In [None]:
from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import sys
pathtoJK = "../JKBio"
sys.path.insert(0, pathtoJK)
import TerraFunction as terra
import CCLF_processing as cclf
from IPython.core.debugger import set_trace
import ipdb

from Helper import *
import numpy as np
from gsheets import Sheets
# https://github.com/jkobject/JKBIO

"""
Log into the Google Developers Console with the Google account whose spreadsheets you want to access.
Create (or select) a project and enable the Drive API and Sheets API (under Google Apps APIs).

https://console.developers.google.com/

Go to the Credentials for your project and create New credentials > OAuth client ID > of type Other.
In the list of your OAuth 2.0 client IDs click Download JSON for the Client ID you just created.
Save the file as client_secrets.json in your home directory (user directory).
Another file, named storage.json in this example, will be created after successful authorization
to cache OAuth data.

On you first usage of gsheets with this file (holding the client secrets),
your webbrowser will be opened, asking you to log in with your Google account to authorize
this client read access to all its Google Drive files and Google Sheets.
"""
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

# Initialization

In [None]:
# create sample set names for each batch
samplesetnames = ["CCLF_TWIST1","CCLF_TWIST2","CCLF_TWIST3","CCLF_TWIST4"]
samplesetnames_normals = [s + '_normals' for s in samplesetnames]
samplesetnames_tumors = [s + '_tumors' for s in samplesetnames]
samplesetnames_pairs = [s + '_pairs' for s in samplesetnames]
samplesetnames_all = [s + '_all' for s in samplesetnames]

date="2019" # not using currently; could also get this from release_date column (in Cancer_Cell_Line_Factory_CCLF_PanCancer_PanelSeq)
data_namespace="broad-genomics-delivery"
data_workspace="Cancer_Cell_Line_Factory_CCLF_PanCancer_PanelSeq"
proc_namespace="nci-mimoun-bi-org"
# proc_workspace="CCLF_Targeted"
proc_workspace ="PANCAN_TWIST copy" ## check: change later
source="CCLF"
site="HT33MBCX2" # is this used? where to get this info?
tsca_id="TWIST1-4" # is this used?

TSCA_version="TWISTv1" # where can we find this information? The bait_set column in Cancer_Cell_Line_Factory_CCLF_PanCancer_PanelSeq
picard_aggregation_type_validation="PCR"
forcekeep=[]
cohorts2id="https://docs.google.com/spreadsheets/d/1R97pgzoX0YClGDr5nmQYQwimnKXxDBGnGzg7YPlhZJU" # cohort dict
# list of the external sheets produced for each batch you want to run through the pipeline
gsheeturllist = ["https://docs.google.com/spreadsheets/d/1LR8OFylVClxf0kmZpAdlVjrn3RBcfZKpNoDYtKdnHB8", # PanCan 1
"https://docs.google.com/spreadsheets/d/1S3DqBdVkd9dLP1PDYcdSWuD2Iy2gJpzuYBhvmP37UxU", # TWIST 2 New
"https://docs.google.com/spreadsheets/d/1kVIeIw66AxWLhAZlqUnAY17S87Rtfhijf1o3x0hG3Jw", # TWIST 3 New
"https://docs.google.com/spreadsheets/d/1tZQpxag7BO46pei3s_KaoHvxwN9EVESk3xYvzW7f7Uo/" # TWIST 4
                ]
wfrom = dm.WorkspaceManager(data_namespace, data_workspace)
wto = dm.WorkspaceManager(proc_namespace, proc_workspace)

# Getting the samples

- we load the samples from data workspace and load the metadata files
- we remove data that has already been processed
- we create the final ids

In [None]:
# we look at all the samples we already have
refsamples = wto.get_samples()
refids = refsamples.index

In [None]:
gsheets = [sheets.get(url).sheets[0].to_frame() for url in gsheeturllist]

In [None]:
# get the data from google sheets
# add a column with batch information (e.g. TWIST1 vs TWIST2)
metadata = pd.concat(gsheets,sort=False)
len(metadata)

In [None]:
sorted(metadata.columns.values.tolist())
# the existing Batch Number column is clearly not well-maintained. This is why I add in a column with batch information.
metadata['Batch Number'].unique()

In [None]:
# add a column with batch information (e.g. TWIST1 vs TWIST2)
metadata = pd.concat(gsheets,sort=False, keys = ["CCLF_TWIST1","CCLF_TWIST2","CCLF_TWIST3","CCLF_TWIST4"])
metadata = metadata.reset_index().rename(columns = {'level_0':'batch'})
metadata = metadata.drop(['level_1'], axis = 'columns')

In [None]:
# we look at all the samples we already have
cohorts = sheets.get(cohorts2id).sheets[0].to_frame()
# we use this gsheet package to get all the sheets into one dataframe

# we do some corrections just in case
samples1 = wfrom.get_samples().replace(np.nan, '', regex=True)

# creating sample_id (like in processing workspace) for metadata and samples1
newmetadata = metadata.dropna(0, subset=['Collaborator Sample ID','Sample Type','Exported DNA SM-ID']) 
print("dropped indices: "+str(set(metadata.index.tolist())-set(newmetadata.index.tolist())))
print('new length: '+str(len(newmetadata)))
metadata=newmetadata

In [None]:
ttype = [i for i in metadata["Sample Type"]]
metadata['sample_id'] = [str(val['Collaborator Sample ID'][:-1]) + '-' + str(val['Sample Type']) + '-' + str(val['Exported DNA SM-ID']) for i, val in metadata.iterrows()]

samples1.index = [i.split('_')[2] for i, val in samples1.iterrows()]

samples1['sample_id'] = [str(val["individual_alias"]) + '-' + str(val['sample_type']) + '-' + i for i, val in samples1.iterrows()]
metadata.index = metadata['Exported DNA SM-ID']

In [None]:
# filtering on what already exists in the processing workspace (refids)
# but what if a sample was run on TSCA and is later run on TWIST? We may need to think this through more.
newsamples = samples1[(~samples1.index.isin(refids)) | samples1.index.isin(forcekeep)]
tokeep = set(metadata.index) & set(newsamples.index)
len(tokeep)

In [None]:
# useful to merge the two df, sm-id is one of the only unique id here
if len(newsamples[~newsamples.index.isin(tokeep)]) > 0:
    print('we could not add these as we dont have metadata for them: ' + '\n' + str(newsamples[~newsamples.index.isin(tokeep)].index))
newsamples = newsamples[newsamples.index.isin(tokeep)]
newmetadata = metadata[metadata.index.isin(tokeep)].sort_index().drop_duplicates("Exported DNA SM-ID")

In [None]:
newsamples.shape

In [None]:
newsamples['bait_set'].unique()

# Creating the sample information dataframe

In [None]:
print('creating new df')
df = pd.concat([newmetadata, newsamples], axis=1, sort=True)
# from this new set we create a dataframe which will get uploaded to terra
sample_info = df[['crai_or_bai_path', 'cram_or_bam_path']]
sample_info['batch'] = df['batch'].astype(str)
sample_info['individual_id'] = df['Collaborator Participant ID'].astype(str)
sample_info['reference_id'] = df['Exported DNA SM-ID'].astype(str)
sample_info['participant'] = df['Collaborator Participant ID'].astype(str)
# sample_info['aggregation_product_name_validation'] = [TSCA_version] * sample_info.shape[0]
sample_info['aggregation_product_name_validation'] = df['bait_set'].astype(str)
# here we add this number as the reference id might be present many times already for different samples
# in the processing workspace
sample_info['external_id_validation'] = [i +'_'+ str(refsamples[refsamples['external_id_validation'] == i].shape[1]) if refsamples[refsamples['external_id_validation'] == i].shape[0] > 0 else i for i in sample_info['reference_id']]
sample_info['bsp_sample_id_validation'] = df.index.astype(str)
sample_info['stock_sample_id_validation'] = df['Stock DNA SM-ID'].astype(str)
sample_info['sample_type'] = df['Sample Type'].astype(str)
sample_info['picard_aggregation_type_validation'] = [picard_aggregation_type_validation] * sample_info.shape[0]
sample_info['tumor_subtype'] = df['Tumor Type'].astype(str)
sample_info['squid_sample_id_validation'] = sample_info['external_id_validation']
sample_info['source_subtype_validation'] = df['Original Material Type'].astype(str)
sample_info['processed_subtype_validation'] = df['Material Type'].astype(str)
sample_info['primary_disease'] = df['Primary Disease'].astype(str)
sample_info['media'] = df['Media on Tube'].astype(str)
sample_info['Collection'] = df['Collection'].astype(str)
# match collection data and error out
cohortlist = []
for k, val in sample_info['Collection'].iteritems():
    res = cohorts[cohorts['Name'] == val]
    if len(res) == 0:
        print("we do not have a corresponding cohort for this collection for sample: " + str(k))
        cohortlist.append('nan')
    else:
        cohortlist.append(res['ID'].values[0])
sample_info['cohorts'] = cohortlist

sample_info['tissue_site'] = df['Tissue Site'].astype(str)
sample_info['source'] = [source] * sample_info.shape[0]
sample_info['sample_id'] = df.index.astype(str)

sample_info = sample_info.set_index('sample_id')

In [None]:
sorted(df.columns.values.tolist())

In [None]:
print('Since they don\'t have full data, we are dropping: \n' + 
      str(df.iloc[[j for j,i in enumerate(df[['Collaborator Participant ID','Exported DNA SM-ID',
                                              'Stock DNA SM-ID','Sample Type','Tumor Type',
                                              'Original Material Type', 'Material Type','Primary Disease',
                                              'Media on Tube','Collection','Tissue Site']].isna().values.sum(1)) if i !=0]].index.tolist()))
df = df.iloc[[j for j,i in enumerate(df[['Exported DNA SM-ID','Collaborator Participant ID',
                                         'Stock DNA SM-ID','Sample Type','Tumor Type',
                                         'Original Material Type', 'Material Type','Primary Disease',
                                         'Media on Tube','Collection','Tissue Site']].isna().values.sum(1)) if i ==0]]

In [None]:
df.isna().sum()
sample_info.isna().sum()

# Creating the sample_sets

In [None]:
# refsamples['sample_type'].head()
# refsamples.columns.values.tolist()

# for k, val in refsamples.iterrows():
#     print(val.sample_type)

In [None]:
normals = [r["participant"] for i, r in sample_info.iterrows() if r['sample_type'] == "Normal"]
normalsid = [i for i, r in sample_info.iterrows() if r['sample_type'] == "Normal"]
tumors = [r["participant"] for i, r in sample_info.iterrows() if r['sample_type'] == "Tumor"]
tumorsid = [i for i, r in sample_info.iterrows() if r['sample_type'] == "Tumor"]
prevtumors = [val["participant"] for k, val in refsamples.iterrows() if val.sample_type == "Tumor"]
prevnormals = [val["participant"] for k, val in refsamples.iterrows() if val.sample_type == "Normal"]

print("creating new pairs...")
# do we have new tumors/normals for our previous ones
newpairs = {'pair_id': [], 'case_sample': [], 'control_sample': [], 'participant': [], 'match_type':[]}

toreprocess_normals = set(tumors) & set(prevnormals)
for val in toreprocess_normals:
    if val != 'nan':
        for tumor_id in sample_info[sample_info['participant'] == val][sample_info[
                'sample_type'] == 'Tumor'].index.tolist():
            normal_id = refsamples[refsamples['participant'] == val][refsamples[
              'sample_type'] == 'Normal'].index.tolist()[0]
            newpairs['pair_id'].append(tumor_id + '_' + normal_id)
            newpairs['case_sample'].append(tumor_id)
            newpairs['control_sample'].append(normal_id)
            newpairs['participant'].append(val)
            newpairs['match_type'].append("Tumor_Normal")

paired = set(tumors) & set(normals)
for val in set(tumors) - toreprocess_normals:
    if val != 'nan':
        for tumor_id in sample_info[sample_info['participant'] == val][sample_info[
                'sample_type'] == 'Tumor'].index.tolist():
            normal_id = sample_info[(sample_info['participant'] == val) & (sample_info[
              'sample_type'] == 'Normal')].index.tolist()[0] if val in paired else 'NA'
            newpairs['pair_id'].append(tumor_id + "_" + normal_id)
            newpairs['case_sample'].append(tumor_id)
            newpairs['control_sample'].append(normal_id)
            newpairs['participant'].append(val)
            newpairs['match_type'].append("Tumor_Normal" if val in paired else 'Tumor_NA')

newpairs = pd.DataFrame(newpairs).set_index('pair_id')

# Uploads to Terra

## all the entities (e.g. sample tsv) need to exist! Else it will raise an error and block further uploads to Terra

In [None]:
## test / scratch
# print(sample_info[sample_info['batch'] == 'CCLF_TWIST1']['batch'].head())
# for i in range(len(samplesetnames)):
#     batch_sample_info = sample_info[sample_info['batch'] == i]
#     print("Printing "+samplesetnames[i])
#     print(tmp['batch'].head())

# batch_sample_info = sample_info[sample_info['batch'] == 'CCLF_TWIST1' ]
# for val in cohorts['ID'].values:
#     cohortsamples=batch_sample_info[batch_sample_info["cohorts"] == val].index.tolist()
#     print(batch_sample_info['cohorts'].unique())
#     print(batch_sample_info[batch_sample_info["cohorts"] == val])
#     print('\n')
# print(cohortsamples)
    
    

In [None]:
print("All the entities need to exist! Else it will raise an error and block further uploads to Terra")
print("uploading new samples...")
wto.upload_samples(sample_info)
if not "NA" in wto.get_samples().index.tolist():
    wto.upload_samples(pd.DataFrame({'sample_id':['NA'], 'participant_id':['NA']}).set_index('sample_id'))
    
print("creating pairs and pairsets...")
wto.upload_entities('pair', newpairs)
# wto.update_pair_set(samplesetname+'_pairs', newpairs.index.tolist())

# want to create a pair set for each batch
cohorts_per_batch = {} # will be dict of cohorts in each batch 
for i in range(len(samplesetnames)):
    
    wto.update_pair_set(samplesetnames_pairs[i], newpairs.index.tolist())
    
    # get appropriate subset of the samples for each batch
    batch_sample_info = sample_info[sample_info['batch'] == samplesetnames[i]]
    cohorts_in_batch = []
    cohorts_with_pairs = [] # check: do we use this for anything?
    # for each batch, make pairsets by cohort
    for val in cohorts['ID'].values:
        cohortsamples = batch_sample_info[batch_sample_info["cohorts"] == val].index.tolist()
        tumorsamplesincohort = batch_sample_info[batch_sample_info["cohorts"] == val][batch_sample_info['sample_type']=="Tumor"].index.tolist()
        pairsamples = newpairs[newpairs['case_sample'].isin(tumorsamplesincohort)].index.tolist()
        if len(cohortsamples)>0:
            cohorts_in_batch.append(val)
            try:
                terra.addToSampleSet(wto, val, cohortsamples)
            except KeyError: # we may not have this set yet
                print("KeyError for sampleset: " + str(val))
                wto.update_sample_set(val, cohortsamples)
        if len(pairsamples)>0:
            cohorts_with_pairs.append(val)
            try:
                terra.addToPairSet(wto,val, pairsamples)
            except KeyError: # we may not have this set yet
                print("KeyError for pairset: " + str(val))
                wto.update_pair_set(val, pairsamples)
    batch_name = samplesetnames[i]
    cohorts_per_batch.update(batch_name = cohorts_in_batch)
    
# cohorts_in_batch = []
# cohorts_with_pairs = []
# for val in cohorts['ID'].values:
#     cohortsamples=sample_info[sample_info["cohorts"] == val].index.tolist()
#     tumorsamplesincohort = sample_info[sample_info["cohorts"] == val][sample_info['sample_type']=="Tumor"].index.tolist()
#     pairsamples=newpairs[newpairs['case_sample'].isin(tumorsamplesincohort)].index.tolist()
#     if len(cohortsamples)>0:
#         cohorts_in_batch.append(val)
#         try:
#             terra.addToSampleSet(wto, val, cohortsamples)
#         except KeyError: # we may not have this set yet
#             wto.update_sample_set(val, cohortsamples)
#     if len(pairsamples)>0:
#         cohorts_with_pairs.append(val)
#         try:
#             terra.addToPairSet(wto,val, pairsamples)
#         except KeyError: # we may not have this set yet
#             wto.update_pair_set(val, pairsamples)
            
print("creating sample sets...")
# want to create a sample set for each batch
for i in range(len(samplesetnames)):
    # get appropriate subset of the samples
    batch_sample_info = sample_info[sample_info['batch'] == samplesetnames[i]]
    # define batch-specific tumors and normals
    batch_normals = [r["participant"] for i, r in batch_sample_info.iterrows() if r['sample_type'] == "Normal"]
    batch_normalsid = [i for i, r in batch_sample_info.iterrows() if r['sample_type'] == "Normal"]
    batch_tumors = [r["participant"] for i, r in batch_sample_info.iterrows() if r['sample_type'] == "Tumor"]
    batch_tumorsid = [i for i, r in batch_sample_info.iterrows() if r['sample_type'] == "Tumor"]
    # create batch-level sample sets
    wto.update_sample_set(sample_set_id=samplesetnames_all[i], sample_ids=batch_sample_info.index.tolist())
    wto.update_sample_set(sample_set_id=samplesetnames_tumors[i], sample_ids=batch_tumorsid)
    wto.update_sample_set(sample_set_id=samplesetnames_normals[i], sample_ids=batch_normalsid)
# wto.update_sample_set(sample_set_id=samplesetname + "_all", sample_ids=sample_info.index.tolist())
# wto.update_sample_set(sample_set_id=samplesetname + "_tumors", sample_ids=tumorsid)
# wto.update_sample_set(sample_set_id=samplesetname + "_normals", sample_ids=normalsid)

# create sample sets for all samples in workspace, and all normals in workspace
# Same as cum pon but better
normalsid.extend([k for k, val in refsamples.iterrows() if val.sample_type == "Normal"]) # add pre-existing normals
terra.addToSampleSet(sample_set_id="All_normals_TWIST", sample_ids=normalsid)
all_samples = wto.get_samples().index.tolist()
all_samples.remove('NA')
terra.addToSampleSet(sample_set_id="All_samples_TWIST", sample_ids=all_samples)

# Running Terra Worlflows

In [None]:
print("Creating Terra submissions: remember you can only cancel \
    or interact with terra submissions from the Terra website. \
    https://app.terra.bio/#workspaces/"+proc_namespace.replace(" ", "%20")+"/"+proc_workspace.replace(" ", "%20")+"/job_history")

# RenameBAM_TWIST = wto.create_submission("RenameBAM_TWIST", samplesetname + "_all", 'sample_set', expression='this.samples')
RenameBAM_TWIST = terra.createManySubmissions(wto, "RenameBAM_TWIST", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')
print("waiting for 'Rename'")
terra.waitForSubmission(wto, [RenameBAM_TWIST])

In [None]:
# CalculateTargetCoverage_PANCAN = wto.create_submission('CalculateTargetCoverage', samplesetname + "_all", 'sample_set', expression='this.samples')
# DepthOfCov_PANCAN = wto.create_submission('DepthOfCov_PANCAN', samplesetname + "_all", 'sample_set', expression='this.samples')

CalculateTargetCoverage_PANCAN = terra.createManySubmissions(wto, "CalculateTargetCoverage", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')
DepthOfCov_PANCAN = terra.createManySubmissions(wto, "DepthOfCov_PANCAN", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')
print("waiting for 'CalculateTargetCoverage' & 'DepthOfCov_PANCAN'")
terra.waitForSubmission(wto, [CalculateTargetCoverage_PANCAN, DepthOfCov_PANCAN])

In [None]:
# CreatePanelOfNormalsGATK_PANCAN = wto.create_submission('CreatePanelOfNormalsGATK_PANCAN', 'All_normals_TWIST')
# DepthOfCovQC_PANCAN = wto.create_submission('DepthOfCovQC_PANCAN', samplesetname + "_all", 'sample_set', expression='this.samples')

## changing to use just the normals from the batch, not all normals
CreatePanelOfNormalsGATK_PANCAN = terra.createManySubmissions(wto, "CreatePanelOfNormalsGATK_PANCAN", samplesetnames_normals)
DepthOfCovQC_PANCAN = terra.createManySubmissions(wto, "DepthOfCovQC_PANCAN", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')

print("waiting for 'DepthOfCovQC_PANCAN' & 'CNV_CreatePoNForCNV'")
terra.waitForSubmission(wto, [DepthOfCovQC_PANCAN, CreatePanelOfNormalsGATK_PANCAN])

In [None]:
# DepthOfCovQC_PANCAN = wto.create_submission('DepthOfCovQC_PANCAN', samplesetname + "_all", 'sample_set', expression='this.samples')
# print("waiting for 'DepthOfCovQC_PANCAN' & 'CNV_CreatePoNForCNV'")
# terra.waitForSubmission(wto, [DepthOfCovQC_PANCAN, CreatePanelOfNormalsGATK_PANCAN])

In [None]:
# CallSomaticCNV_PANCAN = wto.create_submission('CallSomaticCNV_PANCAN', samplesetname + "_all", 'sample_set', expression='this.samples')

CallSomaticCNV_PANCAN = terra.createManySubmissions(wto, "CallSomaticCNV_PANCAN", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')
print("waiting for 'CallSomaticCNV_PANCAN'")
terra.waitForSubmission(wto, [CallSomaticCNV_PANCAN])

In [None]:
# MutationCalling_Normals_TWIST = wto.create_submission("MutationCalling_Normals_TWIST", samplesetname + "_normals", 'sample_set', expression='this.samples')

MutationCalling_Normals_TWIST = terra.createManySubmissions(wto, "MutationCalling_Normals_TWIST", samplesetnames_normals, 
                                              entity='sample_set', expression='this.samples')
print("waiting for 'MutationCalling_Normals_TWIST'")
terra.waitForSubmission(wto, [MutationCalling_Normals_TWIST])

In [None]:
# FilterGermlineVariants_NormalSample_TWIST = wto.create_submission('FilterGermlineVariants_NormalSample_TWIST', samplesetname + "_normals", 'sample_set', expression='this.samples')

FilterGermlineVariants_NormalSample_TWIST = terra.createManySubmissions(wto, "FilterGermlineVariants_NormalSample_TWIST", samplesetnames_normals, 
                                              entity='sample_set', expression='this.samples')
print("waiting for 'SNV_FilterGermline'")
terra.waitForSubmission(wto, [FilterGermlineVariants_NormalSample_TWIST])

In [None]:
# CreatePoNSNV_Mutect1 = wto.create_submission('CreatePoNSNV_Mutect1', "All_normals_TWIST")
# CreatePoN_SNV_MuTect2 = wto.create_submission('CreatePoN_SNV_MuTect2', "All_normals_TWIST")

# create PON for SNV from all the normals we have in the workspace so far
CreatePoNSNV_Mutect1 = terra.createManySubmissions(wto, "CreatePoNSNV_Mutect1", 'All_normals_TWIST')
CreatePoN_SNV_MuTect2 = terra.createManySubmissions(wto, "CreatePoN_SNV_MuTect2", 'All_normals_TWIST')
print("waiting for 'CreatePoN_SNV_MuTect2' & 'CreatePoNSNV_Mutect1'")
terra.waitForSubmission(wto, [CreatePoNSNV_Mutect1, CreatePoN_SNV_MuTect2])

In [None]:
# PlotSomaticCNVMaps_PANCAN = wto.create_submission('PlotSomaticCNVMaps_PANCAN', samplesetname + "_all")
# for val in cohorts_in_batch:
#     wto.create_submission("PlotSomaticCNVMaps_PANCAN", val)

# create CNV map for each batch
PlotSomaticCNVMaps_PANCAN = terra.createManySubmissions(wto, "PlotSomaticCNVMaps_PANCAN", samplesetnames_all)

# create CNV map for each cohort in a batch
for batch, cohorts in cohorts_per_batch.items():
    for val in cohorts:
        wto.create_submission("PlotSomaticCNVMaps_PANCAN", val)
print("submitted final jobs for CNV pipeline")

In [None]:
# SNV_PostProcessing_Normals = wto.create_submission('SNV_PostProcessing_Normals', samplesetname + "_normals")
# MutationCalling_Tumors_TWIST = wto.create_submission('MutationCalling_Tumors_TWIST', samplesetname+'_pairs', 'pair_set', expression='this.pairs')

SNV_PostProcessing_Normals = terra.createManySubmissions(wto, "SNV_PostProcessing_Normals", samplesetnames_normals)
MutationCalling_Tumors_TWIST = terra.createManySubmissions(wto, "MutationCalling_Tumors_TWIST", samplesetnames_pairs, 
                                              entity='pair_set', expression='this.pairs')
print("waiting for 'SNV_PostProcessing' & 'MutationCalling_Tumors_TWIST'")
terra.waitForSubmission(wto, [SNV_PostProcessing_Normals, MutationCalling_Tumors_TWIST])

In [None]:
b[b['case_sample'].isin(a)].index.tolist()

In [None]:
wto.delete_entity('pair',b[b['case_sample'].isin(a)].index.tolist())

In [None]:
wto.delete_sample(a)

In [None]:
## two cohorts have not worked because they contained just one acceptable cell line (the workflow needs cohorts with at least 2 acceptable CL, here both had one rejected one)
# FilterGermlineEvents_TumorSample = wto.create_submission('FilterGermlineEvents_TumorSample', samplesetname+'_pairs', 'pair_set', expression='this.pairs')

FilterGermlineEvents_TumorSample = terra.createManySubmissions('FilterGermlineEvents_TumorSample', samplesetnames_pairs, 'pair_set', expression='this.pairs')
print("waiting for 'FilterGermlineEvents_TumorSample'")
terra.waitForSubmission(wto, FilterGermlineEvents_TumorSample)

In [None]:
b = wto.get_pairs()

In [None]:
b['case_sample'].isin(a)

In [None]:
b.loc[b['case_sample'].isin(a),'match_type'] = ["Tumor_NA" if '_NA' in k else "Tumor_Normal" for k in b[b['case_sample'].isin(a)].index] 

In [None]:
wto.upload_entities('pair', )

In [None]:
# SNVPostProcessing_TWIST = wto.create_submission('SNVPostProcessing_TWIST', samplesetname+'_pairs', "pair_set")
SNVPostProcessing_TWIST = terra.createManySubmissions(wto, "SNVPostProcessing_TWIST", samplesetnames_pairs, 
                                              entity='pair_set', expression='this.pairs')
print("Submitted final jobs for SNV pipeline")

# FNG_Compile_Pileup_Cnt = wto.create_submission("FNG_Compile_Pileup_Cnt", samplesetname + "_all", 'sample_set', expression='this.samples')
FNG_Compile_Pileup_Cnt = terra.createManySubmissions(wto, "FNG_Compile_Pileup_Cnt", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')
print("waiting for 'FNG_Compile_Pileup_Cnt'")
terra.waitForSubmission(wto, [FNG_Compile_Pileup_Cnt])


## check: I don't think we have a "All_samples" set yet... we do have a "All_samples_TWIST" though.
# FNG_Compile_db_slow_download = wto.create_submission("FNG_Compile_db_slow_download", "All_samples")
FNG_Compile_db_slow_download = wto.create_submission("FNG_Compile_db_slow_download", "All_samples_TWIST")
print("waiting for 'FNG_Compile_db'")
terra.waitForSubmission(wto, [FNG_Compile_db_slow_download])

In [None]:
# FNG_Query_db = wto.create_submission("FNG_Query_db", samplesetname + "_all")
FNG_Query_db = terra.createManySubmissions(wto, "FNG_Query_db", samplesetnames_all)
print("Submitted final FNG Job")
print('Done')

## Create and upload nice folders with data per sample or per cohort or per any list provided

In [None]:
workspace1 = "CCLF_TSCA_2_0_2"
namespace1 = "nci-mimoun-bi-org"
wm1 = dm.WorkspaceManager(namespace1,workspace1)
pathto_cnvpng='segmented_copy_ratio_img'
pathto_stats='sample_statistics'
is_from_pairs=True
pathto_snv='filtered_variants'
pathto_seg='cnv_calls'
datadir='gs://cclf_results/targeted/kim_sept/'

specificlist=['CCLF_PEDS1012',
'PEDS172',
'PEDS182',
'PEDS196',
'PEDS204']

In [None]:
cclf.getReport(datadir='gs://cclf_results/targeted/kim_sept_2/',
               specificlist=specificlist)