In [None]:
from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import sys
pathtoJK = "../JKBio"
sys.path.insert(0, pathtoJK)
import TerraFunction as terra
import CCLF_processing as cclf
from IPython.core.debugger import set_trace
import ipdb

from Helper import *
import numpy as np
from gsheets import Sheets
# https://github.com/jkobject/JKBIO

"""
Log into the Google Developers Console with the Google account whose spreadsheets you want to access.
Create (or select) a project and enable the Drive API and Sheets API (under Google Apps APIs).

https://console.developers.google.com/

Go to the Credentials for your project and create New credentials > OAuth client ID > of type Other.
In the list of your OAuth 2.0 client IDs click Download JSON for the Client ID you just created.
Save the file as client_secrets.json in your home directory (user directory).
Another file, named storage.json in this example, will be created after successful authorization
to cache OAuth data.

On you first usage of gsheets with this file (holding the client secrets),
your webbrowser will be opened, asking you to log in with your Google account to authorize
this client read access to all its Google Drive files and Google Sheets.
"""
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

# CCLF TWIST Pipeline

*go to the [readme](./README.md) to see more about execution*



This pipeline has the following major steps:
1. Pull in information about the TWIST batch(es) from Google sheet(s).
2. Create a TSV of the new sample information
3. Create a TSV of the new sample set information (e.g. cohorts)
4. Upload the sample information and sample set TSVs to the Terra workspace 
5. Run Terra workflows to get copy number (CNV) and mutation (SNV) information, and to create copy number heat maps by batch and by cohort.


# Initialization
Pull in information about the TWIST batch(es) from Google sheet(s).

**Note:** The following cell contains a lot of information that needs to be changed each time this pipeline is run.

You would want to write the samplesetnames you are interested in and h

In [None]:
# create sample set names for each batch
# if you only have one batch to run, still make it a list e.g. ["CCLF_TWIST1"]
# this ensures that the pipeline will run as designed
samplesetnames = ["CCLF_TWIST1","CCLF_TWIST2","CCLF_TWIST3","CCLF_TWIST4"]
samplesetnames_normals = [s + '_normals' for s in samplesetnames]
samplesetnames_tumors = [s + '_tumors' for s in samplesetnames]
samplesetnames_pairs = [s + '_pairs' for s in samplesetnames]
samplesetnames_all = [s + '_all' for s in samplesetnames]

date="2019" # not using currently; could also get this from release_date column (in Cancer_Cell_Line_Factory_CCLF_PanCancer_PanelSeq). Might be useful to include?

# workspace where we are pulling in the data from
data_workspace="broad-genomics-delivery/Cancer_Cell_Line_Factory_CCLF_PanCancer_PanelSeq"
# workspace where we are running the workflows
proc_workspace="nci-mimoun-bi-org/PANCAN_TWIST copy"
# proc_workspace="CCLF_Targeted"

source="CCLF"

picard_aggregation_type_validation="PCR"
forcekeep=[]
cohorts2id="https://docs.google.com/spreadsheets/d/1R97pgzoX0YClGDr5nmQYQwimnKXxDBGnGzg7YPlhZJU"
#mapping abbreviations to full names/descriptions

# list of the external sheets produced for each batch you want to run through the pipeline
gsheeturllist = ["https://docs.google.com/spreadsheets/d/1LR8OFylVClxf0kmZpAdlVjrn3RBcfZKpNoDYtKdnHB8",
"https://docs.google.com/spreadsheets/d/1S3DqBdVkd9dLP1PDYcdSWuD2Iy2gJpzuYBhvmP37UxU",
"https://docs.google.com/spreadsheets/d/1kVIeIw66AxWLhAZlqUnAY17S87Rtfhijf1o3x0hG3Jw",
"https://docs.google.com/spreadsheets/d/1tZQpxag7BO46pei3s_KaoHvxwN9EVESk3xYvzW7f7Uo/"]


In [None]:
wfrom = dm.WorkspaceManager(data_workspace)
wto = dm.WorkspaceManager(proc_workspace)

# Getting the samples

- we load the samples from data workspace and load the metadata files
- we remove data that has already been processed
- we create the final ids

In [None]:
# we look at all the samples we already have in the workspace
refsamples = wto.get_samples()
refids = refsamples.index

# get the data from google sheets
gsheets = [sheets.get(url).sheets[0].to_frame() for url in gsheeturllist]

# add a column with batch information (e.g. TWIST1 vs TWIST2)
metadata = pd.concat(gsheets,sort=False)
len(metadata)

In [None]:
# the existing Batch Number column is clearly not well-maintained. This is why we add in a column with batch information when creating sample_info.
metadata['Batch Number'].unique()

In [None]:
# add a column with batch information (e.g. CCLF_TWIST1 vs CCLF_TWIST2)
metadata = pd.concat(gsheets,sort=False, keys = samplesetnames)
metadata = metadata.reset_index().rename(columns = {'level_0':'batch'}).drop(['level_1'], axis = 'columns')

# we look at all the samples we already have
# we use this gsheet package to get all the sheets into one dataframe
cohorts = sheets.get(cohorts2id).sheets[0].to_frame()

# we do some corrections just in case
samples1 = wfrom.get_samples().replace(np.nan, '', regex=True)

# creating sample_id (like in processing workspace) for metadata and samples1
newmetadata = metadata.dropna(0, subset=['Collaborator Sample ID','Sample Type','Exported DNA SM-ID']) 
print("dropped indices: "+str(set(metadata.index.tolist())-set(newmetadata.index.tolist())))
print('new length: '+str(len(newmetadata)))
metadata=newmetadata

ttype = [i for i in metadata["Sample Type"]]
metadata['sample_id'] = [str(val['Collaborator Sample ID'][:-1]) + '-' + str(val['Sample Type']) + '-' + str(val['Exported DNA SM-ID']) for i, val in metadata.iterrows()]

samples1.index = [i.split('_')[2] for i, val in samples1.iterrows()]

samples1['sample_id'] = [str(val["individual_alias"]) + '-' + str(val['sample_type']) + '-' + i for i, val in samples1.iterrows()]
metadata.index = metadata['Exported DNA SM-ID']
# filtering on what already exists in the processing workspace (refids)
newsamples = samples1[(~samples1.index.isin(refids)) | samples1.index.isin(forcekeep)]
tokeep = set(metadata.index) & set(newsamples.index)
len(tokeep)

In [None]:
# useful to merge the two df, sm-id is one of the only unique id here
if len(newsamples[~newsamples.index.isin(tokeep)]) > 0:
    print('we could not add these as we dont have metadata for them: ' + '\n' + str(newsamples[~newsamples.index.isin(tokeep)].index))
newsamples = newsamples[newsamples.index.isin(tokeep)]
newmetadata = metadata[metadata.index.isin(tokeep)].sort_index().drop_duplicates("Exported DNA SM-ID")
newsamples.shape

In [None]:
newsamples['bait_set'].unique()

# Creating the sample information dataframe
Create a dataframe of the new sample information

**Note:** It can be difficult to recreate the sample_info variable below after you have already uploaded TSVs to Terra since this pipeline specifically looks for samples that do not already exist in the workspace. When running the pipeline on a new batch of data, **I recommend writing the final sample_info to a file.**

**Note 2:** We replace all "/" in the External IDs with "_". This prevents errors when filepaths are created using the external IDs in Terra.

In [None]:
# print(newmetadata[['batch','external_id_validation','External ID']].to_string())
# get all the external IDs into one column:
newmetadata['external_id_validation'] = newmetadata['external_id_validation'].fillna(newmetadata['External ID'])
print(newmetadata[['batch','external_id_validation','External ID']].to_string())

In [None]:
print('creating new sample information df')
df = pd.concat([newmetadata, newsamples], axis=1, sort=True)
# from this new set we create a dataframe which will get uploaded to terra
sample_info = df[['crai_or_bai_path', 'cram_or_bam_path']]
sample_info['batch'] = df['batch'].astype(str)
# sample_info['pt_id'] = df['PT_ID'].astype(str) # add the patient ID; currently not in the External Sheets; Remi will add
sample_info['individual_id'] = df['Collaborator Participant ID'].astype(str)
sample_info['reference_id'] = df['Exported DNA SM-ID'].astype(str)
sample_info['participant'] = df['Collaborator Participant ID'].astype(str)
sample_info['aggregation_product_name_validation'] = df['bait_set'].astype(str)
# here we add this number as the reference id might be present many times already for different samples
# in the processing workspace
# sample_info['external_id_validation'] = [i +'_'+ str(refsamples[refsamples['external_id_validation'] == i].shape[1]) if refsamples[refsamples['external_id_validation'] == i].shape[0] > 0 else i for i in sample_info['reference_id']]

sample_info['external_id_validation'] = 'NA'
for i in range(len(sample_info['reference_id'])):
    # external id for the sample; using str.contains instead of ==
    # replace any "/" that exist with "_"; otherwise get errors because looks like new directory when try to build file paths
    ext_id_for_sample = df[df.index == sample_info['reference_id'][i]]['external_id_validation'].values[0] 
    ext_id_for_sample = [ext_id_for_sample.replace('/', '_') for ext_id in ext_id_for_sample]
    if refsamples[refsamples['external_id_validation'].str.contains(ext_id_for_sample)].shape[0] > 0: # maybe do this for all so we're consistent?
        # tack on a number to distinguish external IDs that we have run more than once
        num_in_workspace = refsamples[refsamples.external_id_validation.str.contains(ext_id_for_sample)].shape[0]
        num_already_seen_here = sample_info[sample_info.external_id_validation.str.contains(ext_id_for_sample)].shape[0]
        num_to_add = num_in_workspace + num_already_seen_here + 1
        sample_info['external_id_validation'][i] = ext_id_for_sample +'_'+ str(num_to_add)
    else:
        sample_info['external_id_validation'][i] = ext_id_for_sample

sample_info['bsp_sample_id_validation'] = df.index.astype(str)
sample_info['stock_sample_id_validation'] = df['Stock DNA SM-ID'].astype(str)
sample_info['sample_type'] = df['Sample Type'].astype(str)
sample_info['picard_aggregation_type_validation'] = [picard_aggregation_type_validation] * sample_info.shape[0]
sample_info['tumor_subtype'] = df['Tumor Type'].astype(str)
sample_info['squid_sample_id_validation'] = sample_info['external_id_validation']
sample_info['source_subtype_validation'] = df['Original Material Type'].astype(str)
sample_info['processed_subtype_validation'] = df['Material Type'].astype(str)
sample_info['primary_disease'] = df['Primary Disease'].astype(str)
sample_info['media'] = df['Media on Tube'].astype(str)
sample_info['Collection'] = df['Collection'].astype(str)
# match collection data and error out
cohortlist = []
for k, val in sample_info['Collection'].iteritems():
    res = cohorts[cohorts['Name'] == val]
    if len(res) == 0:
        print("we do not have a corresponding cohort for this collection for sample: " + str(k))
        cohortlist.append('nan')
    else:
        cohortlist.append(res['ID'].values[0])
sample_info['cohorts'] = cohortlist

sample_info['tissue_site'] = df['Tissue Site'].astype(str)
sample_info['source'] = [source] * sample_info.shape[0]
sample_info['sample_id'] = df.index.astype(str)

sample_info = sample_info.set_index('sample_id')

In [None]:
# Run this chunk to save the sample_info TSV to a file. I highly recommend this when running a pipeline on a new batch.
# This way, if anything goes wrong in the workspace, you can fall back to this.
filepath = './sample_info.tsv' # edit this if you want this to save to a different location
sample_info.to_csv(filepath, sep='\t')

We do not include samples that were missing information in any of the following columns in the external sheet:
- Collaborator Participant ID
- Exported DNA SM-ID
- Stock DNA SM-ID
- Sample Type
- Tumor Type
- Original Material Type
- Material Type
- Primary Disease
- Media on Tube
- Collection
- Tissue Site

Without this list of metadata, the samples will not be added to Terra.

In [None]:
print('Since they don\'t have full data, we are dropping: \n' + 
      str(df.iloc[[j for j,i in enumerate(df[['Collaborator Participant ID','Exported DNA SM-ID',
                                              'Stock DNA SM-ID','Sample Type','Tumor Type',
                                              'Original Material Type', 'Material Type','Primary Disease',
                                              'Media on Tube','Collection','Tissue Site']].isna().values.sum(1)) if i !=0]].index.tolist()))
df = df.iloc[[j for j,i in enumerate(df[['Exported DNA SM-ID','Collaborator Participant ID',
                                         'Stock DNA SM-ID','Sample Type','Tumor Type',
                                         'Original Material Type', 'Material Type','Primary Disease',
                                         'Media on Tube','Collection','Tissue Site']].isna().values.sum(1)) if i ==0]]

In [None]:
df.isna().sum(),sample_info.isna().sum()

# Creating the pairs
Create a TSV of the new pairs information.

In [None]:
normals = [r["participant"] for i, r in sample_info.iterrows() if r['sample_type'] == "Normal"]
normalsid = [i for i, r in sample_info.iterrows() if r['sample_type'] == "Normal"]
tumors = [r["participant"] for i, r in sample_info.iterrows() if r['sample_type'] == "Tumor"]
tumorsid = [i for i, r in sample_info.iterrows() if r['sample_type'] == "Tumor"]
prevtumors = [val["participant"] for k, val in refsamples.iterrows() if val.sample_type == "Tumor"]
prevnormals = [val["participant"] for k, val in refsamples.iterrows() if val.sample_type == "Normal"]

print("creating new pairs...")
# do we have new tumors/normals for our previous ones
newpairs = {'pair_id': [], 'case_sample': [], 'control_sample': [], 'participant': [], 'match_type':[]}

toreprocess_normals = set(tumors) & set(prevnormals)
for val in toreprocess_normals:
    if val != 'nan':
        for tumor_id in sample_info[sample_info['participant'] == val][sample_info[
                'sample_type'] == 'Tumor'].index.tolist():
            normal_id = refsamples[refsamples['participant'] == val][refsamples[
              'sample_type'] == 'Normal'].index.tolist()[0]
            newpairs['pair_id'].append(tumor_id + '_' + normal_id)
            newpairs['case_sample'].append(tumor_id)
            newpairs['control_sample'].append(normal_id)
            newpairs['participant'].append(val)
            newpairs['match_type'].append("Tumor_Normal")

paired = set(tumors) & set(normals)
for val in set(tumors) - toreprocess_normals:
    if val != 'nan':
        for tumor_id in sample_info[sample_info['participant'] == val][sample_info[
                'sample_type'] == 'Tumor'].index.tolist():
            normal_id = sample_info[(sample_info['participant'] == val) & (sample_info[
              'sample_type'] == 'Normal')].index.tolist()[0] if val in paired else 'NA'
            newpairs['pair_id'].append(tumor_id + "_" + normal_id)
            newpairs['case_sample'].append(tumor_id)
            newpairs['control_sample'].append(normal_id)
            newpairs['participant'].append(val)
            newpairs['match_type'].append("Tumor_Normal" if val in paired else 'Tumor_NA')

newpairs = pd.DataFrame(newpairs).set_index('pair_id')

# Create pair sets and sample sets

In the following cell, we create:
- a pair set for each batch
- sample sets for each batch 
- sample sets for each cohort

And then we upload these entities to the Terra workspace.

**Note:** all the entities (e.g. sample, sample set, participant tsv) need to exist! Else it will raise an error and block further uploads to Terra. You can do this by just uploading TSVs with NA. The below code does this automatically for the sample TSV.

In [None]:
print("uploading new samples...")
wto.upload_samples(sample_info)
if not "NA" in wto.get_samples().index.tolist():
    wto.upload_samples(pd.DataFrame({'sample_id':['NA'], 'participant_id':['NA']}).set_index('sample_id'))
    
print("creating pairs and pairsets...")
wto.upload_entities('pair', newpairs)

samplesetnames_normals = [s + '_normals' for s in samplesetnames]
samplesetnames_tumors = [s + '_tumors' for s in samplesetnames]
samplesetnames_pairs = [s + '_pairs' for s in samplesetnames]
samplesetnames_all = [s + '_all' for s in samplesetnames]
# create a pair set for each batch. 
cohorts_per_batch = {} # will be dict of cohorts in each batch; we do not actually use this for anything. Could be used for QC.
for i in range(len(samplesetnames)):
    
    wto.update_pair_set(samplesetnames_pairs[i], newpairs.index.tolist())
    
    # get appropriate subset of the samples for each batch
    batch_sample_info = sample_info[sample_info['batch'] == samplesetnames[i]]
    cohorts_in_batch = []
    cohorts_with_pairs = [] # check: currently not used.
    # for each batch, make pairsets by cohort
    for val in cohorts['ID'].values:
        cohortsamples = batch_sample_info[batch_sample_info["cohorts"] == val].index.tolist()
        tumorsamplesincohort = batch_sample_info[batch_sample_info["cohorts"] == val][batch_sample_info['sample_type']=="Tumor"].index.tolist()
        pairsamples = newpairs[newpairs['case_sample'].isin(tumorsamplesincohort)].index.tolist()
        if len(cohortsamples)>0:
            cohorts_in_batch.append(val)
            try:
                terra.addToSampleSet(wto_namespace_workspace, val, cohortsamples)
            except KeyError: # we may not have this set yet
                print("KeyError for sampleset: " + str(val))
                wto.update_sample_set(val, cohortsamples)
        if len(pairsamples)>0:
            cohorts_with_pairs.append(val)
            try:
                terra.addToPairSet(wto_namespace_workspace,val, pairsamples)
            except KeyError: # we may not have this set yet
                print("KeyError for pairset: " + str(val))
                wto.update_pair_set(val, pairsamples)
    batch_name = samplesetnames[i]
    cohorts_per_batch.update(batch_name = cohorts_in_batch)
            
print("creating sample sets...")
# want to create a sample set for each batch
for i in range(len(samplesetnames)):
    # get appropriate subset of the samples
    batch_sample_info = sample_info[sample_info['batch'] == samplesetnames[i]]
    # define batch-specific tumors and normals
    batch_normals = [r["participant"] for _, r in batch_sample_info.iterrows() if r['sample_type'] == "Normal"]
    batch_normalsid = [k for k, _ in batch_sample_info.iterrows() if r['sample_type'] == "Normal"]
    batch_tumors = [r["participant"] for _, r in batch_sample_info.iterrows() if r['sample_type'] == "Tumor"]
    batch_tumorsid = [k for k,_ in batch_sample_info.iterrows() if r['sample_type'] == "Tumor"]
    # create batch-level sample sets
    wto.update_sample_set(sample_set_id=samplesetnames_all[i], sample_ids=batch_sample_info.index.tolist())
    wto.update_sample_set(sample_set_id=samplesetnames_tumors[i], sample_ids=batch_tumorsid)
    wto.update_sample_set(sample_set_id=samplesetnames_normals[i], sample_ids=batch_normalsid)

# create sample sets for all samples in workspace, and all normals in workspace
# Same as cum pon but better
normalsid.extend([k for k, _ in refsamples.iterrows() if val.sample_type == "Normal"]) # add pre-existing normals

try:
    terra.addToSampleSet(wto_namespace_workspace, samplesetid="All_normals_TWIST", samples=normalsid)
except KeyError:
    wto.update_sample_set(sample_set_id="All_normals_TWIST", sample_ids=normalsid)
all_samples = wto.get_samples().index.tolist()
all_samples.remove('NA')
try:
    terra.addToSampleSet(wto_namespace_workspace, samplesetid="All_samples_TWIST", samples=all_samples)
except KeyError:
    wto.update_sample_set(sample_set_id="All_samples_TWIST", sample_ids=all_samples)

# Running Terra Worlflows
Run Terra workflows to get copy number (CNV) and mutation (SNV) information, and to create copy number heat maps by batch and by cohort.

The order of running the workflows is as follows:
- RenameBAM_TWIST
- CalculateTargetCoverage_PANCAN, 
    + DepthOfCov_PANCAN
- CreatePanelOfNormalsGATK_PANCAN, 
    + DepthOfCovQC_PANCAN
- CallSomaticCNV_PANCAN
- MutationCalling_Normals_TWIST
- FilterGermlineVariants_NormalSample_TWIST
- CreatePoNSNV_Mutect1, 
    + CreatePoNSNV_Mutect2
- PlotSomaticCNVMaps_PANCAN: we plot CN heat maps for each batch and also for each cohort
- SNV_PostProcessing_Normals, 
    + MutationCalling_Tumors_TWIST
- FilterGermlineEvents_TumorSample
- SNVPostProcessing_TWIST, 
    + FNG_Compile_Pileup_Cnt
- FNG_Compile_db_slow_download
- FNG_Query_db

More information about the pipeline exist here: https://cclf.gitbook.io/tsca/

**Note 1:** If for som reason, one of the terra submission function gives no output and it does not seem to submit anything to terra, it might be that you have been logged out of terra you will have to reload the workspace manager and package.

**Note 2:** If you get the preflight error "expression and etype must BOTH be None or a string value", check the workflow configuration. This occurs when you pass in expression and etype information, but the etype is already set as the "rootEntity" aka the default for the workflow. You can fix this by either changing the workflow configuration in Terra, or by not passing in the etype or expression. If you want to see why this error occurs, look at the preflight function in lapdog.py (https://github.com/broadinstitute/lapdog/blob/master/lapdog/lapdog.py).

In [None]:
wto.get_configs()

In [None]:
print("Creating Terra submissions: remember you can only cancel \
    or interact with terra submissions from the Terra website. \
    https://app.terra.bio/#workspaces/"+proc_workspace.replace(" ", "%20")+"/job_history")

RenameBAM_TWIST = terra.createManySubmissions(wto, "RenameBAM_TWIST", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')

print("waiting for 'Rename'")
terra.waitForSubmission(wto_namespace_workspace, RenameBAM_TWIST)

In [None]:
CalculateTargetCoverage_PANCAN = terra.createManySubmissions(wto, "CalculateTargetCoverage_PANCAN", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')
DepthOfCov_PANCAN = terra.createManySubmissions(wto, "DepthOfCov_PANCAN", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')
print("waiting for 'CalculateTargetCoverage' & 'DepthOfCov_PANCAN'")
combined_list = CalculateTargetCoverage_PANCAN + DepthOfCov_PANCAN
terra.waitForSubmission(wto_namespace_workspace, combined_list)
# terra.waitForSubmission(wto_namespace_workspace, CalculateTargetCoverage_PANCAN, DepthOfCov_PANCAN)

In [None]:
# changing to use just the normals from the batch, not all normals
CreatePanelOfNormalsGATK_PANCAN = terra.createManySubmissions(wto, "CreatePanelOfNormalsGATK_PANCAN", samplesetnames_normals)
DepthOfCovQC_PANCAN = terra.createManySubmissions(wto, "DepthOfCovQC_PANCAN", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')

print("waiting for 'DepthOfCovQC_PANCAN' & 'CNV_CreatePoNForCNV'")
combined_list = DepthOfCovQC_PANCAN + CreatePanelOfNormalsGATK_PANCAN
terra.waitForSubmission(wto_namespace_workspace, combined_list)

In [None]:
wto.get_config('CallSomaticCNV_PANCAN')

In [None]:
CallSomaticCNV_PANCAN = terra.createManySubmissions(wto, "CallSomaticCNV_PANCAN", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples', use_callcache = False)

print("waiting for 'CallSomaticCNV_PANCAN'")
terra.waitForSubmission(wto_namespace_workspace, CallSomaticCNV_PANCAN)

In [None]:
MutationCalling_Normals_TWIST = terra.createManySubmissions(wto, "MutationCalling_Normals_TWIST", samplesetnames_normals, 
                                              entity='sample_set', expression='this.samples')
print("waiting for 'MutationCalling_Normals_TWIST'")
terra.waitForSubmission(wto_namespace_workspace, MutationCalling_Normals_TWIST)

In [None]:
# had errors when using call caching on TWIST1-3. No errors for TWIST4
FilterGermlineVariants_NormalSample_TWIST = terra.createManySubmissions(wto, "FilterGermlineVariants_NormalSample_TWIST", samplesetnames_normals, 
                                              entity='sample_set', expression='this.samples', use_callcache=False)
print("waiting for 'SNV_FilterGermline'")
terra.waitForSubmission(wto_namespace_workspace, FilterGermlineVariants_NormalSample_TWIST)

In [None]:
# create PON for SNV from all the normals we have in the workspace so far
CreatePoNSNV_Mutect1 = wto.create_submission('CreatePoNSNV_Mutect1', "All_normals_TWIST")
CreatePoN_SNV_MuTect2 = wto.create_submission('CreatePoN_SNV_MuTect2', "All_normals_TWIST")

# CreatePoNSNV_Mutect1 = terra.createManySubmissions(wto, "CreatePoNSNV_Mutect1", 'All_normals_TWIST')
# CreatePoN_SNV_MuTect2 = terra.createManySubmissions(wto, "CreatePoN_SNV_MuTect2", 'All_normals_TWIST')
print("waiting for 'CreatePoN_SNV_MuTect2' & 'CreatePoNSNV_Mutect1'")
combined_list = CreatePoNSNV_Mutect1 + CreatePoN_SNV_MuTect2
terra.waitForSubmission(wto_namespace_workspace, combined_list)
# terra.waitForSubmission(wto_namespace_workspace, [CreatePoNSNV_Mutect1, CreatePoN_SNV_MuTect2])

In [None]:
## re-create sample_info by pulling in sample data from the Terra workspace
sample_info = wto.get_samples()

In [None]:
## create / re-create cohorts_per_batch dictionary
cohorts_per_batch = {} # will be dict of cohorts in each batch 
all_changed_cohorts = set()
for i in range(len(samplesetnames)):
    # get appropriate subset of the samples for each batch
    batch_sample_info = sample_info[sample_info['batch'] == samplesetnames[i]]
    cohorts_in_batch = []
    cohorts_with_pairs = [] # check: currently not used.
    # for each batch, make pairsets by cohort
    for val in cohorts['ID'].values:
        cohortsamples = batch_sample_info[batch_sample_info["cohorts"] == val].index.tolist()
        tumorsamplesincohort = batch_sample_info[batch_sample_info["cohorts"] == val][batch_sample_info['sample_type']=="Tumor"].index.tolist()
        if len(cohortsamples)>0:
            cohorts_in_batch.append(val)
    batch_name = samplesetnames[i]
    cohorts_per_batch[batch_name] = cohorts_in_batch
    all_changed_cohorts.update(cohorts_in_batch) # add all the new cohorts in this batch to the full list
# cohorts_per_batch
all_changed_cohorts

In [None]:
## note: the workflow will fail when running on a cohort that has only 1 sample (requires 2+)

# create CNV map for each cohort (regardless of the batch)
for val in all_changed_cohorts:
    wto.create_submission("PlotSomaticCNVMaps_PANCAN", val)

# create CNV map for each batch
PlotSomaticCNVMaps_PANCAN = terra.createManySubmissions(wto, "PlotSomaticCNVMaps_PANCAN", samplesetnames_all)

print("submitted final jobs for CNV pipeline")
print("you don't need to wait before moving onto the next cell")

In [None]:
SNV_PostProcessing_Normals = terra.createManySubmissions(wto, "SNV_PostProcessing_Normals", samplesetnames_normals)
MutationCalling_Tumors_TWIST = terra.createManySubmissions(wto, "MutationCalling_Tumors_TWIST", samplesetnames_pairs, 
                                              entity='pair_set', expression='this.pairs')
print("waiting for 'SNV_PostProcessing' & 'MutationCalling_Tumors_TWIST'")
combined_list = SNV_PostProcessing_Normals + MutationCalling_Tumors_TWIST
terra.waitForSubmission(wto_namespace_workspace, combined_list)

In [None]:
## note: the workflow needs cohorts with at least 2 acceptable CL to run (if only 1, then the workflow will fail)
FilterGermlineEvents_TumorSample = terra.createManySubmissions(wto, 'FilterGermlineEvents_TumorSample', samplesetnames_pairs, 'pair_set', expression='this.pairs')
print("waiting for 'FilterGermlineEvents_TumorSample'")
terra.waitForSubmission(wto_namespace_workspace, FilterGermlineEvents_TumorSample)

In [None]:
# SNVPostProcessing_TWIST = terra.createManySubmissions(wto, "SNVPostProcessing_TWIST", samplesetnames_pairs, 
#                                               entity='pair_set', expression='this.pairs')
SNVPostProcessing_TWIST = terra.createManySubmissions(wto, "SNVPostProcessing_TWIST", samplesetnames_pairs)
print("Submitted final jobs for SNV pipeline")

# sometimes get space errors when run FNG_Compile_Pileup_Cnt if use 4 GB; changed to 10 GB
to_run = ['SM-IF6GU', 'SM-IF6H4', 'SM-IF6HX']
FNG_Compile_Pileup_Cnt = terra.createManySubmissions(wto, "FNG_Compile_Pileup_Cnt", samplesetnames_all, 
                                              entity='sample_set', expression='this.samples')
print("waiting for 'FNG_Compile_Pileup_Cnt'")
terra.waitForSubmission(wto_namespace_workspace, FNG_Compile_Pileup_Cnt)

In [None]:
FNG_Compile_db_slow_download = wto.create_submission("FNG_Compile_db_slow_download", "All_samples_TWIST")
print("waiting for 'FNG_Compile_db'")
terra.waitForSubmission(wto_namespace_workspace, FNG_Compile_db_slow_download)

In [None]:
FNG_Query_db = terra.createManySubmissions(wto, "FNG_Query_db", samplesetnames_all)
print("Submitted final FNG Job")
print('Done')