# Mutation Pipeline

In [None]:
from __future__ import print_function
import os.path
import pandas as pd
import gzip
import sys
import numpy as np

sys.path.insert(0, '..')

from src.CCLE_postp_function import *
from JKBio import Datanalytics as da 
from JKBio import TerraFunction as terra
from JKBio import Helper as h
from JKBio.helper.google_sheet import GSheet
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display



%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()

my_id = '~/.client_secret.json'
mystorage_id = "~/.storage.json"
sheets = Sheets.from_files(my_id, mystorage_id)
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

## boot up

we are instanciating all the parameters needed for this pipeline to run

In [None]:
samplesetname = "20Q4"
prevname="20Q3"
prevversion = 24
prevprevname ='20Q2'
prevprevversion= 22

workspace1="broad-genomics-delivery/Getz_IBM_CellLines_Exomes"
workspace2="broad-firecloud-ccle/CCLE_DepMap_WES"
workspace3="broad-genomics-delivery/CCLE_DepMap_WES"

workspace6="terra-broad-cancer-prod/CCLE_DepMap_WES"

refworkspace="broad-firecloud-ccle/DepMap_Mutation_Calling_CGA_pipeline"

rnaworkspace="broad-firecloud-ccle/DepMap_hg38_RNAseq"

source1="ibm"
source2="ccle"
source3="ccle"
source6="ccle"
source7="ibm"

refsheet_url = "https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE"
refsheet_id = "555466897"
sheeturl = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"

release = samplesetname


In [None]:
wm1 = dm.WorkspaceManager(workspace1)
wm2 = dm.WorkspaceManager(workspace2)
wm3 = dm.WorkspaceManager(workspace3)

wm6 = dm.WorkspaceManager(workspace6)

refwm = dm.WorkspaceManager(refworkspace)

In [None]:
extract_to_change = {'from_arxspan_id': 'participant'}

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame().set_index("cds_sample_id")

In [None]:
refsheet = GSheet('/home/jeremie/'+mystorage_id[2:], refsheet_url, refsheet_id)

## Adding new data

We are looking for new samples in a range of workspaces.

They are quite messy and might contains duplicates, contain broken file paths...

- We are thus looking at the bam files one by one and comparing them with our own bams. 
- We remove broken files, duplicates and add new version of a cell line's bam if we find some.

In [None]:
# we will be missing "primary disease","sm_id", "cellosaurus_id", "gender, "age", "primary_site", "primary_disease", "subtype", "subsubtype", "origin", "comments"
#when SMid: match== 
samples, pairs, noarxspan = GetNewCellLinesFromWorkspaces(refworkspace, stype='wes', refurl=refsheet_url, wmfroms = [workspace1, workspace2, workspace6], sources=[source1, source2, source6], match=['ACH-','CDS-'], participantslicepos=10, accept_unknowntypes=True, extract=extract_to_change, recomputedate=True)

In [None]:
noarxspan = noarxspan.sort_values(by='stripped_cell_line_name')

In [None]:
noarxspan.to_csv('temp/noarxspan_wes_'+release+'.csv')

### finding back arxspan

In [None]:
noarxspan['arxspan_id'] = ["ACH-001001", "ACH-000520", "ACH-000740", "ACH-000283", "ACH-000757", "ACH-001328", "ACH-000157", "ACH-000557", "ACH-000593", "ACH-001454", "ACH-001456", "ACH-001458", "ACH-001459", "ACH-001460", "ACH-001461", "ACH-000511", "ACH-000025", "ACH-000458", "ACH-001339", "ACH-000662", "ACH-000695", "ACH-000278", "ACH-000123", "ACH-000608", "ACH-001061", "ACH-000698", "ACH-000877", "ACH-001496", "ACH-001497", "ACH-000487", "ACH-001067", "ACH-001500", "ACH-000047", "ACH-001345", "ACH-000840", "ACH-000868", "ACH-000901", "ACH-000143", "ACH-000150", "ACH-000339", "ACH-000872", "ACH-000029", "ACH-000941", "ACH-000946", "ACH-000954", "ACH-000004", "ACH-000005", "ACH-000393", "ACH-001522", "ACH-000274", "ACH-001523", "ACH-000509", "ACH-000672", "ACH-000310", "ACH-000577", "ACH-000237", "ACH-000993", "ACH-001098", "ACH-000167", "ACH-000419", "ACH-000028", "ACH-000823", "ACH-001113", "ACH-000596", "ACH-000591", "ACH-000634", "ACH-000673", "ACH-000676", "ACH-000128", "ACH-000215", "ACH-000760", "ACH-000007", "ACH-000152", "ACH-001550", "ACH-001551", "ACH-001552", "ACH-000019", "ACH-000884", "ACH-001554", "ACH-001555", "ACH-001556", "ACH-001557", "ACH-001558", "ACH-001559", "ACH-001560", "ACH-001561", "ACH-001562", "ACH-000758", "ACH-001563", "ACH-001566", "ACH-001567", "ACH-001568", "ACH-001569", "ACH-001570", "ACH-001129", "ACH-000866", "ACH-000514", "ACH-000921", "ACH-000434", "ACH-000010", "ACH-000912", "ACH-000700", "ACH-000251", "ACH-001075", "ACH-000337", "ACH-000837", "ACH-000800", "ACH-000767", "ACH-000378", "ACH-000200", "ACH-001368", "ACH-000436", "ACH-000247", "ACH-000544", "ACH-000296", "ACH-001373", "ACH-001151", "ACH-000022", "ACH-000606", "ACH-000960", "ACH-000791", "ACH-000774", "ACH-000261", "ACH-000398", "ACH-000473", "ACH-001386", "ACH-001645", "ACH-000887", "ACH-000655", "ACH-000490", "ACH-001190", "ACH-000312", "ACH-001194", "ACH-000017", "ACH-001654", "ACH-000127", "ACH-000302", "ACH-000461", "ACH-000466", "ACH-000736", "ACH-000898", "ACH-000537", "ACH-000460", "ACH-000280", "ACH-000316", "ACH-001390", "ACH-001391", "ACH-001394", "ACH-000122", "ACH-000677", "ACH-000820", "ACH-001402", "ACH-000452", "ACH-001210", "ACH-000036", "ACH-000262", "ACH-000304", "ACH-001709", "ACH-000836",]

In [None]:
noarxspan.loc[noarxspan[noarxspan['stripped_cell_line_name']=="SUM299PE1"].index,"stripped_cell_line_name"] = "SUM299PE"

In [None]:
noarxspan = resolveFromWorkspace(noarxspan, refsamples = ccle_refsamples[ccle_refsamples['datatype'] == 'wes'], match = ['ACH','CDS'], participantslicepos = 10, accept_unknowntypes = True, extract = extract_to_change)

In [None]:
#assess any potential issues
set(noarxspan.arxspan_id) & set(samples.arxspan_id)

In [None]:
samples = pd.concat([samples, noarxspan], sort=False)

In [None]:
samples = assessAllSamples(samples, ccle_refsamples, stype='wes', rename={}, extract={})

In [None]:
set(pairs.control_sample)

In [None]:
#TODO: manage the match normals in noarxspan samples

## getting the addtional data and writing it here in the right order 'as shown above'
- use the stripped_cell_line_name to find the samples on https://docs.google.com/spreadsheets/d/1uqCOos-T9EMQU7y2ZUw4Nm84opU5fIT1y7jet1vnScE/edit#gid=356471436. 
- Make sure that we don't have duplicate cell lines in there. Otherwise, use the duplicate renaming function
- copy Primary Site, Primary Disease, Subtype, Comments, Disease Sub-subtype, if they exist. (sometimes subtype and subsubtype are the same.. don't use subsubtype then.
- look for the cell line in cellosaurus, you might need to use one of the aliases given in master depmap pv..
- copy  cellosaurus_id gender age info or write 'U' if they don't exist. 'can be a number or {Embryonic, Children, Adult, Fetus, U} 
- check that it does not say this cell line is not a duplicate from another cell line
- check that if it says this cell line is derived/children/father/samepatient from other cell lines, and that if we have any of the other cell lines, that the patient id is changed to be the same one for all (be sure that you are updating everywhere these patient ids are used)

In [None]:
len(ccle_refsamples)

In [None]:
# If I have a previous samples I can update unknown data directly
# TODO: to functionalize
index=[]
notfound=[]
toupdate = {"sex":[],
"primary_disease":[],
"sm_id":[],
"cellosaurus_id":[],
"age":[],
"primary_site":[],
"subtype":[],
"subsubtype":[],
"origin":[],
"parent_cell_line":[''],
"matched_normal":[''],
"comments":[],
"participant_id":[]}
for k, val in samples.iterrows():
    dat = ccle_refsamples[ccle_refsamples['arxspan_id']==val['arxspan_id']]
    if len(dat)>0:
        index.append(k)
        for k, v in toupdate.items():
            toupdate[k].append(dat[k].tolist()[0])
    else:
        notfound.append(k)
# doing so..
for k, v in toupdate.items():
    samples.loc[index,k] =v
len(samples.loc[notfound].patient_id), samples.loc[notfound].patient_id.tolist()

In [None]:
# found same patient
a = ["ACH-000635","ACH-000717", "ACH-000864", "ACH-001042", "ACH-001547"]
b = ["ACH-002291","ACH-001672"]

In [None]:
# duplicate ach-id
dup = {"ACH-001620": "ACH-001605",
"ACH-001621": "ACH-001606"}

In [None]:
if any([i in samples.arxspan_id.tolist() for i in dup.keys()]):
    samples = changeCellLineNameInNewSet(new = samples, ref=ccle_refsamples, datatype="wes", dupdict=dup)

In [None]:
# If I have a previous samples I can update unknown data directly
index=[]
notfound=[]
toupdate = {"sex":[],
"primary_disease":[],
"sm_id":[],
"cellosaurus_id":[],
"age":[],
"primary_site":[],
"subtype":[],
"subsubtype":[],
"parent_cell_line":[''],
"matched_normal":[''],
"origin":[],
"comments":[],
"participant_id":[]}
for k, val in samples.loc[notfound].iterrows():
    dat = ccle_refsamples[ccle_refsamples['arxspan_id']==val['arxspan_id']]
    if len(dat)>0:
        index.append(k)
        for k, v in toupdate.items():
            toupdate[k].append(dat[k].tolist()[0])
    else:
        notfound.append(k)
# doing so..
for k, v in toupdate.items():
    samples.loc[index,k] =v
len(samples.loc[notfound].patient_id), samples.loc[notfound].patient_id.tolist()

In [None]:
samples.loc[notfound]

In [None]:
toupdate = {"sex":["Male"],
"primary_disease":["Leukemia"],
"cellosaurus_id":["CVCL_Y549"],
"age":['Adult'],
"primary_site":["haematopoietic_and_lymphoid_tissue"],
"subtype":["CLL"],
"subsubtype":["b_cell"],
"comments":["B-type chronic lymphocytic leukemia (CLL, Rai stage I at diagnosis)"],
"stripped_cell_line_name":["21MT2"],
"parent_cell_line":[''],
"matched_normal":[''],
"participant_id":['PT-y3RbI7uD']}

In [None]:
a  = pd.DataFrame(toupdate)
a['name'] = samples.loc[notfound,"stripped_cell_line_name"].tolist()
a

In [None]:
# updating..
for k, v in toupdate.items():
    samples.loc[notfound,k] =v

In [None]:
# uploading to our bucket (now a new function)
h.changeToBucket(samples,'gs://cclebams/wes/', name_col= "index" , values=['internal_bam_filepath','internal_bai_filepath'], filetypes=['bam', 'bai'], catchdup=True, test=False)

In [None]:
sampes['baits'] = 'ice'

## Check that we have all the cell lines we expect for this release

This involves comparing to the list in the Google sheet "Cell Line Profiling Status."

_As the list cannot be parsed, we are not comparing it for now_

In [None]:
names=[]
subccle_refsamples = ccle_refsamples[ccle_refsamples['datatype'] == "wes"]
for k, val in samples.iterrows():
    val = val["arxspan_id"]
    names.append(val)
    samples.loc[k, 'version'] = len(subccle_refsamples[subccle_refsamples['arxspan_id'] == val]) + names.count(val)
samples['version'] = samples['version'].astype(int)

In [None]:
ccle_refsamples = pd.read_csv('temp/updated_ref_samples.csv', index_col=0)

In [None]:
subccle_refsamples.sequencing_date = h.datetoint(subccle_refsamples.sequencing_date.values, split='/', order = "asc")
for k, val in samples.iterrows():
    loc = subccle_refsamples[subccle_refsamples.arxspan_id==val.arxspan_id]
    if len(loc)>0:
        if val.sequencing_date > 0:
            for i, v in loc.iterrows():
                if v.sequencing_date > val.sequencing_date:
                    ccle_refsamples.loc[i,'version']+=1
                    samples.loc[k, 'version']-=1
        else:
            if max(loc['size']) > val['size']:
                samples.loc[k, 'version'] = 1
                ccle_refsamples.loc[loc.index,'version'] = ccle_refsamples.loc[loc.index,'version'].values+1     

In [None]:
ccle_refsamples = ccle_refsamples.append(samples, sort=False)

In [None]:
for val in samepatient:
    sub = ccle_refsamples[ccle_refsamples.arxspan_id.isin(val)]
    if len(set(sub.participant_id))>2:
        print('we found a missig participant relationship')
        # ccle_refsamples.loc[ccle_refsamples.index, "participant_id"]=sub.participant_id[0]

In [None]:
ccle_refsamples.to_csv('temp/updated_ref_samples.csv')

In [None]:
pairs = setupPairsFromSamples(samples, subccle_refsamples, extract={'patient_id':'participant_id'})

In [None]:
#uploading new samples to mut
refwm = refwm.disable_hound()
refwm.upload_samples(samples)
refwm.upload_entities('pairs', pairs)
refwm.update_pair_set(pair_set_id=samplesetname,pair_ids=pairs.index)
sam = refwm.get_samples()

pair = refwm.get_pairs()
refwm.update_pair_set(pair_set_id='all',pair_ids=pair.index)
refwm.update_pair_set(pair_set_id='all_agilent',pair_ids=pair[pair["case_sample"].isin(sam[sam['baits']=="AGILENT"].index.tolist())].index)
refwm.update_pair_set(pair_set_id='all_ice',pair_ids=pair[pair["case_sample"].isin([i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])].index)
#creating a sample set
refwm.update_sample_set(sample_set_id=samplesetname, sample_ids=samples.index)
refwm.update_sample_set(sample_set_id='all', sample_ids=[i for i in sam.index.tolist() if i!='nan'])
refwm.update_sample_set(sample_set_id='all_agilent', sample_ids = sam[sam['baits'] == "AGILENT"].index.tolist())
refwm.update_sample_set(sample_set_id='all_ice', sample_ids=[i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])

In [None]:
#and CN
cnwm = dm.WorkspaceManager('broad-firecloud-ccle/DepMap_WES_CN_hg38')
cnwm = cnwm.disable_hound()
cnwm.upload_samples(samples)
cnwm.upload_entities('pairs', pairs)
cnwm.update_pair_set(pair_set_id=samplesetname,pair_ids=pairs.index)
sam = cnwm.get_samples()

pair = cnwm.get_pairs()
cnwm.update_pair_set(pair_set_id='all',pair_ids=pair.index)
cnwm.update_pair_set(pair_set_id='all_agilent',pair_ids=pair[pair["case_sample"].isin(sam[sam['baits']=="AGILENT"].index.tolist())].index)
cnwm.update_pair_set(pair_set_id='all_ice',pair_ids=pair[pair["case_sample"].isin([i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])].index)
#creating a sample set
cnwm.update_sample_set(sample_set_id=samplesetname, sample_ids=samples.index)
cnwm.update_sample_set(sample_set_id='all', sample_ids=[i for i in sam.index.tolist() if i!='nan'])
cnwm.update_sample_set(sample_set_id='all_agilent', sample_ids = sam[sam['baits'] == "AGILENT"].index.tolist())
cnwm.update_sample_set(sample_set_id='all_ice', sample_ids=[i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])

# run the pipeline

We are using Dalmatian to send request to Terra, we are running a set of 5 functions To generate the mutation dataset:

*   For new samples in DepMap, run the ICE version of this task. CCLE2 samples used Agilent targets, so this pipeline should be used instead. The pipelines are identical in terms of their outputs, but the proper targets, baits, and pseudo normal should be used based on how the samples were sequenced.

    **ICE_CGA_Production_Analysis_Pipeline_Cell_Lines_copy** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22) OR


    **AGILENT_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Anablysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22)

*   **common_variant_filter** (breardon/common_variant_filterSnapshot ID: 3)
*   **filterMAF_on_CGA_pipeline** (gkugener/filterMAF_on_CGA_pipelineSnapshot ID: 8)
*   **aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1)

This outputs to be downloaded will be saved in the sample set that was run. The output we use for the release is:


*   **passedCGA_filteredMAF_aggregated** 

There are several other tasks in this workspace. In brief:



*   **CGA_Production_Analysis_Pipeline_Cell_Lines** (lelagina/CGA_Production_Analysis_Pipeline_Cell_LinesSnapshot ID: 12). This task is the same as the ICE and AGILENT prefixed version above, except that it relied on pulling the baits and targets to use from the metadata stored for the samples. Having AGILENT and ICE versions specified made the uploading and running process easier.
*   **SANGER_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22). This task was trying to run the CGA pipeline on the Sanger WES data, using a Sanger pseudo normal. In its current implementation, this task fails to complete for the samples.
*   **UNFILTERED_aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1). Aggregates the MAF outputted by the CGA cell line pipeline prior to the common variant filter and germline filtering tasks. This can give us insight to which mutations are getting filtered out when. We may want to potentially include this MAF in the release so people can see why certain mutations of interest may be getting filtered out.
*   WES_DM_Mutation_Calling_Pipeline_(standard |expensive) (gkugener/WES_DM_Mutation_Calling_PipelineSnapshot ID: 2). This was a previous mutation calling pipeline implemented for CCLE. We do not use this pipeline any more as the CGA pipeline looks better.
*   aggregate_filterMAF_CGA (CCLE/aggregate_filterMAF_CGASnapshot ID: 1). An aggregation MAF task that we used in the past. We do not use this task anymore.
*   calculate_mutational_burden (breardon/calculate_mutational_burdenSnapshot ID: 21). This task can be used to calculate the mutational rate of the samples. We do not make use of this data in the release although it could be of interest.
*   summarizeWigFile (breardon/summarizeWigFileSnapshot ID: 5). CCLF ran this task (might be necessary for the mutational burden task). For our workflow, we do not run it.

## On Terra

In [None]:
samplesetname

In [None]:
submission_id1 = refwm.create_submission("CGA_WES_CCLE_ICE", samplesetname, 'sample_set', expression='this.samples')

### Germline

In [None]:
submission_id2 = refwm.create_submission("cnn-variant-filter", samplesetname, 'sample_set', expression='this.samples')

### copy pairs data to sample data

In [None]:
pairs = refwm.get_pairs()

In [None]:
pairs = pairs[pairs.index.isin(tokeep)]
pairs = pairs[~pairs['mutation_validator_validated_maf'].isna()]
pairs = pairs.drop(columns=['case_sample','control_sample','participant_id'])
pairs.index = [i.split('_')[0] for i in pairs.index]

In [None]:
refwm.update_sample_attributes(pairs)

continuing

In [None]:
terra.waitForSubmission(refworkspace, submission_id1)
submission_id1 = refwm.create_submission("common_variant_filter", samplesetname, 'sample_set', expression='this.samples')

In [None]:
terra.waitForSubmission(refworkspace, submission_id2)
submission_id2 = refwm.create_submission("aggregate_vcfs", "all")

In [None]:
terra.waitForSubmission(refworkspace, submission_id1)
submission_id1 = refwm.create_submission("filterMAF_on_CGA_pipeline", samplesetname,'sample_set',expression='this.samples')

### filtered

In [None]:
terra.waitForSubmission(refworkspace, submission_id1)
submission_id1 = refwm.create_submission("aggregateMAFs_selectFields_filtered", "all")

### unfiltered

In [None]:
submission_id3 = refwm.create_submission("aggregateMAFs_selectFields_unfiltered", "all")

In [None]:
terra.waitForSubmission(refworkspace, [submission_id1,submission_id2, submission_id3])

### Save the workflow configurations used

In [None]:
terra.saveConfigs(refworkspace,'./data/'+samplesetname+'/Mutconfig')

## On local


### Remove some datafile to save money¶

In [None]:
res = refwm.get_samples()
toremove = ["fixedmate_bam"]
for val in toremove:
    refwm.disable_hound().delete_entity_attributes('sample', res[val], delete_files=True)

In [None]:
! gsutil -m rm "gs://fc-secure-012d088c-f039-4d36-bde5-ee9b1b76b912/9e3cc501-3f08-47fb-87a5-0359febb833c/**/call-tumorMM_Task/*.cleaned.bam"

In [None]:
# sometimes it does not work; so better check again
a = res.fixedmate_bam
a = [i for i in a if i is not np.nan]
gcp.rmFiles(a)

### downloading from terra

In [None]:
sam = refwm.get_samples()

In [None]:
nowes = set(mutations.DepMap_ID)-set(sam.arxspan_id)
nowes

In [None]:
mutations.columns

In [None]:
nothing = nows -set(ccle_refsamples.arxspan_id)
nothing

In [None]:
set(mutations[mutations.DepMap_ID.isin(nothing) & ~mutations.SangerWES_AC.isna()].DepMap_ID)

### get QC files

In [None]:
dataMut = getQC(workspace=refworkspace ,only=[], qcname=["gatk_cnv_all_plots", "lego_plotter_pngs", "copy_number_qc_report", "ffpe_OBF_figures", "mut_legos_html", "oxoG_OBF_figures", "tumor_bam_base_distribution_by_cycle_metrics", "tumor_bam_converted_oxog_metrics"])

In [None]:
dataBam = getQC(workspace=refworkspace ,only=[], qcname=[ "tumor_bam_alignment_summary_metrics", "tumor_bam_bait_bias_summary_metrics", "tumor_bam_gc_bias_summary_metrics", "tumor_bam_hybrid_selection_metrics", "tumor_bam_insert_size_histogram", "tumor_bam_insert_size_metrics", "tumor_bam_pre_adapter_summary_metrics", "tumor_bam_quality_by_cycle_metrics", "tumor_bam_quality_distribution_metrics", "tumor_bam_quality_yield_metrics"])

In [None]:
wrongsamples = ['CDS-jqOvtj', 'CDS-FRxdcH', 'CDS-6Yy3Yj', 'CDS-CuJ0f8', 'CDS-rLRUbG', 'CDS-PdUZxY', 'CDS-eUqT7L', 'CDS-KbbgMb', 'CDS-6da3hu', 'CDS-fXMRF9', 'CDS-CMenCH', 'CDS-MLJbT2', 'CDS-QVhVDT', 'CDS-XevQNc', 'CDS-0pZb0j', 'CDS-6l3V79', 'CDS-MnF3x8', 'CDS-ihI7Dp', 'CDS-34hKv3', 'CDS-TyWjJs', 'CDS-4sr6RL', 'CDS-M8xDMS', 'CDS-TpDBjm', 'CDS-W80jkV', 'CDS-agZcmk', 'CDS-cYWYp7', 'CDS-IJnjkY', 'CDS-0aJ4Yh', 'CDS-txTRwz', 'CDS-gIMBax', 'CDS-1p2nnc', 'CDS-KQDgIV', 'CDS-Eq9UNX', 'CDS-3M6Pq9', 'CDS-qZsCuJ', 'CDS-0lfqVz', 'CDS-o4dXGr', 'CDS-uQ8nnX', 'CDS-iqPqOr', 'CDS-Dkl8OF', 'CDS-Hj3xAa', 'CDS-3WygAj', 'CDS-oHu1Ik', 'CDS-X3c4UY', 'CDS-PYw8ID', 'CDS-Sp18uD', 'CDS-leGxSD', 'CDS-SJq3p4', 'CDS-no7ysz', 'CDS-UnDaBI', 'CDS-eowEZF', 'CDS-HNytLD', 'CDS-KYkMDa', 'CDS-OgPf0h', 'CDS-OCkOqy', 'CDS-QU7ftt', 'CDS-iEULQm', 'CDS-ODmXrP', 'CDS-YMIv9D', 'CDS-5rD8XC', 'CDS-QXBhht', 'CDS-9XPgHB', 'CDS-Ig6N9S', 'CDS-UtrDTK', 'CDS-nby0QM', 'CDS-49azaP', 'CDS-9qDPiX', 'CDS-KgRznV', 'CDS-picEuX', 'CDS-L0pDPl', 'CDS-kxNZ5S', 'CDS-1djAlo', 'CDS-YYLKZ0', 'CDS-pXMN9C', 'CDS-gRA4SM', 'CDS-QHp4h4', 'CDS-B0qAaq', 'CDS-1b1Hxk', 'CDS-5wYxZS', 'CDS-cyuMYb', 'CDS-XQkXf4', 'CDS-7PFldq',
'CDS-3EBt51', 'CDS-UV1pVE', 'CDS-WedVJA', 'CDS-WfjTcJ', 'CDS-bntBUl', 'CDS-cAEii6', 'CDS-d18Xie', 'CDS-dpub1O', 'CDS-yPSmxb']

In [None]:
for k,v in dataMut.items():
    if k =='nan':
        continue
    curr = ccle_refsamples.loc[k,'processing_qc']
    curr = set(curr[1:-1].replace("'","").split(', '))
    curr = set(v) | curr
    ccle_refsamples.loc[k,'processing_qc'] = str(list(curr))
for k,v in dataBam.items():
    if k =='nan':
        continue
    curr = ccle_refsamples.loc[k,'processing_qc']
    curr = set(curr[1:-1].replace("'","").split(', '))
    curr = set(v) | curr
    ccle_refsamples.loc[k,'bam_qc'] = str(list(curr))
#ccle_refsamples.to_csv('temp/newrefWES.csv')

In [None]:
res = refwm.get_sample_sets().loc["all"]
res

### retrieving RNAseq vcfs

In [None]:
rnamutations = dm.WorkspaceManager(rnaworkspace).get_sample_sets().loc['All_samples']['merged_vcf']
! gsutil cp $rnamutations "temp/rna_mutation_unfiltered_terra_merged.vcf"

### retrieving germline mutations

In [None]:
snps = res['merged_vcf']
! gsutil cp $snps gs://cclebams/germline_data/wes.all.called.vcf
! gsutil -m acl ch -ru taiga-892@cds-logging.iam.gserviceaccount.com:R gs://cclebams/germline_data/wes.all.called.vcf

### rertrievinng filtered mutations

In [None]:
filtered = res['filtered_CGA_MAF_aggregated']
! gsutil cp $filtered "temp/mutation_filtered_terra_merged.txt"

In [None]:
mutations = mutations[~mutations.Tumor_Sample_Barcode.isin(wrongsamples)]

In [None]:
mutations = pd.read_csv('temp/mutation_filtered_terra_merged.txt',sep='\t') 
print(mutations.columns[:10])
renaming = removeOlderVersions(names = set(mutations['Tumor_Sample_Barcode']), refsamples = refwm.get_samples(), arxspan_id = "arxspan_id", version="version")
print(len(mutations[mutations['Chromosome']=='0']))

# postprocessing


Here, rather than rerunning the entire analysis, because we know we are adding only WES samples, we can download the previous release's MAF, add the samples, update any annotations, and perform any global filters at the end.

First we need to do an additional step of filtering on coverage and number 

- readMutations
- createSNPs
- addToMainMutation
- filterAllelicFraction
- filterMinCoverage
- mergeAnnotations
- addAnnotation
- maf_add_variant_annotations
- mutation_maf_to_binary_matrix (x3)

In [None]:
mutations = mutations.rename(columns={"i_ExAC_AF":"ExAC_AF","Tumor_Sample_Barcode":'DepMap_ID',"Tumor_Seq_Allele2":"Tumor_Allele"}).drop(columns=['Center','Tumor_Seq_Allele1'])

In [None]:
mutations = annotate_likely_immortalized(mutations, TCGAlocs = ['TCGAhsCnt',
'COSMIChsCnt'], max_recurrence=0.05 ,min_tcga_true_cancer=5)

In [None]:
mutations['CGA_WES_AC'] = [str(i[0]) + ':' + str(i[1]) for i in np.nan_to_num(mutations[['t_alt_count','t_ref_count']].values,0).astype(int)]

In [None]:
mutations = filterCoverage(mutations, loc=['CGA_WES_AC'], sep=':',cov=2)

In [None]:
mutations = filterAllelicFraction(mutations, loc=['CGA_WES_AC'], sep=':',frac=0.1)

In [None]:
#Count the total number of mutations per cell line, split by type (SNP, INS, DEL)
#Count the total number of mutations observed by position

In [None]:
mutations = addAnnotation(mutations, NCBI_Build='37', Strand="+")

In [None]:
mutations.to_csv('temp/wes_somatic_mutations_withduplicates_'+samplesetname+'.csv', index=False)

In [None]:
mutations = mutations[mutations.DepMap_ID.isin(renaming.keys())].replace(renaming)

### Adding WGS's exonic mutation

In [None]:
wgsemutations = pd.read_csv('temp/wgs_somatic_mutations_'+samplesetname+'.csv')

In [None]:
#for now we keep WES if we have them
toadd = set(wgsemutations.DepMap_ID) - set(mutations.DepMap_ID)
toadd

In [None]:
mutations = mutations.append(wgsemutations[wgsemutations.DepMap_ID.isin(toadd)])

In [None]:
mutations.to_csv('temp/wes_somatic_mutations_all_'+samplesetname+'.csv', index=False)

In [None]:
mutations = pd.read_csv('temp/wes_somatic_mutations_all_'+samplesetname+'.csv')

In [None]:
mutations

In [None]:
mafToMat(mutations[(mutations.isDeleterious) & (mutations['tumor_f']>0.25)]).T.to_csv('temp/wes_somatic_mutations_deleterious_matrix.csv')
mafToMat(mutations[~(mutations.isDeleterious | mutations.isCOSMIChotspot | mutations.isTCGAhotspot | mutations['Variant_Classification']=='Silent') & (mutations['tumor_f']>0.25)]).T.to_csv('temp/wes_somatic_mutations_other_matrix.csv')
mafToMat(mutations[(mutations.isCOSMIChotspot | mutations.isTCGAhotspot) &(mutations['tumor_f']>0.25)]).T.to_csv('temp/wes_somatic_mutations_hotspot_matrix.csv')

In [None]:
mafToMat(mutations[(mutations.isDeleterious) & (mutations['tumor_f']>0.25)], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_deleterious_boolmatrix.csv')
mafToMat(mutations[~(mutations.isDeleterious | mutations.isCOSMIChotspot | mutations.isTCGAhotspot | mutations['Variant_Classification']=='Silent') & (mutations['tumor_f']>0.25)], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_other_boolmatrix.csv')
mafToMat(mutations[(mutations.isCOSMIChotspot | mutations.isTCGAhotspot) & (mutations['tumor_f']>0.25)], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_hotspot_boolmatrix.csv')

In [None]:
legacy_hybridcapture = tc.get(name='mutations-da6a', file='legacy_hybridcapture_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_raindance = tc.get(name='mutations-da6a', file='legacy_raindance_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_rna = tc.get(name='mutations-da6a', file='legacy_rna_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_wes_sanger = tc.get(name='mutations-da6a', file='legacy_wes_sanger_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_wgs_exoniconly = tc.get(name='mutations-da6a', file='legacy_wgs_exoniconly_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})

solving issues with the legacy datasets

In [None]:
legacy_hybridcapture[legacy_hybridcapture.DepMap_ID=='ACH-001011']

In [None]:
legacy_hybridcapture.loc[legacy_hybridcapture[legacy_hybridcapture['Variant_Classification'].isna()].index,'Variant_Classification']='Missense_Mutation'

In [None]:
legacy_wes_sanger[legacy_wes_sanger.DepMap_ID=="ACH-002396"]

In [None]:
legacy_wgs_exoniconly.loc[legacy_wgs_exoniconly[legacy_wgs_exoniconly['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_wgs_exoniconly[legacy_wgs_exoniconly['Genome_Change'].isna()].iterrows()]

In [None]:
legacy_wes_sanger.loc[legacy_wes_sanger[legacy_wes_sanger['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_wes_sanger[legacy_wes_sanger['Genome_Change'].isna()].iterrows()]

In [None]:
legacy_raindance.loc[legacy_raindance[legacy_raindance['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_raindance[legacy_raindance['Genome_Change'].isna()].iterrows()]

In [None]:
legacy_hybridcapture.loc[legacy_hybridcapture[legacy_hybridcapture['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_hybridcapture[legacy_hybridcapture['Genome_Change'].isna()].iterrows()]

In [None]:
legacy_rna.loc[legacy_rna[legacy_rna['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_rna[legacy_rna['Genome_Change'].isna()].iterrows()]

In [None]:
todrop = []
for val in h.dups(legacy_rna.loci):
    todrop.append(legacy_rna[legacy_rna.loci==val].index[0])
legacy_rna = legacy_rna.drop(todrop)

In [None]:
merged = mergeAnnotations(mutations, legacy_hybridcapture, useSecondForConflict=True, dry_run=False)
merged = mergeAnnotations(merged, legacy_raindance, useSecondForConflict=True, dry_run=False)
merged = mergeAnnotations(merged, legacy_rna, useSecondForConflict=False, dry_run=False)
merged = mergeAnnotations(merged, legacy_wes_sanger, useSecondForConflict=False, dry_run=False)

In [None]:
merged = mergeAnnotations(merged, legacy_wgs_exoniconly, useSecondForConflict=False, dry_run=False)

In [None]:
mutation_groups={
"other conserving": ["5'Flank", "Intron", "IGR", "3'UTR", "5'UTR"],
"other non-conserving":["In_Frame_Del", "In_Frame_Ins", "Stop_Codon_Del", "Stop_Codon_Ins", "Missense_Mutation", "Nonstop_Mutation"],
'silent': ['Silent'],
"damaging":['De_novo_Start_OutOfFrame','Frame_Shift_Del','Frame_Shift_Ins', 'Splice_Site', 'Start_Codon_Del', 'Start_Codon_Ins', 'Start_Codon_SNP','Nonsense_Mutation']
}

In [None]:
rename = {}
for k,v in mutation_groups.items():
    for e in v:
        rename[e] = k
merged['Variant_annotation'] = [rename[i] for i in merged['Variant_Classification'].tolist()]

### Compare to previous release

I would run some checks here comparing the results to the previous releases MAF. Namely:

- Count the total number of mutations per cell line, split by type (SNP, INS, DEL)
- Count the total number of mutations observed by position (group by chromosome, start position, end position and count the number of mutations)
- Look at specific differences between the two MAFs (join on DepMap_ID, Chromosome, Start position, End position, Variant_Type). I would do this for WES only

In [None]:
prevmut = tc.get(name='depmap-mutation-calls-9be3', version=prevversion, file='depmap_'+prevname+'_mutation_calls')

In [None]:
prevprev= tc.get(name='depmap-mutation-calls-9be3', file= "depmap_"+prevprevname+"_mutation_calls", version = prevprevversion)

In [None]:
set(merged.DepMap_ID) - set(merged[~(merged['CGA_WES_AC'].isna() & merged['SangerWES_AC'].isna() & merged['WGS_AC'].isna())].DepMap_ID)

In [None]:
merged[merged.DepMap_ID=="ACH-000458"].sum(0)

In [None]:
merged[(merged["Hugo_Symbol"]=="ACOT4")&(merged['Start_position']==74058831)]

In [None]:
merged[merged.DepMap_ID=="ACH-001546"][merged.columns[-17:]]

### Do some checks and manual rescuing

In [None]:
mutations[mutations.DepMap_ID=="ACH-003000"]

### check important mutations

In [None]:
# check MOLM13, MV411 cell lines- The well known mutation status of FLT3

In [None]:
# check TP53 mutation 

Are mutation consistent?

QC mutations, for a known dependency, check if it matches mutation of this gene. (if P53 is mutated, cannot have dependency on P53 or MDM2 MDM4/ inverse fir BRAF and KRAF to themselves)

### saving this version

In [None]:
merged[merged.DepMap_ID=="ACH-002055"]

In [None]:
merged.to_csv('temp/wes_somatic_mutations_withlegacy_'+samplesetname+'.csv', index=False)

In [None]:
merged = pd.read_csv('temp/wes_somatic_mutations_withlegacy_'+samplesetname+'.csv')

In [None]:
mafToMat(merged[merged.Variant_annotation=="damaging"], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_damaging_'+samplesetname+".csv")
mafToMat(merged[merged.Variant_annotation=="other conserving"], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_othercons_'+samplesetname+".csv")
mafToMat(merged[merged.Variant_annotation=="other non-conserving"], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_othernoncons_'+samplesetname+".csv")
mafToMat(merged[(merged.isCOSMIChotspot | merged.isTCGAhotspot)], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_hotspot_'+samplesetname+'.csv')

### saving samples used for this release

should be the same as in CN otherwise need to do something more complex

In [None]:
#ccle_refsamples.loc[renaming.keys(),samplesetname]=1
#ccle_refsamples.loc[ccle_refsamples[ccle_refsamples.arxspan_id.isin(toadd) & ccle_refsamples.datatype=="wgs"].index,samplesetname]=1
#ccle_refsamples.to_csv('temp/newrefWES.csv')

## retrieving unfiltered mutations

In [None]:
unfiltered = res['unfiltered_CGA_MAF_aggregated']
! gsutil cp $unfiltered "temp/wes_mutation_unfiltered_terra_merged.txt"

In [None]:
unfiltered = pd.read_csv('temp/wes_mutation_unfiltered_terra_merged.txt', sep='\t', encoding='L6',na_values=["__UNKNOWN__",'.'], engine='c', dtype=str)

In [None]:
unfiltered['somatic'] = unfiltered['somatic'].replace('nan','False')
unfiltered['HGNC_Status'] = unfiltered['HGNC_Status'].replace('nan','Unapproved')
unfiltered['judgement'] = unfiltered['judgement'].replace('nan','REMOVE')
unfiltered = unfiltered.rename(columns={"i_ExAC_AF":"ExAC_AF","Tumor_Sample_Barcode":'DepMap_ID',"Tumor_Seq_Allele2":"Tumor_Allele"}).drop(columns=['Tumor_Seq_Allele1'])
unfiltered['CGA_WES_AC'] = [str(i[0]) + ':' + str(i[1]) for i in np.nan_to_num(unfiltered[['t_alt_count','t_ref_count']].values.astype(float),0).astype(int)]

In [None]:
toremove = []
subunfilt = unfiltered.iloc[:10000]
for i, val in enumerate(unfiltered.columns):
    h.showcount(i,len(unfiltered.columns))
    if len(set(subunfilt[val])-set(['nan']))==1:
        if len(set(unfiltered[val])-set(['nan']))==1:
            toremove.append(val)
unfiltered = unfiltered.drop(columns=set(toremove))

In [None]:
toint =  ["Start_position", "End_position"]
for val in toint:
    unfiltered[val]  = unfiltered[val].astype(int)

In [None]:
unfiltered.to_csv('temp/mutation_somatic_unfiltered_withreplicates.csv.gz', index=False)

In [None]:
unfiltered = unfiltered[unfiltered.DepMap_ID.isin(renaming.keys())]
unfiltered['DepMap_ID'] = unfiltered['DepMap_ID'].replace(renaming)

In [None]:
unfiltered.to_csv('temp/mutation_somatic_unfiltered_all.csv.gz', index=False)

In [None]:
del unfiltered

# uploading on taiga

## Saving to latest version

In [None]:
!gunzip temp/wes_mutation_somatic_unfiltered_withreplicates.csv.gz
!gunzip temp/wes_mutation_somatic_unfiltered_all.csv.gz

In [None]:
tc.update_dataset(dataset_permaname="mutations-latest-ed72",
                 upload_file_path_dict={
        'temp/wes_somatic_mutations_withduplicates_'+samplesetname+'.csv': 'TableCSV',
        'temp/wgs_somatic_mutations_'+samplesetname+'.csv': 'TableCSV',
        'temp/wes_somatic_mutations_deleterious_matrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_other_matrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_hotspot_matrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_all_'+samplesetname+'.csv': 'TableCSV',
        'temp/wes_somatic_mutations_deleterious_boolmatrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_other_boolmatrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_hotspot_boolmatrix.csv': 'NumericMatrixCSV',
        'temp/wes_mutation_somatic_unfiltered_all.csv': 'TableCSV',
        'temp/wes_mutation_somatic_unfiltered_withreplicates.csv': 'TableCSV',
        }, 
                 dataset_description="""
# Mutations

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal.

/!\ This is the most up to date version of the CCLE Mutatios data.
The data is most likely of a better quality that what is on other folder. It is however in beta version as not all changes have either been confirmed or accepted by the DepMap Ops and the DepMap Portal Team.

#Version:

v1: first version from 20Q4. with new ways to compute the binary matrices (a binary and continuous version), version with and without duplicates. the unfiltered list of mutations with all available annotations from the CGA pipeline.


# Notations:

all: every cell lines we have

WES: all data comes from the WExomeS samples we posses

WGS: all data comes from the WGenomeS samples we posses

withreplicates: if we have two different sequencing from a sample, we kept both, see the depmap sample tracker for annotations [https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE](https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE). this dataset is more geared toward QC or in-depth analysis of a particular cell line.

merged: everything from both WGS and WES

latest: only the latest sequencing versions of the samples were kept

genes (gene rpkm):
__Rows__:
__Columns__:
Counts (gene counts):
__Rows__:
__Columns__:
Gene level CN data:
__Rows__:
__Columns__:
 DepMap cell line IDs
 gene names in the format HGNC\_symbol (Entrez\_ID)
DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean
 """)