# Mutation Pipeline

In [40]:
from __future__ import print_function
import os.path
import pandas as pd
import gzip
import sys
import numpy as np

sys.path.insert(0, '..')

from src.CCLE_postp_function import *
from JKBio import Datanalytics as da 
from JKBio import TerraFunction as terra
from JKBio import Helper as h
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display



%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## boot up

we are instanciating all the parameters needed for this pipeline to run

In [41]:
samplesetname = "20Q3"
prevname="20Q2"
prevversion = 22
prevprevname ='20Q1'
prevprevversion= 20
virtual_public='public-20q3-3d35'
virtual_dmc='dmc-20q3-033d'
virtual_internal='internal-20q3-00d0'

workspace1="broad-genomics-delivery/Getz_IBM_CellLines_Exomes"
workspace2="broad-firecloud-ccle/CCLE_DepMap_WES"
workspace3="broad-genomics-delivery/CCLE_DepMap_WES"

workspace6="terra-broad-cancer-prod/CCLE_DepMap_WES"

refworkspace="broad-firecloud-ccle/DepMap_Mutation_Calling_CGA_pipeline"

rnaworkspace="broad-firecloud-ccle/DepMap_hg38_RNAseq"

source1="ibm"
source2="ccle"
source3="ccle"
source6="ccle"
source7="ibm"

refsheet_url = "https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE"
sheeturl = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"

release = samplesetname

In [3]:
%%R
release <- '20Q3'
prevname <- '20Q2'
genome_version <- 'hg19'
taiga_version <- 10
prevversion <-13

In [42]:
wm1 = dm.WorkspaceManager(workspace1)
wm2 = dm.WorkspaceManager(workspace2)
wm3 = dm.WorkspaceManager(workspace3)

wm6 = dm.WorkspaceManager(workspace6)

refwm = dm.WorkspaceManager(refworkspace)

In [None]:
extract_to_change = {'from_arxspan_id': 'participant'}

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame()

## Adding new data

We are looking for new samples in a range of workspaces.

They are quite messy and might contains duplicates, contain broken file paths...

- We are thus looking at the bam files one by one and comparing them with our own bams. 
- We remove broken files, duplicates and add new version of a cell line's bam if we find some.

In [None]:
# we will be missing "primary disease","sm_id", "cellosaurus_id", "gender, "age", "primary_site", "primary_disease", "subtype", "subsubtype", "origin", "comments"
#when SMid: match== 
samples, pairs, noarxspan = GetNewCellLinesFromWorkspaces(refworkspace, stype='wes', refurl=refsheet_url, wmfroms = [workspace1, workspace2, workspace3, workspace6], sources=[source1, source2, source3, source6], match=['ACH-','CDS-'], participantslicepos=10, accept_unknowntypes=True, extract=extract_to_change, recomputedate=True)

In [None]:
# I am trying to remove duplicates from samples without arxspan ids to then look more into them and see if I have to get data for them or if I should just throw them out
toremov=set()
for k, val in noarxspan.iterrows():
    withsamesize = noarxspan[noarxspan["sample_id"] == val["sample_id"]]
    if len(withsamesize) > 1:
        for l, v in withsamesize.iloc[1:].iterrows():
            toremov.add(l)

In [None]:
for i in toremov:
    noarxspan = noarxspan.drop(i)

In [None]:
noarxspan

In [None]:
noarxspan.sample_id = [i.split('_Exom')[0] for i in noarxspan.sample_id]

In [None]:
len(noarxspan)

In [None]:
noarxspan['ccle_name'] = [''.join(i.split('_')[1:-1]).split('_v')[0] for i in noarxspan.sample_id]
noarxspan['readgroup'] = [i.split('_')[0] for i in noarxspan.sample_id]

In [None]:
for i, v in noarxspan.iterrows():
    if not gcp.exists(v['cram_or_bam_path']):
        print(v.ccle_name)
        noarxspan = noarxspan.drop(i)

In [None]:
noarxspan['ccle_name'].tolist()

In [None]:
len(noarxspan)

In [None]:
toupdate = {"gender":[],
"primary_disease":[],
"sm_id":[],
"cellosaurus_id":[],
"age":[],
"primary_site":[],
"subtype":[],
"subsubtype":[],
"origin":[],
"comments":[],
"patient_id":[]}

In [None]:
samples

In [None]:
# If I have a previous samples I can update unknown data directly
index=[]
notfound=[]
for k, val in samples.iterrows():
    dat = ccle_refsamples[ccle_refsamples['arxspan_id']==val['arxspan_id']]
    if len(dat)>0:
        index.append(k)
        for k, v in toupdate.items():
            toupdate[k].append(dat[k].tolist()[0])
    else:
        notfound.append(k)

In [None]:
len(index)

In [None]:
# doing so..
for k, v in toupdate.items():
    samples.loc[index,k] =v

In [None]:
len(samples.loc[notfound].patient_id)

In [None]:
samples.loc[notfound].patient_id.tolist()

In [None]:
# for these samples I will need to check and manually add the data in the list 
samples.loc[notfound]

In [None]:
# found same patient
a = ["ACH-000635","ACH-000717", "ACH-000864", "ACH-001042", "ACH-001547"]
b = ["ACH-002291","ACH-001672"]

In [None]:
ccle_refsamples[ccle_refsamples.arxspan_id.isin(a)].patient_id

In [None]:
# duplicate ach-id
dup = {"ACH-001620": "ACH-001605",
"ACH-001621": "ACH-001606"}

In [None]:
samples = changeCellLineNameInNewSet(new = samples, ref=ccle_refsamples, datatype="rna", dupdict=dup)

In [None]:
#rename ccle_name TODO: ask becky what to do
rename = {"PEDS117": "CCLFPEDS0009T"}

In [None]:
len(notfound)

## getting the addtional data and writing it here in the right order 'as shown above'
- use the stripped_cell_line_name to find the samples on https://docs.google.com/spreadsheets/d/1uqCOos-T9EMQU7y2ZUw4Nm84opU5fIT1y7jet1vnScE/edit#gid=356471436. 
- Make sure that we don't have duplicate cell lines in there. Otherwise, use the duplicate renaming function
- copy Primary Site, Primary Disease, Subtype, Comments, Disease Sub-subtype, if they exist. (sometimes subtype and subsubtype are the same.. don't use subsubtype then.
- look for the cell line in cellosaurus, you might need to use one of the aliases given in master depmap pv..
- copy  cellosaurus_id gender age info or write 'U' if they don't exist. 'can be a number or {Embryonic, Children, Adult, Fetus, U} 
- check that it does not say this cell line is not a duplicate from another cell line
- check that if it says this cell line is derived/children/father/samepatient from other cell lines, and that if we have any of the other cell lines, that the patient id is changed to be the same one for all (be sure that you are updating everywhere these patient ids are used)

In [None]:
toupdate = {"gender":["Female","Female","Female"],
"primary_disease":["Breast Cancer","Breast Cancer","Breast Cancer"],
"cellosaurus_id":["CVCL_7932","CVCL_7931","CVCL_7933"],
"age":[37,37,37],
"primary_site":["pleural_effusion","pleural_effusion","breast"],
"subtype":["Carcinoma","Carcinoma","Carcinoma"],
"subsubtype":["","",""],
"comments":["HER2+; Received from Academic lab (Polyak, DFCI)","HER2+; Received from Academic lab (Polyak, DFCI)","HER2+; Received from Academic lab (Polyak, DFCI)"],
"stripped_cell_line_name":["21MT2","21MT1", "21NT"],
"patient_id":['PT-y3RbI7uD', 'PT-y3RbI7uD', 'PT-y3RbI7uD']}

In [None]:
a  = pd.DataFrame(toupdate)
a['name'] = samples.loc[notfound,"stripped_cell_line_name"].tolist()
a

In [None]:
# updating..
for k, v in toupdate.items():
    samples.loc[notfound,k] =v

In [None]:
# uploading to our bucket (now a new function)
samples = h.changeToBucket(samples,'gs://cclebams/wes/', values=['internal_bam_filepath','internal_bai_filepath'], catchdup=False)

In [None]:
samples

In [None]:
# saving and updating the spreadsheet with these
print("YOU NOW NEED TO UPDATE THE GOOGLE SHEET!")
samples.to_csv('temp/new_ccle_samples.csv')

In [None]:
samples['arxspan_id'].tolist()

In [None]:
samples

In [None]:
samples = samples.rename(columns={'patient_id':'participant_id'})

In [None]:
pairs

In [None]:
pairs = pairs.rename(columns={'patient_id':'participant_id'}).set_index('pair_id')

In [None]:
pairs.participant_id = samples.participant_id.tolist()

In [None]:
sam = cnwm.get_samples()
sam[sam['baits']=="AGILENT"]

In [None]:
#uploading new samples to mut
refwm = refwm.disable_hound()
refwm.upload_samples(samples)
refwm.upload_entities('pairs', pairs)
refwm.update_pair_set(pair_set_id=samplesetname,pair_ids=pairs.index)
sam = refwm.get_samples()

pair = refwm.get_pairs()
refwm.update_pair_set(pair_set_id='all',pair_ids=pair.index)
refwm.update_pair_set(pair_set_id='all_agilent',pair_ids=pair[pair["case_sample"].isin(sam[sam['baits']=="AGILENT"].index.tolist())].index)
refwm.update_pair_set(pair_set_id='all_ice',pair_ids=pair[pair["case_sample"].isin([i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])].index)
#creating a sample set
refwm.update_sample_set(sample_set_id=samplesetname, sample_ids=samples.index)
refwm.update_sample_set(sample_set_id='all', sample_ids=[i for i in sam.index.tolist() if i!='nan'])
refwm.update_sample_set(sample_set_id='all_agilent', sample_ids = sam[sam['baits'] == "AGILENT"].index.tolist())
refwm.update_sample_set(sample_set_id='all_ice', sample_ids=[i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])

In [None]:
#and CN
cnwm = dm.WorkspaceManager('broad-firecloud-ccle/DepMap_WES_CN_hg38')
cnwm = cnwm.disable_hound()
cnwm.upload_samples(samples)
cnwm.upload_entities('pairs', pairs)
cnwm.update_pair_set(pair_set_id=samplesetname,pair_ids=pairs.index)
sam = cnwm.get_samples()

pair = cnwm.get_pairs()
cnwm.update_pair_set(pair_set_id='all',pair_ids=pair.index)
cnwm.update_pair_set(pair_set_id='all_agilent',pair_ids=pair[pair["case_sample"].isin(sam[sam['baits']=="AGILENT"].index.tolist())].index)
cnwm.update_pair_set(pair_set_id='all_ice',pair_ids=pair[pair["case_sample"].isin([i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])].index)
#creating a sample set
#cnwm.update_sample_set(sample_set_id=samplesetname, sample_ids=samples.index)
#cnwm.update_sample_set(sample_set_id='all', sample_ids=[i for i in sam.index.tolist() if i!='nan'])
#cnwm.update_sample_set(sample_set_id='all_agilent', sample_ids = sam[sam['baits'] == "AGILENT"].index.tolist())
cnwm.update_sample_set(sample_set_id='all_ice', sample_ids=[i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])

## Check that we have all the cell lines we expect for this release

This involves comparing to the list in the Google sheet "Cell Line Profiling Status."

_As the list cannot be parsed, we are not comparing it for now_

In [None]:
# this function may not work - it hasn't been tested
url = 'https://docs.google.com/spreadsheets/d/1qus-9TKzqzwUMNWp8S1QP4s4-3SsMo2vuQRZrNXf7ag/edit?ts=5db85e27#gid=0&fvid=1627883727'

compareToCuratedGS(url, sample = newsample[0], samplesetname = samplesetname, colname = 'CN New to internal')

# run the pipeline

We are using Dalmatian to send request to Terra, we are running a set of 5 functions To generate the mutation dataset:

*   For new samples in DepMap, run the ICE version of this task. CCLE2 samples used Agilent targets, so this pipeline should be used instead. The pipelines are identical in terms of their outputs, but the proper targets, baits, and pseudo normal should be used based on how the samples were sequenced.

    **ICE_CGA_Production_Analysis_Pipeline_Cell_Lines_copy** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22) OR


    **AGILENT_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Anablysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22)

*   **common_variant_filter** (breardon/common_variant_filterSnapshot ID: 3)
*   **filterMAF_on_CGA_pipeline** (gkugener/filterMAF_on_CGA_pipelineSnapshot ID: 8)
*   **aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1)

This outputs to be downloaded will be saved in the sample set that was run. The output we use for the release is:


*   **passedCGA_filteredMAF_aggregated** 

There are several other tasks in this workspace. In brief:



*   **CGA_Production_Analysis_Pipeline_Cell_Lines** (lelagina/CGA_Production_Analysis_Pipeline_Cell_LinesSnapshot ID: 12). This task is the same as the ICE and AGILENT prefixed version above, except that it relied on pulling the baits and targets to use from the metadata stored for the samples. Having AGILENT and ICE versions specified made the uploading and running process easier.
*   **SANGER_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22). This task was trying to run the CGA pipeline on the Sanger WES data, using a Sanger pseudo normal. In its current implementation, this task fails to complete for the samples.
*   **UNFILTERED_aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1). Aggregates the MAF outputted by the CGA cell line pipeline prior to the common variant filter and germline filtering tasks. This can give us insight to which mutations are getting filtered out when. We may want to potentially include this MAF in the release so people can see why certain mutations of interest may be getting filtered out.
*   WES_DM_Mutation_Calling_Pipeline_(standard |expensive) (gkugener/WES_DM_Mutation_Calling_PipelineSnapshot ID: 2). This was a previous mutation calling pipeline implemented for CCLE. We do not use this pipeline any more as the CGA pipeline looks better.
*   aggregate_filterMAF_CGA (CCLE/aggregate_filterMAF_CGASnapshot ID: 1). An aggregation MAF task that we used in the past. We do not use this task anymore.
*   calculate_mutational_burden (breardon/calculate_mutational_burdenSnapshot ID: 21). This task can be used to calculate the mutational rate of the samples. We do not make use of this data in the release although it could be of interest.
*   summarizeWigFile (breardon/summarizeWigFileSnapshot ID: 5). CCLF ran this task (might be necessary for the mutational burden task). For our workflow, we do not run it.

## On Terra

In [None]:
submission_id1 = refwm.create_submission("CGA_WES_CCLE_ICE", samplesetname,'pair_set',expression='this.pairs')


### copy pairs data to sample data

In [None]:
pairs = refwm.get_pairs()

In [None]:
pairs = pairs[pairs.index.isin(tokeep)]
pairs = pairs[~pairs['mutation_validator_validated_maf'].isna()]
pairs = pairs.drop(columns=['case_sample','control_sample','participant_id'])
pairs.index = [i.split('_')[0] for i in pairs.index]

In [None]:
refwm.update_sample_attributes(pairs)

### Germline

In [1]:
submission_id2 = refwm.create_submission("cnn-variant-filter", samplesetname, 'sample_set', expression='this.samples')

SyntaxError: invalid syntax (<ipython-input-1-1cbc8561191b>, line 2)

Continuing

In [None]:
terra.waitForSubmission(refworkspace, submission_id1)
submission_id1 = refwm.create_submission("common_variant_filter", samplesetname, 'sample_set', expression='this.samples')

In [None]:
terra.waitForSubmission(refworkspace, submission_id2)
submission_id2 = refwm.create_submission("aggregate_vcfs", samplesetname)

In [None]:
terra.waitForSubmission(refworkspace, submission_id1)
submission_id1 = refwm.create_submission("filterMAF_on_CGA_pipeline", samplesetname,'sample_set',expression='this.samples')

### filtered

In [None]:
terra.waitForSubmission(refworkspace, submission_id1)
submission_id1 = refwm.create_submission("aggregateMAFs_selectFields_copy", samplesetname)

### unfiltered

In [None]:
submission_id3 = refwm.create_submission("aggregateMAFs_selectFields_unfiltered", samplesetname)

In [None]:
terra.waitForSubmission(refworkspace, [submission_id1,submission_id2, submission_id3])

### Save the workflow configurations used

In [None]:
terra.saveConfigs(refworkspace,'./data/'+samplesetname+'/Mutconfig')

## On local


### Remove some datafile to save money¶

In [None]:
res = refwm.get_samples()
toremove = ["fixedmate_bam"]
for val in toremove:
    refwm.disable_hound().delete_entity_attributes('sample', res[val], delete_files=True)

In [None]:
! gsutil -m rm "gs://fc-secure-012d088c-f039-4d36-bde5-ee9b1b76b912/9e3cc501-3f08-47fb-87a5-0359febb833c/**/call-tumorMM_Task/*.cleaned.bam"

In [None]:
# sometimes it does not work so better check again
a = res.fixedmate_bam
a = [i for i in a if i is not np.nan]
gcp.rmFiles(a)

### downloading from terra

In [None]:
sam = refwm.get_samples()

In [None]:
nowes = set(mutations.DepMap_ID)-set(sam.arxspan_id)
nowes

In [None]:
mutations.columns

In [None]:
nothing = nows -set(ccle_refsamples.arxspan_id)
nothing

In [None]:
set(mutations[mutations.DepMap_ID.isin(nothing) & ~mutations.SangerWES_AC.isna()].DepMap_ID)

In [None]:
res = refwm.get_sample_sets().loc["all"]
res

In [None]:
filtered = res['filtered_CGA_MAF_aggregated']
! gsutil cp $filtered "temp/mutation_filtered_terra_merged.txt"

### get QC files

In [None]:
dataMut = getWESQC(workspace=refworkspace ,only=[], qcname=["gatk_cnv_all_plots", "lego_plotter_pngs", "copy_number_qc_report", "ffpe_OBF_figures", "mut_legos_html", "oxoG_OBF_figures", "tumor_bam_base_distribution_by_cycle_metrics", "tumor_bam_converted_oxog_metrics"])

In [None]:
dataBam = getWESQC(workspace=refworkspace ,only=[], qcname=[ "tumor_bam_alignment_summary_metrics", "tumor_bam_bait_bias_summary_metrics", "tumor_bam_gc_bias_summary_metrics", "tumor_bam_hybrid_selection_metrics", "tumor_bam_insert_size_histogram", "tumor_bam_insert_size_metrics", "tumor_bam_pre_adapter_summary_metrics", "tumor_bam_quality_by_cycle_metrics", "tumor_bam_quality_distribution_metrics", "tumor_bam_quality_yield_metrics"])

In [None]:
new_refsamples = pd.read_csv('temp/newrefCN.csv',index_col="cds_sample_id")

In [None]:
for k,v in dataMut.items():
    if k =='nan':
        continue
    new_refsamples.loc[k,'processing_qc'] = str(v) + ',' + new_refsamples.loc[k,'processing_qc']
for k,v in dataBam.items():
    if k =='nan':
        continue
    new_refsamples.loc[k,'bam_qc'] = str(v) + ',' + new_refsamples.loc[k,'bam_qc']
new_refsamples.to_csv('temp/newrefWES.csv')

### retrieving unfiltered mutations

In [None]:
unfiltered = res['unfiltered_CGA_MAF_aggregated']
! gsutil cp $unfiltered "temp/mutation_unfiltered_terra_merged.txt"

In [None]:
unfiltered = pd.read_csv('temp/mutation_unfiltered_terra_merged.txt', sep='\t', encoding='L6',na_values=["__UNKNOWN__",'.'])

In [None]:
toremove = []
for val in unfiltered.columns:
    if len(unfiltered[unfiltered[val]=='nan'])>len(unfiltered)*0.99:
        toremove.append(val)
    elif len(set(unfiltered[val])-set(['nan']))==1:
        toremove.append(val)

In [None]:
unfiltered = unfiltered.drop(columns=["UniProt_Site","alt_allele_seen","CCLE_ONCOMAP_overlapping_mutations","failure_reasons","ESP_CA","SVTYPE","id","gnomADg_GT","ESP_GWAS_PUBMED", 'dbSNP_Val_Status', 'qual', 'iHpol', 'QSI_ref', 'BCNoise', 'score', 'Familial_Cancer_Genes_Reference', 'NT']+toremove)

In [None]:
unfiltered['somatic'] = unfiltered['somatic'].replace('nan','False')
unfiltered['HGNC_Status'] = unfiltered['HGNC_Status'].replace('nan','Unapproved')
unfiltered['judgement'] = unfiltered['judgement'].replace('nan','REMOVE')

In [None]:
toint =  ["Start_position", "End_position"]
for val in toint:
    unfiltered[val]  = unfiltered[val].astype('Int64')

### retrieving RNAseq vcfs

In [None]:
rnamutations = dm.WorkspaceManager(rnaworkspace).get_sample_sets().loc['All_samples']['merged_vcf']

In [None]:
rnamutations = 

In [None]:
mutations[mutations.DepMap_ID=="ACH-000045"]

### retrieving germline mutations

### postprocessing


Here, rather than rerunning the entire analysis, because we know we are adding only WES samples, we can download the previous release's MAF, add the samples, update any annotations, and perform any global filters at the end.

First we need to do an additional step of filtering on coverage and number 

- readMutations
- createSNPs
- addToMainMutation
- filterAllelicFraction
- filterMinCoverage
- mergeAnnotations
- addAnnotation
- maf_add_variant_annotations
- mutation_maf_to_binary_matrix (x3)

In [None]:
file = pd.read_csv('temp/mutation_filtered_terra_merged.txt',sep='\t') 
print(file.columns[:10])
renaming = removeOlderVersions(names = set(file['Tumor_Sample_Barcode']), refsamples = refwm.get_samples(), arxspan_id = "arxspan_id", version="version")
print(file[file['Chromosome']=='0'])
file[file['Tumor_Sample_Barcode'].isin(renaming.keys())].replace({'Tumor_Sample_Barcode':renaming}).reset_index(drop=True)

In [None]:
unfiltered[unfiltered.DepMap_ID.isin(renaming.keys())].replace(renaming).rename(columns={'Tumor_Sample_Barcode':'DepMap_ID'}).to_csv('temp/mutation_unfiltered_terra_merged.csv.gz', index=False)

#### saving samples used for 20Q2

In [None]:
ccle_refsamples.loc[renaming.keys(),version]=1
new_refsamples.to_csv('temp/newrefWES.csv')

In [None]:
%%R
newly_merged_maf <- readMutations('temp/mutation_filtered_terra_merged.txt')
new_release <- createSNPs(newly_merged_maf)
names(new_release)

In [None]:
%%R
previous.release.maf <- load.from.taiga(data.name='depmap-mutations-maf-35fe', data.file=paste0('mutations.',prevname),data.version=prevversion)
if (colnames(previous.release.maf)[1] == 'X1' || colnames(previous.release.maf)[1] == "") {
 previous.release.maf[,1] <- NULL 
}
prevnames <- names(previous.release.maf)
prevnames

In [None]:
%%R
merged <- addToMainMutation(previous.release.maf, new_release)

In [None]:
%%R
## Adding more
newly_merged_maf <- readMutations('temp/mutation_filtered_terra_merged.txt')
new_release <- createSNPs(newly_merged_maf)
print(names(new_release))
merged <- addToMainMutation(merged, new_release)
nrow(merged)

In [None]:
%%R
filtered <- filterAllelicFraction(merged)

In [None]:
%%R
filtered <- filterMinCoverage(filtered$merged, filtered$removed_from_maf)

In [None]:
%%R
clean_annotations <- mergeAnnotations(merged,previous.release.maf)

In [None]:
%%R
# Guillaume's version
new_release <- addAnnotation(filtered$merged, clean_annotations, colnames(previous.release.maf))
# Allie's version
new_release <- maf_add_variant_annotations(new_release)

In [None]:
def filterCoverage(maf, loc=['CGA_WES_AC'], sep=':',cov=4):
    muts=np.zeroes((len(maf),2))
    for val in loc:
        muts+= np.array([[v[0],0] if 'NA' in v else v for v in mutations_20Q2_all[val].fillna('0'+sep+'0').astype(str).str.split(sep).tolist()]).astype(int)
    return maf[muts[:,1]>=cov]

def filterAllelicFraction(maf, loc=['CGA_WES_AC'], sep=':',frac=0.3):
    muts=np.zeroes((len(maf),2))
    for val in loc:
        muts+= np.array([[v[0],0] if 'NA' in v else v for v in mutations_20Q2_all[val].fillna('0'+sep+'0').astype(str).str.split(sep).tolist()]).astype(int)
    muts = muts[:,0]/muts[:,1]
    return maf[muts>=frac]

def mergeAnnotations(newmaf, additionalmaf, additionalonmerge=[]):
    on = ['Chromosome', 'Start_position', 'End_position', 'Reference_Allele', 'Tumor_Seq_Allele1']
    on.extend(additionalonmerge)
    
    newmaf = newmaf.join(additionalmaf, on = on)
    if 
    solve issues with Hugo_Symbol, Entrez_Gene_Id
    
    
    
    return newmad
    
def mergeXY():
    dbSNP_RS.x, dbSNP_RS.y


def addAnnotation(maf, NCBI_Build='37', Strand="+"):
    maf['NCBI_Build'] = NCBI_Build
    maf['Strand'] = Strand
    maf = maf[['current', 'SangerWES_AC', 'SangerRecalibWES_AC', 'RNAseq_AC', 'HC_AC', 'RD_AC', 'WGS_AC']

def mafToMat(maf, col, boolify = False, samplesCol = "DepMap_ID", mutNameCol="Hugo_Symbol"):
    maf = maf.sort_values(by = mutNameCol)
    samples = set(maf[samplesCol])
    mut = pd.DataFrame(data = np.zeros((len(set(maf[mutNameCol])), 1)), columns=['fake'], index=set(maf[mutNameCol])).astype(float)
    for i,val in enumerate(samples):
        h.showcount(i,len(samples))
        mut = mut.join(maf[maf[samplesCol]==val].drop_duplicates(mutNameCol).set_index(mutNameCol)[col].rename(val))
    return mut.nan_to_num(0).astype(bool if boolify else float).drop(columns=['fake'])

In [None]:
filtered_mutations = filterCoverage(mutations)
filtered_mutations = filterAllelicFraction(filtered_mutations)

merged_mutations = addAnnotation(mutations)

mafToMat(filtered_mutations[filtered_mutations.damaging]).to_csv('.csv')
mafToMat(filtered_mutations[filtered_mutations.other]).to_csv('.csv')
mafToMat(filtered_mutations[filtered_mutations.hotspot]).to_csv('.csv')


CCLE2othermutations = 

mutations = mergeAnnotations(filtered_mutations, CCLE2othermutations)

#making 
for muttype in ['']:
    mafToMat(CCLE2othermutations[CCLE2othermutations.damaging & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")
    mafToMat(CCLE2othermutations[CCLE2othermutations.other & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")
    mafToMat(CCLE2othermutations[CCLE2othermutations.hotspot & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")

# Validation

## Compare to previous release

I would run some checks here comparing the results to the previous releases MAF. Namely:

- Count the total number of mutations per cell line, split by type (SNP, INS, DEL)
- Count the total number of mutations observed by position (group by chromosome, start position, end position and count the number of mutations)
- Look at specific differences between the two MAFs (join on DepMap_ID, Chromosome, Start position, End position, Variant_Type). I would do this for WES only

In [48]:
mutations = pd.read_csv('temp/mutations.'+release+'.all.csv')
damaging_mutation = pd.read_csv('temp/damaging_mutation.'+release+'.all.csv')
print(len(damaging_mutation))
other_mutation = pd.read_csv('temp/other_mutation.'+release+'.all.csv')
print(len(other_mutation))
hotspot_mutation = pd.read_csv('temp/hotspot_mutation.'+release+'.all.csv')
print(len(hotspot_mutation))

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
mutations.columns

In [None]:
set(mutations.DepMap_ID) - set(mutations[~(mutations['CGA_WES_AC'].isna() & mutations['SangerWES_AC'].isna() & mutations['WGS_AC'].isna() & mutations['SangerRecalibWES_AC'].isna())].DepMap_ID)

In [None]:
len(a)

In [None]:
mutations[mutations.DepMap_ID=="ACH-000458"].sum(0)

In [None]:
mutations[mutations["Hugo_Symbol"]=="ACOT4"][mutations['Start_position']==74058831]

In [None]:
ac_data = mutations[[val for val in mutations.columns.values if '_AC' in val]]

In [None]:
ac_names = ac_data.columns.values
ac_data = ac_data.values

In [None]:
ac_data.shape[0]

## Do some checks and manual rescuing

In [None]:
mutations[mutations.DepMap_ID=="ACH-003000"]

## check important mutations

In [None]:
# check MOLM13, MV411 cell lines- The well known mutation status of FLT3

In [None]:
# check TP53 mutation 

In [None]:
toofew = 0
allnan = 0
for pos, val in enumerate(ac_data):
    i = 0
    print(str(100*pos/ac_data.shape[0]),end='\r')
    for p, v in enumerate(val):
        if v is np.nan:
            i+=1
    if i==7:
        mutations = mutations.drop[pos]
        allnan+=1

In [None]:
allnan

### basic counts

In [None]:
#Count the total number of mutations per cell line, split by type (SNP, INS, DEL)

In [None]:
# Count the total number of mutations observed by position

Are mutation consistent?

In [None]:
#  to check this, if you group all the mutations in the mutations table by Chromosome, Start_position, End_position, Reference_Allele, Tumor_Seq_Allele1 columns, they should all have the same annotation for the other columns (protein change, exac_af, etc...)

QC mutations, for a known dependency, check if it matches mutation of this gene. (if P53 is mutated, cannot have dependency on P53 or MDM2 MDM4/ inverse fir BRAF and KRAF to themselves)

In [None]:
prevprevname,prevprevversion 

In [None]:
mutations[mutations.DepMap_ID=="ACH-001546"][mutations.columns[-17:]]

In [31]:
prevprev= set(tc.get(name='depmap-mutation-calls-9be3', file= "depmap_"+prevprevname+"_mutation_calls", version = prevprevversion).DepMap_ID.tolist())

# uploading on taiga

In [25]:
gsheets = sheets.get(sheeturl).sheets[6].to_frame()
wes_dmc_embargo = [i for i in gsheets['WES_DMC_embargo'].values.tolist() if str(i) != "nan"]
wes_embargo = [i for i in gsheets['WES_embargo'].values.tolist() if str(i) != "nan"]
blacklist = [i for i in gsheets['blacklist'].values.tolist() if str(i) != "nan"]

In [None]:
wes_embargo, wes_dmc_embargo, blacklist

In [None]:
! cd .. && git clone https://github.com/broadinstitute/depmap-release-readmes.git && cd -

In [None]:
! cd ../depmap-release-readmes && git pull

In [None]:
!cd ../depmap-release-readmes/ && python3 make_new_release.py $release && git add . && git commit -m $release && git push 

In [None]:
os.system('cd ../depmap-release-readmes && git pull && mv release-'+release+'/internal-'+release+'.txt ../ccle_processing/temp/README && cd -')

In [None]:
tc.update_dataset(dataset_permaname="depmap-mutations-maf-35fe",
                 upload_file_path_dict={'temp/mutations.'+release+'.all.csv': 'TableCSV'}, 
                 dataset_description="""
# Mutations

filtered and unfiltered mutation files from Broad WES and Sanger WES data mapped to hg19
The MAF file for DepMap that includes all of the latest WES samples. This MAF is generated by merging CCLE (WGS, RNAseq, RD, HC) and Sanger (WES) data.

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal. Must use subsetted dataset instead. These data will not make it on the portal starting 19Q1. With the DMC portal, there is new cell line release prioritization as to which lines can be included, so a new taiga dataset will be created containing CN for the portal.

version 1:  In 19Q1 the WES_AC column has been replaced by two columns, VA_WES_AC and CGA_WES_AC. We are currently using the Van Allen and CGA based pipeline to generate mutation calls. The CGA pipeline includes more filtering on the MAFs than VA and has a better INDEL caller. However, some of these filters may be removing some variants of interest that are still capture by the VA pipeline, which is why both a retained for now. DEPRECATED:  Missing the VA_WES_AC, CGA_WES_AC columns
version 2: 19Q1 data
version 3: 19Q2 data. We are no longer using the CCLE_WES_AC column. We are only using the CGA pipeline for mutation calls.
version 4: Updating to 19Q3interim DEPRECATED
version 5: Updating to 19Q3interim DEPRECATED
version 6: Updating to 19Q3interim
version 7: Updating to 19Q3 DEPRECATED
version 8: reparing the missing mutation problem DEPRECATED
version 9: reparing the missing column problem


version10:
Adding 52 new cell lines. 
Some cells lines have been flagged as:

version11:
adding missing cell lines

Adding 52 new cell lines. 
Some cells lines have been flagged as:

 - having bad looking copy ration plots = 
 - Genes having a similar CN value accross all []

version 12:

adding 8 new cell lines

version 13:

removing a wrong column

version 14:

adding 8 new cell lines. Adding .all. since we are soon going to release a restricted set of mutations. this one contains everything which is not necessarily what we want


genes (gene rpkm):
__Rows__:
__Columns__:
Counts (gene counts):
__Rows__:
__Columns__:
Gene level CN data:
__Rows__:
__Columns__:
 DepMap cell line IDs
 gene names in the format HGNC\_symbol (Entrez\_ID)
DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean
 """)

## Internal

In [None]:
hotspot_mutation

In [None]:
prevmut = tc.get(name='depmap-mutation-calls-9be3', version=24, file='depmap_'+prevname+'_mutation_calls.all')
print('shoud be None')
print(set(prevmut.DepMap_ID) - set(mutations.DepMap_ID))
print("new lines")
newlines = set(mutations.DepMap_ID) - set(prevmut.DepMap_ID) 
newlines

In [None]:
tc.update_dataset(dataset_permaname="depmap-mutation-calls-9be3",
                 upload_file_path_dict={'temp/depmap_'+release+'_mutation_calls.all': 'TableCSV',
                                        'temp/damaging_mutation.all': 'NumericMatrixCSV',
                                        'temp/other_mutation.all': 'NumericMatrixCSV',
                                        'temp/hotspot_mutation.all': 'NumericMatrixCSV',
                                       },#'temp/README': 'Raw'},
                 dataset_description="""
# Internal Mutations

Mutation calls for Internal DepMap data

* Version 1 Internal 18Q1*

original source: `/xchip/ccle_dist/broad_only/CMAG/mutations/CCLE_depMap_18Q1_maf_20180202.txt`
* Version 2-4 Internal 18Q2*

merged mutations and indels file (1,606 cell lines, including CCLE and Sanger WES reanalysis)
original source: 
`/xchip/ccle_dist/broad_only/CMAG/mutations/CCLE_depMap_18q2_maf_20180502.txt`
Binary matrices:
- damaging: if isDeleterious is true
- missense: if isDeleterious is false
- hotspot: if missense and either TCGA or COSMIC hotspot
Version 2 contains the MAF file
* Version 5-6 Internal 18Q3*

version 5 deprecated

original source: `/xchip/ccle_dist/broad_only/CMAG/mutations/CCLE_depMap_18q3_maf_20180716.txt`

Binary matrices:
- damaging: if isDeleterious is true
- missense: if isDeleterious is false
- hotspot: if missense and either TCGA or COSMIC hotspot
- Rows: cell line, Broad (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

MAF file

* Version 7-8 Internal 18Q4*

version 8 just changes a column name in the MAF file from Broad_ID to DepMap_ID

original source: `/xchip/ccle_dist/broad_only/CMAG/mutations/CCLE_DepMap_18Q4_maf_20181028.txt`

* Version 9-12 Internal 19Q1*

version 12 updates the column name from VA_WES_AC to CCLE_WES_AC

version 11+ uses an updated definition for hotspot mutations

version 12 contains the correct data for 19Q1

* Version 13 Internal 19Q2*

* Version 14-15 Internal 19Q3*

version 15 fixed entrez ids

* Version 16 Internal 19Q4*

adding 35 new cell lines.

* Version 16 Internal 19Q4*
uploading as matrices

* Version 17 Internal 19Q4*
removing unauthorized lines and setting as matrices

* Version 18 Internal 19Q4*
removing unauthorized lines and setting as matrices

* Version 19 Internal 20Q1*
uploading 8 new lines

* Version 20 Internal 20Q1*
removing unauthorized cl

* Version 21 Internal 20Q2*
uploading 8 new lines and adding .all to express the fact that this data is the aggregate of all different sequencing methods.

* Version 22 Internal 20Q2*
removing 2 cell lines

* Version 23 Internal 20Q3*
nothing different from 20Q2. no new cell lines

* Version 24 Internal 20Q2*
updating the blacklists

*** Variant annotation column ***

MAF file, added column (Variant_annotation) classifying each variant as either silent, damaging, other conserving, or other non-conserving, based on this mapping (old annotation from Variant_Classification column - new annotation):

Silent - silent
Splice_Site - damaging
Missense_Mutation - other non-conserving
Nonsense_Mutation - damaging
De_novo_Start_OutOfFrame - damaging
Nonstop_Mutation - other non-conserving
Frame_Shift_Del - damaging
Frame_Shift_Ins - damaging
In_Frame_Del - other non-conserving
In_Frame_Ins - other non-conserving
Stop_Codon_Del - other non-conserving
Stop_Codon_Ins - other non-conserving
Start_Codon_SNP - damaging
Start_Codon_Del - damaging
Start_Codon_Ins - damaging
5'Flank - other conserving
Intron - other conserving
IGR - other conserving
3'UTR - other conserving
5'UTR - other conserving
Binary matrices:

- damaging: if damaging
- other: if other conserving or other non-conserving
- hotspot: if it is not a silent mutation and is either TCGA or COSMIC hotspot
- Rows: cell line, DepMap (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

NEW LINES:
"""+newlines)

In [None]:
# To add to a virtual dataset
AddToVirtual(virtual_internal, 'depmap-mutation-calls-9be3', [('CCLE_mutations', 'depmap_'+release+'_mutation_calls'),])#('README','README')])
# To add to a eternal dataset
AddToVirtual('depmap-a0ab', 'depmap-mutation-calls-9be3', [('CCLE_mutations', 'depmap_'+release+'_mutation_calls')])

## DMC

In [None]:
os.system('cd ../depmap-release-readmes && git pull && mv release-'+releAse+'/dmc-'+releAse+'.txt ../ccle_processing/temp/README && cd -')

In [9]:
print(len(mutations))
mutations = mutations[~mutations.DepMap_ID.isin(wes_embargo)]
print(len(mutations))
mutations.to_csv('temp/depmap_'+release+'_mutation_calls.all', index=False)
print(len(damaging_mutation))
damaging_mutation = damaging_mutation[~damaging_mutation.index.isin(wes_embargo)]
print(len(damaging_mutation))
damaging_mutation.to_csv('temp/damaging_mutation.all')
print(len(other_mutation))
other_mutation = other_mutation[~other_mutation.index.isin(wes_embargo)]
print(len(other_mutation))
other_mutation.to_csv('temp/other_mutation.all',)
print(len(hotspot_mutation))
hotspot_mutation = hotspot_mutation[~hotspot_mutation.index.isin(wes_embargo)]
print(len(hotspot_mutation))
hotspot_mutation.to_csv('temp/hotspot_mutation.all',)

1301656
1297418
1757
1744
1758
1745
1725
1712


In [12]:
prevmut = tc.get(name='depmap-mutation-calls-dfce', version=15, file='depmap_'+prevname+'_mutation_calls')
print('shoud be None')
print(set(prevmut.DepMap_ID) - set(mutations.DepMap_ID))
print("new lines")
newlines = set(mutations.DepMap_ID) - set(prevmut.DepMap_ID) 
newlines

[##################]100% |  40.4 MiB/s | 277.3 MiB / 277.3 MiB | Time:  0:00:06


shoud be None
set()
new lines


{'ACH-001533', 'ACH-001574', 'ACH-002021', 'ACH-002065'}

In [None]:
tc.update_dataset(dataset_permaname="depmap-mutation-calls-dfce",
                 upload_file_path_dict={'temp/depmap_'+release+'_mutation_calls.all': 'TableCSV',
                                        'temp/damaging_mutation.all': 'NumericMatrixCSV',
                                        'temp/other_mutation.all': 'NumericMatrixCSV',
                                        'temp/hotspot_mutation.all': 'NumericMatrixCSV',
                                       },#'temp/README': 'Raw'},
                 dataset_description="""
# DMC Mutations

* Version 1-5 DMC 19Q1*

version 5 is a one-off portal thing because dmc wanted to be able to plot if a gene has any mutation as one-hot encoded value in the x/y axes of the data explorer It adds the any_mutation matrix, but does not change the others. Code used to generate:

```
from taigapy import TaigaClient

c = TaigaClient()

dmc_19q1_mutation_taiga_root = "depmap-mutation-calls-dfce.3/"
other_matrix = c.get(dmc_19q1_mutation_taiga_root + "other_mutation")
damaging_matrix = c.get(dmc_19q1_mutation_taiga_root + "damaging_mutation")
hotspot_matrix = c.get(dmc_19q1_mutation_taiga_root + "hotspot_mutation")

df = other_matrix.append(damaging_matrix)
df = df.groupby(level=0).sum()

df = df.append(hotspot_matrix)
df = df.groupby(level=0).sum()

df[df > 1] = 1

df.to_csv('any_mutation.csv')
```
The code uses version 3 because the dmc portal was using version 3

version 4 updates the column name from VA_WES_AC to CCLE_WES_AC

version 3 has an updated definition for hotspot mutations

version 2+ contains the correct data for 19Q1

* Version 6 DMC 19Q2*

* Version 7-8 DMC 19Q3*
version 8 fixed entrez ids

* Version 9 DMC 19Q4*
adding 52 new cell lines.

* Version 10 DMC 19Q4*
removing unauthorized lines and setting as matrices

* Version 11 DMC 19Q4*
removing unauthorized lines and setting as matrices

* Version 12 Internal 20Q1*
uploading 8 new lines

* Version 13 Internal 20Q1*
removing unauthorized cl

* Version 14 Internal 20Q2*
uploading 8 new lines and adding .all to express the fact that this data is the aggregate of all different sequencing methods.

* Version 15 Internal 20Q2*
removing 2 lines

* Version 15 Internal 20Q3*
nothing different from 20Q2. no new cell lines

* Version 15 Internal 20Q3*
updating the blacklists


MAF file, added column (Variant_annotation) classifying each variant as either silent, damaging, other conserving, or other non-conserving, based on this mapping (old annotation from Variant_Classification column - new annotation):

Silent - silent
Splice_Site - damaging
Missense_Mutation - other non-conserving
Nonsense_Mutation - damaging
De_novo_Start_OutOfFrame - damaging
Nonstop_Mutation - other non-conserving
Frame_Shift_Del - damaging
Frame_Shift_Ins - damaging
In_Frame_Del - other non-conserving
In_Frame_Ins - other non-conserving
Stop_Codon_Del - other non-conserving
Stop_Codon_Ins - other non-conserving
Start_Codon_SNP - damaging
Start_Codon_Del - damaging
Start_Codon_Ins - damaging
5'Flank - other conserving
Intron - other conserving
IGR - other conserving
3'UTR - other conserving
5'UTR - other conserving
Binary matrices:
- damaging: if damaging
- other: if other conserving or other non-conserving
- hotspot: if it is not a silent mutation and is either TCGA or COSMIC hotspot
- Rows: cell line, DepMap (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

NEW LINES:
"""+newlines)

In [None]:
# To add to a virtual dataset
AddToVirtual(virtual_dmc, 'depmap-mutation-calls-dfce', [('CCLE_mutations', 'depmap_'+release+'_mutation_calls'),])#('README','README')])

## Public

In [None]:
os.system('cd ../depmap-release-readmes && git pull && mv release-'+releAse+'/public-'+releAse+'.txt README && cd -')

In [27]:

#damaging_mutation
mutations=depmap_20Q3_mutation_calls
#hotspot_mutation
#other_mutation

In [35]:
print(len(mutations))
mutations = mutations[mutations.DepMap_ID.isin(prevprev)]
mutations = mutations[~mutations.DepMap_ID.isin(wes_dmc_embargo)]
print(len(mutations))
mutations.to_csv('temp/depmap_'+release+'_mutation_calls.all', index=False)
print(len(damaging_mutation))
damaging_mutation = damaging_mutation[damaging_mutation.index.isin(prevprev)]
damaging_mutation = damaging_mutation[~damaging_mutation.index.isin(wes_dmc_embargo)]
print(len(damaging_mutation))
damaging_mutation.to_csv('temp/damaging_mutation.all')
print(len(other_mutation))
other_mutation = other_mutation[other_mutation.index.isin(prevprev)]
other_mutation = other_mutation[~other_mutation.index.isin(wes_dmc_embargo)]
print(len(other_mutation))
other_mutation.to_csv('temp/other_mutation.all')
print(len(hotspot_mutation))
hotspot_mutation = hotspot_mutation[hotspot_mutation.index.isin(prevprev)]
hotspot_mutation = hotspot_mutation[~hotspot_mutation.index.isin(wes_dmc_embargo)]
print(len(hotspot_mutation))
hotspot_mutation.to_csv('temp/hotspot_mutation.all')

1297418
1297418
1744
1744
1745
1745
1712
1712


In [36]:
prevmut = tc.get(name='depmap-mutation-calls-9a1a', version=18, file='depmap_'+prevname+'_mutation_calls')
print('shoud be None')
ermgency_removed = set(prevmut.DepMap_ID) - set(mutations.DepMap_ID)
print(ermgency_removed) 
print("new lines")
newlines = set(mutations.DepMap_ID) - set(prevmut.DepMap_ID) 
newlines

shoud be None
set()
new lines


{'ACH-001533', 'ACH-001574', 'ACH-002021', 'ACH-002065'}

In [38]:
description="""
# Public Mutations

Mutation calls for Public DepMap data

* Version 1 Public 18Q1*

original source: CCLE data portal
* Version 2 Public 18Q2*

merged mutations and indels file (1,549 cell lines total, including data for 63 newly released cell lines)
original source: `/xchip/ccle_dist/public/DepMap_18Q2/CCLE_DepMap_18Q2_maf_20180502.txt`
* Version 3-4 Public 18Q3*

version 3 deprecated

original source: `/xchip/ccle_dist/public/DepMap_18Q3/CCLE_DepMap_18q3_maf_20180718.txt`

Binary matrices:
damaging: if isDeleterious is true
missense: if isDeleterious is false
hotspot: if missense and either TCGA or COSMIC hotspot
Rows: cell line, Broad (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

MAF file

* Version 5 Public 18Q4*

original source: `/xchip/ccle_dist/public/DepMap_18Q4/CCLE_DepMap_18q4_maf_20181029.txt`

* Version 6-9 Public 19Q1*

version 9 updates the column name from VA_WES_AC to CCLE_WES_AC

version 8 uses an updated definition for hotspot mutations

version 9 contains the correct data for 19Q1

* Version 10 Public 19Q2*

* Version 11-12 Public 19Q3*

version 12 fixed entrez ids

* Version 13 Public 19Q4*

adding 52 new cell lines

* Version 14 Public 19Q4*
removing unauthorized lines and setting matrices

* Version 15 Public 20Q1*
adding 8 new lines 

* Version 16 Public 20Q1*
removing an unauthorized line

* Version 17 Internal 20Q2*
uploading 8 new lines and adding .all to express the fact that this data is the aggregate of all different sequencing methods.

* Version 18 Internal 20Q2*
removing 2 lines

* Version 19 Internal 20Q3*
nothing different from 20Q2. no new cell lines

* Version 20 Internal 20Q3*
updating the blacklists

* Version 21 Internal 20Q3*
updating the dmc

* Version 22 Internal 20Q3*
readding two already released samples to the public list

MAF file, added column (Variant_annotation) classifying each variant as either silent, damaging, other conserving, or other non-conserving, based on this mapping (old annotation from Variant_Classification column - new annotation):

Silent - silent
Splice_Site - damaging
Missense_Mutation - other non-conserving
Nonsense_Mutation - damaging
De_novo_Start_OutOfFrame - damaging
Nonstop_Mutation - other non-conserving
Frame_Shift_Del - damaging
Frame_Shift_Ins - damaging
In_Frame_Del - other non-conserving
In_Frame_Ins - other non-conserving
Stop_Codon_Del - other non-conserving
Stop_Codon_Ins - other non-conserving
Start_Codon_SNP - damaging
Start_Codon_Del - damaging
Start_Codon_Ins - damaging
5'Flank - other conserving
Intron - other conserving
IGR - other conserving
3'UTR - other conserving
5'UTR - other conserving
Binary matrices:

- damaging: if damaging
- other: if other conserving or other non-conserving
- hotspot: if it is not a silent mutation and is either TCGA or COSMIC hotspot
- Rows: cell line, DepMap (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

NEW LINES:
"""+str(newlines)

if len(ermgency_removed):
    description+="""
    
    !! WE REMOVED!!:
    """+str(ermgency_removed)

tc.update_dataset(dataset_permaname="depmap-mutation-calls-9a1a",
                 upload_file_path_dict={'temp/depmap_'+release+'_mutation_calls.all': 'TableCSV',
                                        'temp/damaging_mutation.all': 'NumericMatrixCSV',
                                        'temp/other_mutation.all': 'NumericMatrixCSV',
                                        'temp/hotspot_mutation.all': 'NumericMatrixCSV',
                                       },#'temp/README': 'Raw'},
                 dataset_description=description)

Uploading depmap_20Q3_mutation_calls...
hitting https://cds.team/taiga/api/datafile/00b151ca35a14f6d9f1be95ef24ea368
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the fi

'8cc4900a50874d8593b0bfc591001360'

In [39]:
# To add to a virtual dataset
AddToVirtual(virtual_public, 'depmap-mutation-calls-9a1a', [('CCLE_mutations', 'depmap_'+release+'_mutation_calls'),])#('README','README')])

[('CCLE_mutations', 'depmap-mutation-calls-9a1a.22/depmap_20Q3_mutation_calls'), ('CCLE_gene_cn', 'depmap-wes-cn-data-97cc.34/public_20Q3_gene_cn'), ('Achilles_gene_effect_unscaled', 'avana-public-tentative-20q3-3e73.5/gene_effect_unscaled'), ('Achilles_high_variance_genes', 'avana-public-tentative-20q3-3e73.5/high_variance_genes'), ('Achilles_guide_efficacy', 'avana-public-tentative-20q3-3e73.5/guide_efficacy'), ('CCLE_fusions_unfiltered', 'gene-fusions-6212.14/unfiltered_fusions_20Q3'), ('common_essentials', 'avana-public-tentative-20q3-3e73.5/essential_genes'), ('Achilles_logfold_change_failures', 'avana-public-tentative-20q3-3e73.5/logfold_change_failures'), ('CCLE_expression', 'depmap-rnaseq-expression-data-ccd0.25/public_20Q3_proteincoding_tpm'), ('Achilles_raw_readcounts', 'avana-public-tentative-20q3-3e73.5/raw_readcounts'), ('Achilles_raw_readcounts_failures', 'avana-public-tentative-20q3-3e73.5/raw_readcounts_failing'), ('README', 'public-20q3-3d35.22/README'), ('CCLE_segment