# Mutation Pipeline

In [177]:
from __future__ import print_function
import os.path
import pandas as pd
import gzip
import sys
import numpy as np

sys.path.insert(0, '..')

from src.CCLE_postp_function import *
from JKBio import terra
from JKBio.utils import helper as h
from JKBio.google import gcp
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm
from JKBio.google.google_sheet import dfToSheet

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import pearsonr,spearmanr

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display

%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()

my_id = '~/.client_secret.json'
mystorage_id = "~/.storage.json"
# do the first steps of https://medium.com/craftsmenltd/from-csv-to-google-sheet-using-python-ef097cb014f9
creds = '../.credentials.json'

sheets = Sheets.from_files(my_id, mystorage_id)
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## boot up

we are instanciating all the parameters needed for this pipeline to run

In [154]:
samplesetname = "21Q1"

workspace1="broad-genomics-delivery/Getz_IBM_CellLines_Exomes"
workspace2="broad-firecloud-ccle/CCLE_DepMap_WES"
workspace3="broad-genomics-delivery/CCLE_DepMap_WES"

workspace6="terra-broad-cancer-prod/CCLE_DepMap_WES"
workspace7="terra-broad-cancer-prod/Getz_IBM_CellLines_Exomes"
refworkspace="broad-firecloud-ccle/DepMap_Mutation_Calling_CGA_pipeline"

rnaworkspace="broad-firecloud-ccle/DepMap_hg38_RNAseq"

source1="ibm"
source2="ccle"
source3="ccle"
source6="ccle"
source7="ibm"

refsheet_url = "https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY"
refsheet_id = "555466897"
sheeturl = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"
extract_to_change = {'from_arxspan_id': 'participant'}

MutationWESmethods=[
    "broadinstitute_gtex/samtofastq_v1-0_BETA/6",
    "broadinstitute_gtex/star_v1-0_BETA/7",
    "broadinstitute_gtex/rsem_v1-0_BETA/6",
    "jkobject/rsem_aggregate_results/5",
    "jkobject/rnaseq-germline-snps-indels/7",
    "broadinstitute_gtex/rnaseqc2_v1-0_BETA/2",
    "gkugener/STAR_fusion/17",
    "jkobject/aggregate_vcfs/22",
    "gkugener/Aggregate_files_set/2",
]

maxage='2020-09-10'

In [145]:
release = samplesetname

ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)
#wm1 = dm.WorkspaceManager(workspace1)
#wm2 = dm.WorkspaceManager(workspace2)
#wm3 = dm.WorkspaceManager(workspace3)

wm6 = dm.WorkspaceManager(workspace6)
wm7 = dm.WorkspaceManager(workspace6)

refwm = dm.WorkspaceManager(refworkspace)

## Adding new data

We are looking for new samples in a range of workspaces.

They are quite messy and might contains duplicates, contain broken file paths...

- We are thus looking at the bam files one by one and comparing them with our own bams. 
- We remove broken files, duplicates and add new version of a cell line's bam if we find some.

In [146]:
ccle_refsamples

Unnamed: 0_level_0,arxspan_id,version,sm_id,PDO,datatype,size,ccle_name,stripped_cell_line_name,participant_id,cellosaurus_id,...,18q1,18q2,18q3,18q4,19q1,19q2,19q3,19q4,20q1,20q2
cds_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CDS-Np8hgB,ACH-001036,1,,,hybrid_capture,6158697647,,CMK115,PT-W5WB8L8R,CVCL_0217,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDS-U2X6Jq,ACH-000096,1,,,hybrid_capture,1974163413,,G401,PT-paHVRsA6,CVCL_0270,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDS-Fm81Sq,ACH-001106,1,,,hybrid_capture,1504510077,,KOPN8,PT-DtWi0a1e,CVCL_1866,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDS-ai9nhG,ACH-000602,1,,,hybrid_capture,2268020309,,M07E,PT-DDVzRDiZ,CVCL_2106,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDS-76I1en,ACH-000375,1,,,hybrid_capture,2801470212,,G402,PT-irXMLyDH,CVCL_1221,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CDS-VVWDLi,ACH-001338,1,,,wgs,111261029620,,CHP134,PT-sqRdd0QF,CVCL_1124,...,,,,,,,,,,
CDS-WgAbdT,ACH-001703,1,,,wgs,118478462240,,VAL,PT-iFueiW0E,CVCL_1819,...,,,,,,,,,,
CDS-X7YJtK,ACH-001562,1,,,wgs,117193717024,,MERO95,PT-qbPTkrGa,CVCL_2597,...,,,,,,,,,,
CDS-YLQJ19,ACH-002511,1,,,wgs,125056253166,,M140325,PT-ELEMMjP8,U,...,,,,,,,,,,


In [157]:
# we will be missing "primary disease","sm_id", "cellosaurus_id", "gender, "age", "primary_site", "primary_disease", "subtype", "subsubtype", "origin", "comments"
#when SMid: match== 
samples, pairs, noarxspan = GetNewCellLinesFromWorkspaces(refworkspace, stype='wes', maxage=maxage, refurl=refsheet_url, wmfroms = [workspace6, workspace7], sources=[source6,source7], match=['ACH-','CDS-'], participantslicepos=10, accept_unknowntypes=True, extract=extract_to_change, recomputedate=True)

you need to have JKBio in your path:
e.g. have installed JKBio in the same folder as ccle_processing
refsamples is overrided by a refurl
Getting sample infos...

The shape of the sample tsv from <dalmatian.wmanager.WorkspaceManager terra-broad-cancer-prod/CCLE_DepMap_WES>: (232, 243)
Identifying any true duplicates by checking file hashes (this runs for each data source)...
This step can take a while as we need to use gsutil to check the size of each potential duplicate...
listing files in gs
These 14 bam file path do not exist: {'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/CCLE_DepMap_WES_July_Dec_2018/RP-1561/Exome/UMUC11/v3/UMUC11.bam', 'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/CCLE_DepMap_WES_July_Dec_2018/RP-1561/Exome/UMUC6/v3/UMUC6.bam', 'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/CCLE_DepMap_WES_July_Dec_2018/RP-1561/Exome/UMUC9/v3/UMUC9.bam', 'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/CCLE_DepMap_WES_July_Dec_2018/RP-1561/Exome/UMUC4/v3/UMUC4.bam', 'gs://fc-9d2e10ea-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples[extract['hash']] = [gcp.extractHash(val) for val in gcp.lsFiles(samples[extract["bam"]].tolist(), "-L", 200)]


listing files in gs


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples[extract['size']] = [gcp.extractSize(i)[1] for i in lis]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples[extract['update_time']] = [gcp.extractTime(i)[1] for i in lis]


listing files in gs6
listing files in gs
we found and removed 0 samples which did not match our id names: ['ACH-', 'CDS-']
found 150 likely replicate
listing files in gs
Len of samples before removal: 152
Dups from this workspace has len 121:
 ['gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/CCLE_DepMap_WES_July_Dec_2018/RP-1561/Exome/JEG3/v1/JEG3.bam', 'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/CCLE_DepMap_WES_July_Dec_2018/RP-1561/Exome/MP46/v2/MP46.bam', 'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/CCLE_DepMap_WES_July_Dec_2018/RP-1561/Exome/ICC12/v1/ICC12.bam', 'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/DepMap_CellLine_WES_June2019/RP-1561/Exome/HO1U1/v2/HO1U1.bam', 'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/DepMap_CellLine_WES_Oct2019/RP-1561/Exome/OSC19/v2/OSC19.bam', 'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/DepMap_CellLine_WES_Nov2019/RP-1561/Exome/T3M3/v1/T3M3.bam', 'gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/DepMap_CellLine_WES_batch2_Feb2020/RP-1561/Exome/SNU23


The shape of the sample tsv from <dalmatian.wmanager.WorkspaceManager terra-broad-cancer-prod/Getz_IBM_CellLines_Exomes>: (198, 233)
Identifying any true duplicates by checking file hashes (this runs for each data source)...
This step can take a while as we need to use gsutil to check the size of each potential duplicate...
listing files in gs
These 1 bam file path do not exist: {'gs://fc-ea03c0c1-9d38-4d76-881b-0679ea6167c4/Getz_IBM_CellLines_Exomes_6samples_04282018/C836/NA/KYO-1/v1/KYO-1.bam'}
listing files in gs
listing files in gs
listing files in gs5
listing files in gs
we found and removed 0 samples which did not match our id names: ['ACH-', 'CDS-']
found 103 likely replicate
listing files in gs
Len of samples before removal: 104
Dups from this workspace has len 100:
 ['gs://fc-ea03c0c1-9d38-4d76-881b-0679ea6167c4/Getz_IBM_DMX_CellLines_12samples_deepcov_01302018/RP-1561/Exome/HT144SKINFV2/v1/HT144SKINFV2.bam', 'gs://fc-ea03c0c1-9d38-4d76-881b-0679ea6167c4/Getz_IBM_CellLine_WES

found 0 matched normals
removed: 1 samples from size alone (too similar to a replicate)
removed: 0 samples with duplicat PDO ids 
removed: 51 samples that have not changed since last time (likely duplicate having been removed)


In [158]:
samples

Unnamed: 0_level_0,internal_bam_filepath,internal_bai_filepath,stripped_cell_line_name,arxspan_id,sequencing_date,participant_id,crc32c_hash,size,PDO,update_time,datatype,version
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CDS-H8AM79,gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/D...,gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/D...,CCLFPEDS0010T,ACH-001428,737573,PT-pTcRdaXV,Jkq3/A==,90248287750,PDO-21575,2020-11-19,wes,1


In [159]:
noarxspan

Unnamed: 0_level_0,internal_bam_filepath,internal_bai_filepath,stripped_cell_line_name,arxspan_id,sequencing_date,participant_id,crc32c_hash,size,PDO,update_time
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


### finding back arxspan

In [119]:
noarxspan = retrieveFromCellLineName(noarxspan, ccle_refsamples, datatype='wes', depmappvlink = "https://docs.google.com/spreadsheets/d/1uqCOos-T9EMQU7y2ZUw4Nm84opU5fIT1y7jet1vnScE", extract=extract_to_change)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  def updateFromTracker(samples, ccle_refsamples, arxspan_id='arxspan_id', participant_id='participant_id', toupdate={"sex": [],


we found and removed 0 samples which did not match our id names: ['ACH', 'CDS']
found 42 likely replicate
listing files in gs
Len of samples before removal: 42
Dups from this workspace has len 0:
 []
Len of samples after removal: 42


In [120]:
#assess any potential issues
set(noarxspan.arxspan_id) & set(samples.arxspan_id)

set()

In [121]:
noarxspan.arxspan_id.tolist()

['ACH-000283',
 'ACH-000877',
 'ACH-000047',
 'ACH-000868',
 'ACH-000004',
 'ACH-000509',
 'ACH-000419',
 'ACH-000007',
 'ACH-000019',
 'ACH-000010',
 'ACH-000837',
 'ACH-000436',
 'ACH-000544',
 'ACH-001151',
 'ACH-000261',
 'ACH-000312',
 'ACH-000122',
 'ACH-000452',
 'ACH-001210',
 'ACH-000304',
 'ACH-000337',
 'ACH-000740',
 'ACH-000698',
 'ACH-001496',
 'ACH-001497',
 'ACH-001500',
 'ACH-001345',
 'ACH-000672',
 'ACH-000514',
 'ACH-000434',
 'ACH-001075',
 'ACH-000800',
 'ACH-000767',
 'ACH-001368',
 'ACH-000247',
 'ACH-001654',
 'ACH-000577',
 'ACH-000596',
 'ACH-000200',
 'ACH-000655',
 'ACH-001190',
 'ACH-001402']

In [122]:
samples = pd.concat([samples, noarxspan[noarxspan.arxspan_id!='0']], sort=False)

In [123]:
noarxspan = noarxspan[noarxspan.arxspan_id=='0']

In [160]:
samples = assessAllSamples(samples, ccle_refsamples, stype='wes', rename={}, extract={})

we had 0 duplicates in the release buckets


In [161]:
samples

Unnamed: 0_level_0,internal_bam_filepath,internal_bai_filepath,stripped_cell_line_name,arxspan_id,sequencing_date,participant_id,crc32c_hash,size,PDO,update_time,datatype,version
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CDS-H8AM79,gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/D...,gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/D...,CCLFPEDS0010T,ACH-001428,737573,PT-pTcRdaXV,Jkq3/A==,90248287750,PDO-21575,2020-11-19,wes,1


In [184]:
#TODO: manage the match normals in noarxspan samples

## getting the addtional data and writing it here in the right order 'as shown above'
- use the stripped_cell_line_name to find the samples on https://docs.google.com/spreadsheets/d/1uqCOos-T9EMQU7y2ZUw4Nm84opU5fIT1y7jet1vnScE/edit#gid=356471436. 
- Make sure that we don't have duplicate cell lines in there. Otherwise, use the duplicate renaming function
- copy Primary Site, Primary Disease, Subtype, Comments, Disease Sub-subtype, if they exist. (sometimes subtype and subsubtype are the same.. don't use subsubtype then.
- look for the cell line in cellosaurus, you might need to use one of the aliases given in master depmap pv..
- copy  cellosaurus_id gender age info or write 'U' if they don't exist. 'can be a number or {Embryonic, Children, Adult, Fetus, U} 
- check that it does not say this cell line is not a duplicate from another cell line
- check that if it says this cell line is derived/children/father/samepatient from other cell lines, and that if we have any of the other cell lines, that the patient id is changed to be the same one for all (be sure that you are updating everywhere these patient ids are used)

In [163]:
# If I have a previous samples I can update unknown data directly
samples, notfound = updateFromTracker(samples, ccle_refsamples)

In [164]:
noarxspan

Unnamed: 0_level_0,internal_bam_filepath,internal_bai_filepath,stripped_cell_line_name,arxspan_id,sequencing_date,participant_id,crc32c_hash,size,PDO,update_time
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [165]:
noarxspan.sort_values(by = 'stripped_cell_line_name').to_csv('temp/noarxspan_wes_' + release + '.csv')
samples.loc[notfound].to_csv('temp/notfound_wes_'+release+'.csv')
samples.to_csv('temp/new_wes_'+release+'.csv')

In [166]:
samples.loc[notfound]

Unnamed: 0_level_0,internal_bam_filepath,internal_bai_filepath,stripped_cell_line_name,arxspan_id,sequencing_date,participant_id,crc32c_hash,size,PDO,update_time,...,age,primary_site,subtype,subsubtype,origin,parent_cell_line,matched_normal,comments,mediatype,condition
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CDS-H8AM79,gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/D...,gs://fc-9d2e10ea-1be3-4a23-a772-57854dbd1659/D...,CCLFPEDS0010T,ACH-001428,737573,PT-pTcRdaXV,Jkq3/A==,90248287750,PDO-21575,2020-11-19,...,,,,,,,,,,


In [167]:
toupdate = {
"primary_disease":['Bone Cancer'],
"cellosaurus_id":[''],
"age":['C'],
"primary_site":['bone'],
"subtype":["Ewings Sarcoma"],
"subsubtype":[""],
"origin":["bone"],
"parent_cell_line":[""],
"matched_normal":[""],
"comments":["(PCR) EWS/FLI"],
"sex":["U"],
"mediatype":["Adherent, CM2:051117: 100.0 %"],
"condition":[""],
'stripped_cell_line_name':['CCLFPEDS0010T'],
"participant_id":['PT-hrwIWtZC']
}
pd.DataFrame(toupdate)

Unnamed: 0,primary_disease,cellosaurus_id,age,primary_site,subtype,subsubtype,origin,parent_cell_line,matched_normal,comments,sex,mediatype,condition,stripped_cell_line_name,participant_id
0,Bone Cancer,,C,bone,Ewings Sarcoma,,bone,,,(PCR) EWS/FLI,U,"Adherent, CM2:051117: 100.0 %",,CCLFPEDS0010T,PT-hrwIWtZC


In [168]:
# updating..
for k, v in toupdate.items():
    samples.loc[notfound,k] =v

In [169]:
samples['baits'] = 'ice'

In [172]:
# uploading to our bucket (now a new function)
terra.changeToBucket(samples,'gs://cclebams/wes/', name_col= "index" , values=['internal_bam_filepath','internal_bai_filepath'], filetypes=['bam', 'bai'], catchdup=True, test=False)

Unnamed: 0_level_0,internal_bam_filepath,internal_bai_filepath,stripped_cell_line_name,arxspan_id,sequencing_date,participant_id,crc32c_hash,size,PDO,update_time,...,primary_site,subtype,subsubtype,origin,parent_cell_line,matched_normal,comments,mediatype,condition,baits
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CDS-H8AM79,gs://cclebams/wes/CDS-H8AM79.bam,gs://cclebams/wes/CDS-H8AM79.bai,CCLFPEDS0010T,ACH-001428,737573,PT-hrwIWtZC,Jkq3/A==,90248287750,PDO-21575,2020-11-19,...,bone,Ewings Sarcoma,,bone,,,(PCR) EWS/FLI,"Adherent, CM2:051117: 100.0 %",,ice


In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)


In [173]:
#setting the right version TODO: func
names=[]
subccle_refsamples = ccle_refsamples[ccle_refsamples['datatype'] == "wes"]
for k, val in samples.iterrows():
    val = val["arxspan_id"]
    names.append(val)
    samples.loc[k, 'version'] = len(subccle_refsamples[subccle_refsamples['arxspan_id'] == val]) + names.count(val)
samples['version'] = samples['version'].astype(int)

In [174]:
ccle_refsamples = ccle_refsamples.append(samples, sort=False)

In [178]:
dfToSheet(ccle_refsamples,'ccle sample tracker', secret=creds)

In [182]:
pairs = setupPairsFromSamples(samples, subccle_refsamples, extract={'patient_id':'participant_id'})

found 0 matched normals


In [183]:
#uploading new samples to mut
refwm = refwm.disable_hound()
refwm.upload_samples(samples)
refwm.upload_entities('pairs', pairs)
refwm.update_pair_set(pair_set_id=samplesetname,pair_ids=pairs.index)
sam = refwm.get_samples()

pair = refwm.get_pairs()
refwm.update_pair_set(pair_set_id='all',pair_ids=pair.index)
refwm.update_pair_set(pair_set_id='all_agilent',pair_ids=pair[pair["case_sample"].isin(sam[sam['baits']=="AGILENT"].index.tolist())].index)
refwm.update_pair_set(pair_set_id='all_ice',pair_ids=pair[pair["case_sample"].isin([i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])].index)
#creating a sample set
refwm.update_sample_set(sample_set_id=samplesetname, sample_ids=samples.index)
refwm.update_sample_set(sample_set_id='all', sample_ids=[i for i in sam.index.tolist() if i!='nan'])
refwm.update_sample_set(sample_set_id='all_agilent', sample_ids = sam[sam['baits'] == "AGILENT"].index.tolist())
refwm.update_sample_set(sample_set_id='all_ice', sample_ids=[i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])



Successfully imported 1 participants.
Successfully imported 1 samples.
Successfully imported 1 pairss.
Successfully imported 1 pair sets:
  * 21Q1 (1 pairs)
Pair set "all" (2243 pairs) successfully updated.
Pair set "all_agilent" (1392 pairs) successfully updated.
Pair set "all_ice" (850 pairs) successfully updated.
Successfully imported 1 sample sets:
  * 21Q1 (1 samples)
Sample set "all" (2243 samples) successfully updated.
Sample set "all_agilent" (1392 samples) successfully updated.
Sample set "all_ice" (850 samples) successfully updated.


In [184]:
#and CN
cnwm = dm.WorkspaceManager('broad-firecloud-ccle/DepMap_WES_CN_hg38')
cnwm = cnwm.disable_hound()
cnwm.upload_samples(samples)
cnwm.upload_entities('pairs', pairs)
cnwm.update_pair_set(pair_set_id=samplesetname,pair_ids=pairs.index)
sam = cnwm.get_samples()

pair = cnwm.get_pairs()
cnwm.update_pair_set(pair_set_id='all',pair_ids=pair.index)
cnwm.update_pair_set(pair_set_id='all_agilent',pair_ids=pair[pair["case_sample"].isin(sam[sam['baits']=="AGILENT"].index.tolist())].index)
cnwm.update_pair_set(pair_set_id='all_ice',pair_ids=pair[pair["case_sample"].isin([i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])].index)
#creating a sample set
cnwm.update_sample_set(sample_set_id=samplesetname, sample_ids=samples.index)
cnwm.update_sample_set(sample_set_id='all', sample_ids=[i for i in sam.index.tolist() if i!='nan'])
cnwm.update_sample_set(sample_set_id='all_agilent', sample_ids = sam[sam['baits'] == "AGILENT"].index.tolist())
cnwm.update_sample_set(sample_set_id='all_ice', sample_ids=[i for i in sam[(sam['baits'] == "ICE") |(sam['baits'].isna())].index.tolist() if i != 'nan'])

Successfully imported 1 participants.
Successfully imported 1 samples.
Successfully imported 1 pairss.
Successfully imported 1 pair sets:
  * 21Q1 (1 pairs)
Successfully imported 1 pair sets:
  * all (2235 pairs)
Pair set "all_agilent" (1392 pairs) successfully updated.
Successfully imported 1 pair sets:
  * all_ice (842 pairs)
Successfully imported 1 sample sets:
  * 21Q1 (1 samples)
Sample set "all" (2235 samples) successfully updated.
Sample set "all_agilent" (1392 samples) successfully updated.
Sample set "all_ice" (842 samples) successfully updated.


# run the pipeline

We are using Dalmatian to send request to Terra, we are running a set of 5 functions To generate the mutation dataset:

*   For new samples in DepMap, run the ICE version of this task. CCLE2 samples used Agilent targets, so this pipeline should be used instead. The pipelines are identical in terms of their outputs, but the proper targets, baits, and pseudo normal should be used based on how the samples were sequenced.

    **ICE_CGA_Production_Analysis_Pipeline_Cell_Lines_copy** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22) OR


    **AGILENT_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Anablysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22)

*   **common_variant_filter** (breardon/common_variant_filterSnapshot ID: 3)
*   **filterMAF_on_CGA_pipeline** (gkugener/filterMAF_on_CGA_pipelineSnapshot ID: 8)
*   **aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1)

This outputs to be downloaded will be saved in the sample set that was run. The output we use for the release is:


*   **passedCGA_filteredMAF_aggregated** 

There are several other tasks in this workspace. In brief:



*   **CGA_Production_Analysis_Pipeline_Cell_Lines** (lelagina/CGA_Production_Analysis_Pipeline_Cell_LinesSnapshot ID: 12). This task is the same as the ICE and AGILENT prefixed version above, except that it relied on pulling the baits and targets to use from the metadata stored for the samples. Having AGILENT and ICE versions specified made the uploading and running process easier.
*   **SANGER_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22). This task was trying to run the CGA pipeline on the Sanger WES data, using a Sanger pseudo normal. In its current implementation, this task fails to complete for the samples.
*   **UNFILTERED_aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1). Aggregates the MAF outputted by the CGA cell line pipeline prior to the common variant filter and germline filtering tasks. This can give us insight to which mutations are getting filtered out when. We may want to potentially include this MAF in the release so people can see why certain mutations of interest may be getting filtered out.
*   WES_DM_Mutation_Calling_Pipeline_(standard |expensive) (gkugener/WES_DM_Mutation_Calling_PipelineSnapshot ID: 2). This was a previous mutation calling pipeline implemented for CCLE. We do not use this pipeline any more as the CGA pipeline looks better.
*   aggregate_filterMAF_CGA (CCLE/aggregate_filterMAF_CGASnapshot ID: 1). An aggregation MAF task that we used in the past. We do not use this task anymore.
*   calculate_mutational_burden (breardon/calculate_mutational_burdenSnapshot ID: 21). This task can be used to calculate the mutational rate of the samples. We do not make use of this data in the release although it could be of interest.
*   summarizeWigFile (breardon/summarizeWigFileSnapshot ID: 5). CCLF ran this task (might be necessary for the mutational burden task). For our workflow, we do not run it.

### Cleaning workspaces

In [185]:
torm = terra.listHeavyFiles(refworkspace)
h.parrun(['gsutil rm '+i for i in torm], cores=8)
terra.removeFromFailedWorkflows(refworkspace, dryrun=False, everythingFor=['Realign_WES_GATK4','Generate_uBAM_File_List','BamToUnmappedRGBams_MC','CGA_WES_CCLE_ICE','CGA_WES_CCLE_AGILENT'])

KeyboardInterrupt: 

## On Terra

In [None]:
samplesetname

In [189]:
submission_id1 = refwm.create_submission("CGA_WES_CCLE_ICE", samplesetname, 'sample_set', expression='this.samples')

Successfully created submission 8475669c-0807-4d95-934a-24491045d788.


### Germline

In [192]:
submission_id2 = refwm.create_submission("cnn_variant_filter_ice", samplesetname, 'sample_set', expression='this.samples')

Successfully created submission 5beed30b-1e0b-4247-9378-f64a97606ef9.


### copy pairs data to sample data

In [193]:
terra.waitForSubmission(refworkspace, submission_id1)
pairs = refwm.get_pairs()

1.0 of jobs Succeeded in submission 0.


In [194]:
pairs = pairs[pairs.index.isin(tokeep)]
pairs = pairs[~pairs['mutation_validator_validated_maf'].isna()]
pairs = pairs.drop(columns=['case_sample','control_sample','participant_id'])
pairs.index = [i.split('_')[0] for i in pairs.index]

NameError: name 'tokeep' is not defined

In [None]:
refwm.update_sample_attributes(pairs)

continuing

In [195]:
submission_id1 = refwm.create_submission("common_variant_filter", samplesetname, 'sample_set', expression='this.samples')

Successfully created submission 29f2aee9-75d6-4a5e-ac0d-c1ff6c3c56a6.


In [196]:
#terra.waitForSubmission(refworkspace, submission_id2)
#submission_id2 = refwm.create_submission("aggregate_vcfs", "all")

CDS-H8AM79 Failed for 0 jobs in submission 0. 9 mn elapsed.
0.0 of jobs Succeeded in submission 0.


RuntimeError: 1 failed submission

In [197]:
terra.waitForSubmission(refworkspace, submission_id1)
submission_id1 = refwm.create_submission("filterMAF_on_CGA_pipeline", samplesetname,'sample_set',expression='this.samples')

1.0 of jobs Succeeded in submission 0.
Successfully created submission f6087a29-91d7-4d2a-b3cb-9f4b2fb67ca4.


### filtered

In [198]:
terra.waitForSubmission(refworkspace, submission_id1)
submission_id1 = refwm.create_submission("aggregateMAFs_selectFields_filtered", "all")

1.0 of jobs Succeeded in submission 0.sion 0. 5 mn elapsed.
Successfully created submission 833057d2-a472-4332-a8e2-7f1a52fa35d1.


### unfiltered

In [199]:
submission_id3 = refwm.create_submission("aggregateMAFs_selectFields_unfiltered", "all")

Successfully created submission e760ae71-6d9b-4d2a-aadc-e10b32a1ab73.


In [None]:
terra.waitForSubmission(refworkspace, [submission_id1,submission_id2, submission_id3])

### Save the workflow configurations used

In [201]:
terra.saveConfigs(refworkspace,'./data/'+samplesetname+'/Mutconfig')

  df = pd.io.json.json_normalize(r)


## On local


### Remove some datafile to save money¶

In [202]:
res = refwm.get_samples()
toremove = ["fixedmate_bam"]
for val in toremove:
    refwm.disable_hound().delete_entity_attributes('sample', res[val], delete_files=True)

KeyError: 'fixedmate_bam'

In [203]:
# sometimes it does not work; so better check again
a = res.fixedmate_bam
a = [i for i in a if i is not np.nan]
gcp.rmFiles(a)

AttributeError: 'DataFrame' object has no attribute 'fixedmate_bam'

### downloading from terra

In [205]:
sam = refwm.get_samples()

In [206]:
nowes = set(mutations.DepMap_ID)-set(sam.arxspan_id)
nowes

NameError: name 'mutations' is not defined

In [None]:
nothing = nows -set(ccle_refsamples.arxspan_id)
nothing

### get QC files

In [209]:
only = samples.index.tolist()

In [210]:
dataMut = getQC(workspace=refworkspace ,only=only, qcname=["gatk_cnv_all_plots", "lego_plotter_pngs", "copy_number_qc_report", "ffpe_OBF_figures", "mut_legos_html", "oxoG_OBF_figures", "tumor_bam_base_distribution_by_cycle_metrics", "tumor_bam_converted_oxog_metrics"])

In [211]:
dataBam = getQC(workspace=refworkspace ,only=only, qcname=[ "tumor_bam_alignment_summary_metrics", "tumor_bam_bait_bias_summary_metrics", "tumor_bam_gc_bias_summary_metrics", "tumor_bam_hybrid_selection_metrics", "tumor_bam_insert_size_histogram", "tumor_bam_insert_size_metrics", "tumor_bam_pre_adapter_summary_metrics", "tumor_bam_quality_by_cycle_metrics", "tumor_bam_quality_distribution_metrics", "tumor_bam_quality_yield_metrics"])

In [215]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

In [221]:
for k,v in dataMut.items():
    if k =='nan':
        continue
    curr = ccle_refsamples.loc[k,'processing_qc']
    if curr is np.nan or curr=="":
        curr=set(v)
    else:
        curr = set(curr[1:-1].replace("'","").split(', '))
        curr = set(v) | curr
    ccle_refsamples.loc[k,'processing_qc'] = str(list(curr))
for k,v in dataBam.items():
    if k =='nan':
        continue
    curr = ccle_refsamples.loc[k,'bam_qc']
    if curr is np.nan or curr=="":
        curr=set(v)
    else:
        curr = set(curr[1:-1].replace("'","").split(', '))
        curr = set(v) | curr
    ccle_refsamples.loc[k,'bam_qc'] = str(list(curr))


In [224]:
dfToSheet(ccle_refsamples,'ccle sample tracker', secret=creds)

In [225]:
res = refwm.get_sample_sets().loc["all"]
res

filtered_CGA_MAF_aggregated      gs://fc-secure-012d088c-f039-4d36-bde5-ee9b1b7...
samples                          [CDS-00rz9N, CDS-01bI6z, CDS-02waxZ, CDS-04TUV...
merged_mutations                 gs://fc-secure-012d088c-f039-4d36-bde5-ee9b1b7...
merged_vcf                       gs://fc-secure-012d088c-f039-4d36-bde5-ee9b1b7...
unfiltered_CGA_MAF_aggregated    gs://fc-secure-012d088c-f039-4d36-bde5-ee9b1b7...
Name: all, dtype: object

### retrieving RNAseq vcfs

In [None]:
rnamutations = dm.WorkspaceManager(rnaworkspace).get_sample_sets().loc['All_samples']['merged_vcf']
! gsutil cp $rnamutations "temp/rna_mutation_unfiltered_terra_merged.vcf"

### retrieving germline mutations

In [8]:
snps = res['merged_vcf']
! gsutil cp $snps gs://cclebams/germline_data/wes.all.called.vcf
! gsutil -m acl ch -ru taiga-892@cds-logging.iam.gserviceaccount.com:R gs://cclebams/germline_data/wes.all.called.vcf

Copying gs://fc-secure-012d088c-f039-4d36-bde5-ee9b1b76b912/a7c71e70-dc82-4b95-92c5-3d98b7438837/aggregate_vcfs/434a1e9d-487e-41a7-a3ee-eaaea7233573/call-aggregate/all.called.vcf [Content-Type=text/vcard]...
/ [1 files][ 49.6 GiB/ 49.6 GiB]                                                
Operation completed over 1 objects/49.6 GiB.                                     
Updated ACL on gs://cclebams/germline_data/wes.all.called.vcf


### rertrievinng filtered mutations

In [226]:
filtered = res['filtered_CGA_MAF_aggregated']
! gsutil cp $filtered "temp/mutation_filtered_terra_merged.txt"

Copying gs://fc-secure-012d088c-f039-4d36-bde5-ee9b1b76b912/833057d2-a472-4332-a8e2-7f1a52fa35d1/aggregateMAFs_selectFields_workflow/95316a15-b77d-439b-a5ee-5a25d9618b26/call-aggregateMAFs_selectFields/all.mergedMAF.txt...
| [1 files][296.0 MiB/296.0 MiB]                                                
Operation completed over 1 objects/296.0 MiB.                                    


In [417]:
mutations = pd.read_csv('temp/mutation_filtered_terra_merged.txt',sep='\t') 

KeyboardInterrupt: 

In [None]:
mutations = pd.read_csv('temp/mutation_filtered_terra_merged.txt',sep='\t') 
print(mutations.columns[:10])
renaming = removeOlderVersions(names = set(mutations['Tumor_Sample_Barcode']), refsamples = refwm.get_samples(), arxspan_id = "arxspan_id", version="version")
print(len(mutations[mutations['Chromosome']=='0']))

# postprocessing


Here, rather than rerunning the entire analysis, because we know we are adding only WES samples, we can download the previous release's MAF, add the samples, update any annotations, and perform any global filters at the end.

First we need to do an additional step of filtering on coverage and number 

- readMutations
- createSNPs
- addToMainMutation
- filterAllelicFraction
- filterMinCoverage
- mergeAnnotations
- addAnnotation
- maf_add_variant_annotations
- mutation_maf_to_binary_matrix (x3)

In [None]:
mutations = mutations.rename(columns={"i_ExAC_AF":"ExAC_AF","Tumor_Sample_Barcode":'DepMap_ID',"Tumor_Seq_Allele2":"Tumor_Allele"}).drop(columns=['Center','Tumor_Seq_Allele1'])

In [None]:
mutations = annotate_likely_immortalized(mutations, TCGAlocs = ['TCGAhsCnt',
'COSMIChsCnt'], max_recurrence=0.05 ,min_tcga_true_cancer=5)

In [None]:
mutations['CGA_WES_AC'] = [str(i[0]) + ':' + str(i[1]) for i in np.nan_to_num(mutations[['t_alt_count','t_ref_count']].values,0).astype(int)]

In [None]:
mutations = filterCoverage(mutations, loc=['CGA_WES_AC'], sep=':',cov=2)

In [None]:
mutations = filterAllelicFraction(mutations, loc=['CGA_WES_AC'], sep=':',frac=0.1)

In [241]:
#Count the total number of mutations per cell line, split by type (SNP, INS, DEL)
#Count the total number of mutations observed by position

In [None]:
mutations = addAnnotation(mutations, NCBI_Build='37', Strand="+")

In [243]:
mutations.to_csv('temp/wes_somatic_mutations_withduplicates_'+samplesetname+'.csv', index=False)

In [418]:
mutations= pd.read_csv('temp/wes_somatic_mutations_withduplicates_'+samplesetname+'.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [300]:
mutations

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,...,tumor_f,isDeleterious,isTCGAhotspot,TCGAhsCnt,isCOSMIChotspot,COSMIChsCnt,ExAC_AF,PASS,is_likely_immortalization,CGA_WES_AC
0,PRDM2,7799,37,1,14108651,14108651,+,Missense_Mutation,SNP,C,...,0.492647,False,False,,False,0.0,,True,False,335:345
1,TMEM51,55092,37,1,15541888,15541888,+,Missense_Mutation,SNP,A,...,0.192308,False,False,,False,0.0,,True,False,5:21
2,EPHA2,1969,37,1,16464806,16464806,+,Missense_Mutation,SNP,G,...,0.304348,False,False,,False,0.0,0.000016,True,False,7:16
3,SPATA21,374955,37,1,16727249,16727249,+,Silent,SNP,G,...,0.104396,False,False,,False,0.0,,True,False,19:163
4,EPHB2,2048,37,1,23107922,23107922,+,Missense_Mutation,SNP,A,...,0.409091,False,False,,False,0.0,,True,False,18:26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1341259,AKAP1,8165,37,17,55184280,55184288,+,In_Frame_Del,DEL,CACCTGTGT,...,0.944882,False,False,,False,0.0,,True,False,120:7
1341260,EPS8L1,54869,37,19,55593836,55593836,+,Frame_Shift_Del,DEL,G,...,1.000000,True,False,,False,0.0,,True,False,11:0
1341261,RANGAP1,5905,37,22,41650387,41650388,+,In_Frame_Ins,INS,-,...,0.586207,False,False,,False,0.0,,True,False,17:12
1341262,BCOR,54880,37,X,39913178,39913179,+,Frame_Shift_Ins,INS,-,...,0.620690,True,False,,False,1.0,,True,False,18:11


In [419]:
# based on QC from the CN pipeline: can't be done in mut because not a quantitativ pipeline
%store -r wesfailed
%store -r wesdup
%store -r renaming
%store -r issues
#removing dups from replicates
mutations = mutations[~mutations.DepMap_ID.isin(set(wesdup[:,1])|set(wesfailed))]

In [420]:
renaming.update({"CDS-mys9Dm":"ACH-001955","CDS-Rl87Z1":"ACH-001956","CDS-TzQAjG":"ACH-001957"})

In [421]:
mutations = mutations[mutations.DepMap_ID.isin(renaming.keys())].replace(renaming)

In [422]:
mutations[mutations.DepMap_ID=='ACH-001995']

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,...,tumor_f,isDeleterious,isTCGAhotspot,TCGAhsCnt,isCOSMIChotspot,COSMIChsCnt,ExAC_AF,PASS,is_likely_immortalization,CGA_WES_AC


### Adding WGS's exonic mutation

In [None]:
wgsemutations = pd.read_csv('temp/wgs_somatic_mutations_'+samplesetname+'.csv')

In [306]:
#for now we keep WES if we have them
toadd = set(wgsemutations.DepMap_ID) - set(mutations.DepMap_ID)
toadd

{'ACH-000195',
 'ACH-001046',
 'ACH-001078',
 'ACH-001188',
 'ACH-001447',
 'ACH-001492',
 'ACH-001503',
 'ACH-001553',
 'ACH-001571',
 'ACH-001675',
 'ACH-001683',
 'ACH-001691',
 'ACH-001697',
 'ACH-001700',
 'ACH-001701',
 'ACH-001705',
 'ACH-001813',
 'ACH-001817',
 'ACH-001828',
 'ACH-001949',
 'ACH-001953',
 'ACH-001974',
 'ACH-001975',
 'ACH-001976',
 'ACH-001977',
 'ACH-001978',
 'ACH-001980',
 'ACH-001981',
 'ACH-001982',
 'ACH-001983',
 'ACH-001984',
 'ACH-001985',
 'ACH-001986',
 'ACH-001992',
 'ACH-001994',
 'ACH-002013',
 'ACH-002014',
 'ACH-002034',
 'ACH-002047',
 'ACH-002060',
 'ACH-002061',
 'ACH-002074',
 'ACH-002080',
 'ACH-002084',
 'ACH-002504',
 'ACH-002512',
 'ACH-002699',
 'ACH-002810',
 'ACH-002818',
 'ACH-002822',
 'ACH-002825',
 'ACH-002828',
 'ACH-002851'}

In [307]:
mutations = mutations.append(wgsemutations[wgsemutations.DepMap_ID.isin(toadd)])

In [357]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

In [408]:
normals = set(ccle_refsamples[ccle_refsamples.primary_disease=="normal"].arxspan_id)
wrong = set(['ACH-001189', 'ACH-002303', 'ACH-002315', 'ACH-002341', 'ACH-001011', 'ACH-001108', 'ACH-001187','ACH-003000', 'ACH-002875', 'ACH-002874'])

In [386]:
mutations = mutations[~mutations.DepMap_ID.isin(wrong)]

In [387]:
mutations = mutations[~mutations.DepMap_ID.isin(normals)]

In [354]:
mutations.to_csv('temp/wes_somatic_mutations_all_'+samplesetname+'.csv', index=False)

In [385]:
mutations = pd.read_csv('temp/wes_somatic_mutations_all_'+samplesetname+'.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [355]:
mafToMat(mutations[(mutations.isDeleterious) & (mutations['tumor_f']>0.25)]).T.to_csv('temp/wes_somatic_mutations_deleterious_matrix.csv')
mafToMat(mutations[~(mutations.isDeleterious | mutations.isCOSMIChotspot | mutations.isTCGAhotspot | mutations['Variant_Classification']=='Silent') & (mutations['tumor_f']>0.25)]).T.to_csv('temp/wes_somatic_mutations_other_matrix.csv')
mafToMat(mutations[(mutations.isCOSMIChotspot | mutations.isTCGAhotspot) &(mutations['tumor_f']>0.25)]).T.to_csv('temp/wes_somatic_mutations_hotspot_matrix.csv')

100%

In [356]:
mafToMat(mutations[(mutations.isDeleterious) & (mutations['tumor_f']>0.25)], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_deleterious_boolmatrix.csv')
mafToMat(mutations[~(mutations.isDeleterious | mutations.isCOSMIChotspot | mutations.isTCGAhotspot | mutations['Variant_Classification']=='Silent') & (mutations['tumor_f']>0.25)], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_other_boolmatrix.csv')
mafToMat(mutations[(mutations.isCOSMIChotspot | mutations.isTCGAhotspot) & (mutations['tumor_f']>0.25)], boolify=True).astype(int).T.to_csv('temp/wes_somatic_mutations_hotspot_boolmatrix.csv')

100%

In [388]:
legacy_hybridcapture = tc.get(name='mutations-da6a', file='legacy_hybridcapture_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_raindance = tc.get(name='mutations-da6a', file='legacy_raindance_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_rna = tc.get(name='mutations-da6a', file='legacy_rna_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_wes_sanger = tc.get(name='mutations-da6a', file='legacy_wes_sanger_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_wgs_exoniconly = tc.get(name='mutations-da6a', file='legacy_wgs_exoniconly_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})

solving issues with the legacy datasets

In [389]:
legacy_hybridcapture = legacy_hybridcapture[~legacy_hybridcapture.DepMap_ID.isin(['ACH-001189'])]

In [390]:
legacy_wes_sanger = legacy_wes_sanger[~legacy_wes_sanger.DepMap_ID.isin([
 #'ACH-001131',
 'ACH-001189',
 #'ACH-002217',
 #'ACH-002315',
 #'ACH-002341',
 #'ACH-002390',
 #'ACH-002391',
 #'ACH-002393',
 #'ACH-002394',
 #'ACH-002395',
 #'ACH-002396'
])]

In [391]:
legacy_hybridcapture.loc[legacy_hybridcapture[legacy_hybridcapture['Variant_Classification'].isna()].index,'Variant_Classification']='Missense_Mutation'

In [392]:
legacy_wgs_exoniconly.loc[legacy_wgs_exoniconly[legacy_wgs_exoniconly['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_wgs_exoniconly[legacy_wgs_exoniconly['Genome_Change'].isna()].iterrows()]

In [393]:
legacy_wes_sanger.loc[legacy_wes_sanger[legacy_wes_sanger['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_wes_sanger[legacy_wes_sanger['Genome_Change'].isna()].iterrows()]

In [394]:
legacy_raindance.loc[legacy_raindance[legacy_raindance['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_raindance[legacy_raindance['Genome_Change'].isna()].iterrows()]

In [395]:
legacy_hybridcapture = legacy_hybridcapture[~legacy_hybridcapture.DepMap_ID.isin(normals)]

In [396]:
legacy_raindance = legacy_raindance[~legacy_raindance.DepMap_ID.isin(normals)]

In [397]:
legacy_wes_sanger = legacy_wes_sanger[~legacy_wes_sanger.DepMap_ID.isin(normals)]

In [398]:
legacy_hybridcapture.loc[legacy_hybridcapture[legacy_hybridcapture['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_hybridcapture[legacy_hybridcapture['Genome_Change'].isna()].iterrows()]

In [399]:
legacy_rna.loc[legacy_rna[legacy_rna['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_rna[legacy_rna['Genome_Change'].isna()].iterrows()]

In [400]:
todrop = []
legacy_rna['loci'] = legacy_rna['DepMap_ID']+"_"+legacy_rna['Chromosome']+"_"+legacy_rna['Start_position'].astype(str)
for val in h.dups(legacy_rna.loci):
    todrop.append(legacy_rna[legacy_rna.loci==val].index[0])
legacy_rna = legacy_rna.drop(todrop)

In [401]:
merged = mergeAnnotations(mutations, legacy_hybridcapture, useSecondForConflict=True, dry_run=False)
merged = mergeAnnotations(merged, legacy_raindance, useSecondForConflict=True, dry_run=False)
merged = mergeAnnotations(merged, legacy_wgs_exoniconly, useSecondForConflict=False, dry_run=False)
merged = mergeAnnotations(merged, legacy_wes_sanger, useSecondForConflict=False, dry_run=False)
merged = mergeAnnotations(merged, legacy_rna, useSecondForConflict=False, dry_run=False)

found 461 nonmatching mutations
found 3 nonmatching mutations
found 177 nonmatching mutations
found 248 nonmatching mutations
found 295 nonmatching mutations


In [402]:
mutation_groups={
"other conserving": ["5'Flank", "Intron", "IGR", "3'UTR", "5'UTR"],
"other non-conserving":["In_Frame_Del", "In_Frame_Ins", "Stop_Codon_Del", "Stop_Codon_Ins", "Missense_Mutation", "Nonstop_Mutation"],
'silent': ['Silent'],
"damaging":['De_novo_Start_OutOfFrame','Frame_Shift_Del','Frame_Shift_Ins', 'Splice_Site', 'Start_Codon_Del', 'Start_Codon_Ins', 'Start_Codon_SNP','Nonsense_Mutation']
}

In [403]:
rename = {}
for k,v in mutation_groups.items():
    for e in v:
        rename[e] = k
merged['Variant_annotation'] = [rename[i] for i in merged['Variant_Classification'].tolist()]

### Compare to previous release

I would run some checks here comparing the results to the previous releases MAF. Namely:

- Count the total number of mutations per cell line, split by type (SNP, INS, DEL)
- Count the total number of mutations observed by position (group by chromosome, start position, end position and count the number of mutations)
- Look at specific differences between the two MAFs (join on DepMap_ID, Chromosome, Start position, End position, Variant_Type). I would do this for WES only

### check important mutations

In [None]:
# check MOLM13, MV411 cell lines- The well known mutation status of FLT3

In [None]:
# check TP53 mutation 

Are mutation consistent?

QC mutations, for a known dependency, check if it matches mutation of this gene. (if P53 is mutated, cannot have dependency on P53 or MDM2 MDM4/ inverse fir BRAF and KRAF to themselves)

### saving this version

In [None]:
h.compareDfs(merged, tc.get(name='depmap-a0ab', file='CCLE_mutations'))

In [404]:
merged.to_csv('temp/wes_somatic_mutations_withlegacy_'+samplesetname+'.csv', index=False)

In [None]:
merged = pd.read_csv('temp/wes_somatic_mutations_withlegacy_'+samplesetname+'.csv')

In [405]:
mafToMat(merged[merged.Variant_annotation=="damaging"], boolify=True).astype(int).T.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_damaging_'+samplesetname+".csv")
mafToMat(merged[merged.Variant_annotation=="other conserving"], boolify=True).astype(int).T.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othercons_'+samplesetname+".csv")
mafToMat(merged[merged.Variant_annotation=="other non-conserving"], boolify=True).astype(int).T.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othernoncons_'+samplesetname+".csv")
mafToMat(merged[(merged.isCOSMIChotspot | merged.isTCGAhotspot)], boolify=True).astype(int).T.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_hotspot_'+samplesetname+'.csv')

100%

### saving samples used for this release

should be the same as in CN otherwise need to do something more complex

In [347]:
#ccle_refsamples.loc[renaming.keys(),samplesetname]=1
#ccle_refsamples.loc[ccle_refsamples[ccle_refsamples.arxspan_id.isin(toadd) & ccle_refsamples.datatype=="wgs"].index,samplesetname]=1
#ccle_refsamples.to_csv('temp/newrefWES.csv')

## retrieving unfiltered mutations

In [135]:
unfiltered = res['unfiltered_CGA_MAF_aggregated']
! gsutil cp $unfiltered "temp/wes_mutation_unfiltered_terra_merged.txt"

NameError: name 'res' is not defined

In [140]:
unfiltered = pd.read_csv('temp/wes_mutation_unfiltered_terra_merged.txt', sep='\t', encoding='L6',na_values=["__UNKNOWN__",'.'], engine='c', dtype=str)

In [141]:
unfiltered['somatic'] = unfiltered['somatic'].replace('nan','False')
unfiltered['HGNC_Status'] = unfiltered['HGNC_Status'].replace('nan','Unapproved')
unfiltered['judgement'] = unfiltered['judgement'].replace('nan','REMOVE')
unfiltered = unfiltered.rename(columns={"i_ExAC_AF":"ExAC_AF","Tumor_Sample_Barcode":'DepMap_ID',"Tumor_Seq_Allele2":"Tumor_Allele"}).drop(columns=['Tumor_Seq_Allele1'])
unfiltered['CGA_WES_AC'] = [str(i[0]) + ':' + str(i[1]) for i in np.nan_to_num(unfiltered[['t_alt_count','t_ref_count']].values.astype(float),0).astype(int)]

ValueError: cannot convert float NaN to integer

In [150]:
toremove = []
subunfilt = unfiltered.iloc[:10000]
for i, val in enumerate(unfiltered.columns):
    h.showcount(i,len(unfiltered.columns))
    if len(set(subunfilt[val])-set(['nan']))==1:
        if len(set(unfiltered[val])-set(['nan']))==1:
            toremove.append(val)
unfiltered = unfiltered.drop(columns=set(toremove))

100%

In [152]:
toint =  ["Start_position", "End_position"]
for val in toint:
    unfiltered[val]  = unfiltered[val].astype(int)

In [153]:
unfiltered.to_csv('temp/mutation_somatic_unfiltered_withreplicates.csv.gz', index=False)

In [154]:
unfiltered = unfiltered[unfiltered.DepMap_ID.isin(renaming.keys())]
unfiltered['DepMap_ID'] = unfiltered['DepMap_ID'].replace(renaming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [155]:
unfiltered.to_csv('temp/mutation_somatic_unfiltered_all.csv.gz', index=False)

In [382]:
del unfiltered

NameError: name 'unfiltered' is not defined

# uploading on taiga

## Saving to latest version

In [384]:
#!gunzip temp/wes_mutation_somatic_unfiltered_withreplicates.csv.gz
!gunzip temp/wes_mutation_somatic_unfiltered_all.csv.gz

gzip: temp/wes_mutation_somatic_unfiltered_all.csv.gz: No such file or directory


In [406]:
tc.update_dataset(dataset_permaname="mutations-latest-ed72",
                 upload_file_path_dict={
        'temp/wes_somatic_mutations_withduplicates_'+samplesetname+'.csv': 'TableCSV',
        'temp/wgs_somatic_mutations_'+samplesetname+'.csv': 'TableCSV',
        'temp/wes_somatic_mutations_deleterious_matrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_other_matrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_hotspot_matrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_all_'+samplesetname+'.csv': 'TableCSV',
        'temp/wes_somatic_mutations_deleterious_boolmatrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_other_boolmatrix.csv': 'NumericMatrixCSV',
        'temp/wes_somatic_mutations_hotspot_boolmatrix.csv': 'NumericMatrixCSV',
        'temp/wes_mutation_somatic_unfiltered_all.csv': 'TableCSV',
        #'temp/wes_mutation_somatic_unfiltered_withreplicates.csv': 'TableCSV',
        }, 
                 dataset_description="""
# Mutations

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal.

/!\ This is the most up to date version of the CCLE Mutatios data.
The data is most likely of a better quality that what is on other folder. It is however in beta version as not all changes have either been confirmed or accepted by the DepMap Ops and the DepMap Portal Team.

# Notations:

all: every cell lines we have

WES: all data comes from the WExomeS samples we posses

WGS: all data comes from the WGenomeS samples we posses

withreplicates: if we have two different sequencing from a sample, we kept both, see the depmap sample tracker for annotations [https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE](https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE). this dataset is more geared toward QC or in-depth analysis of a particular cell line.

merged: everything from both WGS and WES

latest: only the latest sequencing versions of the samples were kept

genes (gene rpkm):
__Rows__:
__Columns__:
Counts (gene counts):
__Rows__:
__Columns__:
Gene level CN data:
__Rows__:
__Columns__:
 DepMap cell line IDs
 gene names in the format HGNC\_symbol (Entrez\_ID)
DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean
 """)

Uploading wes_somatic_mutations_withduplicates_21Q1...
hitting https://cds.team/taiga/api/datafile/3a8319a192204c9cad6da6c6899ca885
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Dow

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading 

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3

	 Done: wes_mutation_somatic_unfiltered_all p

'4a2eb597da5a4d5aa26ae6123faf02df'