# Preproccessing and loading data

In [None]:
from __future__ import print_function


from depmapomics import loading, tracker
from depmapomics import mutations as omics_mut
from depmapomics import terra as myterra
from genepy import terra
from genepy import mutations as mut
from genepy.utils import helper as h

from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm
from bokeh.plotting import output_notebook

%load_ext autoreload
%autoreload 2

tc = TaigaClient()
output_notebook()

## boot up

we are instanciating all the parameters needed for this pipeline to run

In [None]:
#GENERAL PARAMS
isCCLE = True
doCleanup = True
samplesetname = "21Q2"

## current age at which to consider the sample already loaded in previous release
maxage = '2020-11-01'

## genomic annotations (v35)

replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

# Terra workflow names

WGSmethods = [
    "gatk/PreProcessingForVariantDiscovery_GATK4/8",
    "GP-TAG/Manta_SomaticSV/9",
    "gkugener/ArrayOfFilesToTxt/1",
    "vdauwera/BamToUnmappedRGBams/4",
    "gatk/CNV_Somatic_Pair_Workflow/9",
    "gkugener/Aggregate_CN_seg_files/2"
]

MutationWESmethods=[
    "broadinstitute_gtex/samtofastq_v1-0_BETA/6",
    "broadinstitute_gtex/star_v1-0_BETA/7",
    "broadinstitute_gtex/rsem_v1-0_BETA/6",
    "jkobject/rsem_aggregate_results/5",
    "jkobject/rnaseq-germline-snps-indels/7",
    "broadinstitute_gtex/rnaseqc2_v1-0_BETA/2",
    "gkugener/STAR_fusion/17",
    "jkobject/aggregate_vcfs/22",
    "gkugener/Aggregate_files_set/2",
]

CNWESmethods = [
    "gatk/PreProcessingForVariantDiscovery_GATK4/8",
    "GP-TAG/Manta_SomaticSV/9",
    "gkugener/ArrayOfFilesToTxt/1",
    "vdauwera/BamToUnmappedRGBams/4",
    "gatk/CNV_Somatic_Pair_Workflow/9",
    "gkugener/Aggregate_CN_seg_files/2"
]

## version 102
ensemblserver = "http://nov2020.archive.ensembl.org/biomart" 

datatype = 'rna'

# USER SPECIFIC

my_id = '~/.client_secret.json'
mystorage_id = "~/.storage.json"

## do the first steps of https://medium.com/craftsmenltd/from-csv-to-google-sheet-using-python-ef097cb014f9
creds = '../.credentials.json'
sheets = Sheets.from_files(my_id, mystorage_id)

## lines that have issues
wrongwes = {'CDS-VnMBYD', 'CDS-TGTiB8', 'CDS-8Ut3sT', 'CDS-BNyMCM', 'CDS-dgxjAa', 'CDS-ZJh6UN', 'CDS-Ip02tY', 'CDS-Rd4nMx', 'CDS-YSRYLi', 'CDS-0qPmaJ','CDS-0aJ4Yh','CDS-0lfqVz','CDS-0pZb0j','CDS-1b1Hxk','CDS-1djAlo','CDS-1p2nnc','CDS-34hKv3','CDS-3EBt51','CDS-3M6Pq9','CDS-3WygAj','CDS-49azaP','CDS-4sr6RL','CDS-5rD8XC','CDS-5wYxZS','CDS-6Yy3Yj','CDS-6da3hu','CDS-6l3V79','CDS-7PFldq','CDS-9XPgHB','CDS-9qDPiX','CDS-B0qAaq','CDS-CMenCH','CDS-CuJ0f8','CDS-Dkl8OF','CDS-Eq9UNX','CDS-FRxdcH','CDS-HNytLD','CDS-Hj3xAa','CDS-IJnjkY','CDS-Ig6N9S','CDS-KQDgIV','CDS-KYkMDa','CDS-KbbgMb','CDS-KgRznV','CDS-L0pDPl','CDS-M8xDMS','CDS-MLJbT2','CDS-MnF3x8','CDS-OCkOqy','CDS-ODmXrP','CDS-OgPf0h','CDS-PYw8ID','CDS-PdUZxY','CDS-QHp4h4','CDS-QU7ftt','CDS-QVhVDT','CDS-QXBhht','CDS-SJq3p4','CDS-Sp18uD','CDS-TpDBjm','CDS-TyWjJs','CDS-UV1pVE','CDS-UnDaBI','CDS-UtrDTK','CDS-W80jkV','CDS-WedVJA','CDS-WfjTcJ','CDS-X3c4UY','CDS-XQkXf4','CDS-XevQNc','CDS-YMIv9D','CDS-YYLKZ0','CDS-agZcmk','CDS-bntBUl','CDS-cAEii6','CDS-cYWYp7','CDS-cyuMYb','CDS-d18Xie','CDS-dpub1O','CDS-eUqT7L','CDS-eowEZF','CDS-fXMRF9','CDS-gIMBax','CDS-gRA4SM','CDS-iEULQm','CDS-ihI7Dp','CDS-iqPqOr','CDS-jqOvtj','CDS-kxNZ5S','CDS-leGxSD','CDS-nby0QM','CDS-no7ysz','CDS-o4dXGr','CDS-oHu1Ik','CDS-pXMN9C','CDS-picEuX','CDS-qZsCuJ','CDS-rLRUbG','CDS-txTRwz','CDS-uQ8nnX','CDS-yPSmxb''CDS-0qPmaJ','CDS-1PXzlf','CDS-1uWUTi','CDS-294bk6','CDS-2JxT1P','CDS-2LFZYm','CDS-2Q2Kia','CDS-2hGt1N','CDS-2lAFkD','CDS-2xSJmZ','CDS-3DHwSX','CDS-3FueNQ','CDS-3VNhFC','CDS-3jIdRa','CDS-3mvYnW','CDS-3pZIvU','CDS-49xzNU','CDS-4BrJr7','CDS-4S6juQ','CDS-4ZOQQF','CDS-4l9BUT','CDS-5H2go6','CDS-5IcijG','CDS-5LNjjI','CDS-5PXB9Y','CDS-5ViPeM','CDS-5bQzF2','CDS-5hbofu','CDS-6EyvRQ','CDS-6Fc0S5','CDS-6PZKz8','CDS-6mq2Or','CDS-6xyqy9','CDS-75psAH','CDS-7JWzyA','CDS-7nEZFG','CDS-7rcFYn','CDS-83LhEq','CDS-8aHSii','CDS-8mpXJa','CDS-8sQWae','CDS-8yHnJv','CDS-8z476r','CDS-96DdrP','CDS-9JpX07','CDS-9M8GNS','CDS-9sg0Pm','CDS-9u5DMn','CDS-9zidMf','CDS-AJMYsd','CDS-AOWMF3','CDS-AjRIMt','CDS-Awmxa5','CDS-BRxHbu','CDS-BnszE4','CDS-Bojgi7','CDS-C3hSav','CDS-C7o0op','CDS-CRPZeK','CDS-CZstO2','CDS-D6mIfI','CDS-DIckeT','CDS-DZMoWW','CDS-Eh7ost','CDS-Eo5oAR','CDS-EpURcL','CDS-EzZEgz','CDS-Fz0HXE','CDS-G1sVsw','CDS-GINQfy','CDS-GnBdHN','CDS-H1oKTL','CDS-H4hPhD','CDS-HEoDm7','CDS-HOVBCg','CDS-HjGCvC','CDS-HkZUmY','CDS-HoW111','CDS-Hv0i3y','CDS-Hw6KuA','CDS-Hx6zuD','CDS-I7bMcd','CDS-I97Uzq','CDS-IGOgCK','CDS-Iu8c04','CDS-IzeN7a','CDS-J3jfZW','CDS-J6kDsZ','CDS-JMfP1M','CDS-JvOeJK','CDS-K2tTmq','CDS-Kswf83','CDS-LCfY0q','CDS-LNTGnh','CDS-LOW19e','CDS-LUm1Vn','CDS-LVeuLY','CDS-LifesX','CDS-LnV7QY','CDS-M1sAGX','CDS-M8aV3P','CDS-MOOIHL','CDS-Md89va','CDS-MhXQX3','CDS-N83rwD','CDS-NBnCDl','CDS-NPG23x','CDS-NXnWiI','CDS-NZsio7','CDS-NjunRu','CDS-O1ShTQ','CDS-O8dfj7','CDS-OLgoE4','CDS-OWJaXi','CDS-OjLMVy','CDS-OnIxUL','CDS-OxQgBw','CDS-P79y6z','CDS-PHI8VT','CDS-PYWxsh','CDS-Pkk9e2','CDS-Pku96X','CDS-PyELSk','CDS-QE7bdY','CDS-Qbfoau','CDS-Ql8GJZ','CDS-QtTdY6','CDS-QxeMJW','CDS-R3txwY','CDS-R6ehaT','CDS-RFBAY6','CDS-RWYJ02','CDS-RnsUHX','CDS-RxQhcq','CDS-SO3AhH','CDS-SvzhGj','CDS-T10Uph','CDS-TCqSJW','CDS-TDblpN','CDS-TSDUCK','CDS-Twv1kD','CDS-Ty3mgt','CDS-UL1jLm','CDS-UVxUrF','CDS-UfC2Dz','CDS-Uru0Mh','CDS-UvBswk','CDS-UxKEaK','CDS-V2ZEuP','CDS-V6Kk5q','CDS-VBr00g','CDS-VCuHjJ','CDS-WAPQGk','CDS-WHZolj','CDS-WP95Oi','CDS-Ww1LC7','CDS-XJDBDj','CDS-Xgu4mi','CDS-XqaEOX','CDS-Y27yfi','CDS-YYd4ww','CDS-YnodyM','CDS-ZGlgTf','CDS-ZMsoXe','CDS-aDUHcI','CDS-aGMcvr','CDS-aXqwpM','CDS-allHxr','CDS-awunD8','CDS-b9sdh9','CDS-bPT1F0','CDS-bdb5iE','CDS-bons31','CDS-c2Sowd','CDS-cBOy2Z','CDS-cKMeDY','CDS-cMvnjL','CDS-ck9vpG','CDS-cmV75B','CDS-ctVpqU','CDS-dJqQ4g','CDS-dNVjOc','CDS-dPlJzz','CDS-dWHWU3','CDS-eGQYXr','CDS-eZg4P8','CDS-fLsYaB','CDS-fRpNQH','CDS-frzvLf','CDS-fs8moU','CDS-g0KUGN','CDS-gCSYjV','CDS-gKIdjs','CDS-gsqqAz','CDS-h4mOdz','CDS-hOI086','CDS-iKXYuH','CDS-iRstNJ','CDS-iX8vqU','CDS-ik526H','CDS-jHqXGP','CDS-kAARUi','CDS-kFiHZk','CDS-kt2Gne','CDS-ktRRkc','CDS-l1OClV','CDS-lSpYo6','CDS-lTogDX','CDS-ldrQm3','CDS-leyYAD','CDS-loy9vi','CDS-m49nRz','CDS-mGHY2S','CDS-mazUYU','CDS-mtMTts','CDS-n7Fqfe','CDS-nOKbmw','CDS-nTW67d','CDS-nYIBWR','CDS-ocw0rP','CDS-ogUnWk','CDS-ohjYlg','CDS-opnGD7','CDS-qIc5x3','CDS-qP2MBQ','CDS-qUtkjN','CDS-qaOoHQ','CDS-qeIIoY','CDS-qv2bpJ','CDS-r5Ym7C','CDS-rLadW7','CDS-rQIdNN','CDS-rQMY3G','CDS-rUs3FP','CDS-rVAuin','CDS-ragHOy','CDS-s7pOQR','CDS-sCWLGL','CDS-sbwn0P','CDS-sieIuO','CDS-soTPPi','CDS-tORJC8','CDS-tPR3fn','CDS-tYXity','CDS-tgnRyK','CDS-u1AlUI','CDS-uGZguG','CDS-w7i5l7','CDS-w8wJvh','CDS-wSV3OM','CDS-wWwBMZ','CDS-wbPtTZ','CDS-wlTAAF','CDS-wpXVQk','CDS-x21VqU','CDS-x7srFK','CDS-xCyamv','CDS-xI8ZAZ','CDS-xIv1KJ','CDS-xKNh7Q','CDS-yCSYHi','CDS-ycD9px','CDS-ydPJEM','CDS-z8Bvmk','CDS-ziEOXJ','CDS-zwAn7G'}

#TODO: rerun aggrgate wes with new All (without those bams)
deletedwes = {'CDS-phR2eo', 'CDS-5x4qLj', 'CDS-9TDRpv', 'CDS-NUlX3d', 'CDS-yu1s5X', 'CDS-jSUD4f', 'CDS-2jBQ8n', 'CDS-up4Vo5', 'CDS-BNyMCM', 'CDS-vrqu12', 'CDS-c51IFr', 'CDS-VS9XDY', 'CDS-363TYH', 'CDS-k9Qfva', 'CDS-RLVrVE', 'CDS-gLz8Kz', 'CDS-gJupgp', 'CDS-lIXOWR', 'CDS-pBashm', 'CDS-b5ElTm', 'CDS-6i6dRP', 'CDS-g2J7MD', 'CDS-0qPmaJ', 'CDS-59uKc2', 'CDS-ljFuDX', 'CDS-GuKG2u', 'CDS-oRM8DN', 'CDS-IQuj9W', 'CDS-bb2V33', 'CDS-pIy1EQ', 'CDS-6YFjST', 'CDS-T5BcdG', 'CDS-2HO10g'}

# samples we know have issues
wrongwes_arxspan = {'ACH-001189', 'ACH-002303', 'ACH-002315', 'ACH-002341', 'ACH-001011', 'ACH-001108', 'ACH-001187', 'ACH-002875', 'ACH-002874', 
"ACH-001955", #chordoma lines
"ACH-001956",
"ACH-001957"}

# samples that failed QC but we still want to keep for now
wes_toprefer = {"CDS-Ckptje": "ACH-001672",
"CDS-pgDmZb": "ACH-002291",
"CDS-mys9Dm":"ACH-001955",
"CDS-Rl87Z1":"ACH-001956",
"CDS-TzQAjG":"ACH-001957"}

wesfailed = ['CDS-mys9Dm',
             'CDS-8GqFo5',
             'CDS-Rl87Z1',
             'CDS-H8AM79',
             'CDS-C2RlCj',
             'CDS-Qcyabl',
             'CDS-TzQAjG']

wgsfailed = ['CDS-mYOC4j', 'CDS-Wszh2o', 'CDS-3PGQ84']

tokeep_specific_hybrid_capture = ["ACH-001187", "ACH-001011", "ACH-001108"]

## bucket to save in
bucket = "gs://cclebams/rna/"

# CCLE SPECIFIC

## old GP storage buckets

## curent GP buckets
workspace1="terra-broad-cancer-prod/DepMap_WGS"
workspace2="terra-broad-cancer-prod/Getz_IBM_CellLines_WGS"

## and their correesponding sample source
source1="ccle"
source2="ibm"

## our working workspace (reference)
refworkspace="broad-firecloud-ccle/DepMap_WGS_CN"
wescnworkspace="broad-firecloud-ccle/DepMap_WES_CN_hg38"
wesmutworkspace="broad-firecloud-ccle/DepMap_Mutation_Calling_CGA_pipeline"

## info/metadata google spreadsheets (info about cell lines)
refsheet_url = "https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY"
privacy_release_url = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"
depmap_pv = "https://docs.google.com/spreadsheets/d/1uqCOos-T9EMQU7y2ZUw4Nm84opU5fIT1y7jet1vnScE"
depmap_taiga = "arxspan-cell-line-export-f808"

sampletrackername='ccle sample tracker'

## values we need to rename from the GP workspaces
extract_to_change = {'from_arxspan_id': 'participant'}

## things to match to from the GP workspaces
match = ['ACH-','CDS-']

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

## Adding new data

We are looking for new samples in a range of workspaces.

They are quite messy and might contains duplicates, contain broken file paths...

- We are thus looking at the bam files one by one and comparing them with our own bams. 
- We remove broken files, duplicates and add new version of a cell line's bam if we find some.

In [None]:
if isCCLE:
    print("loading new WGS data")
    samples = loading.loadWGS(samplesetname,workspaces=[workspace1, workspace2],sources=["ccle", "ibm"], maxage=maxage, baits='genome', stype="wgs", toraise=toraise)

In [None]:
if isCCLE:
    print("uploading samples to the tracker and Terra")
    loading.update(samples, samplesetname, stype="wgs", bucket="gs://cclebams/wgs/", refworkspace,
          name_col="index", values=['legacy_bam_filepath', 'legacy_bai_filepath'],
          filetypes=['bam', 'bai'],
          my_id=my_id,
          mystorage_id=mystorage_id,
          creds=creds,
          sampletrackername=sampletrackername, refsheet_url=refsheet_url)

# run the pipeline

We are using Dalmatian to send request to Terra, we are running a set of 5 functions To generate the mutation dataset:

*   For new samples in DepMap, run the ICE version of this task. CCLE2 samples used Agilent targets, so this pipeline should be used instead. The pipelines are identical in terms of their outputs, but the proper targets, baits, and pseudo normal should be used based on how the samples were sequenced.

    **ICE_CGA_Production_Analysis_Pipeline_Cell_Lines_copy** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22) OR


    **AGILENT_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Anablysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22)

*   **common_variant_filter** (breardon/common_variant_filterSnapshot ID: 3)
*   **filterMAF_on_CGA_pipeline** (gkugener/filterMAF_on_CGA_pipelineSnapshot ID: 8)
*   **aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1)

This outputs to be downloaded will be saved in the sample set that was run. The output we use for the release is:


*   **passedCGA_filteredMAF_aggregated** 

There are several other tasks in this workspace. In brief:



*   **CGA_Production_Analysis_Pipeline_Cell_Lines** (lelagina/CGA_Production_Analysis_Pipeline_Cell_LinesSnapshot ID: 12). This task is the same as the ICE and AGILENT prefixed version above, except that it relied on pulling the baits and targets to use from the metadata stored for the samples. Having AGILENT and ICE versions specified made the uploading and running process easier.
*   **SANGER_CGA_Production_Analysis_Pipeline_Cell_Lines** (cclf/CGA_Production_Analysis_Pipeline_Cell_Lines_debuggingSnapshot ID: 22). This task was trying to run the CGA pipeline on the Sanger WES data, using a Sanger pseudo normal. In its current implementation, this task fails to complete for the samples.
*   **UNFILTERED_aggregateMAFs_selectFields** (ccle_mg/aggregateMAFs_selectFieldsSnapshot ID: 1). Aggregates the MAF outputted by the CGA cell line pipeline prior to the common variant filter and germline filtering tasks. This can give us insight to which mutations are getting filtered out when. We may want to potentially include this MAF in the release so people can see why certain mutations of interest may be getting filtered out.
*   WES_DM_Mutation_Calling_Pipeline_(standard |expensive) (gkugener/WES_DM_Mutation_Calling_PipelineSnapshot ID: 2). This was a previous mutation calling pipeline implemented for CCLE. We do not use this pipeline any more as the CGA pipeline looks better.
*   aggregate_filterMAF_CGA (CCLE/aggregate_filterMAF_CGASnapshot ID: 1). An aggregation MAF task that we used in the past. We do not use this task anymore.
*   calculate_mutational_burden (breardon/calculate_mutational_burdenSnapshot ID: 21). This task can be used to calculate the mutational rate of the samples. We do not make use of this data in the release although it could be of interest.
*   summarizeWigFile (breardon/summarizeWigFileSnapshot ID: 5). CCLF ran this task (might be necessary for the mutational burden task). For our workflow, we do not run it.

### cleaning workspaces

In [None]:
if doCleanup:
    print("cleaning workspaces")
    torm = asyncio.run(terra.deleteHeavyFiles(refworkspace))
    h.parrun(['gsutil rm '+i for i in torm], cores=8)
    terra.removeFromFailedWorkflows(refworkspace, dryrun=False, everythingFor)

## On Terra

In [None]:
# TODO: update with latest workspace parameters from our repo

In [None]:
print("running Terra pipeline")
submission_id = refwm.create_submission("WGS_pipeline", samplesetname,'sample_set',expression='this.samples')
asyncio.run(terra.waitForSubmission(refworkspace, submission_id))

In [None]:
submission_id = refwm.create_submission("WGS_aggregate", 'all')
asyncio.run(terra.waitForSubmission(refworkspace, submission_id))

### Save the workflow configurations used

In [None]:
terra.saveWorkspace(refworkspace,'data/'+samplesetname+'/WGSconfig/')

## On local


### Remove some datafile to save money

In [None]:
val = ""
! gsutil -m rm gs://fc-secure-012d088c-f039-4d36-bde5-ee9b1b76b912/$val/**/call-tumorMM_Task/*.cleaned.bam

In [None]:
# sometimes it does not work so better check again
toremove = ["readgroup_ubams",]

for val in toremove:
    refwm.disable_hound().delete_entity_attributes('sample', toremove)
    
a = refwm.get_samples()
e = []
for i in a[toremove].values.tolist():
    if i is not np.nan:
        e.extend(i)
gcp.rmFiles(e)

In [None]:
### move HG38 to our bucket
onlycol = ['internal_bam_filepath', 'internal_bai_filepath']
samplesinset= [i['entityName'] for i in refwm.get_entities('sample_set').loc[samplesetname].samples]
len(samplesinset), samplesinset

In [None]:
wgs_newgs = 'gs://cclebams/wgs_hg38/'
wgs_res, _ = terra.changeGSlocation(refworkspace, newgs=wgs_newgs, onlycol=onlycol, entity='sample', keeppath=False, dry_run = False, onlysamples=samplesinset)

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

In [None]:
ccle_refsamples.loc[wgs_res.index.tolist()][['legacy_size', 'legacy_crc32c_hash']] = ccle_refsamples.loc[wgs_res.index.tolist()][['size','crc32c_hash']].values

ccle_refsamples.loc[wgs_res.index.tolist()][['internal_bam_filepath','internal_bai_filepath']] = wgs_res[['internal_bam_filepath','internal_bai_filepath']].values

ccle_refsamples.loc[wgs_res.index.tolist(),'size'] = [gcp.extractSize(i)[1] for i in gcp.lsFiles(wgs_res['internal_bam_filepath'].tolist(),'-l')]

ccle_refsamples.loc[wgs_res.index.tolist(),'crc32c_hash'] = [gcp.extractHash(i) for i in gcp.lsFiles(wgs_res['internal_bam_filepath'].tolist(),'-L')]

ccle_refsamples.loc[wgs_res.index.tolist(),'md5_hash'] = gcp.catFiles(refwm.get_samples().loc[samplesinset,'analysis_ready_bam_md5',].tolist(), cut=32)

### get QC files

In [None]:
dataMut = getQC(workspace=cgaworkspace ,only=samplesinset, qcname=["gatk_cnv_all_plots", "lego_plotter_pngs", "copy_number_qc_report", "ffpe_OBF_figures", "mut_legos_html", "oxoG_OBF_figures", "tumor_bam_base_distribution_by_cycle_metrics", "tumor_bam_converted_oxog_metrics"])

In [None]:
dataBamCGA = getQC(workspace=cgaworkspace ,only=samplesinset, qcname=["tumor_bam_alignment_summary_metrics", "tumor_bam_bait_bias_summary_metrics", "tumor_bam_gc_bias_summary_metrics", "tumor_bam_hybrid_selection_metrics", "tumor_bam_insert_size_histogram", "tumor_bam_insert_size_metrics", "tumor_bam_pre_adapter_summary_metrics", "tumor_bam_quality_by_cycle_metrics", "tumor_bam_quality_distribution_metrics", "tumor_bam_quality_yield_metrics"])

In [None]:
for k,v in dataMut.items():
    if k =='nan':
        continue
    a = ccle_refsamples.loc[k,'processing_qc']
    a = '' if a is np.nan else a
    ccle_refsamples.loc[k,'processing_qc'] = str(v) + ',' + a
for k,v in dataBamCGA.items():
    if k =='nan':
        continue
    a = ccle_refsamples.loc[k,'bam_qc']
    a = '' if a is np.nan else a
    ccle_refsamples.loc[k,'bam_qc'] = str(v) + ',' + a

In [None]:
dataBamGATK = getQC(workspace=refworkspace, only=samplesinset, qcname=["duplication_metrics", "bqsr_report"])

In [None]:
dataCN = getQC(workspace=refworkspace,only=samplesinset, qcname=["allelic_counts_tumor","delta_MAD_tumor","denoised_MAD_tumor","scaled_delta_MAD_tumor","denoised_copy_ratios_lim_4_plot_tumor","denoised_copy_ratios_plot_tumor","modeled_segments_plot_tumor"])

In [None]:
for k,v in dataCN.items():
    if k =='nan':
        continue
    a = ccle_refsamples.loc[k,'processing_qc']
    a = '' if a is np.nan else a
    ccle_refsamples.loc[k,'processing_qc'] = str(v) + ',' + a
for k,v in dataBamGATK.items():
    if k =='nan':
        continue
    a = ccle_refsamples.loc[k,'bam_qc']
    a = '' if a is np.nan else a
    ccle_refsamples.loc[k,'bam_qc'] = str(v) + ',' + a

In [None]:
dfToSheet(ccle_refsamples,'ccle sample tracker', secret=creds)

# Post processing

## postprocessing Copy Number

In [None]:
gene_rename, protcod_rename, ensembltohgnc = utils.generateGeneNames(ensemble_server=ensemble_server)

### loading WES

In [None]:
print('loading WES from Terra')
segments = pp_cn.loadFromGATKAggregation(wesrefwm, sampleset="all", sortby=["DepMap_ID", 'Chromosome', "Start", "End"], todrop = wrongwes|deletedwes)

In [None]:
print('making gene level copy number')
genecn = mut.toGeneMatrix(mut.manageGapsInSegments(segments), gene_mapping)

# validation step
print('summary of the gene cn data:')
print(genecn.values.min(), genecn.values.mean(), genecn.values.max())
mut.checkGeneChangeAccrossAll(genecn, thresh=0.025)
wesfailed = mut.checkAmountOfSegments(segments,thresh = 2000)
print("failed our QC")
print(wesfailed)
%store wesfailed

segments = segments[~segments.DepMap_ID.isin(set(wesfailed)-set(wes_toprefer.keys()))].reset_index(drop=True)
genecn = genecn[~genecn.index.isin(set(wesfailed)-set(wes_toprefer.keys()))]

#resetting the source
for v in set(segments.DepMap_ID):
    segments.loc[segments[segments.DepMap_ID==v].index,'Source']= ccle_refsamples[ccle_refsamples.index==v].source.values[0]
segments.Source = segments.Source.replace({'CCLF':'Broad WES', 'CHORDOMA':'Chordoma WES', 'SANGER':'Sanger WES', 'IBM':'Broad WES', np.nan:'Broad WES', 'DEPMAP':'Broad WES', 'IBM WES': "Broad WES", 'Broad CCLF':"Broad WES"})

#saving
print('saving files')
segments.to_csv('temp/segments_allWES_withreplicates_'+samplesetname+'.csv', index=False)
genecn.to_csv('temp/gene_cn_allWES_withreplicates_'+samplesetname+".csv")

In [None]:
segments = pd.read_csv('temp/segments_allWES_withreplicates_'+samplesetname+'.csv')
genecn = pd.read_csv('temp/gene_cn_allWES_withreplicates_'+samplesetname+".csv", index_col=0)

In [None]:
if isCCLE:
    # selecting the right arxspan id (latest version) 
    renaming = tracker.removeOlderVersions(names=set(segments.DepMap_ID.tolist()), refsamples=ccle_refsamples[ccle_refsamples.datatype=="wes"], arxspan_id="arxspan_id", version="version")

    # reparing QC when we have a better duplicate
    ref=pd.DataFrame(ccle_refsamples[ccle_refsamples.datatype=="wes"]['arxspan_id'])
    replace={}
    for val in wesfailed:
        if val in list(renaming.keys()):
            a = ref[ref.arxspan_id==ref.loc[val].arxspan_id].index
            for v in a:
                if v not in wesfailed:
                    replace.update({val:v})
                    break
    print(len(replace), len(wesfailed))
    for k, val in replace.items():
        renaming[val] = renaming.pop(k)

    print("failed: ", wesfailed)
    renaming.update(wes_toprefer)

    torm=[]
    for k, val in renaming.items():
        if val in wrongwes_arxspan:
            torm.append(k)
    print('wrong wes:', torm)
    for v in torm:
        renaming.pop(v)

### QCing and saving replicate level data

In [None]:
prevgenecn = (2**tc.get(name='depmap-a0ab', file='CCLE_gene_cn'))-1 
prevsegments = tc.get(name='depmap-a0ab', file='CCLE_segment_cn')
prev = prevgenecn.index.tolist()

In [None]:
pd.concat([prevgenecn.loc[["ACH-002291"]], genecn.loc[['CDS-SyBYYw',"CDS-PavpH4"]])

In [None]:
np.corrcoef(genecn.loc[['CDS-SyBYYw',"CDS-PavpH4", "CDS-l4hN5a"]])

In [None]:
#ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)
#normals = ccle_refsamples[ccle_refsamples['primary_disease']=='normal'].index.tolist()
#wespriosegments = wespriosegments[~wespriosegments.DepMap_ID.isin(normals)]
#wespriogenecn = wespriogenecn.drop(index=normals)

# prioritization
if isCCLE:
    print('renaming')
    wespriosegments = segments[segments.DepMap_ID.isin(set(renaming.keys()))].replace({'DepMap_ID': renaming}).reset_index(drop = True)
    wespriogenecn = genecn[genecn.index.isin(set(renaming.keys()))].rename(index=renaming)

    #saving prio
    wespriosegments.to_csv("temp/segments_allWES_latest_"+samplesetname+".csv", index=False)
    wespriogenecn.to_csv('temp/gene_cn_allWES_latest_'+samplesetname+".csv")

In [None]:
# comparing
if isCCLE:
    print('comparing to previous version')
    #h.compareDfs(priosegments, tc.get(name='depmap-a0ab', file='CCLE_segment_cn'))
    h.compareDfs(wespriogenecn, tc.get(name='depmap-a0ab', file='CCLE_gene_cn'))

### Loading WGS

In [None]:
print('loading WGS from Terra')
pp_cn.loadFromGATKAggregation(refwm, sampleset="all", sortby=["DepMap_ID", 'Chromosome', "Start", "End"], todrop = wrongwes|deletedwes)

In [None]:
print('making gene level copy number')
wgsgenecn = mut.toGeneMatrix(mut.manageGapsInSegments(wgssegments), gene_mapping)

# validation step
print('summary of the gene cn data:')
print(wgssegments.Segment_Mean.min(), wespriosegments.Segment_Mean.min())

mut.checkGeneChangeAccrossAll(wgsgenecn, thresh=0.025)
wgsfailed = mut.checkAmountOfSegments(wgssegments,thresh = 3000)
print("failed our QC")
print(wgsfailed)
%store wgsfailed

#removing failed samples
wgssegments = wgssegments[~wgssegments.DepMap_ID.isin(wgsfailed)].reset_index(drop=True)
wgsgenecn = wgsgenecn[~wgsgenecn.index.isin(wgsfailed)]

#saving
print('saving files')
wgssegments.to_csv('temp/segments_allWGS_withreplicates_'+samplesetname+'.csv', index=False)
wgsgenecn.to_csv('temp/gene_cn_allWGS_withreplicates_'+samplesetname+".csv")

In [None]:
wgssegments = pd.read_csv('temp/segments_allWGS_withreplicates_'+samplesetname+'.csv')
wgsgenecn = pd.read_csv('temp/gene_cn_allWGS_withreplicates_'+samplesetname+".csv", index_col=0)

In [None]:
if isCCLE:
    # selecting the right arxspan id (latest version) 
    wgsrenaming = tracker.removeOlderVersions(names = set(wgssegments['DepMap_ID']), refsamples = ccle_refsamples[ccle_refsamples.datatype=="wgs"], arxspan_id = "arxspan_id", version="version")
    # reparing QC when we have a better duplicate
    ref=pd.DataFrame(ccle_refsamples[ccle_refsamples.datatype=="wgs"]['arxspan_id'])
    replace={}
    for val in wgsfailed:
        if val in list(wgsrenaming.keys()):
            a = ref[ref.arxspan_id==ref.loc[val].arxspan_id].index
            for v in a:
                if v not in wgsfailed:
                    replace.update({val:v})
                    break
    print(len(replace))
    for k, val in replace.items():
        wgsrenaming[val] = wgsrenaming.pop(k)

### Saving and merging CN

In [None]:
if isCCLE:
    #renaming
    print('renaming')
    wgspriosegments = wgssegments[wgssegments.DepMap_ID.isin(set(wgsrenaming.keys()))].replace({'DepMap_ID':wgsrenaming}).reset_index(drop=True)
    wgspriogenecn = wgsgenecn[wgsgenecn.index.isin(set(wgsrenaming.keys()))].rename(index=wgsrenaming)
    # saving prio
    wgspriosegments.to_csv("temp/segments_allWGS_latest_"+samplesetname+".csv", index=False)
    wgspriogenecn.to_csv('temp/gene_cn_allWGS_latest_'+samplesetname+".csv")

In [None]:
if isCCLE:
    print('comparing to previous version')
    #h.compareDfs(priosegments, tc.get(name='depmap-a0ab', file='CCLE_segment_cn'))
    h.compareDfs(wgspriogenecn, tc.get(name='depmap-a0ab', file='CCLE_gene_cn'))

In [None]:
#adding to the sample tracker the sequencing that were selected and the ones that failed QC
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)
ccle_refsamples.loc[ccle_refsamples[ccle_refsamples.datatype.isin(['wes',"wgs"])].index, samplesetname]=0
selected = {j:i for i,j in renaming.items()}
selected.update({j:i for i,j in wgsrenaming.items()})
len(selected)
ccle_refsamples.loc[selected.values(),samplesetname]=1
ccle_refsamples.loc[list(wesfailed)+list(wgsfailed),'low_quality']=1
dfToSheet(ccle_refsamples,'ccle sample tracker', secret=creds)

### Comparison and merging with WES

In [None]:
ind = set(wespriogenecn.index) & set(wgspriogenecn.index)
cols = set(wespriogenecn.columns) & set(wgspriogenecn.columns)
ge = np.log2(1+wespriogenecn[cols])
ce = np.log2(1+wgspriogenecn[cols])
corr={}
for val in ind:
    corr[val] = pearsonr(ge.loc[val],ce.loc[val])[0]

In [None]:
for k,v in corr.items():
    if v<0.5:
        print(k)
        print(set(wgspriosegments[wgspriosegments.DepMap_ID==k].Source), set(wespriosegments[wespriosegments.DepMap_ID==k].Source))
        
a = np.array(list(corr.values()))
sns.kdeplot(a)
plt.show()
sns.scatterplot(x=ge.loc[ind].values.ravel()[:100000],y=ce.loc[ind].values.ravel()[:100000],)
plt.show()
sns.kdeplot(data=np.array([ge.loc[ind].values.ravel()[:100000], ce.loc[ind].values.ravel()[:100000]]).T, fill=True)
plt.show()

In [None]:
#merging WES/WGS
mergedsegments =  wgspriosegments.append(wespriosegments[~wespriosegments.DepMap_ID.isin(set(wgspriosegments.DepMap_ID))])[['DepMap_ID', 'Chromosome', 'Start', 'End', 'Segment_Mean', 'Num_Probes', 'Status', 'Source']]
mergedgenecn =  wgspriogenecn.append(wespriogenecn[~wespriogenecn.index.isin(set(wgspriogenecn.index))])

mergedgenecn.to_csv('temp/gene_cn_all_merged_'+samplesetname+".csv")
mergedsegments.to_csv('temp/segments_all_merged_'+samplesetname+".csv",index=False)

## postprocessing Mutations


Here, rather than rerunning the entire analysis, because we know we are adding only WES samples, we can download the previous release's MAF, add the samples, update any annotations, and perform any global filters at the end.

First we need to do an additional step of filtering on coverage and number 

- readMutations
- createSNPs
- addToMainMutation
- filterAllelicFraction
- filterMinCoverage
- mergeAnnotations
- addAnnotation
- maf_add_variant_annotations
- mutation_maf_to_binary_matrix (x3)

In [None]:
wesres = dm.WorkspaceManager(wesmutworkspace)

### WES Somatic

In [None]:
wesres.update_sample_set('all', wesres.get_samples().index.tolist())

In [None]:
filtered = wesres.get_sample_sets().loc['all','filtered_CGA_MAF_aggregated']
! gsutil cp $filtered "temp/mutation_filtered_terra_merged.txt"
mutations = pd.read_csv('temp/mutation_filtered_terra_merged.txt',sep='\t') 
print(mutations.columns[:10])
mutations = mutations.rename(columns={"i_ExAC_AF":"ExAC_AF","Tumor_Sample_Barcode":'DepMap_ID',"Tumor_Seq_Allele2":"Tumor_Allele"}).drop(columns=['Center','Tumor_Seq_Allele1'])
mutations['CGA_WES_AC'] = [str(i[0]) + ':' + str(i[1]) for i in np.nan_to_num(mutations[['t_alt_count','t_ref_count']].values,0).astype(int)]
mutations = mutations[~mutations['DepMap_ID'].isin(wrong|deletedwes)]
#renaming = tracker.removeOlderVersions(names = set(mutations['DepMap_ID']), refsamples = wescgawm.get_samples(), arxspan_id = "arxspan_id", version="version")

In [None]:
mutations = mut.filterCoverage(mutations, loc=['CGA_WES_AC'], sep=':',cov=2)
mutations = mut.filterAllelicFraction(mutations, loc=['CGA_WES_AC'], sep=':',frac=0.1)
mutations = omics_mut.addAnnotation(mutations, NCBI_Build='37', Strand="+")
mutations = omics_mut.annotateLikelyImmortalized(mutations, TCGAlocs = ['TCGAhsCnt',
'COSMIChsCnt'], max_recurrence=0.05 ,min_tcga_true_cancer=5)
mutations.to_csv('temp/wes_somatic_mutations_withduplicates_'+samplesetname+'.csv', index=False)

In [None]:
mutations = pd.read_csv('temp/wes_somatic_mutations_withduplicates_'+samplesetname+'.csv')

In [None]:
#TODO: Count the total number of mutations per cell line, split by type (SNP, INS, DEL)
#TODO: Count the total number of mutations observed by position

In [None]:
%store -r failed 
failed

In [None]:
# based on QC from the CN pipeline: 
#TODO: in the future merge QC results from mutations/ remapping & CN to filter wrong 
priomutations = mutations[mutations.DepMap_ID.isin(renaming.keys())].replace({'DepMap_ID':renaming})

### WGS Somatic

In [None]:
res = cgawm.get_sample_sets().loc["allcurrent"]
filtered = res['filtered_CGA_MAF_aggregated']
! gsutil cp $filtered "temp/mutation_filtered_terra_merged_wgs.txt"
wgsmutations = pd.read_csv('temp/mutation_filtered_terra_merged_wgs.txt',sep='\t') 
print(wgsmutations.columns[:10])
wgsmutations = wgsmutations.rename(columns={"i_ExAC_AF":"ExAC_AF","Tumor_Sample_Barcode":'DepMap_ID',"Tumor_Seq_Allele2":"Tumor_Allele"}).drop(columns=['Center','Tumor_Seq_Allele1'])
wgsmutations = omics_mut.annotateLikelyImmortalized(wgsmutations, TCGAlocs = ['TCGAhsCnt', 'COSMIChsCnt'], max_recurrence=0.05 ,min_tcga_true_cancer=5)

wgsmutations['CGA_WES_AC'] = [str(i[0]) + ':' + str(i[1]) for i in np.nan_to_num(wgsmutations[['t_alt_count','t_ref_count']].values,0).astype(int)]

wgsmutations = mut.filterCoverage(wgsmutations, loc=['CGA_WES_AC'], sep=':',cov=2)
wgsmutations = mut.filterAllelicFraction(wgsmutations, loc=['CGA_WES_AC'], sep=':',frac=0.1)
wgsmutations = omics_mut.addAnnotation(wgsmutations, NCBI_Build='37', Strand="+")

wgsmutations = wgsmutations[wgsmutations.DepMap_ID.isin(wgsrenaming.keys())].replace({'DepMap_ID':wgsrenaming})

wgsmutations.to_csv('temp/wgs_somatic_mutations_'+samplesetname+'.csv',index=None)

In [None]:
h.dups(priomutations.Genome_Change+priomutations.DepMap_ID)

In [None]:
wgsmutations = pd.read_csv('temp/wgs_somatic_mutations_'+samplesetname+'.csv')

### Merge [WES / WGS exonic] somatic

In [None]:
toadd = set(wgsmutations.DepMap_ID) - set(priomutations.DepMap_ID)
priomutations = priomutations.append(wgsmutations[wgsmutations.DepMap_ID.isin(toadd)]).reset_index(drop = True)
#normals = set(ccle_refsamples[ccle_refsamples.primary_disease=="normal"].arxspan_id)
#mutations = mutations[~mutations.DepMap_ID.isin(normals)]
priomutations.to_csv('temp/wes_somatic_mutations_all_'+samplesetname+'.csv', index=False)

In [None]:
priomutations = pd.read_csv('temp/wes_somatic_mutations_all_'+samplesetname+'.csv')

### making mutations matrices

In [None]:
# binary mutations matrices
mut.mafToMat(priomutations[(priomutations.isDeleterious) ]).astype(int).T.to_csv('temp/wes_somatic_mutations_deleterious_boolmatrix.csv')
mut.mafToMat(priomutations[~(priomutations.isDeleterious | priomutations.isCOSMIChotspot | priomutations.isTCGAhotspot | priomutations['Variant_Classification']=='Silent') ]).astype(int).T.to_csv('temp/wes_somatic_mutations_other_boolmatrix.csv')
mut.mafToMat(priomutations[(priomutations.isCOSMIChotspot | priomutations.isTCGAhotspot)]).astype(int).T.to_csv('temp/wes_somatic_mutations_hotspot_boolmatrix.csv')

In [None]:
# genotyped mutations matrices
mut.mafToMat(priomutations[(priomutations.isDeleterious)], mode="genotype", minfreqtocall=0.05).T.to_csv('temp/wes_somatic_mutations_deleterious_matrix.csv')
mut.mafToMat(priomutations[~(priomutations.isDeleterious | priomutations.isCOSMIChotspot | priomutations.isTCGAhotspot | priomutations['Variant_Classification']=='Silent')], mode="genotype", minfreqtocall=0.05).T.to_csv('temp/wes_somatic_mutations_other_matrix.csv')
mut.mafToMat(priomutations[(priomutations.isCOSMIChotspot | priomutations.isTCGAhotspot)], mode="genotype", minfreqtocall=0.05).T.to_csv('temp/wes_somatic_mutations_hotspot_matrix.csv')

### adding legacy datasets

In [None]:
legacy_hybridcapture = tc.get(name='mutations-da6a', file='legacy_hybridcapture_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_raindance = tc.get(name='mutations-da6a', file='legacy_raindance_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_rna = tc.get(name='mutations-da6a', file='legacy_rna_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_wes_sanger = tc.get(name='mutations-da6a', file='legacy_wes_sanger_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})
legacy_wgs_exoniconly = tc.get(name='mutations-da6a', file='legacy_wgs_exoniconly_somatic_mutations').drop(columns=['Unnamed: 0',"Tumor_Sample_Barcode"]).rename(columns={'Tumor_Seq_Allele1':'Tumor_Allele'})

### solving issues with the legacy datasets

In [None]:
legacy_wgs_exoniconly.loc[legacy_wgs_exoniconly[legacy_wgs_exoniconly['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_wgs_exoniconly[legacy_wgs_exoniconly['Genome_Change'].isna()].iterrows()]
legacy_wgs_exoniconly['t_alt_count'] = legacy_wgs_exoniconly.WGS_AC.str.split(':').str[0].astype(int)
legacy_wgs_exoniconly['t_ref_count'] = legacy_wgs_exoniconly.WGS_AC.str.split(':').str[1].astype(int)
legacy_wgs_exoniconly['tumor_f'] = legacy_wgs_exoniconly['t_alt_count'] / (legacy_wgs_exoniconly['t_ref_count'] + legacy_wgs_exoniconly['t_alt_count'])

In [None]:
legacy_wes_sanger.loc[legacy_wes_sanger[legacy_wes_sanger['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_wes_sanger[legacy_wes_sanger['Genome_Change'].isna()].iterrows()]
legacy_wes_sanger['t_alt_count'] = legacy_wes_sanger.SangerWES_AC.str.split(':').str[0].astype(int)
legacy_wes_sanger['t_ref_count'] = legacy_wes_sanger.SangerWES_AC.str.split(':').str[1].astype(int)
legacy_wes_sanger['tumor_f'] = legacy_wes_sanger['t_alt_count'] / (legacy_wes_sanger['t_ref_count'] + legacy_wes_sanger['t_alt_count'])

In [None]:
legacy_raindance.loc[legacy_raindance[legacy_raindance['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_raindance[legacy_raindance['Genome_Change'].isna()].iterrows()]
legacy_raindance['t_alt_count'] = legacy_raindance.RD_AC.str.split(':').str[0].astype(int)
legacy_raindance['t_ref_count'] = legacy_raindance.RD_AC.str.split(':').str[1].astype(int)
legacy_raindance['tumor_f'] = legacy_raindance['t_alt_count'] / (legacy_raindance['t_ref_count'] + legacy_raindance['t_alt_count'])

In [None]:
legacy_hybridcapture.loc[legacy_hybridcapture[legacy_hybridcapture['Variant_Classification'].isna()].index,'Variant_Classification']='Missense_Mutation'
legacy_hybridcapture.loc[legacy_hybridcapture[legacy_hybridcapture['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_hybridcapture[legacy_hybridcapture['Genome_Change'].isna()].iterrows()]
legacy_hybridcapture['t_alt_count'] = legacy_hybridcapture.HC_AC.str.split(':').str[0].astype(int)
legacy_hybridcapture['t_ref_count'] = legacy_hybridcapture.HC_AC.str.split(':').str[1].astype(int)
legacy_hybridcapture['tumor_f'] = legacy_hybridcapture['t_alt_count'] / (legacy_hybridcapture['t_ref_count'] + legacy_hybridcapture['t_alt_count'])

In [None]:
legacy_rna.loc[legacy_rna[legacy_rna['Genome_Change'].isna()].index, 'Genome_Change'] = ['g.chr'+str(i.Chromosome)+":"+str(i.Start_position)+i.Reference_Allele+">"+i.Tumor_Allele for _, i in legacy_rna[legacy_rna['Genome_Change'].isna()].iterrows()]
legacy_rna['t_alt_count'] = legacy_rna.RNAseq_AC.str.split(':').str[0].astype(int)
legacy_rna['t_ref_count'] = legacy_rna.RNAseq_AC.str.split(':').str[1].astype(int)
legacy_rna['tumor_f'] = legacy_rna['t_alt_count'] / (legacy_rna['t_ref_count'] + legacy_rna['t_alt_count'])

In [None]:
todrop = []
legacy_rna['loci'] = legacy_rna['DepMap_ID']+"_"+legacy_rna['Chromosome']+"_"+legacy_rna['Start_position'].astype(str)
for val in h.dups(legacy_rna.loci):
    todrop.append(legacy_rna[legacy_rna.loci==val].index[0])
legacy_rna = legacy_rna.drop(todrop)

In [None]:
#legacy_hybridcapture = legacy_hybridcapture[~legacy_hybridcapture.DepMap_ID.isin(normals)]
#legacy_raindance = legacy_raindance[~legacy_raindance.DepMap_ID.isin(normals)]
#legacy_wes_sanger = legacy_wes_sanger[~legacy_wes_sanger.DepMap_ID.isin(normals)]

In [None]:
legacy_wgs_exoniconly = legacy_wgs_exoniconly[~legacy_wgs_exoniconly.DepMap_ID.isin(wrongwes_arxspan)]
legacy_wes_sanger = legacy_wes_sanger[~legacy_wes_sanger.DepMap_ID.isin(wrongwes_arxspan)]
legacy_raindance = legacy_raindance[~legacy_raindance.DepMap_ID.isin(wrongwes_arxspan)]
legacy_rna = legacy_rna[~legacy_rna.DepMap_ID.isin(wrongwes_arxspan)]
legacy_hybridcapture = legacy_hybridcapture[~legacy_hybridcapture.DepMap_ID.isin(set(wrongwes_arxspan)-set(tokeep_specific_hybrid_capture))]

In [None]:
merged = mut.mergeAnnotations(priomutations, legacy_hybridcapture, "HC_AC", useSecondForConflict=True, dry_run=False)
merged = mut.mergeAnnotations(merged, legacy_raindance, "RD_AC", useSecondForConflict=True, dry_run=False)
merged = mut.mergeAnnotations(merged, legacy_wgs_exoniconly, "WGS_AC", useSecondForConflict=False, dry_run=False)
merged = mut.mergeAnnotations(merged, legacy_wes_sanger, "SangerWES_AC", useSecondForConflict=False, dry_run=False)
merged = mut.mergeAnnotations(merged, legacy_rna, "RNAseq_AC", useSecondForConflict=False, dry_run=False)

In [None]:
merged = merged[merged['tumor_f']>0.05]

In [None]:
merged = omics_mut.annotateLikelyImmortalized(merged, TCGAlocs = ['TCGAhsCnt', 'COSMIChsCnt'], max_recurrence=0.05 ,min_tcga_true_cancer=5)

### changing variant annotations

In [None]:
mutation_groups={
"other conserving": ["5'Flank", "Intron", "IGR", "3'UTR", "5'UTR"],
"other non-conserving":["In_Frame_Del", "In_Frame_Ins", "Stop_Codon_Del", "Stop_Codon_Ins", "Missense_Mutation", "Nonstop_Mutation"],
'silent': ['Silent'],
"damaging":['De_novo_Start_OutOfFrame','Frame_Shift_Del','Frame_Shift_Ins', 'Splice_Site', 'Start_Codon_Del', 'Start_Codon_Ins', 'Start_Codon_SNP','Nonsense_Mutation']
}

In [None]:
rename = {}
for k,v in mutation_groups.items():
    for e in v:
        rename[e] = k
merged['Variant_annotation'] = [rename[i] for i in merged['Variant_Classification'].tolist()]

### Compare to previous release

I would run some checks here comparing the results to the previous releases MAF. Namely:

- Count the total number of mutations per cell line, split by type (SNP, INS, DEL)
- Count the total number of mutations observed by position (group by chromosome, start position, end position and count the number of mutations)
- Look at specific differences between the two MAFs (join on DepMap_ID, Chromosome, Start position, End position, Variant_Type). I would do this for WES only

In [None]:
a = set(merged.DepMap_ID) 
prev = tc.get(name='depmap-a0ab', file='CCLE_mutations')#tc.get(name='internal-20q2-7f46', version=18, file='CCLE_mutations')
b = set(prev.DepMap_ID)
print("new lines:")
print(a-b)
print('lost lines:')
print(b-a)

In [None]:
set(mutations[mutations.DepMap_ID.isin(e.keys())].replace({'DepMap_ID':e}).DepMap_ID)

In [None]:
e = {"CDS-mys9Dm":"ACH-001955",
"CDS-Rl87Z1":"ACH-001956",
"CDS-TzQAjG":"ACH-001957"}

In [None]:
merged = merged.append(mutations[mutations.DepMap_ID.isin(e.keys())].replace({'DepMap_ID':e}))

### check important mutations

In [None]:
# check MOLM13, MV411 cell lines- The well known mutation status of FLT3

In [None]:
# check TP53 mutation 

Are mutation consistent?

QC mutations, for a known dependency, check if it matches mutation of this gene. (if P53 is mutated, cannot have dependency on P53 or MDM2 MDM4/ inverse fir BRAF and KRAF to themselves)

### saving this version

In [None]:
merged.to_csv('temp/all_somatic_mutations_withlegacy_'+samplesetname+'.csv', index=False)

In [None]:
merged = pd.read_csv('temp/all_somatic_mutations_withlegacy_'+samplesetname+'.csv')

### making binary matrices

In [None]:
merged['mutname'] = merged['Hugo_Symbol'] + " (" + merged["Entrez_Gene_Id"].astype(str) + ")"
merged = merged[merged['Entrez_Gene_Id']!=0]

# removing immortalized ffor now 
merged = merged[merged.is_likely_immortalization!=True]


In [None]:
mut.mafToMat(merged[(merged.Variant_annotation=="damaging")], mode='bool', mutNameCol="mutname").astype(int).T.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_damaging.csv')
mut.mafToMat(merged[(merged.Variant_annotation=="other conserving")], mode='bool', mutNameCol="mutname").astype(int).T.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othercons.csv')
mut.mafToMat(merged[(merged.Variant_annotation=="other non-conserving")], mode='bool', mutNameCol="mutname").astype(int).T.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othernoncons.csv')
mut.mafToMat(merged[(merged.isCOSMIChotspot | merged.isTCGAhotspot)], mode='bool', mutNameCol="mutname").astype(int).T.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_hotspot.csv')

### retrieving unfiltered mutations [OPTIONAL]

In [None]:
####### WES
unfiltered = wesres['unfiltered_CGA_MAF_aggregated']
! gsutil cp $unfiltered "temp/wes_mutation_unfiltered_terra_merged.txt"
unfiltered = pd.read_csv('temp/wes_mutation_unfiltered_terra_merged.txt', sep='\t', encoding='L6',na_values=["__UNKNOWN__",'.'], engine='c', dtype=str)

In [None]:
unfiltered['somatic'] = unfiltered['somatic'].replace('nan','False')
unfiltered['HGNC_Status'] = unfiltered['HGNC_Status'].replace('nan','Unapproved')
unfiltered['judgement'] = unfiltered['judgement'].replace('nan','REMOVE')
unfiltered = unfiltered.rename(columns={"i_ExAC_AF":"ExAC_AF","Tumor_Sample_Barcode":'DepMap_ID',"Tumor_Seq_Allele2":"Tumor_Allele"}).drop(columns=['Tumor_Seq_Allele1'])
unfiltered['CGA_WES_AC'] = [str(i[0]) + ':' + str(i[1]) for i in np.nan_to_num(unfiltered[['t_alt_count','t_ref_count']].values.astype(float),0).astype(int)]

In [None]:
toremove = []
subunfilt = unfiltered.iloc[:10000]
for i, val in enumerate(unfiltered.columns):
    h.showcount(i,len(unfiltered.columns))
    if len(set(subunfilt[val])-set(['nan']))==1:
        if len(set(unfiltered[val])-set(['nan']))==1:
            toremove.append(val)
unfiltered = unfiltered.drop(columns=set(toremove))

In [None]:
ls temp/*_mutation_somatic_unfiltered_withreplicates.csv*

In [None]:
toint =  ["Start_position", "End_position"]
for val in toint:
    unfiltered[val]  = unfiltered[val].astype(int)
unfiltered.to_csv('temp/wes_mutation_somatic_unfiltered_withreplicates.csv.gz', index=False)

In [None]:
del unfiltered

In [None]:
####### WGS
unfiltered = res['unfiltered_CGA_MAF_aggregated']
! gsutil cp $unfiltered "temp/wes_mutation_unfiltered_terra_merged.txt"
unfiltered = pd.read_csv('temp/wes_mutation_unfiltered_terra_merged.txt', sep='\t', encoding='L6',na_values=["__UNKNOWN__",'.'], engine='c', dtype=str)

In [None]:
unfiltered['somatic'] = unfiltered['somatic'].replace('nan','False')
unfiltered['HGNC_Status'] = unfiltered['HGNC_Status'].replace('nan','Unapproved')
unfiltered['judgement'] = unfiltered['judgement'].replace('nan','REMOVE')
unfiltered = unfiltered.rename(columns={"i_ExAC_AF":"ExAC_AF","Tumor_Sample_Barcode":'DepMap_ID',"Tumor_Seq_Allele2":"Tumor_Allele"}).drop(columns=['Tumor_Seq_Allele1'])
unfiltered['CGA_WES_AC'] = [str(i[0]) + ':' + str(i[1]) for i in np.nan_to_num(unfiltered[['t_alt_count','t_ref_count']].values.astype(float),0).astype(int)]

In [None]:
toremove = []
subunfilt = unfiltered.iloc[:10000]
for i, val in enumerate(unfiltered.columns):
    h.showcount(i,len(unfiltered.columns))
    if len(set(subunfilt[val])-set(['nan']))==1:
        if len(set(unfiltered[val])-set(['nan']))==1:
            toremove.append(val)
unfiltered = unfiltered.drop(columns=set(toremove))

In [None]:
toint =  ["Start_position", "End_position"]
for val in toint:
    unfiltered[val]  = unfiltered[val].astype(int)
unfiltered.to_csv('temp/wgs_mutation_somatic_unfiltered_withreplicates.csv.gz', index=False)

In [None]:
del unfiltered

# uploading on taiga [CCLE ONLY]

## Somatic mutations

In [None]:
#reverting to previous versions
merged = merged[['Hugo_Symbol', 'Entrez_Gene_Id', 'NCBI_Build', 'Chromosome',
       'Start_position', 'End_position', 'Strand', 'Variant_Classification',
       'Variant_Type', 'Reference_Allele', 'Tumor_Allele', 'dbSNP_RS',
       'dbSNP_Val_Status', 'Genome_Change', 'Annotation_Transcript',
       'DepMap_ID', 'cDNA_Change', 'Codon_Change', 'Protein_Change', 'isDeleterious',
       'isTCGAhotspot', 'TCGAhsCnt', 'isCOSMIChotspot', 'COSMIChsCnt',
       'ExAC_AF',"Variant_annotation", 'CGA_WES_AC', 'HC_AC',
       'RD_AC', 'RNAseq_AC', 'SangerWES_AC', 'WGS_AC']].rename(columns={"Tumor_Allele":"Tumor_Seq_Allele1"})
merged.to_csv('temp/all_somatic_mutations_withlegacy_'+samplesetname+"_depmapversion.csv", index=False)

In [None]:
!gunzip temp/wes_mutation_somatic_unfiltered_withreplicates.csv.gz
!gunzip temp/wgs_mutation_somatic_unfiltered_withreplicates.csv.gz

In [None]:
a = 'temp/expression_' + samplesetname + '_transcripts_tpm_logp1.csv'
! ls -al $a

In [None]:
tc.update_dataset(changes_description="new "+samplesetname+" release!",
                 dataset_permaname="mutations-latest-ed72",
                 upload_files=[
                    {
                       "path": "temp/all_somatic_mutations_boolmatrix_fordepmap_hotspot.csv",
                       "name": "all_somatic_mutations_boolmatrix_fordepmap_hotspot",
                       "format": "NumericMatrixCSV",
                       "encoding": "utf-8"
                    },
                    {
                       "path": "temp/all_somatic_mutations_boolmatrix_fordepmap_othernoncons.csv",
                       "name": "all_somatic_mutations_boolmatrix_fordepmap_othernoncons",
                       "format": "NumericMatrixCSV",
                       "encoding": "utf-8"
                    },
                    {
                        "path": "temp/all_somatic_mutations_boolmatrix_fordepmap_damaging.csv",
                        "name": "all_somatic_mutations_boolmatrix_fordepmap_damaging",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/all_somatic_mutations_withlegacy_"+samplesetname+"_depmapversion.csv",
                        "name": 'all_somatic_mutations_all_'+samplesetname+"_depmapversion",
                        "format": "TableCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/all_somatic_mutations_withlegacy_"+samplesetname+".csv",
                        "name": 'all_somatic_mutations_all_'+samplesetname,
                        "format": "TableCSV",
                        "encoding": "utf-8"
                    },
                 ],
                 add_all_existing_files=True,
                  upload_async=False,
                 dataset_description="""
# Mutations

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal.

/!\ This is the most up to date version of the CCLE Mutatios data.
The data is most likely of a better quality that what is on other folder. It is however in beta version as not all changes have either been confirmed or accepted by the DepMap Ops and the DepMap Portal Team.

# Notations:

all: every cell lines we have

WES: all data comes from the WExomeS samples we posses

WGS: all data comes from the WGenomeS samples we posses

withreplicates: if we have two different sequencing from a sample, we kept both, see the depmap sample tracker for annotations [https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE](https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE). this dataset is more geared toward QC or in-depth analysis of a particular cell line.

merged: everything from both WGS and WES

latest: only the latest sequencing versions of the samples were kept

genes (gene rpkm):
__Rows__:
__Columns__:
Counts (gene counts):
__Rows__:
__Columns__:
Gene level CN data:
__Rows__:
__Columns__:
 DepMap cell line IDs
 gene names in the format HGNC\_symbol (Entrez\_ID)
DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean
 """)

## Copy Number

 we push full dataset version in depmap taiga CN

In [None]:
tc.update_dataset(changes_description="new "+samplesetname+" release! (removed misslabellings, see changelog)",
                  dataset_permaname="cn-latest-d8d4", 
                  upload_files=[
                    {
                        "path": "temp/segments_allWES_latest_"+samplesetname+".csv",
                        "name": "segments_allWES_latest_"+samplesetname,
                        "format": "TableCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/gene_cn_allWES_latest_"+samplesetname+".csv",
                        "name": "gene_cn_allWES_latest_"+samplesetname,
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/segments_allWES_withreplicates_"+samplesetname+".csv",
                        "name": "segments_allWES_withreplicates_"+samplesetname,
                        "format": "TableCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/gene_cn_allWES_withreplicates_"+samplesetname+".csv",
                        "name": "gene_cn_allWES_withreplicates_"+samplesetname,
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/gene_cn_all_merged_"+samplesetname+".csv",
                        "name": "gene_cn_all_merged_"+samplesetname,
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/segments_allWGS_withreplicates_"+samplesetname+".csv",
                        "name": "segments_allWGS_withreplicates_"+samplesetname,
                        "format": "TableCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/gene_cn_allWGS_withreplicates_"+samplesetname+".csv",
                        "name": "gene_cn_allWGS_withreplicates_"+samplesetname,
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/segments_all_merged_"+samplesetname+".csv",
                        "name": "segments_all_merged_"+samplesetname,
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                  ],
                  dataset_description="""
# Copy Number

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal.

/!\ This is the most up to date version of the CCLE CN data.

# Notations:

all: everything

allWES: all data comes from the WExomeS samples we posses

allWGS: all data comes from the WGenomeS samples we posses

withreplicates: if we have two different sequencing from a sample, we kept both, see the depmap sample tracker for annotations [https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE](https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE). this dataset is more geared toward QC or in-depth analysis of a particular cell line.

merged: everything from both WGS and WES

latest: only the latest sequencing versions of the samples were kept


Gene level CN data:

__Rows__: cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

Segment level data:

__Columns__: DepMap\_ID, Chromosome, Start, End, Segment\_Mean, Num\_Probes, Calls""")

## Structural variants

## germline Mutations