# Copy Number Pipeline

In [124]:
from __future__ import print_function
import pandas as pd
import numpy as np

from src.CCLE_postp_function import *
from genepy import terra
from genepy.utils import helper as h
from genepy.google import gcp
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm
from genepy.google.google_sheet import dfToSheet

from scipy.stats import pearsonr,spearmanr

from bokeh.plotting import *
from IPython.display import Image,display
import seaborn as sns

from biomart import BiomartServer
import io

%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()

my_id = '~/.client_secret.json'
mystorage_id = "~/.storage.json"
# do the first steps of https://medium.com/craftsmenltd/from-csv-to-google-sheet-using-python-ef097cb014f9
creds = '../.credentials.json'

sheets = Sheets.from_files(my_id, mystorage_id)
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## boot up

- you first need to go to [taiga](https://cds.team/taiga/dataset) and create some new datasets for the virtual release
- the easiest way to create a new dataset is to upload an empty file (since at least one file is required). This empty file can be deleted when you update the dataset with a new version

we are instanciating all the parameters needed for this pipeline to run

In [125]:
samplesetname = "21Q2"

refworkspace="broad-firecloud-ccle/DepMap_WES_CN_hg38"
genelist_hg38 = 'ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS.20180614.txt'

refsheet_url = "https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY"
sheeturl = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"
potential_list_url = "https://docs.google.com/spreadsheets/d/1BEgH03V4OmGhYeciLCZV00h6hp3WkO0basahS93akCE"

CNWESmethods = [
    "gatk/PreProcessingForVariantDiscovery_GATK4/8",
    "GP-TAG/Manta_SomaticSV/9",
    "gkugener/ArrayOfFilesToTxt/1",
    "vdauwera/BamToUnmappedRGBams/4",
    "gatk/CNV_Somatic_Pair_Workflow/9",
    "gkugener/Aggregate_CN_seg_files/2"
]

#version 102
ensemblserver = "http://nov2020.archive.ensembl.org/biomart" 

In [126]:
release = samplesetname
# we initialize the workspaces manager from dalmatian
refwm = dm.WorkspaceManager(refworkspace)

potential_list = sheets.get(potential_list_url).sheets[0].to_frame().values.T[0].tolist()

ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

len(ccle_refsamples)

6771

## Check that we have all the cell lines we expect for this release

This involves comparing to the list in the Google sheet "Cell Line Profiling Status."

_As the list cannot be parsed, we are not comparing it for now_

In [None]:
# this function may not work - it hasn't been tested
url = 'https://docs.google.com/spreadsheets/d/1qus-9TKzqzwUMNWp8S1QP4s4-3SsMo2vuQRZrNXf7ag/edit?ts=5db85e27#gid=0&fvid=1627883727'

compareToCuratedGS(url, sample = newsample[0], samplesetname = samplesetname, colname = 'CN New to internal')

# run the pipeline

We are using Dalmatian to send request to Terra, we are running a set of 5 functions To generate the copy number dataset:

*   **BamToUnmappedRGBams_MC** vdauwera/BamToUnmappedRGBamsSnapshot ID: 3
*   **Generate_uBAM_File_List** gkugener/ArrayOfFilesToTxtSnapshot ID: 1
*   **Realign_WES_GATK4** gatk/PreProcessingForVariantDiscovery_GATK4Snapshot ID: 7
*   **CNV_sample_XX** gatk/CNV_Somatic_Pair_WorkflowSnapshot ID: 9
*   **Aggregate_CN_seg_files** gkugener/Aggregate_CN_seg_filesSnapshot ID: 2

This output file for download will be saved under the sample set under the combined_seg_file attribute.

There are several other tasks in this workspace. In brief:

*   **CNV_Somatic_Panel_Workflow_Agilent_XX** gatk/CNV_Somatic_Panel_WorkflowSnapshot ID: 11. This task was used in this workspace to generate the Sanger PON. In the Sanger dataset, there is a set of 40 normal cell lines samples (cell lines derived from matched normal tissue). We can use these to generate a PON to normalize to rather than using the Agilent PON we use for the other CCLE cell lines. This leads to less noisy results. HOWEVER, results using the PON from this workflow should not use the X chromosome, as the sanger normals are not exclusively female or male (it is likely a mix).
*   **SANGER_PON_CNV_sample_XX** gatk/CNV_Somatic_Pair_WorkflowSnapshot ID: 9. Same as the CNV_sample_XX_gatk, except that is uses the Sanger based PON. Should be used only for the Sanger cell lines.
*   **Sanger_PON_Aggregate_CN_seg_files** gkugener/Aggregate_CN_seg_filesSnapshot ID: 2. Aggregates the segment files for the samples that were run using the Sanger PON based CNV workflow.

### cleaning workspaces

In [None]:
torm = terra.listHeavyFiles(refworkspace)
h.parrun(['gsutil rm '+i for i in torm], cores=8)
terra.removeFromFailedWorkflows(refworkspace, dryrun=False, everythingFor=['Realign_WES_GATK4','Generate_uBAM_File_List','BamToUnmappedRGBams_MC','CGA_WES_CCLE_ICE','CGA_WES_CCLE_AGILENT'])

## On Terra

In [None]:
# a list of Terra workflows that are in the workspace and that we will call sequentially
bamtoubam= "BamToUnmappedRGBams_MC"
ubamtofilelist = "Generate_uBAM_File_List"
realign="Realign_WES_GATK4"

In [None]:
# see dalmatian
subid = refwm.create_submission(bamtoubam,samplesetname,"sample_set","this.samples")
terra.waitForSubmission(refworkspace, subid)

In [None]:
subid = refwm.create_submission(ubamtofilelist,samplesetname,"sample_set","this.samples")
terra.waitForSubmission(refworkspace, subid)

In [None]:
subid = refwm.create_submission(realign,samplesetname,"sample_set","this.samples")
terra.waitForSubmission(refworkspace, subid)

In [None]:
# Testing out the XY PoN for CN characterization. Will test by producing an output in a different column from usual so it's easy to delete the column attribute later
# Also, need to make a split between Agilent and ICE samples..
submission_id= refwm.create_submission("CNV_sample_XY_ice",etype='sample_set',entity=samplesetname, expression='this.samples')
terra.waitForSubmission(refworkspace,submission_id)

### copy pairs data to sample data

In [None]:
pairs = refwm.get_pairs()

In [None]:
pairs = pairs[~pairs['called_copy_ratio_segments_tumor'].isna()]
pairs = pairs.drop(columns=['case_sample','control_sample','participant'])
pairs.index = [i.split('_')[0] for i in pairs.index]

In [None]:
refwm.update_sample_attributes(pairs)

continuing

In [None]:
submission_id = refwm.create_submission("Aggregate_CN_seg_files",entity="all")
terra.waitForSubmission(refworkspace,submission_id)

__we are getting the results file path__

In [None]:
terra.waitForSubmission(refworkspace,submission_id)
aggregated = refwm.get_entities('sample_set').loc['all']["combined_seg_file"]
aggregated

## On local

__We then save the workflow configurations used__

In [None]:
terra.saveConfigs(refworkspace,'data/'+samplesetname+'/CNVconfig')

__delete unmapped bams generated during the process__

In [None]:
toremove = ["readgroup_ubams",]
res = refwm.get_samples()
for val in toremove:
    refwm.disable_hound().delete_entity_attributes('sample', res[val], delete_files=True)

In [None]:
# sometimes the previous step does not work and you need to do it manually (you can run this to check it worked)
for val in samplesinset.readgroup_ubams:
    ubams = ''
    if not type(val) is list:
        continue 
    for v in val:
        ubams+=' '+v
    os.system('gsutil -m rm'+ubams)

__and move the hg38 aligned bams to our own datastorage bucket__

Note that we may encounter some WGS files, which need to go to a different folder from the WES bam files.

In [None]:
samplesetname

In [None]:
#samplesinset = samples.index.tolist()
samplesinset= [i['entityName'] for i in refwm.get_entities('sample_set').loc[samplesetname].samples]
samplesinset

In [None]:
onlycol = ['hg38_analysis_ready_bam', 'hg38_analysis_ready_bam_index', 'hg38_analysis_ready_bam_md5']
wes_newgs = 'gs://cclebams/hg38_wes/'
wes_res, flagged = terra.changeGSlocation(refworkspace, newgs=wes_newgs, onlysamples=samplesinset, onlycol=onlycol, entity='sample', keeppath=False, dry_run = False)

#### set it this way in our sample tracker

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

In [None]:
ccle_refsamples.loc[samplesinset,['legacy_bam_filepath','legacy_bai_filepath', 'legacy_size', 'legacy_crc32c_hash']] = ccle_refsamples.loc[samplesinset][['internal_bam_filepath', 'internal_bai_filepath', 'size', 'crc32c_hash']].values

ccle_refsamples.loc[samplesinset,'internal_bam_filepath'] = wes_res['hg38_analysis_ready_bam'].values

ccle_refsamples.loc[samplesinset,'internal_bai_filepath'] = wes_res['hg38_analysis_ready_bam_index'].values

ccle_refsamples.loc[wes_res.index.tolist(),'size'] = [gcp.extractSize(i)[1] for i in gcp.lsFiles(wes_res['hg38_analysis_ready_bam'].tolist(),'-l')]

ccle_refsamples.loc[wes_res.index.tolist(),'crc32c_hash'] = [gcp.extractHash(i) for i in gcp.lsFiles(wes_res['hg38_analysis_ready_bam'].tolist(),'-L')]

ccle_refsamples.loc[wes_res.index.tolist(),'md5_hash'] = gcp.catFiles(wes_res['hg38_analysis_ready_bam_md5'].tolist(), cut=32)

In [121]:
dfToSheet(ccle_refsamples,'ccle sample tracker', secret=creds)

### Get QC files

In [None]:
dataBam = getQC(workspace=refworkspace ,only=samplesinset, qcname=["hg38_duplication_metrics","hg38_bqsr_report"])
dataCN = getQC(workspace=refworkspace ,only=samplesinset, qcname=["allelic_counts_tumor","delta_MAD_tumor","denoised_MAD_tumor","scaled_delta_MAD_tumor","denoised_copy_ratios_lim_4_plot_tumor","denoised_copy_ratios_plot_tumor","modeled_segments_plot_tumor"])

In [None]:
for k,v in dataCN.items():
    if k =='nan':
        continue
    ccle_refsamples.loc[k,'processing_qc'] = str(v)
for k,v in dataBam.items():
    if k =='nan':
        continue
    ccle_refsamples.loc[k,'bam_qc'] = str(v)

In [None]:
dfToSheet(ccle_refsamples,'ccle sample tracker', secret=creds)

__We download and reprocess removing the appended version and keeping only the newest versions__

In [None]:
! gsutil cp $aggregated "temp/cnv_ccle.called.seg"
segments = pd.read_csv("temp/cnv_ccle.called.seg", sep='\t')
segments = segments.rename(columns={'CONTIG':'Chromosome',
'START':'Start',
'END':'End',
'Sample':"DepMap_ID",
'NUM_POINTS_COPY_RATIO':'Num_Probes',
'MEAN_LOG2_COPY_RATIO':'Segment_Mean',
'CALL':'Status'})
# TODO: copy allelic calls as well
len(set(segments.Sample))

In [None]:
segments = segments[~segments.DepMap_ID.isin(wrongswes)]

In [None]:
wrongswes = {'CDS-8Ut3sT','CDS-Ip02tY','CDS-Rd4nMx','CDS-TGTiB8','CDS-VnMBYD','CDS-YSRYLi','CDS-ZJh6UN','CDS-dgxjAa','CDS-0aJ4Yh','CDS-0lfqVz','CDS-0pZb0j','CDS-1b1Hxk','CDS-1djAlo','CDS-1p2nnc','CDS-34hKv3','CDS-3EBt51','CDS-3M6Pq9','CDS-3WygAj','CDS-49azaP','CDS-4sr6RL','CDS-5rD8XC','CDS-5wYxZS','CDS-6Yy3Yj','CDS-6da3hu','CDS-6l3V79','CDS-7PFldq','CDS-9XPgHB','CDS-9qDPiX','CDS-B0qAaq','CDS-CMenCH','CDS-CuJ0f8','CDS-Dkl8OF','CDS-Eq9UNX','CDS-FRxdcH','CDS-HNytLD','CDS-Hj3xAa','CDS-IJnjkY','CDS-Ig6N9S','CDS-KQDgIV','CDS-KYkMDa','CDS-KbbgMb','CDS-KgRznV','CDS-L0pDPl','CDS-M8xDMS','CDS-MLJbT2','CDS-MnF3x8','CDS-OCkOqy','CDS-ODmXrP','CDS-OgPf0h','CDS-PYw8ID','CDS-PdUZxY','CDS-QHp4h4','CDS-QU7ftt','CDS-QVhVDT','CDS-QXBhht','CDS-SJq3p4','CDS-Sp18uD','CDS-TpDBjm','CDS-TyWjJs','CDS-UV1pVE','CDS-UnDaBI','CDS-UtrDTK','CDS-W80jkV','CDS-WedVJA','CDS-WfjTcJ','CDS-X3c4UY','CDS-XQkXf4','CDS-XevQNc','CDS-YMIv9D','CDS-YYLKZ0','CDS-agZcmk','CDS-bntBUl','CDS-cAEii6','CDS-cYWYp7','CDS-cyuMYb','CDS-d18Xie','CDS-dpub1O','CDS-eUqT7L','CDS-eowEZF','CDS-fXMRF9','CDS-gIMBax','CDS-gRA4SM','CDS-iEULQm','CDS-ihI7Dp','CDS-iqPqOr','CDS-jqOvtj','CDS-kxNZ5S','CDS-leGxSD','CDS-nby0QM','CDS-no7ysz','CDS-o4dXGr','CDS-oHu1Ik','CDS-pXMN9C','CDS-picEuX','CDS-qZsCuJ','CDS-rLRUbG','CDS-txTRwz','CDS-uQ8nnX','CDS-yPSmxb''CDS-0qPmaJ','CDS-1PXzlf','CDS-1uWUTi','CDS-294bk6','CDS-2JxT1P','CDS-2LFZYm','CDS-2Q2Kia','CDS-2hGt1N','CDS-2lAFkD','CDS-2xSJmZ','CDS-3DHwSX','CDS-3FueNQ','CDS-3VNhFC','CDS-3jIdRa','CDS-3mvYnW','CDS-3pZIvU','CDS-49xzNU','CDS-4BrJr7','CDS-4S6juQ','CDS-4ZOQQF','CDS-4l9BUT','CDS-5H2go6','CDS-5IcijG','CDS-5LNjjI','CDS-5PXB9Y','CDS-5ViPeM','CDS-5bQzF2','CDS-5hbofu','CDS-6EyvRQ','CDS-6Fc0S5','CDS-6PZKz8','CDS-6mq2Or','CDS-6xyqy9','CDS-75psAH','CDS-7JWzyA','CDS-7nEZFG','CDS-7rcFYn','CDS-83LhEq','CDS-8aHSii','CDS-8mpXJa','CDS-8sQWae','CDS-8yHnJv','CDS-8z476r','CDS-96DdrP','CDS-9JpX07','CDS-9M8GNS','CDS-9sg0Pm','CDS-9u5DMn','CDS-9zidMf','CDS-AJMYsd','CDS-AOWMF3','CDS-AjRIMt','CDS-Awmxa5','CDS-BRxHbu','CDS-BnszE4','CDS-Bojgi7','CDS-C3hSav','CDS-C7o0op','CDS-CRPZeK','CDS-CZstO2','CDS-D6mIfI','CDS-DIckeT','CDS-DZMoWW','CDS-Eh7ost','CDS-Eo5oAR','CDS-EpURcL','CDS-EzZEgz','CDS-Fz0HXE','CDS-G1sVsw','CDS-GINQfy','CDS-GnBdHN','CDS-H1oKTL','CDS-H4hPhD','CDS-HEoDm7','CDS-HOVBCg','CDS-HjGCvC','CDS-HkZUmY','CDS-HoW111','CDS-Hv0i3y','CDS-Hw6KuA','CDS-Hx6zuD','CDS-I7bMcd','CDS-I97Uzq','CDS-IGOgCK','CDS-Iu8c04','CDS-IzeN7a','CDS-J3jfZW','CDS-J6kDsZ','CDS-JMfP1M','CDS-JvOeJK','CDS-K2tTmq','CDS-Kswf83','CDS-LCfY0q','CDS-LNTGnh','CDS-LOW19e','CDS-LUm1Vn','CDS-LVeuLY','CDS-LifesX','CDS-LnV7QY','CDS-M1sAGX','CDS-M8aV3P','CDS-MOOIHL','CDS-Md89va','CDS-MhXQX3','CDS-N83rwD','CDS-NBnCDl','CDS-NPG23x','CDS-NXnWiI','CDS-NZsio7','CDS-NjunRu','CDS-O1ShTQ','CDS-O8dfj7','CDS-OLgoE4','CDS-OWJaXi','CDS-OjLMVy','CDS-OnIxUL','CDS-OxQgBw','CDS-P79y6z','CDS-PHI8VT','CDS-PYWxsh','CDS-Pkk9e2','CDS-Pku96X','CDS-PyELSk','CDS-QE7bdY','CDS-Qbfoau','CDS-Ql8GJZ','CDS-QtTdY6','CDS-QxeMJW','CDS-R3txwY','CDS-R6ehaT','CDS-RFBAY6','CDS-RWYJ02','CDS-RnsUHX','CDS-RxQhcq','CDS-SO3AhH','CDS-SvzhGj','CDS-T10Uph','CDS-TCqSJW','CDS-TDblpN','CDS-TSDUCK','CDS-Twv1kD','CDS-Ty3mgt','CDS-UL1jLm','CDS-UVxUrF','CDS-UfC2Dz','CDS-Uru0Mh','CDS-UvBswk','CDS-UxKEaK','CDS-V2ZEuP','CDS-V6Kk5q','CDS-VBr00g','CDS-VCuHjJ','CDS-WAPQGk','CDS-WHZolj','CDS-WP95Oi','CDS-Ww1LC7','CDS-XJDBDj','CDS-Xgu4mi','CDS-XqaEOX','CDS-Y27yfi','CDS-YYd4ww','CDS-YnodyM','CDS-ZGlgTf','CDS-ZMsoXe','CDS-aDUHcI','CDS-aGMcvr','CDS-aXqwpM','CDS-allHxr','CDS-awunD8','CDS-b9sdh9','CDS-bPT1F0','CDS-bdb5iE','CDS-bons31','CDS-c2Sowd','CDS-cBOy2Z','CDS-cKMeDY','CDS-cMvnjL','CDS-ck9vpG','CDS-cmV75B','CDS-ctVpqU','CDS-dJqQ4g','CDS-dNVjOc','CDS-dPlJzz','CDS-dWHWU3','CDS-eGQYXr','CDS-eZg4P8','CDS-fLsYaB','CDS-fRpNQH','CDS-frzvLf','CDS-fs8moU','CDS-g0KUGN','CDS-gCSYjV','CDS-gKIdjs','CDS-gsqqAz','CDS-h4mOdz','CDS-hOI086','CDS-iKXYuH','CDS-iRstNJ','CDS-iX8vqU','CDS-ik526H','CDS-jHqXGP','CDS-kAARUi','CDS-kFiHZk','CDS-kt2Gne','CDS-ktRRkc','CDS-l1OClV','CDS-lSpYo6','CDS-lTogDX','CDS-ldrQm3','CDS-leyYAD','CDS-loy9vi','CDS-m49nRz','CDS-mGHY2S','CDS-mazUYU','CDS-mtMTts','CDS-n7Fqfe','CDS-nOKbmw','CDS-nTW67d','CDS-nYIBWR','CDS-ocw0rP','CDS-ogUnWk','CDS-ohjYlg','CDS-opnGD7','CDS-qIc5x3','CDS-qP2MBQ','CDS-qUtkjN','CDS-qaOoHQ','CDS-qeIIoY','CDS-qv2bpJ','CDS-r5Ym7C','CDS-rLadW7','CDS-rQIdNN','CDS-rQMY3G','CDS-rUs3FP','CDS-rVAuin','CDS-ragHOy','CDS-s7pOQR','CDS-sCWLGL','CDS-sbwn0P','CDS-sieIuO','CDS-soTPPi','CDS-tORJC8','CDS-tPR3fn','CDS-tYXity','CDS-tgnRyK','CDS-u1AlUI','CDS-uGZguG','CDS-w7i5l7','CDS-w8wJvh','CDS-wSV3OM','CDS-wWwBMZ','CDS-wbPtTZ','CDS-wlTAAF','CDS-wpXVQk','CDS-x21VqU','CDS-x7srFK','CDS-xCyamv','CDS-xI8ZAZ','CDS-xIv1KJ','CDS-xKNh7Q','CDS-yCSYHi','CDS-ycD9px','CDS-ydPJEM','CDS-z8Bvmk','CDS-ziEOXJ','CDS-zwAn7G'}

### Priorization

add columns to seg file with arxspan ID, version. only keep the newest version for any given arxspan ID.
The process to keep the newest version of any given line is a little different from 20Q2 onwards, because don't have any dataset that uses the CDS-IDs for the data from 20Q1 or earlier.

We have to download the Taiga datasets from the previous quarter, see if we have any arxspan IDs with new data, and then replace with that data. We use the function called "removeOlderVersions" to do this.

In [None]:
renaming = removeOlderVersions(names=set(segments.DepMap_ID.tolist()), refsamples=ccle_refsamples[ccle_refsamples.datatype=="wes"], arxspan_id="arxspan_id", version="version")

## post Procesing

The post processing happens in R using guillaume's functions, in brief:

- processSegments
- filterForCCLE
- interpolateGapsInSegmented
- extendEndsOfSegments
- reprioritizeData

In [None]:
notWESnotlegacy = notWES - set(legacy_segments.DepMap_ID)
%store notWESnotlegacy
notWESnotlegacy

In [71]:
server = BiomartServer(ensemblserver)
ensmbl = server.datasets['hsapiens_gene_ensembl']
server.show_databases()

{'ENSEMBL_MART_ENSEMBL': Ensembl Genes 102,
 'ENSEMBL_MART_FUNCGEN': Ensembl Regulation 102,
 'ENSEMBL_MART_GENOMIC': Genomic features 102,
 'ENSEMBL_MART_MOUSE': Mouse strains 102,
 'ENSEMBL_MART_ONTOLOGY': Ontology,
 'ENSEMBL_MART_SEQUENCE': Sequence,
 'ENSEMBL_MART_SNP': Ensembl Variation 102}


In [64]:
server = BiomartServer( "http://www.nov2020.archive.ensembl.org/biomart" )
ensmbl = server.datasets['hsapiens_gene_ensembl']
server.show_databases()



In [69]:
genemapping[genemapping.gene_biotype=="protein_coding"]

Unnamed: 0_level_0,clone_based_ensembl_gene,hgnc_symbol,gene_biotype,entrezgene_id,start,end,chr
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000198888,,MT-ND1,protein_coding,4535.0,3307,4262,MT
ENSG00000198763,,MT-ND2,protein_coding,4536.0,4470,5511,MT
ENSG00000198804,,MT-CO1,protein_coding,4512.0,5904,7445,MT
ENSG00000198712,,MT-CO2,protein_coding,4513.0,7586,8269,MT
ENSG00000228253,,MT-ATP8,protein_coding,4509.0,8366,8572,MT
...,...,...,...,...,...,...,...
ENSG00000189181,,OR14I1,protein_coding,401994.0,248681322,248682328,1
ENSG00000175137,,SH3BP5L,protein_coding,80851.0,248810446,248825915,1
ENSG00000171161,,ZNF672,protein_coding,79894.0,248838210,248849517,1
ENSG00000171163,,ZNF692,protein_coding,55657.0,248850006,248859144,1


In [67]:
gene_rename

{'ENSG00000210049': 'MT-TF (ENSG00000210049)',
 'ENSG00000211459': 'MT-RNR1 (ENSG00000211459)',
 'ENSG00000210077': 'MT-TV (ENSG00000210077)',
 'ENSG00000210082': 'MT-RNR2 (ENSG00000210082)',
 'ENSG00000209082': 'MT-TL1 (ENSG00000209082)',
 'ENSG00000198888': 'MT-ND1 (ENSG00000198888)',
 'ENSG00000210100': 'MT-TI (ENSG00000210100)',
 'ENSG00000210107': 'MT-TQ (ENSG00000210107)',
 'ENSG00000210112': 'MT-TM (ENSG00000210112)',
 'ENSG00000198763': 'MT-ND2 (ENSG00000198763)',
 'ENSG00000210117': 'MT-TW (ENSG00000210117)',
 'ENSG00000210127': 'MT-TA (ENSG00000210127)',
 'ENSG00000210135': 'MT-TN (ENSG00000210135)',
 'ENSG00000210140': 'MT-TC (ENSG00000210140)',
 'ENSG00000210144': 'MT-TY (ENSG00000210144)',
 'ENSG00000198804': 'MT-CO1 (ENSG00000198804)',
 'ENSG00000210151': 'MT-TS1 (ENSG00000210151)',
 'ENSG00000210154': 'MT-TD (ENSG00000210154)',
 'ENSG00000198712': 'MT-CO2 (ENSG00000198712)',
 'ENSG00000210156': 'MT-TK (ENSG00000210156)',
 'ENSG00000228253': 'MT-ATP8 (ENSG00000228253)',
 

In [None]:
segments['Chromosome'] = [i[3:] for i in segments['Chromosome'].tolist()]
# reverting logtransform of GTK
segments.Segment_Mean = 2**segments.Segment_Mean
segments.Start = segments.Start.astype(int)
segments.End = segments.End.astype(int)
# setting sex genes to half of their value for it to match relative concentration of other genes.
segments.loc[segments[segments.Chromosome.isin(['X','Y'])].index,'Segment_Mean'] = segments[segments.Chromosome.isin(['X','Y'])]['Segment_Mean']/2
segments = segments.sort_values(by=['DepMap_ID','Chromosome','Start','End'])
genemapping = genemapping.sort_values(by=['Chromosome','start','end'])

In [None]:
# TODO: check on IGV maxvalue

## Validation step

Once the files are saved, we load them back in python and do some validations, in brief:

- mean,max,var...
- to previous version: same mean,max,var...
- checkAmountOfSegments: flag any samples with a very high number of segments
- checkGeneChangeAccrossAll: flag any genes which stay at a similar value across all samples

In [None]:
len(segments[segments.Segment_Mean>1000])

In [None]:
gapmergedsegs = manageGapsInSegments(segments)
genecn = toGeneMatrix(gapmergedsegs, gene_mapping)
checkGeneChangeAccrossAll(genecn, thresh=0.025)

In [None]:
genecn.values.min(), genecn.values.mean(), genecn.values.max()

In [None]:
failed = checkAmountOfSegments(segments,thresh = 2000)
failed

In [None]:
# reparing QC when we have a better duplicate
ref=pd.DataFrame(ccle_refsamples[ccle_refsamples.datatype=="wes"]['arxspan_id'])
replace={}
for val in failed:
    if val in list(renaming.keys()):
        a = ref[ref.arxspan_id==ref.loc[val].arxspan_id].index
        for v in a:
            if v not in failed:
                replace.update({val:v})
                break
print(len(replace), len(failed))
for k, val in replace.items():
    renaming[val] = renaming.pop(k)
wesfailed = set(failed) - set(replace.keys())
%store wesfailed

In [None]:
%store renaming

In [None]:
%store -r wesfailed

In [None]:
for i, (k, val) in enumerate(refwm.get_samples().loc[refwm.get_sample_sets().loc["all"].samples].iterrows()):
    if i>100:
        continue
    plot = val["modeled_segments_plot_tumor"]
    ! gsutil cp $plot temp/
    print(k)
    print(val['arxspan_id'], val['sex'])
    display(Image('temp/'+plot.split('/')[-1]))

These look bad in 20Q1: 
ACH-002511 (M140325), ACH-001370 (OCIP5X)

These CN plots subjectively appear to have too many segments in new 20Q2 samples: 
ACH-002399 (CDS-sukIAT, 21NT_1), ACH-002401 (CDS-tVy3GF, 21MT2_1), ACH-002400 (CDS-VUHMHG, 21MT1_1)

In [None]:
prevgenecn = tc.get(name='internal-20q3-00d0', file='CCLE_gene_cn')

In [None]:
# getting the previous versions to check that we have everything we should
#prevgenecn = tc.get(name='depmap-a0ab', file='CCLE_gene_cn')
prevgenecn = tc.get(name='internal-20q3-00d0', file='CCLE_gene_cn')

prevsegments = tc.get(name='depmap-a0ab', file='CCLE_segment_cn')
prev = set(prevgenecn.index.tolist())
prevgenecn.max().max()

### Comparison to replicates

### finding missmatch

In [None]:
closest = findClosestMatching(genecn, CCLE_gene_cn, True)

In [None]:
closest

In [None]:
# for each replicats, if it is not what it is supposed to be, will print other replicates that exist for this cell lines, and print what it seems to be vs what it is supposed to be
issues = []
for k,v in closest.items():
    if ccle_refsamples.loc[k,'arxspan_id'] != v:
        print(k)
        print(ccle_refsamples[(ccle_refsamples.index!=k) & (ccle_refsamples.arxspan_id==v) & (ccle_refsamples.datatype=='wes')].index)
        print(v,ccle_refsamples.loc[k,'arxspan_id'])
        issues.append(k)

In [None]:
issues

In [None]:
match, corr =findClosestMatching(genecn, prevgenecn.loc[['ACH-000123',]], True, returncorr=True)

In [None]:
ccle_refsamples = changeCellLineName(ccle_refsamples, datatype = "wes", dupdict={
'CDS-b5ElTm': "ACH-000157", 
"CDS-up4Vo5": "ACH-000662",
"CDS-CWA37D": "ACH-000825", 
"CDS-CCAK2f": "ACH-001328",
"CDS-2jBQ8n": "ACH-000757",
"CDS-T8W6P4": "ACH-000398",
"CDS-9TDVpH": "ACH-000685",
"CDS-dQKiht": "ACH-000375",
"CDS-Ckptje": "ACH-002291",
"CDS-ljFuDX": "ACH-001339",
"CDS-5x4qLj": "ACH-000608",
"CDS-UxJcOY": "ACH-000561",
"CDS-TUYedU": "ACH-000261",
"CDS-RLVrVE": "ACH-001523",
"CDS-6liik0": "ACH-000561",
"CDS-b5ElTm": "ACH-000157",
"CDS-u9hZ60": "ACH-000077",
"CDS-NUlX3d": "ACH-000458",
"CDS-2HO10g": "ACH-000278"})

In [None]:
ccle_refsamples = cleanVersions(ccle_refsamples)

In [None]:
iss=[]
for k,v in corr.iterrows():
    print(k, v.mean())
    try:
        if v[ccle_refsamples.loc[k,'arxspan_id']] < 0.75:
            print(v[[closest[k],ccle_refsamples.loc[k,'arxspan_id']]])
            continue
    except:
        a = np.argsort(v.values)[-5:]   
        if  v.values[a[-1]]>0.8:
            print(ccle_refsamples.loc[k,'arxspan_id'], corr.columns[a], v.values[a])
            continue
    iss.append(k)
issues = iss

In [None]:
iss=[]
for val in issues:
    if val in renaming:
        v = ccle_refsamples.loc[val,'arxspan_id']
        a = ccle_refsamples[(ccle_refsamples.index!=val) & (ccle_refsamples.arxspan_id==v) & (ccle_refsamples.datatype=='wes')].index.tolist()
        a = [e for e in a if e not in issues]
        if len(a)>0:
            a[0] = renaming.pop(val)
        else:
            iss.append(val)
issues = iss

In [None]:
%store issues
issues

In [None]:
prevgenecn

### removing duplicates

In [None]:
unmatched

In [None]:
lis = {k: v for k,v in ccle_refsamples[ccle_refsamples.index.isin(set(genecn.index))]['arxspan_id'].iteritems()}

In [None]:
ge = np.log2(1+genecn[cols])
ce = CCLE_gene_cn[cols]
prev = ce.index.tolist()
corr={val: {} for val in set(prev) & set(CCLE_gene_cn.index)}
for k,v in lis.items():
    if v in prev:
        corr[v][k] = spearmanr(ge.loc[k],ce.loc[v])[0]

In [None]:
tomerge=[]
for k,v in corr.items():
    a = []
    for i in h.dups(v.values()):
        for l,w in v.items():
            if w == i:
                a.append(l)
    if len(a)>1:
        tomerge.append(a)
len(tomerge)

In [None]:
segments[segments.DepMap_ID=="CDS-A7rsOJ"]

In [None]:
rerenaming['ACH-002359']

In [None]:
corr

In [None]:
for k,v in corr.items():
    if v<0.7:
        print(k)
        print(set(priosegments[priosegments.DepMap_ID==k].Source), set(prevsegments[prevsegments.DepMap_ID==k].Source))

In [None]:
tomerge

In [None]:
wesdup= np.array(tomerge)
%store wesdup

In [None]:
gcp.rmFiles(ccle_refsamples[ccle_refsamples.index.isin(np.array(tomerge)[:,1])][['internal_bam_filepath', 'internal_bai_filepath', 'legacy_bam_filepath', 'legacy_bai_filepath']].values.ravel())

In [None]:
# TODO: replace in the renaming

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

In [None]:
ccle_refsamples = ccle_refsamples.drop(np.array(tomerge)[:,1])

In [None]:
ccle_refsamples = cleanVersions(ccle_refsamples)

In [None]:
#dfToSheet(ccle_refsamples,'ccle sample tracker', secret=creds)
copyToWorkspace("broad-firecloud-ccle/DepMap_WES_CN_hg38", ccle_refsamples, deleteUnmatched=True)
copyToWorkspace("broad-firecloud-ccle/DepMap_Mutation_Calling_CGA_pipeline", ccle_refsamples, deleteUnmatched=True)

### saving replicates

In [None]:
wesfailed

In [None]:
renaming

In [None]:
segments = segments[~segments.DepMap_ID.isin(set(wesfailed) | set(wesdup[:,1]))]
genecn = genecn[~genecn.index.isin(set(wesfailed) | set(wesdup[:,1]))]

In [None]:
for v in set(segments.DepMap_ID):
    segments.loc[segments[segments.DepMap_ID==v].index,'Source']= ccle_refsamples[ccle_refsamples.index==v].source.values[0]

In [None]:
segments.Source = segments.Source.replace({'CCLF':'Broad WES', 'CHORDOMA':'Chordoma WES', 'SANGER':'Sanger WES', 'IBM':'Broad WES', np.nan:'Broad WES', 'DEPMAP':'Broad WES', 'IBM WES': "Broad WES", 'Broad CCLF':"Broad WES"})

In [None]:
segments.to_csv('temp/segments_allWES_withreplicates_'+samplesetname+'.csv', index=False)
genecn.to_csv('temp/gene_cn_allWES_withreplicates_'+samplesetname+".csv")

In [None]:
(set(renaming.keys()) - set(segments.DepMap_ID)) - wesfailed

In [None]:
segments = pd.read_csv('temp/segments_allWES_withreplicates_'+samplesetname+'.csv')
genecn = pd.read_csv('temp/gene_cn_allWES_withreplicates_'+samplesetname+".csv", index_col=0)

### Comparison to prioritized

In [None]:
# getting the other version if necessary, because the other on needs to be removed
for val in wesdup:
    if val[1] in renaming:
        renaming[val[0]] = renaming.pop(val[1])
        
# getting another version that did not fail
for val in wesfailed:
    v = ccle_refsamples[(ccle_refsamples.arxspan_id == ccle_refsamples.loc[val].arxspan_id[0]) & (ccle_refsamples.datatype=='wes')].index
    if len(v)>1:
        for k in v:
            if k != val:
                renaming[k] = renaming.pop(val)
    else:
        try:
            renaming.pop(val)
        except:
            print('already removed')
%store renaming

In [None]:
set(["ACH-000274",
"ACH-002446",
"ACH-000833",
"ACH-001151",
"ACH-001955",
"ACH-000757",
"ACH-000511",
"ACH-001321",
"ACH-000473",
"ACH-001605",
"ACH-001957"]) - set(priosegments.DepMap_ID)

In [None]:
renaming.update({"CDS-Ckptje": "ACH-001672",
"CDS-pgDmZb": "ACH-002291"})

In [None]:
priosegments = segments[segments.DepMap_ID.isin(set(renaming.keys()))].replace(renaming)
priogenecn = genecn[genecn.index.isin(set(renaming.keys()))].rename(index=renaming)

In [None]:
cols = set(priogenecn.columns) & set(CCLE_gene_cn.columns)
ge = np.log2(1+priogenecn[cols])
ce = CCLE_gene_cn[cols]
ind = set(priogenecn.index) & set(CCLE_gene_cn.index)
corr={}
for val in ind:
    corr[val] = pearsonr(ge.loc[val],ce.loc[val])[0]
for k,v in corr.items():
    if v<0.3:
        print(k)
        print(set(priosegments[priosegments.DepMap_ID==k].Source), set(prevsegments[prevsegments.DepMap_ID==k].Source))

In [None]:
for k,v in corr.items():
    if v<0.3:
        print(k)
        print(set(priosegments[priosegments.DepMap_ID==k].Source), set(CCLE_segment_cn[CCLE_segment_cn.DepMap_ID==k].Source))

In [None]:
a = np.array(list(corr.values()))
sns.kdeplot(a)

In [None]:
sns.scatterplot(x=ge.loc[ind].values.ravel()[:100000],y=ce.loc[ind].values.ravel()[:100000],)

In [None]:
sns.kdeplot(data=np.array([ge.loc[ind].values.ravel()[:100000], ce.loc[ind].values.ravel()[:100000]]).T, fill=True)

### saving prioritizd

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)
normals = ccle_refsamples[ccle_refsamples['primary_disease']=='normal'].index.tolist()
#priosegments = priosegments[~priosegments.DepMap_ID.isin(normals)]
#priogenecn = priogenecn.drop(index=normals)

In [None]:
#h.compareDfs(priosegments, tc.get(name='depmap-a0ab', file='CCLE_segment_cn'))
h.compareDfs(priogenecn, tc.get(name='depmap-a0ab', file='CCLE_gene_cn'))

In [None]:
priosegments.to_csv("temp/segments_allWES_latest_"+samplesetname+".csv", index=False)
priogenecn.to_csv('temp/gene_cn_allWES_latest_'+samplesetname+".csv")

In [None]:
priosegments= pd.read_csv("temp/segments_allWES_latest_"+samplesetname+".csv")
priogenecn = pd.read_csv('temp/gene_cn_allWES_latest_'+samplesetname+".csv", index_col=0)

looking for correlation issues to previous releases

In [None]:
a = set(priogenecn.columns) & set(prevgenecn.columns)

In [None]:
corr = np.corrcoef(priogenecn)

In [None]:
ind = priogenecn.index.tolist()

In [None]:
a = [(ind[val[0]], ind[val[1]]) for val in np.argwhere(corr>0.96) if val[0]!=val[1]]

In [None]:
len(a)

In [None]:
['ACH-000131','ACH-000125','ACH-001093','ACH-000284','ACH-000340', 'ACH-000214', 'ACH-001767','ACH-000240','ACH-000154','ACH-000063','ACH-000165']

In [None]:
[val for val in a if val[1] not in ['ACH-000131','ACH-000125','ACH-001093','ACH-000284','ACH-000340', 'ACH-000214', 'ACH-001767','ACH-000240','ACH-000154','ACH-000063','ACH-000165']]

### saving samples version used for the release

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)
loc = set(wes_res.index) & set(ccle_refsamples.index)
for i in ccle_refsamples.columns[-14:-1]:
    ccle_refsamples.loc[loc,i] = 0
ccle_refsamples[samplesetname]=0
ccle_refsamples.loc[renaming.keys(),samplesetname]=1
ccle_refsamples.loc[failed,'low_quality']=1
dfToSheet(ccle_refsamples,'ccle sample tracker', secret=creds)

# Upload to taiga

- we load the blacklisted/embargoed sample ids
- we log2 transform and create a file for each release (and one containing everything)
- we upload the files using taigapy in a corresponding taiga dataset with the corresponding description and also upload it to its virtual dataset

## we push full dataset version in depmap taiga CN

In [None]:
tc.update_dataset(dataset_permaname="cn-latest-d8d4", 
                  upload_file_path_dict={
    'temp/segments_allWES_latest_'+samplesetname+'.csv': 'TableCSV',
    'temp/gene_cn_allWES_latest_'+samplesetname+".csv": 'NumericMatrixCSV',
    'temp/segments_allWES_withreplicates_'+samplesetname+'.csv': 'TableCSV',
    'temp/gene_cn_allWES_withreplicates_'+samplesetname+".csv": 'NumericMatrixCSV',
    'temp/gene_cn_all_merged_'+samplesetname+".csv":"NumericMatrixCSV",
    'temp/segments_allWGS_withreplicates_'+samplesetname+".csv":"TableCSV",
    'temp/gene_cn_allWGS_withreplicates_'+samplesetname+".csv":"NumericMatrixCSV"},
                  changes_description=
"""
""",
#"adding:"+len(new)+"lines and removed"+str(removed)+" and adding WGS data! with 29 new lines",
                  dataset_description="""
# Copy Number

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal.

/!\ This is the most up to date version of the CCLE CN data.

# Notations:

all: everything

allWES: all data comes from the WExomeS samples we posses

allWGS: all data comes from the WGenomeS samples we posses

withreplicates: if we have two different sequencing from a sample, we kept both, see the depmap sample tracker for annotations [https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE](https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE). this dataset is more geared toward QC or in-depth analysis of a particular cell line.

merged: everything from both WGS and WES

latest: only the latest sequencing versions of the samples were kept


Gene level CN data:

__Rows__: cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

Segment level data:

__Columns__: DepMap\_ID, Chromosome, Start, End, Segment\_Mean, Num\_Probes, Calls""")