# preprocessing achilles

In [5]:
from __future__ import print_function
import pandas as pd
import numpy as np
import sys

sys.path.insert(0, '..')

from JKBio import Helper as h
from taigapy import TaigaClient
from JKBio.helper.google_sheet import GSheet
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import pearsonr,spearmanr

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display

%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()

my_id = '~/.client_secret.json'
mystorage_id = "~/.storage.json"
sheets = Sheets.from_files(my_id, mystorage_id)
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## boot up

we are instanciating all the parameters needed for this pipeline to run

In [6]:
samplesetname = "20Q4"
virtual_public='public-20q3-3d35'
virtual_dmc='dmc-20q3-033d'
virtual_internal='internal-20q3-00d0'

prevname="20Q2"
prevversion=46
release=samplesetname

## Do some checks and manual rescuing

In [7]:
legacy_segments = tc.get(name='depmap-wes-cn-data--08f3', file='legacy_segments')
legacy_segments = legacy_segments.drop(columns='Unnamed: 0')
legacy_segments['Status']='U'
prevgenecn = tc.get(name='segmented-cn-wes-prioritzed-7fe1', file='wes.'+prevname+'.gene', version=prevversion) 
prev = prevgenecn.index.tolist()
priosegments = pd.read_csv("temp/segments_allWES_latest_"+samplesetname+".csv")
priogenecn = pd.read_csv('temp/gene_cn_allWES_latest_'+samplesetname+".csv", index_col=0)

In [9]:
onlyinleg = set(legacy_segments.DepMap_ID) - set(priosegments.DepMap_ID)
samegenes = set(prevgenecn.columns) & set(priogenecn.columns)
onlyinleg

{'ACH-000003',
 'ACH-000014',
 'ACH-000016',
 'ACH-000034',
 'ACH-000044',
 'ACH-000049',
 'ACH-000057',
 'ACH-000064',
 'ACH-000068',
 'ACH-000071',
 'ACH-000077',
 'ACH-000088',
 'ACH-000164',
 'ACH-000170',
 'ACH-000179',
 'ACH-000185',
 'ACH-000194',
 'ACH-000195',
 'ACH-000216',
 'ACH-000229',
 'ACH-000230',
 'ACH-000299',
 'ACH-000300',
 'ACH-000306',
 'ACH-000333',
 'ACH-000398',
 'ACH-000413',
 'ACH-000454',
 'ACH-000479',
 'ACH-000494',
 'ACH-000526',
 'ACH-000539',
 'ACH-000550',
 'ACH-000561',
 'ACH-000575',
 'ACH-000578',
 'ACH-000600',
 'ACH-000612',
 'ACH-000632',
 'ACH-000635',
 'ACH-000642',
 'ACH-000659',
 'ACH-000685',
 'ACH-000690',
 'ACH-000705',
 'ACH-000710',
 'ACH-000731',
 'ACH-000737',
 'ACH-000739',
 'ACH-000742',
 'ACH-000842',
 'ACH-000850',
 'ACH-000854',
 'ACH-000870',
 'ACH-000904',
 'ACH-000923',
 'ACH-000928',
 'ACH-000931',
 'ACH-000982',
 'ACH-001000',
 'ACH-001015',
 'ACH-001017',
 'ACH-001018',
 'ACH-001036',
 'ACH-001037',
 'ACH-001042',
 'ACH-0010

In [10]:
priogenecn.shape

(1697, 27562)

In [None]:
## only gettinng good correlation samples

## Achilles repriorization

In [22]:
toreplace = []
corr = {}
ge = priogenecn[samegenes]
pre = (2**prevgenecn[samegenes])-1
for sample in set(prevgenecn.index) & set(priogenecn.index):
        c = ge.loc[sample].values
        b = pre.loc[sample].values
        corr[sample] = pearsonr(c,b)[0]   
for k, val in corr.items():
    if val<0.85:
        toreplace.append(k)

In [23]:
len(toreplace)/len(corr)

0.6015625

In [67]:
mergedsegments = pd.concat([mergedsegments, legacy_segments[legacy_segments.DepMap_ID=='ACH-002335']])

In [68]:
mergedgenecn.loc['ACH-002335'] = prevgenecn.loc['ACH-002335']

In [28]:
mergedsegments = priosegments[~priosegments.DepMap_ID.isin(toreplace)].append(legacy_segments[legacy_segments.DepMap_ID.isin(list(onlyinleg)+toreplace)]).reset_index(drop=True)
mergedgenecn = priogenecn[~priogenecn.index.isin(toreplace)].append(prevgenecn.loc[list(onlyinleg)+toreplace][samegenes])

In [29]:
new1 = set(mergedgenecn.index.values.tolist())
new2 = set(mergedsegments['DepMap_ID'].values.tolist())
print(len(prev), len(set(prev) & new1), len(new1), len(new1 & new2))
new = new1-set(prev)
removed = set(prev)-new1
new,removed

1767 1761 1794 1794


({'ACH-000010',
  'ACH-001210',
  'ACH-001227',
  'ACH-001293',
  'ACH-001349',
  'ACH-001434',
  'ACH-001437',
  'ACH-001438',
  'ACH-001449',
  'ACH-001493',
  'ACH-001502',
  'ACH-001512',
  'ACH-001537',
  'ACH-001662',
  'ACH-001669',
  'ACH-001672',
  'ACH-001676',
  'ACH-001686',
  'ACH-001693',
  'ACH-001696',
  'ACH-001707',
  'ACH-001708',
  'ACH-001756',
  'ACH-001758',
  'ACH-001759',
  'ACH-001760',
  'ACH-001854',
  'ACH-001855',
  'ACH-001971',
  'ACH-002010',
  'ACH-002055',
  'ACH-002138',
  'ACH-002392'},
 {'ACH-001189',
  'ACH-002303',
  'ACH-002315',
  'ACH-002335',
  'ACH-002341',
  'ACH-002359'})

### Adding WGS

In [33]:
wgssegments = pd.read_csv("temp/segments_allWGS_latest_"+samplesetname+".csv")
wgsgenecn = pd.read_csv('temp/gene_cn_allWGS_latest_'+samplesetname+".csv",index_col=0)

In [31]:
wgssegments.Segment_Mean.max()

1481.0390861476

In [34]:
wgsgenecn.max().max()

300.4713399641696

In [35]:
mergedgenecn = wgsgenecn.append(mergedgenecn[~mergedgenecn.index.isin(set(wgsgenecn.index))])
mergedsegments = wgssegments.append(mergedsegments[~mergedsegments.DepMap_ID.isin(set(wgssegments.DepMap_ID))])

In [36]:
mergedgenecn.loc["ACH-002875"] = mergedgenecn.loc["ACH-000614"]
mergedgenecn.loc["ACH-002874"] = mergedgenecn.loc["ACH-000219"]

In [37]:
additional = mergedsegments[mergedsegments.DepMap_ID.isin(["ACH-000614","ACH-000219"])]
additional.loc[additional[additional.DepMap_ID == "ACH-000614"].index,'DepMap_ID'] = "ACH-002875"
additional.loc[additional[additional.DepMap_ID == "ACH-000219"].index, 'DepMap_ID'] = 'ACH-002874'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [41]:
mergedsegments = pd.concat([mergedsegments,additional])

In [42]:
mergedgenecn = mergedgenecn.apply(lambda x: np.log2(1+x))
mergedsegments = mergedsegments[['DepMap_ID', 'Chromosome', 'Start', 'End', 'Segment_Mean', 'Num_Probes', 'Status', 'Source']]

In [74]:
if(mergedgenecn.values.max() > 100):
    print("\n\n\nTOO HIGH, not LOG2 transformed!")
if(len(mergedgenecn.index) > len(set(mergedgenecn.index))):
    print("Duplicate CL, not reprioritized well!")

In [71]:
len(mergedgenecn)

1806

In [69]:
mergedgenecn.to_csv('temp/all_'+release+'_gene_cn.csv')
mergedsegments.to_csv('temp/all_'+release+'_segment.csv', index=False)

# uploading on taiga

## CN

In [45]:
tc.update_dataset(dataset_permaname="cn-wes-achilles-4dcd",
                 upload_file_path_dict={
            'temp/all_'+release+'_segment.csv': 'TableCSV',
            'temp/all_'+release+'_gene_cn.csv': "NumericMatrixCSV"}, 
                 changes_description=
"""
adding new lines:
"""+str(new)+"""

removed a couple of lines that were wrong:
"""+str(removed),
                  dataset_description="""
# Copy Number

Combined segment and gene-level CN calls from Broad WES, Sanger WES, and Broad SNP. Relative CN, log2(x+1) transformed.

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal. Must use subsetted dataset instead. These data will not make it on the portal starting 19Q1. With the DMC portal, there is new cell line release prioritization as to which lines can be included, so a new taiga dataset will be created containing CN for the portal.

These data are generated for Achilles to pull from to run CERES.

Versions to use:

v45 for 20Q2 (For Achilles QC, use v44 of segmentcn)
v40 for 20Q1
v38 for 19Q4
v33 for 19Q4
v25 for 19Q2 (hg38 aligned, Broad WES and Sanger WES based calls were generated from bam realignment. SNP based calls are still from liftover). The gene mapping script was updated to improve the gene level matrix (to remove NAs). The segment level matrix is untransformed relative CN. Gene level matrix is log2(CN + 1).
v20 for 19Q1 (version 21 is hg19)
v18: for 18Q4
v15: for 18Q3
v11: for 18Q2
Gene-level matrix in versions below 10 were using hg38 and not hg19. Version 11 is corrected and should be used instead

Calls on X, Y chromosome for profiles should not be used.

Prioritization is as follows:

Broad WES kept over everything
Sanger WES kept if:
This cell line did not fail fingerprinting
This cell line has no other CN data
This cell lines does not have CRISPR LFC data from the Achilles screen
This CN profile correlates better with Achilles CRISPR LFC data than Broad SNP CN OR the % gene-level difference between this cell lines CN profile from Sanger WES and Broad SNP < 2.5%
Broad SNP used for remaining lines with no Broad WES or with Sanger WES that does not pass the criteria above
The 'Source' column indicates which CN profile was used for that cell line.

version 6: renamed Sample column to CCLE_name for consistency for the Achilles pipeline

version 7: missing chordoma lines

version 8: fixed to names of two chordoma lines (changed suffix from CHORDOMA -> BONE) and removed renamed 
cell lines that were duplicated (with different names). Reran comparison using 18q2 LFC results. Gene level matrix will be generated for version 9

version 11: corrected error in gene-level matrix calculation (previously had been aligned to hg38 however alignment should be hg19). Segment level calls are unaffected.

versions 12-14: Sanger WES were multiplied by 2 so should not be used

version 15: internal segments and gene level matrices for 18q3 release including public version (removed black list lines and Broad WES < 6 months old). Gene level matrices are indexed using Broad IDs.

version 16: internal segments and gene level matrices for 18q4 release including public version (uses all SNP and only WES if those lines are present in the 18Q4 public Achilles dataset)

version 17: same as version 16 but with two additional line in the internal version

version 18: same as version 17 but switched one line in public to use SNP instead of WES because not in public Avana

version 20: two major changes occurred (1) we are using a FireCloud based pipeline for CN calling now for Broad WES data (2) we have moved to use hg38. This is accomplished by lifting over coordinates from hg19 to hg38 after processed by the CN pipeline.
v21 same as version 20, but we are using the original hg19 coordinates, not hg38

version 25: Broad WES and Sanger WES were realigned to hg38. SNP still uses liftover from hg19 to hg38

version 35: Seeing what went wrong with the upload.

version 36: problem with not log2 transforming the data

version 37: resolving the problem with log2 transforming the segment data

version 38: resolving the problem with log2 transforming the segment data

version 39: 20Q1. Samples ACH-002511 (M140325) and ACH-001370 (OCIP5X) appear to have too many segments looking at the CN profile.

version 40: unlog2 transforming segmentcn

version 41: 20Q2 (segmentcn is just relative copy number, whereas the genecn is log2(x+1) transformed). Added 7 new samples.
These CN plots subjectively appear to have too many segments in new 20Q2 samples: ACH-002399 (CDS-sukIAT, 21NT\_1), ACH-002401 (CDS-tVy3GF, 21MT2\_1), ACH-002400 (CDS-VUHMHG, 21MT1\_1)

version 42: **note: version 42 is missing some of the cell lines. Do not use** 

version 43: Resolving issue of no DepMap ID index in the genecn file. Duplicating the CN data in genecn and segmentcn for ACH-000219 so we have CN data for ACH-002874, the same cell line grown in different media. This step is required for Achilles / CERES.

version 44: Removing duplication of ACH-000219 from genecn file. The Achilles QC only needs the duplication in the segmentcn file. This change results in 1767 unique DepMap IDs in the genecn file, and 1768 unique DepMap IDs in the segmentcn file.

version 45: Removing duplication of ACH-000219 from segmentcn file to prevent issues in future releases, which use this Taiga dataset in the process of determining which lines should be released to Public. Now both the genecn file and the segmentcn file have 1767 unique DepMap IDs.

version 46: removing two weird undefined lines 

version 47: 20Q4 data with WGS now!

version 48: solving issues with log transform

Gene level CN data:

__Rows__: DepMap cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

Segment level data:

__Columns__: DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean""")

Uploading all_20Q4_segment...
hitting https://cds.team/taiga/api/datafile/ff78dac9bec14a5fa1831147be4f2ae4
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3

'09439ce24ed841ef84afd1cfea7ad936'