# preprocessing achilles

In [None]:
from __future__ import print_function
import pandas as pd
import numpy as np
import sys

sys.path.insert(0, '..')

from src.CCLE_postp_function import *
from JKBio import terra
from JKBio.utils import helper as h
from JKBio.google import gcp
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm
from JKBio.google.google_sheet import dfToSheet

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import pearsonr,spearmanr

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display
from matplotlib import pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()

my_id = '~/.client_secret.json'
mystorage_id = "~/.storage.json"
# do the first steps of https://medium.com/craftsmenltd/from-csv-to-google-sheet-using-python-ef097cb014f9
creds = '../.credentials.json'

sheets = Sheets.from_files(my_id, mystorage_id)
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

refsheet_url = "https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY"

## boot up

we are instanciating all the parameters needed for this pipeline to run

In [None]:
samplesetname = "21Q1"
release=samplesetname

## Do some checks and manual rescuing

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)

In [None]:
legacy_segments = tc.get(name='depmap-wes-cn-data--08f3', file='legacy_segments')
legacy_segments = legacy_segments.drop(columns='Unnamed: 0')
legacy_segments['Status']='U'

prevgenecn = (2**tc.get(name='depmap-a0ab', file='CCLE_gene_cn'))-1 
prevsegments = tc.get(name='depmap-a0ab', file='CCLE_segment_cn')
prevgenecn = (2**tc.get(name='internal-20q3-00d0', file='CCLE_gene_cn'))-1 
prev = prevgenecn.index.tolist()

In [None]:
priosegments = pd.read_csv("temp/segments_allWES_latest_"+samplesetname+".csv")
#priogenecn = pd.read_csv('temp/gene_cn_allWES_latest_'+samplesetname+".csv", index_col=0)

In [None]:
cyto = pd.read_csv('data/hg38_cytoband.gz', sep='\t',names=['chrom', 'start', 'end','loc','stains'])
cyto['chrom'] = [i[3:] for i in cyto['chrom']]
gene_mapping = pd.read_csv('data/genemapping_19Q1.csv')
onlyinleg = set(legacy_segments.DepMap_ID) - set(priosegments.DepMap_ID)
samegenes = set(prevgenecn.columns) & set(priogenecn.columns)
onlyinleg

In [None]:
bad = ["ACH-001011",
"ACH-001108",
"ACH-001187",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"]

In [None]:
onlyinleng.append("ACH-002291")

In [None]:
for val in onlyinleg:
    print(val,legacy_segments[legacy_segments.DepMap_ID==val].Source.values[0])

In [None]:
priogenecn.shape

In [None]:
## only gettinng good correlation samples

## Achilles repriorization

In [None]:
corr = {}
ge = np.log2(1+priogenecn[samegenes])
pre = np.log2(1+prevgenecn[samegenes]).fillna(0)
for sample in set(prevgenecn.index) & set(priogenecn.index):
        corr[sample] = pearsonr(ge.loc[sample],pre.loc[sample])[0]   
toreplace = []
for k, val in corr.items():
    if val<0.85:
        toreplace.append(k)
len(toreplace)/len(corr)

In [None]:
a = np.array(list(corr.values()))
sns.kdeplot(a)

In [None]:
ind = set(prevgenecn.index) & set(priogenecn.index)
sns.scatterplot(x=ge.loc[ind].values.ravel()[:100000],y=pre.loc[ind].values.ravel()[:100000],)

In [None]:
sns.kdeplot(data=np.array([ge.loc[ind].values.ravel()[:100000], pre.loc[ind].values.ravel()[:100000]]).T, fill=True)

In [None]:
#mergedsegments = priosegments[~priosegments.DepMap_ID.isin(toreplace)].append(legacy_segments[legacy_segments.DepMap_ID.isin(toreplace)]).reset_index(drop=True)
mergedsegments = priosegments

In [None]:
mergedsegments = mergedsegments.append(legacy_segments[legacy_segments.DepMap_ID.isin(list(onlyinleg))]).reset_index(drop=True)

### Adding WGS

In [None]:
wgssegments = pd.read_csv("temp/segments_allWGS_latest_"+samplesetname+".csv")
#wgssegments = pd.read_csv("temp/segments_allWGS_latest_"+samplesetname+".csv")
wgssegments.Segment_Mean.max()

In [None]:
mergedsegments = wgssegments.append(mergedsegments[~mergedsegments.DepMap_ID.isin(set(wgssegments.DepMap_ID))])

In [None]:
lost = set(["ACH-002217",
"ACH-002335",
"ACH-002378"])

In [None]:
#only in snp
onlyinleg - ( lost | set(wgssegments.DepMap_ID))

### removing normal lines

In [None]:
normals = ccle_refsamples[ccle_refsamples['primary_disease']=='normal'].index.tolist()
normals

In [None]:
#mergedsegments = mergedsegments[~mergedsegments.DepMap_ID.isin(normals)]

### removing duplicate because engineered:


In [None]:
mergedsegments = mergedsegments[~mergedsegments.DepMap_ID.isin(["ACH-003000", "ACH-002875", "ACH-002874"])]

### adding duplicate lines for Achilles

In [None]:
#removing any possible instance of "chr"
#mergedsegments.Chromosome = [i[3:] if 'chr' in i else i for i in mergedsegments.Chromosome]
set(mergedsegments.Chromosome)

In [None]:
mergedsegments = mergedsegments[['DepMap_ID', 'Chromosome', 'Start', 'End', 'Segment_Mean', 'Num_Probes', 'Status', 'Source']].sort_values(by=['DepMap_ID', 'Chromosome', 'Start', 'End']).reset_index(drop=True)
#setting amplification status to U for X chromosome as it is artificially amplified in female samples:
mergedsegments.loc[mergedsegments[mergedsegments.Chromosome=="X"].index,'Status'] = 'U'

In [None]:
mergedsegments = manageGapsInSegments(mergedsegments, cyto=cyto)
mergedgenecn = toGeneMatrix(mergedsegments, gene_mapping).apply(lambda x: np.log2(1+x))
set(mergedsegments.Source)

In [None]:
a = set(mergedgenecn.index) & set(prevgenecn.index)
b = set(mergedgenecn.columns) & set(prevgenecn.columns)

In [None]:
match, corr= findClosestMatching(mergedgenecn, prevgenecn, closest=True, returncorr=True)

In [None]:
plotCNchanges(mergedgenecn, prevgenecn.apply(lambda x: np.log2(1+x)), mergedsegments, prevsegments)

In [None]:
if(mergedgenecn.values.max() > 100):
    print("\n\n\nTOO HIGH, not LOG2 transformed!")
if(len(mergedgenecn.index) > len(set(mergedgenecn.index))):
    print("Duplicate CL, not reprioritized well!")

In [None]:
mergedgenecn.isna().sum().sum()

In [None]:
rna = tc.get(name='depmap-a0ab', file='CCLE_expression_proteincoding_genes_expected_count')

In [None]:
_, ax = plt.subplots()
rnaseqcorrelation(mergedgenecn.fillna(0), rna.fillna(0), ax, name="20Q4")
rnaseqcorrelation(prevgenecn[prevgenecn.index.isin(mergedgenecn.index.tolist())], rna.fillna(0), ax, name="20Q3")

In [None]:
h.compareDfs(mergedgenecn, tc.get(name='internal-20q2-7f46', version=18, file='CCLE_gene_cn'))

In [None]:
h.compareDfs(mergedgenecn, tc.get(name='depmap-a0ab', file='CCLE_gene_cn'))
#h.compareDfs(mergedsegments, tc.get(name='depmap-a0ab', file='CCLE_segment_cn'))

In [None]:
mergedgenecn.to_csv('temp/all_'+release+'_gene_cn.csv')
mergedsegments.to_csv('temp/all_'+release+'_segment.csv', index=False)

# uploading on taiga

## CN

In [None]:
tc.update_dataset(dataset_permaname="cn-wes-achilles-4dcd",
                 upload_file_path_dict={
            'temp/all_'+release+'_segment.csv': 'TableCSV',
            'temp/all_'+release+'_gene_cn.csv': "NumericMatrixCSV"}, 
                 changes_description="",
                  dataset_description="""
# Copy Number

Combined segment and gene-level CN calls from Broad WES, Sanger WES, and Broad SNP. Relative CN, log2(x+1) transformed.

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal. Must use subsetted dataset instead. These data will not make it on the portal starting 19Q1. With the DMC portal, there is new cell line release prioritization as to which lines can be included, so a new taiga dataset will be created containing CN for the portal.

These data are generated for Achilles to pull from to run CERES.


Gene level CN data:

__Rows__: DepMap cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

Segment level data:

__Columns__: DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean""")