In [None]:
from __future__ import print_function
import os.path
import pandas as pd
import gzip
import sys
import numpy as np

sys.path.insert(0, '..')

from src.CCLE_postp_function import *
from JKBio import Datanalytics as da 
from JKBio import TerraFunction as terra
from JKBio import Helper as h
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display



%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

## boot up

we are instanciating all the parameters needed for this pipeline to run

In [None]:
samplesetname = "20Q3"
prevname="20Q2"
prevversion = 22
prevprevname ='20Q1'
prevprevversion= 20
virtual_internal='internal-20q3-00d0'

refworkspace="broad-firecloud-ccle/DepMap_Mutation_Calling_CGA_pipeline"



refsheet_url = "https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE"
sheeturl = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"

release = samplesetname

In [None]:
%%R
release <- '20Q3'
prevname <- '20Q2'
genome_version <- 'hg19'
taiga_version <- 10
prevversion <-13

In [None]:
refwm = dm.WorkspaceManager(refworkspace)

In [None]:
res = refwm.get_sample_sets().loc["all"]
res

In [None]:
filtered = res['filtered_CGA_MAF_aggregated']
! gsutil cp $filtered "temp/mutation_filtered_terra_merged.txt"

In [None]:
file = pd.read_csv('temp/mutation_filtered_terra_merged.txt',sep='\t') 
print(file.columns[:10])
renaming = removeOlderVersions(names = set(file['Tumor_Sample_Barcode']), refsamples = refwm.get_samples(), arxspan_id = "arxspan_id", version="version")
print(file[file['Chromosome']=='0'])
file[file['Tumor_Sample_Barcode'].isin(renaming.keys())].replace({'Tumor_Sample_Barcode':renaming}).reset_index(drop=True).to_csv('temp/mutation_filtered_terra_merged.txt',sep='\t',index=None)

In [None]:
ls ../JKBio/

# Analysis

In [None]:
%%R
#source('src/load_libraries_and_annotations.R')
load('src/Annotations.rdata') 
# There are some cell lines the celllinemapr does not know how to map so we need to load this data object for now (from old datasets)
source('src/CCLE_postp_function.R')
library('cdsomics')
library(tidyverse)
library(data.table)
library(magrittr)
library(taigr)
library(cdsomics)
library(celllinemapr) # To pull out DepMap_IDs from CCLE_names where needed

In [None]:
%%R
newly_merged_maf <- readMutations('temp/mutation_filtered_terra_merged.txt')
new_release <- createSNPs(newly_merged_maf)
names(new_release)

In [None]:
%%R
new_release <- renameAsInMainMutation(new_release)

In [None]:
%%R
filtered <- filterAllelicFraction(new_release)

In [None]:
%%R
write.table(
  filtered$merged, 
  paste0('temp/newmutations.', release, '.all.csv'), sep = ',', quote = F, row.names = F)

In [None]:
%%R
filtered <- filterMinCoverage(filtered$merged, filtered$removed_from_maf)

In [None]:
%%R
head(merged)

In [None]:
%%R
clean_annotations <- mergeAnnotations(merged,previous.release.maf)

In [None]:
%%R

# Allie's version
new_release <- addAnnotation(new_release)

In [None]:
%%R
# some matric files that does get used internaly and might be useful
damaging_mutation <- mutation_maf_to_binary_matrix(new_release, damaging =  TRUE)
other_mutation <- mutation_maf_to_binary_matrix(new_release, other = TRUE)
hotspot_mutation <- mutation_maf_to_binary_matrix(new_release, hotspot = TRUE)

In [None]:
%%R
# Save the ready to upload file to upload to taiga
write.table(
  new_release, 
  paste0('temp/mutations.', release, '.all.csv'), sep = ',', quote = F, row.names = F)
# Save the ready to upload file to upload to taiga
write.table(
  damaging_mutation, 
  paste0('temp/damaging_mutation.', release, '.all.csv'), sep = ',', quote = F)
# Save the ready to upload file to upload to taiga
write.table(
  other_mutation, 
  paste0('temp/other_mutation.', release, '.all.csv'), sep = ',', quote = F)
# Save the ready to upload file to upload to taiga
write.table(
  hotspot_mutation, 
  paste0('temp/hotspot_mutation.', release, '.all.csv'), sep = ',', quote = F)

In [None]:
newmutations = pd.read_csv('temp/mutations.'+release+'.all.csv')

In [None]:
mutations_20Q2_all = tc.get(name='depmap-mutations-maf-35fe', version=14, file='mutations.20Q2.all')

# Comparing

## initial, simple

In [None]:
#removing all of the WGS and snp arrays
mutations_20Q2_all = mutations_20Q2_all[~mutations_20Q2_all.DepMap_ID.isin(set(mutations_20Q2_all.DepMap_ID) - set(newmutations.Tumor_Sample_Barcode))]

### strange new cell lines

In [None]:
#Too recent? removed by what means? why?
set(newmutations.Tumor_Sample_Barcode) - set(mutations_20Q2_all.DepMap_ID)

In [None]:
newmutations = newmutations[~newmutations.Tumor_Sample_Barcode.isin(set(newmutations.Tumor_Sample_Barcode) - set(mutations_20Q2_all.DepMap_ID))]

In [None]:
newmutations = newmutations.sort_values(by=['Tumor_Sample_Barcode','Chromosome','Start_position','End_position'])
mutations_20Q2_all = mutations_20Q2_all.sort_values(by=['DepMap_ID','Chromosome','Start_position','End_position'])

In [None]:
newmutations[['Tumor_Sample_Barcode','Chromosome','Start_position','End_position']] = newmutations[['Tumor_Sample_Barcode','Chromosome','Start_position','End_position']].astype(str)
newmutationsset = newmutations['Tumor_Sample_Barcode']+'_'+newmutations['Chromosome']+':'+newmutations['Start_position']+'-'+newmutations['End_position']
newmutations['grouped'] = newmutationsset

In [None]:
len(newmutationsset)

In [None]:
dups = h.dups(newmutationsset)

In [None]:
len(dups)

In [None]:
newmutationsset = set(newmutationsset)

In [None]:
mutations_20Q2_all[['DepMap_ID','Chromosome','Start_position','End_position']] = mutations_20Q2_all[['DepMap_ID','Chromosome','Start_position','End_position']].astype(str)
mutations_20Q2_allset = mutations_20Q2_all['DepMap_ID']+'_'+mutations_20Q2_all['Chromosome']+':'+mutations_20Q2_all['Start_position']+'-'+mutations_20Q2_all['End_position']
mutations_20Q2_all['grouped'] = mutations_20Q2_allset

In [None]:
len(mutations_20Q2_allset)

In [None]:
len(set(mutations_20Q2_allset))

In [None]:
dups = h.dups(mutations_20Q2_allset)

### issues with duplicates

In [None]:
## How come?? the merging did not work well
len(dups)

In [None]:
mutations_20Q2_all[mutations_20Q2_all['grouped']==dups[0]]

In [None]:
submutations_20Q2_all = mutations_20Q2_all[~mutations_20Q2_all.CGA_WES_AC.isna()]

In [None]:
len(submutations_20Q2_all) 

In [None]:
submutations_20Q2_allset = submutations_20Q2_all.grouped

In [None]:
mutations_20Q2_allset = mutations_20Q2_all.grouped

### similarity

In [None]:
len(set(submutations_20Q2_allset) & set(newmutationsset))/ len(submutations_20Q2_allset)

In [None]:
len(set(submutations_20Q2_allset) & set(newmutationsset))/ len(newmutationsset)

### similarity with everything from all datasets

In [None]:
len(set(mutations_20Q2_allset) & set(newmutationsset))/ len(newmutationsset)

In [None]:
len(set(mutations_20Q2_allset) & set(newmutationsset))/ len(mutations_20Q2_allset)

In [None]:
submutations_20Q2_all[submutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(newmutationsset))]

In [None]:
notfound = submutations_20Q2_all[submutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(newmutationsset))]

In [None]:
len(set(notfound.DepMap_ID))

In [None]:
notfound[~(notfound.SangerRecalibWES_AC.isna() & notfound.SangerWES_AC.isna() & notfound.HC_AC.isna() & notfound.WGS_AC.isna() & notfound.RNAseq_AC.isna() &notfound.RD_AC.isna())]

### only one third of the notfound mutations were backed by any other analysis, showing an FP rate of 2/3

In [None]:
len(set(newmutationsset) - set(mutations_20Q2_allset))

In [None]:
newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))]

In [None]:
len(set(newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))].Tumor_Sample_Barcode))

In [None]:
alternate = np.array(newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))]['CGA_WES_AC'].str.split(':').tolist()).astype(int)[:,0]

In [None]:
alternate.mean()

In [None]:
alternate.min()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

### distribution of alternate allele count in the mutation newly found by the new pipeline

In [None]:
a,b = np.unique(alternate, return_counts=True)
fig, ax = plt.subplots(figsize=(10,10))
ax=sns.barplot(x=a[:40],y=b[:40], ci=None, ax=ax)
ax.set_yscale('log')

In [None]:
len(alternate[alternate>2]) / len(alternate) 

In [None]:
len(alternate[alternate>3]) / len(alternate) 

In [None]:
onlyinnew = newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))]
highconf = onlyinnew[(onlyinnew['CGA_WES_AC'].str.split(':').str[0].values.astype(int)>3) & (onlyinnew['PERC_CGA_WES_AC'].values.astype(float)>0.3)]

### We can see that 60% seems to come from good quality additional mutations, we should filter the other ones as is done in guillaume's pipelines

In [None]:
len(highconf)/len(onlyinnew)

In [None]:
len(newmutations[newmutations['Variant_Classification']=='Silent'])

In [None]:
maf = newmutations[newmutations['Variant_Classification']!='Silent']
samplesCol = "Tumor_Sample_Barcode"
mutNameCol="Hugo_Symbol"
col= "CGA_WES_AC"
maf = maf.sort_values(by = mutNameCol)
samples = set(maf[samplesCol])
mut = pd.DataFrame(data = np.zeros((len(set(maf[mutNameCol])), 1)), columns=['fake'], index=set(maf[mutNameCol])).astype(float)
a = {}
du = []
for i,val in enumerate(samples):
    h.showcount(i,len(samples))
    e = maf[maf[samplesCol]==val]
    dups = h.dups(e[mutNameCol])
    a[val] = len(dups)/len(e)
    du.extend(dups)
    #mut = mut.join(maf[maf[samplesCol]==val].drop_duplicates(mutNameCol).set_index(mutNameCol)[col].rename(val))
    #break

## Number of mutation on the same gene

In [None]:
a = pd.DataFrame(data=a,index=['dupmut_to_mut'])

In [None]:
a=a.T

In [None]:
a.max(),a.min(),a.mean(),a.var()

In [None]:
a.values[0]

In [None]:
sns.kdeplot(data = a.values[0])

### filtered by silent

In [None]:
a = pd.DataFrame(data=a,index=['dupmut_to_mut'])

In [None]:
a=a.T

In [None]:
a.max(),a.min(),a.mean(),a.var()

In [None]:
sns.kdeplot(data = a.T.values[0])

In [None]:
from collections import Counter
dudf = pd.DataFrame(data=Counter(du).values(), index=Counter(du).keys(),columns=['counts'])

In [None]:
dudf = dudf.sort_values(by='counts')

In [None]:
dudf.iloc[-50:]

This is linked to gene size and association to cancer:
    - TTN: 250k
    - MUC16: 132 kb
    - LRP1B: 1900kb
    - PCLO: 400kb
    - TP53: 25kb

In [None]:
def filterCoverage(maf, loc=['CGA_WES_AC'], sep=':',cov=4):
    muts=np.zeroes((len(maf),2))
    for val in loc:
        muts+= np.array([[v[0],0] if 'NA' in v else v for v in mutations_20Q2_all[val].fillna('0'+sep+'0').astype(str).str.split(sep).tolist()]).astype(int)
    return maf[muts[:,1]>=cov]

def filterAllelicFraction(maf, loc=['CGA_WES_AC'], sep=':',frac=0.3):
    muts=np.zeroes((len(maf),2))
    for val in loc:
        muts+= np.array([[v[0],0] if 'NA' in v else v for v in mutations_20Q2_all[val].fillna('0'+sep+'0').astype(str).str.split(sep).tolist()]).astype(int)
    muts = muts[:,0]/muts[:,1]
    return maf[muts>=frac]

def mergeAnnotations(newmaf, additionalmaf, additionalonmerge=[]):
    on = ['Chromosome', 'Start_position', 'End_position', 'Reference_Allele', 'Tumor_Seq_Allele1']
    on.extend(additionalonmerge)
    
    newmaf = newmaf.join(additionalmaf, on = on)
    solve issues with Hugo_Symbol, Entrez_Gene_Id
    
    return newmad
    
def mergeXY():
    dbSNP_RS.x, dbSNP_RS.y


def addAnnotation(maf):
    

def mafToMat(maf, col, boolify = False, samplesCol = "DepMap_ID", mutNameCol="Hugo_Symbol"):
    maf = maf.sort_values(by = mutNameCol)
    samples = set(maf[samplesCol])
    mut = pd.DataFrame(data = np.zeros((len(set(maf[mutNameCol])), 1)), columns=['fake'], index=set(maf[mutNameCol])).astype(float)
    for i,val in enumerate(samples):
        h.showcount(i,len(samples))
        mut = mut.join(maf[maf[samplesCol]==val].drop_duplicates(mutNameCol).set_index(mutNameCol)[col].rename(val))
    return mut.nan_to_num(0).astype(bool if boolify else float).drop(columns=['fake'])
    

In [None]:
filtered_mutations = filterCoverage(mutations)
filtered_mutations = filterAllelicFraction(filtered_mutations)

merged_mutations = addAnnotation(mutations)

mafToMat(filtered_mutations[filtered_mutations.damaging]).to_csv('.csv')
mafToMat(filtered_mutations[filtered_mutations.other]).to_csv('.csv')
mafToMat(filtered_mutations[filtered_mutations.hotspot]).to_csv('.csv')


CCLE2othermutations = 

mutations = mergeAnnotations(filtered_mutations, CCLE2othermutations)

#making 
for muttype in ['']:
    mafToMat(CCLE2othermutations[CCLE2othermutations.damaging & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")
    mafToMat(CCLE2othermutations[CCLE2othermutations.other & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")
    mafToMat(CCLE2othermutations[CCLE2othermutations.hotspot & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")

# Looking at the new unfiltered MAF

In [None]:
unfiltered = res['unfiltered_CGA_MAF_aggregated']
! gsutil cp $unfiltered "temp/mutation_unfiltered_terra_merged.txt"

In [None]:
! wc -l temp/mutation_unfiltered_terra_merged.txt

In [None]:
! wc -l temp/sub_unfiltered.tsv

In [None]:
! head -100000 temp/mutation_unfiltered_terra_merged.txt > temp/sub_unfiltered.tsv

In [None]:
! tail -1 temp/sub_unfiltered.tsv > temp/fail.tsv

In [None]:
unfiltered = pd.read_csv('temp/mutation_unfiltered_terra_merged.txt', sep='\t', encoding='L6',na_values=["__UNKNOWN__"])

In [None]:
na = unfiltered.isna().sum(0)

In [None]:
toremove = na[na>len(unfiltered)*0.999].columns
toremove

In [None]:
unfiltered = unfiltered.drop(columns=["UniProt_Site","alt_allele_seen","CCLE_ONCOMAP_overlapping_mutations","failure_reasons","ESP_CA","SVTYPE","id","gnomADg_GT","ESP_GWAS_PUBMED"])

In [None]:
unfiltered = unfiltered.drop(columns=["alt_allele_seen"])

In [None]:
toremove = []
for val in unfiltered.columns:
    if len(unfiltered[unfiltered[val]=='__UNKNOWN__'])>3160000:
        toremove.append(val)

In [None]:
unfiltered=unfiltered.drop(columns=toremove)

In [None]:
unfiltered = unfiltered.astype(str)

In [None]:
unfiltered = unfiltered.replace(['.',"__UNKNOWN__"],None)

In [None]:
unfiltered

In [None]:
toremove = []
for val in unfiltered.columns[40:]:
    if len(unfiltered[unfiltered[val]=='nan'])>3160000:
        toremove.append(val)
    elif len(set(unfiltered[val])-set(['nan']))==1:
        toremove.append(val)

In [None]:
unfiltered['somatic'] = unfiltered['somatic'].replace('nan','False')

In [None]:
unfiltered['HGNC_Status'] = unfiltered['HGNC_Status'].replace('nan','Unapproved')
unfiltered['judgement'] = unfiltered['judgement'].replace('nan','REMOVE')

In [None]:
drop = ['dbSNP_Val_Status', 'qual', 'iHpol', 'QSI_ref', 'BCNoise', 'score', 'Familial_Cancer_Genes_Reference', 'NT']

In [None]:
unfiltered = unfiltered.drop(columns=drop)

In [None]:
toremove = []
for val in unfiltered.columns:
    a = unfiltered[val]
    print(val, a[~a.isna()][:10])

In [None]:
toint =  ["Start_position", "End_position", "dbNSFP_1000Gp1_AFR_AC", "dbNSFP_1000Gp1_ASN_AC", "ESP_DP", "Transcript_Position", "gnomADg_AN", "dbNSFP_Reliability_index", "dbNSFP_codonpos", "dbNSFP_1000Gp1_AMR_AC", "dbNSFP_1000Gp1_EUR_AC", "dbNSFP_1000Gp1_AC"]

In [None]:
val

In [None]:
for i in unfiltered[val]:
    try:
        int(i)
    except:
        print(i)

In [None]:
for val in toint:
    unfiltered[val]  = unfiltered[val].astype(int)

In [None]:
unfiltered[toint].sum(1)

In [None]:
tofloat = [ "n_ref_count", "t_q20_count", "t_ins_count", "t_lod_fstar", "ESP_AvgAAsampleReadDepth", "dbNSFP_phastCons46way_placental_rankscore", "dbNSFP_LRT_converted_rankscore", "dbNSFP_SIFT_converted_rankscore", "TQSI_NT", "normal_f", "FDP50", "t_lod_fstar_forward", "TQSI", "dbNSFP_SLR_test_statistic", "dbNSFP_LR_rankscore", "dbNSFP_FATHMM_rankscore", "tumor_alt_rpir_median", "ESP_CP", "DP2", "ESP_Position", "dbNSFP_phyloP100way_vertebrate_rankscore", "dbNSFP_SiPhy_29way_logOdds", "RC", "observed_in_normals_count", "dbNSFP_phyloP100way_vertebrate", "dbNSFP_ESP6500_AA_AF", "Transcript_Exon", "contaminant_lod", "normal_power_wsp", "dbNSFP_SIFT_score", "ESP_TotalAAsamplesCovered", "n_alt_count", "dbNSFP_phyloP46way_primate_rankscore", "map_Q0_reads", "t_alt_max_mapq", "ESP_CG", "n_q20_count", "QSI_NT", "ESP_TotalSamplesCovered", "contaminant_fraction", "tumor_alt_rpir_mad", "tumor_f", "power_to_detect_negative_strand_artifact", "total_reads", "t_ref_max_mapq", "QSI", "dbNSFP_MutationTaster_converted_rankscore", "ESP_AvgEAsampleReadDepth", "dbNSFP_Polyphen2_HVAR_rankscore", "dbNSFP_1000Gp1_AMR_AF", "IHP", "init_n_lod", "dbNSFP_RadialSVM_rankscore", "ORegAnno_bin", "dbNSFP_phyloP46way_primate", "t_alt_sum", "SUBDP50", "dbNSFP_CADD_raw_rankscore", "t_lod_fstar_reverse", "read_depth", "dbNSFP_Polyphen2_HDIV_rankscore", "t_ref_sum", "dbNSFP_LR_score", "n_ref_sum", "gc_content", "dbNSFP_LRT_Omega", "dbNSFP_phyloP46way_placental", "dbNSFP_LRT_score", "t_alt_count", "dbNSFP_RadialSVM_score", "power_to_detect_positive_strand_artifact", "dbNSFP_ESP6500_EA_AF", "IC", "normal_power", "UniProt_AApos", "dbNSFP_phastCons46way_primate_rankscore", "ESP_TotalEAsamplesCovered", "COSMIC_total_alterations_in_gene", "normal_power_nsp", "CCLE_ONCOMAP_total_mutations_in_gene", "init_t_lod", "power", "tumor_alt_fpir_median", "ESP_AvgSampleReadDepth", "dbNSFP_MutationAssessor_score", "tumor_power", "dbNSFP_CADD_raw", "tumor_alt_fpir_mad", "dbNSFP_MutationAssessor_rankscore", "dbNSFP_CADD_phred", "dbNSFP_SiPhy_29way_logOdds_rankscore", "t_del_count", "n_alt_sum", "COSMIC_n_overlapping_mutations", "t_ref_count", "dbNSFP_1000Gp1_EUR_AF", "dbNSFP_1000Gp1_AF", "DP50"]

In [None]:
val

In [None]:
for val in tofloat[5:]:
    unfiltered[val]  = unfiltered[val].astype(float)

In [None]:
unfiltered = unfiltered.reset_index()

In [None]:
unfiltered[unfiltered.dbSNP_RS.str.contains('|')].index

In [None]:
unfiltered[unfiltered.columns[5:20]]

In [None]:
unfiltered.to_hdf('temp/mutation_unfiltered_terra_merged.hdf5', 'table')

In [None]:
a

In [None]:
unfiltered.to_parquet('temp/mutation_unfiltered_terra_merged.parquet', engine='fastparquet')