# Mutation Pipeline

In [None]:
from __future__ import print_function
import os.path
import pandas as pd
import gzip
import sys
import numpy as np

sys.path.insert(0, '..')

from src.CCLE_postp_function import *
from JKBio import Datanalytics as da 
from JKBio import TerraFunction as terra
from JKBio import Helper as h
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display



%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

## boot up

we are instanciating all the parameters needed for this pipeline to run

In [None]:
samplesetname = "20Q3"
prevname="20Q2"
prevversion = 22
prevprevname ='20Q1'
prevprevversion= 20
virtual_internal='internal-20q3-00d0'

refworkspace="broad-firecloud-ccle/DepMap_Mutation_Calling_CGA_pipeline"



refsheet_url = "https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE"
sheeturl = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"

release = samplesetname

In [None]:
%%R
release <- '20Q3'
prevname <- '20Q2'
genome_version <- 'hg19'
taiga_version <- 10
prevversion <-13

In [None]:
refwm = dm.WorkspaceManager(refworkspace)

In [None]:
res = refwm.get_sample_sets().loc["all"]
res

In [None]:
filtered = res['filtered_CGA_MAF_aggregated']
! gsutil cp $filtered "temp/mutation_filtered_terra_merged.txt"

In [None]:
file = pd.read_csv('temp/mutation_filtered_terra_merged.txt',sep='\t') 
print(file.columns[:10])
renaming = removeOlderVersions(names = set(file['Tumor_Sample_Barcode']), refsamples = refwm.get_samples(), arxspan_id = "arxspan_id", version="version")
print(file[file['Chromosome']=='0'])
file[file['Tumor_Sample_Barcode'].isin(renaming.keys())].replace({'Tumor_Sample_Barcode':renaming}).reset_index(drop=True).to_csv('temp/mutation_filtered_terra_merged.txt',sep='\t',index=None)

In [None]:
ls ../JKBio/

In [None]:
%%R
#source('src/load_libraries_and_annotations.R')
load('src/Annotations.rdata') 
# There are some cell lines the celllinemapr does not know how to map so we need to load this data object for now (from old datasets)
source('src/CCLE_postp_function.R')
library('cdsomics')
library(tidyverse)
library(data.table)
library(magrittr)
library(taigr)
library(cdsomics)
library(celllinemapr) # To pull out DepMap_IDs from CCLE_names where needed

In [None]:
%%R
newly_merged_maf <- readMutations('temp/mutation_filtered_terra_merged.txt')
new_release <- createSNPs(newly_merged_maf)
names(new_release)

In [None]:
%%R
new_release <- renameAsInMainMutation(new_release)

In [None]:
%%R
filtered <- filterAllelicFraction(new_release)

In [None]:
%%R
write.table(
  filtered$merged, 
  paste0('temp/newmutations.', release, '.all.csv'), sep = ',', quote = F, row.names = F)

In [None]:
%%R
filtered <- filterMinCoverage(filtered$merged, filtered$removed_from_maf)

In [None]:
%%R
head(merged)

In [None]:
%%R
clean_annotations <- mergeAnnotations(merged,previous.release.maf)

In [None]:
%%R

# Allie's version
new_release <- addAnnotation(new_release)

In [None]:
%%R
# some matric files that does get used internaly and might be useful
damaging_mutation <- mutation_maf_to_binary_matrix(new_release, damaging =  TRUE)
other_mutation <- mutation_maf_to_binary_matrix(new_release, other = TRUE)
hotspot_mutation <- mutation_maf_to_binary_matrix(new_release, hotspot = TRUE)

In [None]:
%%R
# Save the ready to upload file to upload to taiga
write.table(
  new_release, 
  paste0('temp/mutations.', release, '.all.csv'), sep = ',', quote = F, row.names = F)
# Save the ready to upload file to upload to taiga
write.table(
  damaging_mutation, 
  paste0('temp/damaging_mutation.', release, '.all.csv'), sep = ',', quote = F)
# Save the ready to upload file to upload to taiga
write.table(
  other_mutation, 
  paste0('temp/other_mutation.', release, '.all.csv'), sep = ',', quote = F)
# Save the ready to upload file to upload to taiga
write.table(
  hotspot_mutation, 
  paste0('temp/hotspot_mutation.', release, '.all.csv'), sep = ',', quote = F)

# Validation

In [None]:
newmutations = pd.read_csv('temp/mutations.'+release+'.all.csv')

In [None]:
mutations_20Q2_all = tc.get(name='depmap-mutations-maf-35fe', version=14, file='mutations.20Q2.all')

In [None]:
#removing all of the WGS and snp arrays
mutations_20Q2_all = mutations_20Q2_all[~mutations_20Q2_all.DepMap_ID.isin(set(mutations_20Q2_all.DepMap_ID) - set(newmutations.Tumor_Sample_Barcode))]

In [None]:
#Too recent? removed by what means? why?
set(newmutations.Tumor_Sample_Barcode) - set(mutations_20Q2_all.DepMap_ID)

In [None]:
newmutations = newmutations[~newmutations.Tumor_Sample_Barcode.isin(set(newmutations.Tumor_Sample_Barcode) - set(mutations_20Q2_all.DepMap_ID))]

In [None]:
newmutations = newmutations.sort_values(by=['Tumor_Sample_Barcode','Chromosome','Start_position','End_position'])
mutations_20Q2_all = mutations_20Q2_all.sort_values(by=['DepMap_ID','Chromosome','Start_position','End_position'])

In [None]:
newmutations[['Tumor_Sample_Barcode','Chromosome','Start_position','End_position']] = newmutations[['Tumor_Sample_Barcode','Chromosome','Start_position','End_position']].astype(str)
newmutationsset = newmutations['Tumor_Sample_Barcode']+'_'+newmutations['Chromosome']+':'+newmutations['Start_position']+'-'+newmutations['End_position']
newmutations['grouped'] = newmutationsset

In [None]:
len(newmutationsset)

In [None]:
dups = h.dups(newmutationsset)

In [None]:
len(dups)

In [None]:
newmutationsset = set(newmutationsset)

In [None]:
mutations_20Q2_all[['DepMap_ID','Chromosome','Start_position','End_position']] = mutations_20Q2_all[['DepMap_ID','Chromosome','Start_position','End_position']].astype(str)
mutations_20Q2_allset = mutations_20Q2_all['DepMap_ID']+'_'+mutations_20Q2_all['Chromosome']+':'+mutations_20Q2_all['Start_position']+'-'+mutations_20Q2_all['End_position']
mutations_20Q2_all['grouped'] = mutations_20Q2_allset

In [None]:
len(mutations_20Q2_allset)

In [None]:
len(set(mutations_20Q2_allset))

In [None]:
dups = h.dups(mutations_20Q2_allset)

In [None]:
## How come?? the merging did not work well
len(dups)

In [None]:
mutations_20Q2_all[mutations_20Q2_all['grouped']==dups[0]]

In [None]:
submutations_20Q2_all = mutations_20Q2_all[~mutations_20Q2_all.CGA_WES_AC.isna()]

In [None]:
len(submutations_20Q2_all) 

In [None]:
submutations_20Q2_allset = submutations_20Q2_all.grouped

In [None]:
mutations_20Q2_allset = mutations_20Q2_all.grouped

In [None]:
len(set(submutations_20Q2_allset) & set(newmutationsset))/ len(submutations_20Q2_allset)

In [None]:
len(set(submutations_20Q2_allset) & set(newmutationsset))/ len(newmutationsset)

with everything from all pipelines

In [None]:
len(set(mutations_20Q2_allset) & set(newmutationsset))/ len(newmutationsset)

In [None]:
len(set(mutations_20Q2_allset) & set(newmutationsset))/ len(mutations_20Q2_allset)

In [None]:
submutations_20Q2_all[submutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(newmutationsset))]

In [None]:
notfound = submutations_20Q2_all[submutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(newmutationsset))]

In [None]:
len(set(notfound.DepMap_ID))

In [None]:
notfound[~(notfound.SangerRecalibWES_AC.isna() & notfound.SangerWES_AC.isna() & notfound.HC_AC.isna() & notfound.WGS_AC.isna() & notfound.RNAseq_AC.isna() &notfound.RD_AC.isna())]

only one third of the notfound mutations were backed by any other analysis, showing an FP rate of 2/3

In [None]:
len(set(newmutationsset) - set(mutations_20Q2_allset))

In [None]:
newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))]

In [None]:
len(set(newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))].Tumor_Sample_Barcode))

In [None]:
alternate = np.array(newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))]['CGA_WES_AC'].str.split(':').tolist()).astype(int)[:,0]

In [None]:
alternate.mean()

In [None]:
alternate.min()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
a,b = np.unique(alternate, return_counts=True)
fig, ax = plt.subplots(figsize=(10,10))
ax=sns.barplot(x=a[:40],y=b[:40], ci=None, ax=ax)
ax.set_yscale('log')

In [None]:
len(alternate[alternate>2]) / len(alternate) 

In [None]:
len(alternate[alternate>3]) / len(alternate) 

In [None]:
onlyinnew = newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))]
highconf = onlyinnew[(onlyinnew['CGA_WES_AC'].str.split(':').str[0].values.astype(int)>3) & (onlyinnew['PERC_CGA_WES_AC'].values.astype(float)>0.3)]

In [None]:
len(highconf)/len(onlyinnew)

In [None]:
newmutations

In [None]:
def filterCoverage(maf, loc=['CGA_WES_AC'], sep=':',cov=4):
    muts=np.zeroes((len(maf),2))
    for val in loc:
        muts+= np.array([[v[0],0] if 'NA' in v else v for v in mutations_20Q2_all[val].fillna('0'+sep+'0').astype(str).str.split(sep).tolist()]).astype(int)
    return maf[muts[:,1]>=cov]

def filterAllelicFraction(maf, loc=['CGA_WES_AC'], sep=':',frac=0.3):
    muts=np.zeroes((len(maf),2))
    for val in loc:
        muts+= np.array([[v[0],0] if 'NA' in v else v for v in mutations_20Q2_all[val].fillna('0'+sep+'0').astype(str).str.split(sep).tolist()]).astype(int)
    muts = muts[:,0]/muts[:,1]
    return maf[muts>=frac]

def mergeAnnotations(newmaf, additionalmaf):
    

def addAnnotation(maf):
    

def mafToMat(maf, col, boolify = False, samplesCol = "DepMap_ID", mutNameCol="Hugo_Symbol"):
    maf = maf.sort_values(by = mutNameCol)
    mut = pd.DataFrame(data = np.zeroes((len(set(maf[mutNameCol])), 1)), columns=['fake'], index=set(maf[mutNameCol])).astype(float)
    for val in set(maf[samplesCol]):
        mut.append(maf[maf[samplesCol]==val], axis=)
    return mut.nan_to_num(0).astype(bool if boolify else float).drop(columns=['fake'])
    

In [None]:
filtered_mutations = filterCoverage(mutations)
filtered_mutations = filterAllelicFraction(filtered_mutations)

merged_mutations = addAnnotation(mutations)

mafToMat(filtered_mutations[filtered_mutations.damaging]).to_csv('.csv')
mafToMat(filtered_mutations[filtered_mutations.other]).to_csv('.csv')
mafToMat(filtered_mutations[filtered_mutations.hotspot]).to_csv('.csv')


CCLE2othermutations = 

mutations = mergeAnnotations(filtered_mutations, CCLE2othermutations)

#making 
for muttype in ['']:
    mafToMat(CCLE2othermutations[CCLE2othermutations.damaging & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")
    mafToMat(CCLE2othermutations[CCLE2othermutations.other & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")
    mafToMat(CCLE2othermutations[CCLE2othermutations.hotspot & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")

## DMC

In [None]:
os.system('cd ../depmap-release-readmes && git pull && mv release-'+releAse+'/dmc-'+releAse+'.txt ../ccle_processing/temp/README && cd -')

In [None]:
print(len(mutations))
mutations = mutations[~mutations.DepMap_ID.isin(wes_embargo)]
print(len(mutations))
mutations.to_csv('temp/depmap_'+release+'_mutation_calls.all', index=False)
print(len(damaging_mutation))
damaging_mutation = damaging_mutation[~damaging_mutation.index.isin(wes_embargo)]
print(len(damaging_mutation))
damaging_mutation.to_csv('temp/damaging_mutation.all')
print(len(other_mutation))
other_mutation = other_mutation[~other_mutation.index.isin(wes_embargo)]
print(len(other_mutation))
other_mutation.to_csv('temp/other_mutation.all',)
print(len(hotspot_mutation))
hotspot_mutation = hotspot_mutation[~hotspot_mutation.index.isin(wes_embargo)]
print(len(hotspot_mutation))
hotspot_mutation.to_csv('temp/hotspot_mutation.all',)

In [None]:
prevmut = tc.get(name='depmap-mutation-calls-dfce', version=15, file='depmap_'+prevname+'_mutation_calls')
print('shoud be None')
print(set(prevmut.DepMap_ID) - set(mutations.DepMap_ID))
print("new lines")
newlines = set(mutations.DepMap_ID) - set(prevmut.DepMap_ID) 
newlines

In [None]:
tc.update_dataset(dataset_permaname="depmap-mutation-calls-dfce",
                 upload_file_path_dict={'temp/depmap_'+release+'_mutation_calls.all': 'TableCSV',
                                        'temp/damaging_mutation.all': 'NumericMatrixCSV',
                                        'temp/other_mutation.all': 'NumericMatrixCSV',
                                        'temp/hotspot_mutation.all': 'NumericMatrixCSV',
                                       },#'temp/README': 'Raw'},
                 dataset_description="""
# DMC Mutations

* Version 1-5 DMC 19Q1*

version 5 is a one-off portal thing because dmc wanted to be able to plot if a gene has any mutation as one-hot encoded value in the x/y axes of the data explorer It adds the any_mutation matrix, but does not change the others. Code used to generate:

```
from taigapy import TaigaClient

c = TaigaClient()

dmc_19q1_mutation_taiga_root = "depmap-mutation-calls-dfce.3/"
other_matrix = c.get(dmc_19q1_mutation_taiga_root + "other_mutation")
damaging_matrix = c.get(dmc_19q1_mutation_taiga_root + "damaging_mutation")
hotspot_matrix = c.get(dmc_19q1_mutation_taiga_root + "hotspot_mutation")

df = other_matrix.append(damaging_matrix)
df = df.groupby(level=0).sum()

df = df.append(hotspot_matrix)
df = df.groupby(level=0).sum()

df[df > 1] = 1

df.to_csv('any_mutation.csv')
```
The code uses version 3 because the dmc portal was using version 3

version 4 updates the column name from VA_WES_AC to CCLE_WES_AC

version 3 has an updated definition for hotspot mutations

version 2+ contains the correct data for 19Q1

* Version 6 DMC 19Q2*

* Version 7-8 DMC 19Q3*
version 8 fixed entrez ids

* Version 9 DMC 19Q4*
adding 52 new cell lines.

* Version 10 DMC 19Q4*
removing unauthorized lines and setting as matrices

* Version 11 DMC 19Q4*
removing unauthorized lines and setting as matrices

* Version 12 Internal 20Q1*
uploading 8 new lines

* Version 13 Internal 20Q1*
removing unauthorized cl

* Version 14 Internal 20Q2*
uploading 8 new lines and adding .all to express the fact that this data is the aggregate of all different sequencing methods.

* Version 15 Internal 20Q2*
removing 2 lines

* Version 15 Internal 20Q3*
nothing different from 20Q2. no new cell lines

* Version 15 Internal 20Q3*
updating the blacklists


MAF file, added column (Variant_annotation) classifying each variant as either silent, damaging, other conserving, or other non-conserving, based on this mapping (old annotation from Variant_Classification column - new annotation):

Silent - silent
Splice_Site - damaging
Missense_Mutation - other non-conserving
Nonsense_Mutation - damaging
De_novo_Start_OutOfFrame - damaging
Nonstop_Mutation - other non-conserving
Frame_Shift_Del - damaging
Frame_Shift_Ins - damaging
In_Frame_Del - other non-conserving
In_Frame_Ins - other non-conserving
Stop_Codon_Del - other non-conserving
Stop_Codon_Ins - other non-conserving
Start_Codon_SNP - damaging
Start_Codon_Del - damaging
Start_Codon_Ins - damaging
5'Flank - other conserving
Intron - other conserving
IGR - other conserving
3'UTR - other conserving
5'UTR - other conserving
Binary matrices:
- damaging: if damaging
- other: if other conserving or other non-conserving
- hotspot: if it is not a silent mutation and is either TCGA or COSMIC hotspot
- Rows: cell line, DepMap (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

NEW LINES:
"""+newlines)

In [None]:
# To add to a virtual dataset
AddToVirtual(virtual_dmc, 'depmap-mutation-calls-dfce', [('CCLE_mutations', 'depmap_'+release+'_mutation_calls'),])#('README','README')])

## Public

In [None]:
os.system('cd ../depmap-release-readmes && git pull && mv release-'+releAse+'/public-'+releAse+'.txt README && cd -')

In [None]:

#damaging_mutation
mutations=depmap_20Q3_mutation_calls
#hotspot_mutation
#other_mutation

In [None]:
print(len(mutations))
mutations = mutations[mutations.DepMap_ID.isin(prevprev)]
mutations = mutations[~mutations.DepMap_ID.isin(wes_dmc_embargo)]
print(len(mutations))
mutations.to_csv('temp/depmap_'+release+'_mutation_calls.all', index=False)
print(len(damaging_mutation))
damaging_mutation = damaging_mutation[damaging_mutation.index.isin(prevprev)]
damaging_mutation = damaging_mutation[~damaging_mutation.index.isin(wes_dmc_embargo)]
print(len(damaging_mutation))
damaging_mutation.to_csv('temp/damaging_mutation.all')
print(len(other_mutation))
other_mutation = other_mutation[other_mutation.index.isin(prevprev)]
other_mutation = other_mutation[~other_mutation.index.isin(wes_dmc_embargo)]
print(len(other_mutation))
other_mutation.to_csv('temp/other_mutation.all')
print(len(hotspot_mutation))
hotspot_mutation = hotspot_mutation[hotspot_mutation.index.isin(prevprev)]
hotspot_mutation = hotspot_mutation[~hotspot_mutation.index.isin(wes_dmc_embargo)]
print(len(hotspot_mutation))
hotspot_mutation.to_csv('temp/hotspot_mutation.all')

In [None]:
prevmut = tc.get(name='depmap-mutation-calls-9a1a', version=18, file='depmap_'+prevname+'_mutation_calls')
print('shoud be None')
ermgency_removed = set(prevmut.DepMap_ID) - set(mutations.DepMap_ID)
print(ermgency_removed) 
print("new lines")
newlines = set(mutations.DepMap_ID) - set(prevmut.DepMap_ID) 
newlines

In [None]:
description="""
# Public Mutations

Mutation calls for Public DepMap data

* Version 1 Public 18Q1*

original source: CCLE data portal
* Version 2 Public 18Q2*

merged mutations and indels file (1,549 cell lines total, including data for 63 newly released cell lines)
original source: `/xchip/ccle_dist/public/DepMap_18Q2/CCLE_DepMap_18Q2_maf_20180502.txt`
* Version 3-4 Public 18Q3*

version 3 deprecated

original source: `/xchip/ccle_dist/public/DepMap_18Q3/CCLE_DepMap_18q3_maf_20180718.txt`

Binary matrices:
damaging: if isDeleterious is true
missense: if isDeleterious is false
hotspot: if missense and either TCGA or COSMIC hotspot
Rows: cell line, Broad (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

MAF file

* Version 5 Public 18Q4*

original source: `/xchip/ccle_dist/public/DepMap_18Q4/CCLE_DepMap_18q4_maf_20181029.txt`

* Version 6-9 Public 19Q1*

version 9 updates the column name from VA_WES_AC to CCLE_WES_AC

version 8 uses an updated definition for hotspot mutations

version 9 contains the correct data for 19Q1

* Version 10 Public 19Q2*

* Version 11-12 Public 19Q3*

version 12 fixed entrez ids

* Version 13 Public 19Q4*

adding 52 new cell lines

* Version 14 Public 19Q4*
removing unauthorized lines and setting matrices

* Version 15 Public 20Q1*
adding 8 new lines 

* Version 16 Public 20Q1*
removing an unauthorized line

* Version 17 Internal 20Q2*
uploading 8 new lines and adding .all to express the fact that this data is the aggregate of all different sequencing methods.

* Version 18 Internal 20Q2*
removing 2 lines

* Version 19 Internal 20Q3*
nothing different from 20Q2. no new cell lines

* Version 20 Internal 20Q3*
updating the blacklists

* Version 21 Internal 20Q3*
updating the dmc

* Version 22 Internal 20Q3*
readding two already released samples to the public list

MAF file, added column (Variant_annotation) classifying each variant as either silent, damaging, other conserving, or other non-conserving, based on this mapping (old annotation from Variant_Classification column - new annotation):

Silent - silent
Splice_Site - damaging
Missense_Mutation - other non-conserving
Nonsense_Mutation - damaging
De_novo_Start_OutOfFrame - damaging
Nonstop_Mutation - other non-conserving
Frame_Shift_Del - damaging
Frame_Shift_Ins - damaging
In_Frame_Del - other non-conserving
In_Frame_Ins - other non-conserving
Stop_Codon_Del - other non-conserving
Stop_Codon_Ins - other non-conserving
Start_Codon_SNP - damaging
Start_Codon_Del - damaging
Start_Codon_Ins - damaging
5'Flank - other conserving
Intron - other conserving
IGR - other conserving
3'UTR - other conserving
5'UTR - other conserving
Binary matrices:

- damaging: if damaging
- other: if other conserving or other non-conserving
- hotspot: if it is not a silent mutation and is either TCGA or COSMIC hotspot
- Rows: cell line, DepMap (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

NEW LINES:
"""+str(newlines)

if len(ermgency_removed):
    description+="""
    
    !! WE REMOVED!!:
    """+str(ermgency_removed)

tc.update_dataset(dataset_permaname="depmap-mutation-calls-9a1a",
                 upload_file_path_dict={'temp/depmap_'+release+'_mutation_calls.all': 'TableCSV',
                                        'temp/damaging_mutation.all': 'NumericMatrixCSV',
                                        'temp/other_mutation.all': 'NumericMatrixCSV',
                                        'temp/hotspot_mutation.all': 'NumericMatrixCSV',
                                       },#'temp/README': 'Raw'},
                 dataset_description=description)

In [None]:
# To add to a virtual dataset
AddToVirtual(virtual_public, 'depmap-mutation-calls-9a1a', [('CCLE_mutations', 'depmap_'+release+'_mutation_calls'),])#('README','README')])