In [None]:
from __future__ import print_function
import os.path
import pandas as pd
import gzip
import sys
import numpy as np

sys.path.insert(0, '..')

from src.CCLE_postp_function import *
from JKBio import Datanalytics as da 
from JKBio import TerraFunction as terra
from JKBio import Helper as h
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display



%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

# boot up

we are instanciating all the parameters needed for this pipeline to run

In [None]:
samplesetname = "20Q3"
prevname="20Q2"
prevversion = 22
prevprevname ='20Q1'
prevprevversion= 20
virtual_internal='internal-20q3-00d0'

refworkspace="broad-firecloud-ccle/DepMap_Mutation_Calling_CGA_pipeline"



refsheet_url = "https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE"
sheeturl = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"

release = samplesetname

In [None]:
%%R
release <- '20Q3'
prevname <- '20Q2'
genome_version <- 'hg19'
taiga_version <- 10
prevversion <-13

In [None]:
refwm = dm.WorkspaceManager(refworkspace)

In [None]:
res = refwm.get_sample_sets().loc["all"]
res

In [None]:
filtered = res['filtered_CGA_MAF_aggregated']
! gsutil cp $filtered "temp/mutation_filtered_terra_merged.txt"

In [None]:
file = pd.read_csv('temp/mutation_filtered_terra_merged.txt',sep='\t') 
print(file.columns[:10])
renaming = removeOlderVersions(names = set(file['Tumor_Sample_Barcode']), refsamples = refwm.get_samples(), arxspan_id = "arxspan_id", version="version")
print(file[file['Chromosome']=='0'])
file[file['Tumor_Sample_Barcode'].isin(renaming.keys())].replace({'Tumor_Sample_Barcode':renaming}).reset_index(drop=True).to_csv('temp/mutation_filtered_terra_merged.txt',sep='\t',index=None)

In [None]:
ls ../JKBio/

# Analysis

In [None]:
%%R
#source('src/load_libraries_and_annotations.R')
load('src/Annotations.rdata') 
# There are some cell lines the celllinemapr does not know how to map so we need to load this data object for now (from old datasets)
source('src/CCLE_postp_function.R')
library('cdsomics')
library(tidyverse)
library(data.table)
library(magrittr)
library(taigr)
library(cdsomics)
library(celllinemapr) # To pull out DepMap_IDs from CCLE_names where needed

In [None]:
%%R
newly_merged_maf <- readMutations('temp/mutation_filtered_terra_merged.txt')
new_release <- createSNPs(newly_merged_maf)
names(new_release)

In [None]:
%%R
new_release <- renameAsInMainMutation(new_release)

In [None]:
%%R
filtered <- filterAllelicFraction(new_release)

In [None]:
%%R
write.table(
  filtered$merged, 
  paste0('temp/newmutations.', release, '.all.csv'), sep = ',', quote = F, row.names = F)

In [None]:
%%R
filtered <- filterMinCoverage(filtered$merged, filtered$removed_from_maf)

In [None]:
%%R
head(merged)

In [None]:
%%R
clean_annotations <- mergeAnnotations(merged,previous.release.maf)

In [None]:
%%R

# Allie's version
new_release <- addAnnotation(new_release)

In [None]:
%%R
# some matric files that does get used internaly and might be useful
damaging_mutation <- mutation_maf_to_binary_matrix(new_release, damaging =  TRUE)
other_mutation <- mutation_maf_to_binary_matrix(new_release, other = TRUE)
hotspot_mutation <- mutation_maf_to_binary_matrix(new_release, hotspot = TRUE)

In [None]:
%%R
# Save the ready to upload file to upload to taiga
write.table(
  new_release, 
  paste0('temp/mutations.', release, '.all.csv'), sep = ',', quote = F, row.names = F)
# Save the ready to upload file to upload to taiga
write.table(
  damaging_mutation, 
  paste0('temp/damaging_mutation.', release, '.all.csv'), sep = ',', quote = F)
# Save the ready to upload file to upload to taiga
write.table(
  other_mutation, 
  paste0('temp/other_mutation.', release, '.all.csv'), sep = ',', quote = F)
# Save the ready to upload file to upload to taiga
write.table(
  hotspot_mutation, 
  paste0('temp/hotspot_mutation.', release, '.all.csv'), sep = ',', quote = F)

In [None]:
newmutations = pd.read_csv('temp/mutations.'+release+'.all.csv')

In [None]:
mutations_20Q2_all = tc.get(name='depmap-mutations-maf-35fe', version=14, file='mutations.20Q2.all')

# Comparing

## initial, simple

In [None]:
#removing all of the WGS and snp arrays
mutations_20Q2_all = mutations_20Q2_all[~mutations_20Q2_all.DepMap_ID.isin(set(mutations_20Q2_all.DepMap_ID) - set(newmutations.Tumor_Sample_Barcode))]

### strange new cell lines

In [None]:
#Too recent? removed by what means? why?
set(newmutations.Tumor_Sample_Barcode) - set(mutations_20Q2_all.DepMap_ID)

In [None]:
newmutations = newmutations[~newmutations.Tumor_Sample_Barcode.isin(set(newmutations.Tumor_Sample_Barcode) - set(mutations_20Q2_all.DepMap_ID))]

In [None]:
newmutations = newmutations.sort_values(by=['Tumor_Sample_Barcode','Chromosome','Start_position','End_position'])
mutations_20Q2_all = mutations_20Q2_all.sort_values(by=['DepMap_ID','Chromosome','Start_position','End_position'])

In [None]:
newmutations[['Tumor_Sample_Barcode','Chromosome','Start_position','End_position']] = newmutations[['Tumor_Sample_Barcode','Chromosome','Start_position','End_position']].astype(str)
newmutationsset = newmutations['Tumor_Sample_Barcode']+'_'+newmutations['Chromosome']+':'+newmutations['Start_position']+'-'+newmutations['End_position']
newmutations['grouped'] = newmutationsset

In [None]:
len(newmutationsset)

In [None]:
dups = h.dups(newmutationsset)

In [None]:
len(dups)

In [None]:
newmutationsset = set(newmutationsset)

In [None]:
mutations_20Q2_all[['DepMap_ID','Chromosome','Start_position','End_position']] = mutations_20Q2_all[['DepMap_ID','Chromosome','Start_position','End_position']].astype(str)
mutations_20Q2_allset = mutations_20Q2_all['DepMap_ID']+'_'+mutations_20Q2_all['Chromosome']+':'+mutations_20Q2_all['Start_position']+'-'+mutations_20Q2_all['End_position']
mutations_20Q2_all['grouped'] = mutations_20Q2_allset

In [None]:
len(mutations_20Q2_allset)

In [None]:
dups = h.dups(mutations_20Q2_allset)

### issues with duplicates

In [None]:
## How come?? the merging did not work well
len(dups)

In [None]:
set(mutations_20Q2_all.Variant_Classification)

In [None]:
mutations_20Q2_all[mutations_20Q2_all['grouped']==dups[0]]

In [None]:
mutations_20Q2_all[mutations_20Q2_all['grouped']==dups[0]][mutations_20Q2_all.columns[:20]]

In [None]:
submutations_20Q2_all = mutations_20Q2_all[~(mutations_20Q2_all.CGA_WES_AC.isna())] #& mutations_20Q2_all.SangerRecalibWES_AC.isna())]

In [None]:
len(submutations_20Q2_all) 

In [None]:
len(set(submutations_20Q2_all.DepMap_ID))

In [None]:
submutations_20Q2_allset = submutations_20Q2_all.grouped

In [None]:
mutations_20Q2_allset = mutations_20Q2_all.grouped

### similarity

In [None]:
len(set(submutations_20Q2_allset) & set(newmutationsset))/ len(submutations_20Q2_allset)

In [None]:
[i for i in submutations_20Q2_allset if type(i)!= str]

In [None]:
len(set(submutations_20Q2_allset) - set(newmutationsset))

In [None]:
len(set(submutations_20Q2_allset) & set(newmutationsset))/ len(newmutationsset)

In [None]:
subnewmutationsset = set(newmutations[newmutations.Tumor_Sample_Barcode.isin(submutations_20Q2_all.DepMap_ID)].grouped)
len(set(submutations_20Q2_allset) & set(subnewmutationsset))/ len(subnewmutationsset)

In [None]:
len(set(subnewmutationsset)- set(submutations_20Q2_allset))

In [None]:
on = newmutations[newmutations.Tumor_Sample_Barcode.isin(set(submutations_20Q2_allset) & set(subnewmutationsset))]

### similarity with everything from all datasets

In [None]:
len(set(newmutationsset) - set(mutations_20Q2_allset))

In [None]:
len(set(mutations_20Q2_allset) & set(newmutationsset))/ len(mutations_20Q2_allset)

In [None]:
len(mutations_20Q2_all[mutations_20Q2_all.CGA_WES_AC.isna() & mutations_20Q2_all.SangerRecalibWES_AC.isna()])/ len(mutations_20Q2_all)

# INVESTIGATE ??

## correct for number of cell lines and look at it for each columns ??

In [None]:
len(set(mutations_20Q2_all[mutations_20Q2_all.DepMap_ID.isin(newmutations.Tumor_Sample_Barcode)].grouped) - set(newmutations[newmutations.Tumor_Sample_Barcode.isin(mutations_20Q2_all.DepMap_ID)].grouped))

In [None]:
submutations_20Q2_all[submutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(newmutationsset))]

In [None]:
notfound = submutations_20Q2_all[submutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(newmutationsset))]

In [None]:
len(notfound)

In [None]:
# how many genes
len(set(notfound.Hugo_Symbol))

In [None]:
len(set())

In [None]:
newmutations

In [None]:
newmutations[newmutations.Tumor_Sample_Barcode.isin(set(notfound.DepMap_ID))]

In [None]:
notfound[~(notfound.SangerRecalibWES_AC.isna() & notfound.SangerWES_AC.isna() & notfound.HC_AC.isna() & notfound.WGS_AC.isna() & notfound.RNAseq_AC.isna() &notfound.RD_AC.isna())]

### only one half of the notfound mutations were backed by any other analysis, showing an FP rate of 50%

In [None]:
len(set(newmutationsset) - set(mutations_20Q2_allset))

### what about the other way around? need to merge

In [None]:
len(set(subnewmutationsset) - set(submutations_20Q2_allset))

In [None]:
len(set(subnewmutationsset) - set(submutations_20Q2_allset))

In [None]:
len(set(mutations_20Q2_allset) & (set(subnewmutationsset) - set(submutations_20Q2_allset)))

In [None]:
len(set(newmutations[newmutations.grouped.isin(set(subnewmutationsset) - set(submutations_20Q2_allset))].Tumor_Sample_Barcode))

In [None]:
len(set(submutations_20Q2_allset)-set(subnewmutationsset))

In [None]:
len(set(mutations_20Q2_all[mutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(subnewmutationsset))].Tumor_Sample_Barcode))

In [None]:
len(set(mutations_20Q2_all[mutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset))].Tumor_Sample_Barcode))

In [None]:
newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))]

In [None]:
len(set(newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))].Tumor_Sample_Barcode))

In [None]:
len(set(mutations_20Q2_all[mutations_20Q2_all.grouped.isin(set(mutations_20Q2_allset) - set(newmutationsset))].Tumor_Sample_Barcode))

In [None]:
a = np.array(mutations_20Q2_all[~mutations_20Q2_all['CGA_WES_AC'].isna()]['CGA_WES_AC'].str.split(':').tolist())[:,0].astype(int)

In [None]:
mutations_20Q2_all[~mutations_20Q2_all['CGA_WES_AC'].isna()].iloc[np.where(a==0)]

In [None]:
a = np.array(newmutations['CGA_WES_AC'].str.split(':').tolist()).astype(int)

In [None]:
a = np.array(mutations_20Q2_all[mutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(subnewmutationsset))]['CGA_WES_AC'].str.split(':').tolist())

In [None]:
np.where(a=='NA')

In [None]:
a[1148]

In [None]:
mutations_20Q2_all[mutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(subnewmutationsset))]['CGA_WES_AC'].iloc[1148]

In [None]:
alternate = np.array(mutations_20Q2_all[mutations_20Q2_all.grouped.isin(set(submutations_20Q2_allset) - set(subnewmutationsset))]['CGA_WES_AC'].str.split(':').tolist())[:,0].astype(int)
a,b = np.unique(alternate, return_counts=True)
fig, ax = plt.subplots(figsize=(10,10))
ax=sns.barplot(x=a[:90],y=b[:90], ci=None, ax=ax)
ax.set_yscale('log')

In [None]:
alternate = np.array(newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))]['CGA_WES_AC'].str.split(':').tolist()).astype(int)[:,0]

In [None]:
alternate.mean()

In [None]:
alternate.min()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

### distribution of alternate allele count in the mutation newly found by the new pipeline

In [None]:
a,b = np.unique(alternate, return_counts=True)
fig, ax = plt.subplots(figsize=(10,10))
ax=sns.barplot(x=a[:40],y=b[:40], ci=None, ax=ax)
ax.set_yscale('log')

In [None]:
len(alternate[alternate>2]) / len(alternate) 

In [None]:
len(alternate[alternate>3]) / len(alternate) 

In [None]:
onlyinnew = newmutations[newmutations.grouped.isin(set(newmutationsset) - set(mutations_20Q2_allset))]
highconf = onlyinnew[(onlyinnew['CGA_WES_AC'].str.split(':').str[0].values.astype(int)>2) & (onlyinnew['PERC_CGA_WES_AC'].values.astype(float)>0.1)]

### We can see that 97% seems to come from good quality additional mutations, we should filter the other ones as is done in guillaume's pipelines

In [None]:
len(highconf)/len(onlyinnew)

### 5-10% are really mutations no one know what to do with

In [None]:
whatarethey = onlyinnew[(onlyinnew['CGA_WES_AC'].str.split(':').str[0].values.astype(int)>20) & (onlyinnew['PERC_CGA_WES_AC'].values.astype(float)<0.2)]

In [None]:
len(whatarethey)/len(onlyinnew)

In [None]:
len(newmutations[newmutations['Variant_Classification']=='Silent'])

## Number of mutation on the same gene

In [None]:
maf = newmutations[newmutations['Variant_Classification']!='Silent']
samplesCol = "Tumor_Sample_Barcode"
mutNameCol="Hugo_Symbol"
col= "PERC_CGA_WES_AC"
maf = maf.sort_values(by = mutNameCol)
samples = set(maf[samplesCol])
mut = pd.DataFrame(data = np.zeros((len(set(maf[mutNameCol])), 1)), columns=['fake'], index=set(maf[mutNameCol])).astype(float)
a = {}
du = []
counts=[]
for i,val in enumerate(samples):
    h.showcount(i,len(samples))
    e = maf[maf[samplesCol]==val]
    dups = h.dups(e[mutNameCol])
    a[val] = len(dups)/len(e)
    du.extend(dups)
    if dups:
        for v in dups:
            m = e[e[mutNameCol]==v]
            if m[col].mean() >0.75:
                counts.append(m[['Tumor_Sample_Barcode', 'PERC_CGA_WES_AC', 'Hugo_Symbol', 'Variant_Classification']].values)

In [None]:
a = pd.DataFrame(data=a,index=['dupmut_to_mut'])

In [None]:
a=a.T

In [None]:
a.max(),a.min(),a.mean(),a.var()

### 5% of damaging mutations occur on the same gene.

In [None]:
a.values[0]

In [None]:
sns.kdeplot(data = a.values[0])

### filtered by silent

In [None]:
a = pd.DataFrame(data=a,index=['dupmut_to_mut'])

In [None]:
a=a.T

In [None]:
a.max(),a.min(),a.mean(),a.var()

In [None]:
sns.kdeplot(data = a.T.values[0])

In [None]:
from collections import Counter
dudf = pd.DataFrame(data=Counter(du).values(), index=Counter(du).keys(),columns=['counts'])

In [None]:
dudf = dudf.sort_values(by='counts')

In [None]:
dudf = dudf[dudf.counts>20]

In [None]:
dudf.iloc[-50:]

In [None]:
from pybiomart import Server
server = Server(host='http://www.ensembl.org')
dataset = (server.marts['ENSEMBL_MART_ENSEMBL']
                 .datasets['hsapiens_gene_ensembl'])

In [None]:
table = dataset.query(attributes=["hgnc_symbol", "start_position","end_position"])


In [None]:
table = table[~table['HGNC symbol'].isna()]

In [None]:
table['size'] = table['Gene end (bp)'] - table['Gene start (bp)']

In [None]:
size=[]
for i in dudf.index:
    a = sum(table[table['HGNC symbol']==i]['size'])
    if a==0:
        dudf = dudf.drop(i)
        continue
    size.append(a)
dudf['size']=size

In [None]:
dudf['normalized'] = dudf['counts']/dudf['size']

In [None]:
dudf['normalized'] = dudf['normalized'] /max(dudf['normalized'])

In [None]:
dudf.sort_values(by='normalized').iloc[-50:]

In [None]:
# normalize the list to gene size

This is linked to gene size and association to cancer:
    - TTN: 250k
    - MUC16: 132 kb
    - LRP1B: 1900kb
    - PCLO: 400kb
    - TP53: 25kb
    
But why would cancer need 2 specific damaging mutations in one very important gene?

- still passenger? (would be a helpful info to estimate what is passenger and what is not)
- one of the mutation was not being effective?
- is there a new phenotype from that?

Why do we get a statistically significative number? 
-> there might be tons of mutations, only the few that are deemed "reportable" by CGA are in the most well known genes. Meaning we are heavily skewed towards these mutations 

### what about allele frequency

In [None]:
counts = np.concatenate(counts, axis=0 )

In [None]:
len(set(counts[:,0]))

In [None]:
len(set(counts[:,2]))

In [None]:
len(counts) / (1200000*0.047 )

In [None]:
c = pd.DataFrame(data=Counter(counts[:,2]).values(), index=Counter(counts[:,2]).keys(),columns=['counts']).sort_values(by='counts')/2

In [None]:
c

In [None]:
size=[]
for i in c.index:
    a = sum(table[table['HGNC symbol']==i]['size'])
    if a==0:
        c = c.drop(i)
        continue
    size.append(a)
c['size']=size

In [None]:
c['normalized'] = c['counts']/c['size']
c['normalized'] = c['normalized'] /max(c['normalized'])

In [None]:
c[c.counts>8].sort_values(by='normalized').iloc[-50:]

1/3 of cell lines have on average 2 genes with a double hit homozygous deleterious mutation. 

95% of the double hit damaging mutation could pertain to the known double hit model 


## Comparing Sanger vs new data
- sanger processed vs notsanger processed. sanger not in new; in new no in sanger;
- we have maf files for the two pipelines on the same samples

In [None]:
sangmutations_20Q2_all = mutations_20Q2_all[~mutations_20Q2_all['SangerWES_AC'].isna()]

In [None]:
len(sangmutations_20Q2_all) 

In [None]:
sangmutations_20Q2_allset = set(sangmutations_20Q2_all.grouped)

In [None]:
subnewmutationsset = set(newmutations[newmutations.Tumor_Sample_Barcode.isin(sangmutations_20Q2_all.DepMap_ID)].grouped)

### similarity

In [None]:
len(sangmutations_20Q2_allset - subnewmutationsset)

In [None]:
len(subnewmutationsset - sangmutations_20Q2_allset)

In [None]:
alternate = np.array(sangmutations_20Q2_all['SangerWES_AC'].str.split(':').tolist()).astype(int)[:,0]

In [None]:
a,b = np.unique(alternate, return_counts=True)
fig, ax = plt.subplots(figsize=(10,10))
ax=sns.barplot(x=a[:40],y=b[:40], ci=None, ax=ax)
ax.set_yscale('log')

In [None]:
alternate = np.array(sangmutations_20Q2_all[sangmutations_20Q2_all.grouped.isin(sangmutations_20Q2_allset - subnewmutationsset)]['SangerWES_AC'].str.split(':').tolist()).astype(int)[:,0]
a,b = np.unique(alternate, return_counts=True)
fig, ax = plt.subplots(figsize=(10,10))
ax=sns.barplot(x=a[:40],y=b[:40], ci=None, ax=ax)
ax.set_yscale('log')

In [None]:
mutations_20Q2_all[(mutations_20Q2_all.grouped.isin(sangmutations_20Q2_allset - subnewmutationsset)) & ~(mutations_20Q2_all.HC_AC.isna() & mutations_20Q2_all.WGS_AC.isna() & mutations_20Q2_all.RNAseq_AC.isna() & mutations_20Q2_all.RD_AC.isna())]

In [None]:
## You haven't removed cell lines?? make it a bit more clear!

In [None]:
len(mutations_20Q2_all[(mutations_20Q2_all.grouped.isin(sangmutations_20Q2_allset - subnewmutationsset)) & ~((mutations_20Q2_all.HC_AC.isna() & mutations_20Q2_all.WGS_AC.isna() & mutations_20Q2_all.RNAseq_AC.isna() & mutations_20Q2_all.RD_AC.isna()))]) / len(mutations_20Q2_all[mutations_20Q2_all.grouped.isin(sangmutations_20Q2_allset - subnewmutationsset)])

In [None]:
len(mutations_20Q2_all[(mutations_20Q2_all.grouped.isin(subnewmutationsset- sangmutations_20Q2_allset)) & ~((mutations_20Q2_all.HC_AC.isna() & mutations_20Q2_all.WGS_AC.isna() & mutations_20Q2_all.RNAseq_AC.isna() & mutations_20Q2_all.RD_AC.isna()))]) / len(mutations_20Q2_all[mutations_20Q2_all.grouped.isin(subnewmutationsset- sangmutations_20Q2_allset)])

mmh it is interresting that we get more "good quality peaks" with the Sanger PiPeline than the CGA Pipeline

## working on the new all python pipeline

In [None]:
old_dataset = mutations_20Q2_all.drop(columns='CGA_WES_AC')

In [None]:
hc_mutations = mutations_20Q2_all[~mutations_20Q2_all.HC_AC.isna()].drop(columns=['CGA_WES_AC',"SangerRecalibWES_AC","SangerWES_AC","WGS_AC","RNAseq_AC","RD_AC"])
wgs_mutations = mutations_20Q2_all[~mutations_20Q2_all.WGS_AC.isna()].drop(columns=['CGA_WES_AC',"SangerRecalibWES_AC","SangerWES_AC","HC_AC","RNAseq_AC","RD_AC"])
rna_mutations = mutations_20Q2_all[~mutations_20Q2_all.RNAseq_AC.isna()].drop(columns=['CGA_WES_AC',"SangerRecalibWES_AC","SangerWES_AC","HC_AC","WGS_AC","RD_AC"])
rd_mutations = mutations_20Q2_all[~mutations_20Q2_all.RD_AC.isna()].drop(columns=['CGA_WES_AC',"SangerRecalibWES_AC","SangerWES_AC","HC_AC","RNAseq_AC","WGS_AC"])
sanger_mutations = mutations_20Q2_all[~mutations_20Q2_all.SangerWES_AC.isna()].drop(columns=['CGA_WES_AC',"SangerRecalibWES_AC","RD_AC","HC_AC","RNAseq_AC","WGS_AC"])

In [None]:
def filterCoverage(maf, loc=['CGA_WES_AC'], sep=':',cov=4):
    muts=np.zeroes((len(maf),2))
    for val in loc:
        muts+= np.array([[v[0],0] if 'NA' in v else v for v in mutations_20Q2_all[val].fillna('0'+sep+'0').astype(str).str.split(sep).tolist()]).astype(int)
    return maf[muts[:,1]>=cov]

def filterAllelicFraction(maf, loc=['CGA_WES_AC'], sep=':',frac=0.3):
    muts=np.zeroes((len(maf),2))
    for val in loc:
        muts+= np.array([[v[0],0] if 'NA' in v else v for v in mutations_20Q2_all[val].fillna('0'+sep+'0').astype(str).str.split(sep).tolist()]).astype(int)
    muts = muts[:,0]/muts[:,1]
    return maf[muts>=frac]

def mergeAnnotations(newmaf, additionalmaf, additionalonmerge=[]):
    on = ['Chromosome', 'Start_position', 'End_position', 'Reference_Allele', 'Tumor_Seq_Allele1']
    on.extend(additionalonmerge)
    
    newmaf = newmaf.join(additionalmaf, on = on)
    if 
    solve issues with Hugo_Symbol, Entrez_Gene_Id
    
    return newmad
    
def mergeXY():
    dbSNP_RS.x, dbSNP_RS.y


def addAnnotation(maf, NCBI_Build='37', Strand="+"):
    maf['NCBI_Build'] = NCBI_Build
    maf['Strand'] = Strand
    maf = maf[['current', 'SangerWES_AC', 'SangerRecalibWES_AC', 'RNAseq_AC', 'HC_AC', 'RD_AC', 'WGS_AC']

def mafToMat(maf, col, boolify = False, samplesCol = "DepMap_ID", mutNameCol="Hugo_Symbol"):
    maf = maf.sort_values(by = mutNameCol)
    samples = set(maf[samplesCol])
    mut = pd.DataFrame(data = np.zeros((len(set(maf[mutNameCol])), 1)), columns=['fake'], index=set(maf[mutNameCol])).astype(float)
    for i,val in enumerate(samples):
        h.showcount(i,len(samples))
        mut = mut.join(maf[maf[samplesCol]==val].drop_duplicates(mutNameCol).set_index(mutNameCol)[col].rename(val))
    return mut.nan_to_num(0).astype(bool if boolify else float).drop(columns=['fake'])

In [None]:
filtered_mutations = filterCoverage(mutations)
filtered_mutations = filterAllelicFraction(filtered_mutations)

merged_mutations = addAnnotation(mutations)

mafToMat(filtered_mutations[filtered_mutations.damaging]).to_csv('.csv')
mafToMat(filtered_mutations[filtered_mutations.other]).to_csv('.csv')
mafToMat(filtered_mutations[filtered_mutations.hotspot]).to_csv('.csv')


CCLE2othermutations = 

mutations = mergeAnnotations(filtered_mutations, CCLE2othermutations)

#making 
for muttype in ['']:
    mafToMat(CCLE2othermutations[CCLE2othermutations.damaging & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")
    mafToMat(CCLE2othermutations[CCLE2othermutations.other & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")
    mafToMat(CCLE2othermutations[CCLE2othermutations.hotspot & CCLE2othermutations[muttype]]).to_csv(''+muttype+".csv")

# Looking at the new unfiltered MAF

In [None]:
unfiltered = res['unfiltered_CGA_MAF_aggregated']
! gsutil cp $unfiltered "temp/mutation_unfiltered_terra_merged.txt"

In [None]:
unfiltered = pd.read_csv('temp/mutation_unfiltered_terra_merged.txt', sep='\t', encoding='L6',na_values=["__UNKNOWN__"])

In [None]:
unfiltered = unfiltered.astype(str).replace(['.',"__UNKNOWN__"],None)

In [None]:
toremove = []
for val in unfiltered.columns:
    if len(unfiltered[unfiltered[val]=='__UNKNOWN__'])>3160000:
        toremove.append(val)

In [None]:
unfiltered=unfiltered.drop(columns=toremove)

In [None]:
toremove = []
for val in unfiltered.columns[40:]:
    if len(unfiltered[unfiltered[val]=='nan'])>3160000:
        toremove.append(val)
    elif len(set(unfiltered[val])-set(['nan']))==1:
        toremove.append(val)

In [None]:
unfiltered = unfiltered.drop(columns=["UniProt_Site","alt_allele_seen","CCLE_ONCOMAP_overlapping_mutations","failure_reasons","ESP_CA","SVTYPE","id","gnomADg_GT","ESP_GWAS_PUBMED", 'dbSNP_Val_Status', 'qual', 'iHpol', 'QSI_ref', 'BCNoise', 'score', 'Familial_Cancer_Genes_Reference', 'NT']+toremove)

In [None]:
unfiltered['somatic'] = unfiltered['somatic'].replace('nan','False')
unfiltered['HGNC_Status'] = unfiltered['HGNC_Status'].replace('nan','Unapproved')
unfiltered['judgement'] = unfiltered['judgement'].replace('nan','REMOVE')

In [None]:
for val in unfiltered.columns:
    a = unfiltered[val]
    print(val, a[~a.isna()][:10])

In [None]:
toint =  ["Start_position", "End_position"]

In [None]:
for val in toint:
    unfiltered[val]  = unfiltered[val].astype('Int64')

In [None]:
tofloat = [ "n_ref_count", "t_q20_count", "t_ins_count", "t_lod_fstar", "ESP_AvgAAsampleReadDepth", "TQSI_NT", "normal_f", "FDP50", "t_lod_fstar_forward", "TQSI", "tumor_alt_rpir_median", "ESP_CP", "DP2", "ESP_Position", "RC", "observed_in_normals_count", "Transcript_Exon", "contaminant_lod", "normal_power_wsp", "ESP_TotalAAsamplesCovered", "n_alt_count", "map_Q0_reads", "t_alt_max_mapq", "ESP_CG", "n_q20_count", "QSI_NT", "ESP_TotalSamplesCovered", "contaminant_fraction", "tumor_alt_rpir_mad", "tumor_f", "power_to_detect_negative_strand_artifact", "total_reads", "t_ref_max_mapq", "QSI", "ESP_AvgEAsampleReadDepth", "IHP", "init_n_lod", "ORegAnno_bin", "t_alt_sum", "SUBDP50", "t_lod_fstar_reverse", "read_depth", "t_ref_sum", "n_ref_sum", "gc_content", "t_alt_count", "power_to_detect_positive_strand_artifact", "IC", "normal_power", "UniProt_AApos", "ESP_TotalEAsamplesCovered", "COSMIC_total_alterations_in_gene", "normal_power_nsp", "CCLE_ONCOMAP_total_mutations_in_gene", "init_t_lod", "power", "tumor_alt_fpir_median", "ESP_AvgSampleReadDepth", "tumor_power", "tumor_alt_fpir_mad", "t_del_count", "n_alt_sum", "COSMIC_n_overlapping_mutations", "t_ref_count", "DP50"]

In [None]:
for val in tofloat[5:]:
    try:
        unfiltered[val]  = unfiltered[val].astype(float)
    except:
        print(val)

In [None]:
unfiltered = unfiltered.reset_index()

In [None]:
unfiltered.to_csv('temp/mutation_unfiltered_terra_merged.csv.gz')

In [None]:
unfiltered = pd.read_csv('temp/mutation_unfiltered_terra_merged.csv.gz').drop(columns=['Unnamed: 0','index'])

In [None]:
unfiltered = unfiltered.rename(columns={'Tumor_Sample_Barcode':'DepMap_ID'})

In [None]:
unfiltered.dbSNP_RS.str.contains('|',regex=False).sum() / len(unfiltered)

In [None]:
for val in unfiltered.columns:
    try:
        a = unfiltered[val].str.contains('|',regex=False).sum()
    except:
        continue
    if  a >3:
        print(val)

In [None]:
renaming = removeOlderVersions(names = set(unfiltered.DepMap_ID), refsamples = refwm.get_samples(), arxspan_id = "arxspan_id", version="version")

In [None]:
unfiltered.to_csv('temp/mutation_unfiltered_terra_merged.csv.gz')

## seperating dbSNP status to the rest

In [None]:
[i for i in unfiltered.columns if 'db' in i]

# reprocessing WGS

reprocessing CNV /SV / mutations for WGS

In [None]:
workspace = "broad-firecloud-ccle/DepMap_WGS_CN"
wm = dm.WorkspaceManager(workspace)

In [None]:
sub1 = 'e00f369a-ba2f-4a2f-a8e4-3b6ddc80d3bb'
sub2 = '1ac9bc3f-b22b-417e-9998-5ba07fe776bf'
sub3 = "990e353d-4461-4129-8f23-96bcd8869e75"
sub4 = "b85dec97-262b-454e-b2e5-9452f1048215"

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame()

In [None]:
a = terra.waitForSubmission(workspace,'e00f369a-ba2f-4a2f-a8e4-3b6ddc80d3bb')

In [None]:
failed = ["CDS-1YwLXW","CDS-7UJrWv","CDS-AHsEp4","CDS-AtxiIO","CDS-IFlU2c","CDS-K9VWSo","CDS-L2WS8G","CDS-LJGHlm","CDS-NDaM5V","CDS-WiJdsP","CDS-aN8PNg","CDS-dIijHP","CDS-fzqrxa","CDS-gILaI6","CDS-hKsmPi","CDS-kU1WWq","CDS-knM0TU","CDS-mYOC4j","CDS-njO2PJ","CDS-uQv8yw",
]

In [None]:
wm.update_sample_set("failed",failed)

cleaning and copying back all hg38_WES samples for CNV pipeline

In [None]:
cnworkspace = "broad-firecloud-ccle/DepMap_WES_CN_hg38"
cnwm = dm.WorkspaceManager(cnworkspace)

In [None]:
ind= ccle_refsamples[(ccle_refsamples.datatype=='wes')].index
ccle_refsamples.loc[ind,'legacy_bam_filepath'] = ccle_refsamples.loc[ind,'internal_bam_filepath']

In [None]:
ind= ccle_refsamples[(ccle_refsamples.datatype=='wes')].index
ccle_refsamples.loc[ind,'legacy_bai_filepath'] = ccle_refsamples.loc[ind,'internal_bai_filepath']

In [None]:
ccle_refsamples = ccle_refsamples.set_index('cds_sample_id')

In [None]:
cnsam = cnwm.get_samples()

In [None]:
cnsam = cnsam.drop('nan')

In [None]:
ccle_refsamples.loc[cnsam.index,'internal_bai_filepath'] = cnsam.hg38_analysis_ready_bam_index

In [None]:
ccle_refsamples.loc[cnsam.index,'internal_bam_filepath'] = cnsam.hg38_analysis_ready_bam

In [None]:
ccle_refsamples['md5_hash'] = None

In [None]:
ccle_refsamples['legacy_size'] = None
ccle_refsamples['legacy_crc32c_hash'] = None

In [None]:
ind= ccle_refsamples[(ccle_refsamples.datatype=='wes')].index
ccle_refsamples.loc[ind,'legacy_size'] = ccle_refsamples.loc[ind,'size']
ccle_refsamples.loc[ind,'legacy_crc32c_hash'] = ccle_refsamples.loc[ind,'crc32c_hash']

In [None]:
ccle_refsamples.loc[cnsam.index,'md5_hash'] = gcp.catFiles(cnsam.hg38_analysis_ready_bam_md5, cut=50)

In [None]:
ccle_refsamples.loc[cnsam.index,'size']

In [None]:
ccle_refsamples.loc[cnsam.index,'size'] = [gcp.extractSize(i)[1] for i in gcp.lsFiles(cnsam.hg38_analysis_ready_bam.tolist(), add='-l')]

In [None]:
ccle_refsamples = ccle_refsamples.drop(index=ccle_refsamples[ccle_refsamples.datatype=='hg38_wes'].index)

removing cached data for WGS and copying back hg38 to bucket

In [None]:
ind = ccle_refsamples[ccle_refsamples.datatype=="wgs"].index

In [None]:
ccle_refsamples.loc[ind,'legacy_bam_filepath'] = ccle_refsamples.loc[ind,'internal_bam_filepath']
ccle_refsamples.loc[ind,'legacy_bai_filepath'] = ccle_refsamples.loc[ind,'internal_bai_filepath']
ccle_refsamples.loc[ind,'legacy_size'] = ccle_refsamples.loc[ind,'size']
ccle_refsamples.loc[ind,'legacy_crc32c_hash'] = ccle_refsamples.loc[ind,'crc32c_hash']

In [None]:
wgsworkspace = "broad-firecloud-ccle/DepMap_WGS_CN"
wgswm = dm.WorkspaceManager(wgsworkspace)

In [None]:
sam = wgswm.get_samples()

In [None]:
ccle_refsamples.loc[sam.index,'internal_bai_filepath'] = sam.analysis_ready_bam_index
ccle_refsamples.loc[sam.index,'internal_bam_filepath'] = sam.analysis_ready_bam

In [None]:
ccle_refsamples.loc[sam.index,'md5_hash'] = gcp.catFiles(sam.analysis_ready_bam_md5, cut=32)

In [None]:
ccle_refsamples.loc[sam.index,'size'] = [gcp.extractSize(i)[1] for i in gcp.lsFiles(sam.analysis_ready_bam.tolist(), add='-l')]

In [None]:
sam.analysis_ready_bam[1]

In [None]:
hou

In [None]:
terra.changeGSlocation(wgsworkspace, "gs://cclebams/wgs_hg38/", entity='sample', onlycol=['analysis_ready_bam','analysis_ready_bam_index',"analysis_ready_bam_md5"],keeppath=False, dry_run=False)

In [None]:
terra.cleanWorkspace(wgsworkspace, only=[sub1, sub2, sub3, sub4])

In [None]:
a = gcp.get_all_sizes('gs://fc-secure-bd7b8bc9-f665-4269-997e-5a402088a369/', suffix='bam')

In [None]:
! gsutil -m rm gs://fc-secure-bd7b8bc9-f665-4269-997e-5a402088a369/*/PreProcessingForVariantDiscovery_GATK4/**.bam

In [None]:
ccle_refsamples.to_csv('temp/newsamples.csv')