# Identification of shared mutations between dome and trigone

In [1]:
import pandas as pd
import os
from tqdm import tqdm
tqdm.pandas()
import seaborn as sns
import matplotlib.pyplot as plt
import sys

sys.path.append('../..')
from consensus_variables import *

In [2]:
# general variables

muttypes = ["allsnvs", "missense", "synonymous", "indels", "nonprotaffecting", "truncating"]

## Load mutations and processing

In [3]:
maf_df = pd.read_csv(somatic_maf_file, sep = "\t")
maf_df

  maf_df = pd.read_csv(somatic_maf_file, sep = "\t")


Unnamed: 0,CHROM,POS,REF,ALT,FILTER,INFO,FORMAT,SAMPLE,DEPTH,ALT_DEPTH,...,FILTER.not_searched_COMPLEX,FILTER.MSI12,FILTER.AM_not_searched_SV,FILTER.not_searched_SV,FILTER.SN1.5,FILTER.q22.5,FILTER.v1,FILTER.low_mappability,FILTER.Bias,FILTER.d3
0,chr1,26729450,T,C,low_complex_repetitive;not_in_exons;pSTD,SAMPLE=P19_0024_BTR_01;TYPE=SNV;DP=930;VD=1;AF...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:930:1:929,1:0.0011:929,0:1,0:930:929,1:3:1...",930,1,...,False,False,False,False,False,False,False,False,False,False
1,chr1,26729474,TAG,T,low_complex_repetitive;not_in_exons;pSTD,SAMPLE=P19_0014_BDO_01;TYPE=Deletion;DP=1470;V...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:1470:1:1469,1:0.0007:1469,0:1,0:1470:1469,...",1470,1,...,False,False,False,False,False,False,False,False,False,False
2,chr1,26729479,TTC,T,low_complex_repetitive;not_in_exons;pSTD,SAMPLE=P19_0023_BDO_01;TYPE=Deletion;DP=1984;V...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:1984:1:1983,1:0.0005:1983,0:1,0:1961:1960,...",1961,1,...,False,False,False,False,False,False,False,False,False,False
3,chr1,26729563,G,C,not_in_exons;pSTD,SAMPLE=P19_0023_BDO_01;TYPE=SNV;DP=3490;VD=1;A...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:3490:1:3489,1:0.0003:2657,832:1,0:3490:348...",3490,1,...,False,False,False,False,False,False,False,False,False,False
4,chr1,26729566,TA,T,not_in_exons;p10;pSTD,SAMPLE=P19_0040_BTR_01;TYPE=Deletion;DP=2168;V...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:2168:1:2167,1:0.0005:1845,322:0,1:2114:211...",2114,1,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64273,chrX,124100705,AAT,A,not_in_exons;p10;pSTD,SAMPLE=P19_0025_BDO_01;TYPE=Deletion;DP=2650;V...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:2650:1:2649,1:0.0004:405,2244:0,1:2720:271...",2720,1,...,False,False,False,False,False,False,False,False,False,False
64274,chrX,124100722,C,T,not_in_exons;pSTD,SAMPLE=P19_0003_BDO_01;TYPE=SNV;DP=2046;VD=1;A...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:2046:1:2045,1:0.0005:135,1910:0,1:2049:204...",2049,1,...,False,False,False,False,False,False,False,False,False,False
64275,chrX,124100743,G,C,not_in_exons;pSTD,SAMPLE=P19_0052_BTR_01;TYPE=SNV;DP=1441;VD=1;A...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:1441:1:1440,1:0.0007:53,1387:0,1:1441:1440...",1441,1,...,False,False,False,False,False,False,False,False,False,False
64276,chrX,124100757,T,G,not_in_exons;pSTD,SAMPLE=P19_0038_BDO_01;TYPE=SNV;DP=1168;VD=1;A...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:1168:1:1167,1:0.0009:8,1159:0,1:1168:1167,...",1168,1,...,False,False,False,False,False,False,False,False,False,False


In [4]:
# change sample names for paper style

maf_df["SAMPLE_ID_short"] = maf_df["SAMPLE_ID"].map(old2new_sample_names)
maf_df

Unnamed: 0,CHROM,POS,REF,ALT,FILTER,INFO,FORMAT,SAMPLE,DEPTH,ALT_DEPTH,...,FILTER.MSI12,FILTER.AM_not_searched_SV,FILTER.not_searched_SV,FILTER.SN1.5,FILTER.q22.5,FILTER.v1,FILTER.low_mappability,FILTER.Bias,FILTER.d3,SAMPLE_ID_short
0,chr1,26729450,T,C,low_complex_repetitive;not_in_exons;pSTD,SAMPLE=P19_0024_BTR_01;TYPE=SNV;DP=930;VD=1;AF...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:930:1:929,1:0.0011:929,0:1,0:930:929,1:3:1...",930,1,...,False,False,False,False,False,False,False,False,False,24_TR
1,chr1,26729474,TAG,T,low_complex_repetitive;not_in_exons;pSTD,SAMPLE=P19_0014_BDO_01;TYPE=Deletion;DP=1470;V...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:1470:1:1469,1:0.0007:1469,0:1,0:1470:1469,...",1470,1,...,False,False,False,False,False,False,False,False,False,14_DO
2,chr1,26729479,TTC,T,low_complex_repetitive;not_in_exons;pSTD,SAMPLE=P19_0023_BDO_01;TYPE=Deletion;DP=1984;V...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:1984:1:1983,1:0.0005:1983,0:1,0:1961:1960,...",1961,1,...,False,False,False,False,False,False,False,False,False,23_DO
3,chr1,26729563,G,C,not_in_exons;pSTD,SAMPLE=P19_0023_BDO_01;TYPE=SNV;DP=3490;VD=1;A...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:3490:1:3489,1:0.0003:2657,832:1,0:3490:348...",3490,1,...,False,False,False,False,False,False,False,False,False,23_DO
4,chr1,26729566,TA,T,not_in_exons;p10;pSTD,SAMPLE=P19_0040_BTR_01;TYPE=Deletion;DP=2168;V...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:2168:1:2167,1:0.0005:1845,322:0,1:2114:211...",2114,1,...,False,False,False,False,False,False,False,False,False,40_TR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64273,chrX,124100705,AAT,A,not_in_exons;p10;pSTD,SAMPLE=P19_0025_BDO_01;TYPE=Deletion;DP=2650;V...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:2650:1:2649,1:0.0004:405,2244:0,1:2720:271...",2720,1,...,False,False,False,False,False,False,False,False,False,25_DO
64274,chrX,124100722,C,T,not_in_exons;pSTD,SAMPLE=P19_0003_BDO_01;TYPE=SNV;DP=2046;VD=1;A...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:2046:1:2045,1:0.0005:135,1910:0,1:2049:204...",2049,1,...,False,False,False,False,False,False,False,False,False,03_DO
64275,chrX,124100743,G,C,not_in_exons;pSTD,SAMPLE=P19_0052_BTR_01;TYPE=SNV;DP=1441;VD=1;A...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:1441:1:1440,1:0.0007:53,1387:0,1:1441:1440...",1441,1,...,False,False,False,False,False,False,False,False,False,52_TR
64276,chrX,124100757,T,G,not_in_exons;pSTD,SAMPLE=P19_0038_BDO_01;TYPE=SNV;DP=1168;VD=1;A...,GT:DP:VD:AD:AF:RD:ALD:CDP:CAD:NDP:CDPAM:CADAM:...,"0/1:1168:1:1167,1:0.0009:8,1159:0,1:1168:1167,...",1168,1,...,False,False,False,False,False,False,False,False,False,38_DO


In [5]:
print(maf_df["SAMPLE_ID_short"].loc[maf_df["SAMPLE_ID_short"].isna()])
samples = maf_df["SAMPLE_ID_short"].unique()
len(samples)
# 79 samples correctly changed

Series([], Name: SAMPLE_ID_short, dtype: object)


79

## Shared mutations between all possible pairs of samples

In [6]:
# count number of shared muts between every pair of samples and for each mutation type

shared_muts_dfs_dict = {}

for muttype in muttypes:

    print(muttype)

    # filter maf by mutation type category
    if muttype == "allsnvs":
        maf_df_f = maf_df.loc[maf_df["TYPE"] == "SNV"].copy()
    elif muttype == "indels":
        maf_df_f = maf_df.loc[maf_df["TYPE"].isin(["INSERTION", "DELETION"])].copy()
    elif muttype == "nonprotaffecting":
        maf_df_f = maf_df.loc[maf_df["Protein_affecting"] == "non_protein_affecting"].copy()
    elif muttype == "truncating":
        maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV") & (maf_df["canonical_Consequence_broader"].isin(["nonsense", "essential_splice"]))].copy()
    else:
        maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV") & (maf_df["canonical_Consequence_broader"] == muttype)].copy()
    
    shared_muts_df = pd.DataFrame(index = maf_df["SAMPLE_ID_short"].unique().tolist(),
                                columns = maf_df["SAMPLE_ID_short"].unique().tolist())
    
    # iterate through all possible pairs of samples
    for sample1 in tqdm(samples):
        sample1_muts = set(maf_df_f.loc[maf_df_f["SAMPLE_ID_short"] == sample1]["MUT_ID"].values)
        
        for sample2 in samples:
            sample2_muts = set(maf_df_f.loc[maf_df_f["SAMPLE_ID_short"] == sample2]["MUT_ID"].values)
    
            shared_muts_df.loc[sample1, sample2] = len(sample1_muts.intersection(sample2_muts))

    shared_muts_dfs_dict[muttype] = shared_muts_df.reset_index(names = "sample1").melt(
        id_vars = "sample1", var_name = "sample2", value_name = f"numb_shared_muts_{muttype}")

allsnvs


100%|██████████| 79/79 [00:19<00:00,  4.14it/s]


missense


100%|██████████| 79/79 [00:06<00:00, 11.74it/s]


synonymous


100%|██████████| 79/79 [00:02<00:00, 37.51it/s]


indels


100%|██████████| 79/79 [00:09<00:00,  8.48it/s]


nonprotaffecting


100%|██████████| 79/79 [00:06<00:00, 13.15it/s]


truncating


100%|██████████| 79/79 [00:06<00:00, 12.91it/s]


In [7]:
# merge: 79² comparisons (6241)
shared_muts_df = shared_muts_dfs_dict["allsnvs"].merge(shared_muts_dfs_dict["missense"], on = ["sample1", "sample2"], how = "inner").merge(
    shared_muts_dfs_dict["synonymous"], on = ["sample1", "sample2"], how = "inner"
).merge(
    shared_muts_dfs_dict["indels"], on = ["sample1", "sample2"], how = "inner"
).merge(
    shared_muts_dfs_dict["nonprotaffecting"], on = ["sample1", "sample2"], how = "inner"
).merge(
    shared_muts_dfs_dict["truncating"], on = ["sample1", "sample2"], how = "inner"
)
shared_muts_df

Unnamed: 0,sample1,sample2,numb_shared_muts_allsnvs,numb_shared_muts_missense,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,numb_shared_muts_truncating
0,24_TR,24_TR,320,116,25,92,170,42
1,14_DO,24_TR,8,2,0,6,4,3
2,23_DO,24_TR,16,1,1,6,8,8
3,40_TR,24_TR,10,1,1,5,8,2
4,02_TR,24_TR,8,2,1,2,7,1
...,...,...,...,...,...,...,...,...
6236,13_TR,36_TR,3,0,2,3,4,0
6237,06_DO,36_TR,1,0,1,2,2,0
6238,41_TR,36_TR,2,0,1,5,2,1
6239,27_TR,36_TR,2,1,1,3,1,0


In [8]:
# count total number of mutations per sample and per muttype
for muttype in muttypes:

    print(muttype)

    # filter maf by mutation type category
    if muttype == "allsnvs":
        maf_df_f = maf_df.loc[maf_df["TYPE"] == "SNV"].copy()
    elif muttype == "indels":
        maf_df_f = maf_df.loc[maf_df["TYPE"].isin(["INSERTION", "DELETION"])].copy()
    elif muttype == "nonprotaffecting":
        maf_df_f = maf_df.loc[maf_df["Protein_affecting"] == "non_protein_affecting"].copy()
    elif muttype == "truncating":
        maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV") & (maf_df["canonical_Consequence_broader"].isin(["nonsense", "essential_splice"]))].copy()
    else:
        maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV") & (maf_df["canonical_Consequence_broader"] == muttype)].copy()

    # count number of mutations and merge with shared mutations table (1st sample1, then sample2)
    nmuts_df = maf_df_f.groupby("SAMPLE_ID_short").size().to_frame(f"sample1_numb_muts_{muttype}").reset_index(names = "sample1")
    shared_muts_df = shared_muts_df.merge(nmuts_df, on = "sample1", how = "inner")
    nmuts_df = nmuts_df.rename({f"sample1_numb_muts_{muttype}": f"sample2_numb_muts_{muttype}", "sample1": "sample2"}, axis = 1)
    shared_muts_df = shared_muts_df.merge(nmuts_df, on = "sample2", how = "inner")

shared_muts_df

allsnvs
missense
synonymous
indels
nonprotaffecting
truncating


Unnamed: 0,sample1,sample2,numb_shared_muts_allsnvs,numb_shared_muts_missense,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,numb_shared_muts_truncating,sample1_numb_muts_allsnvs,sample2_numb_muts_allsnvs,sample1_numb_muts_missense,sample2_numb_muts_missense,sample1_numb_muts_synonymous,sample2_numb_muts_synonymous,sample1_numb_muts_indels,sample2_numb_muts_indels,sample1_numb_muts_nonprotaffecting,sample2_numb_muts_nonprotaffecting,sample1_numb_muts_truncating,sample2_numb_muts_truncating
0,24_TR,24_TR,320,116,25,92,170,42,320,320,116,116,25,25,92,92,170,170,42,42
1,14_DO,24_TR,8,2,0,6,4,3,608,320,218,116,50,25,297,92,234,170,182,42
2,23_DO,24_TR,16,1,1,6,8,8,1198,320,423,116,70,25,537,92,414,170,358,42
3,40_TR,24_TR,10,1,1,5,8,2,159,320,52,116,17,25,128,92,97,170,35,42
4,02_TR,24_TR,8,2,1,2,7,1,988,320,426,116,101,25,260,92,382,170,170,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6236,13_TR,36_TR,3,0,2,3,4,0,128,31,58,6,17,8,35,23,82,27,3,1
6237,06_DO,36_TR,1,0,1,2,2,0,310,31,116,6,19,8,167,23,77,27,119,1
6238,41_TR,36_TR,2,0,1,5,2,1,289,31,113,6,24,8,191,23,74,27,105,1
6239,27_TR,36_TR,2,1,1,3,1,0,123,31,42,6,27,8,54,23,93,27,5,1


In [9]:
# calculate jaccard index: shared / (sample1+sample2-shared)
for muttype in tqdm(muttypes):
    shared_muts_df[f"jaccard_index_{muttype}"] = shared_muts_df[f"numb_shared_muts_{muttype}"] / ( shared_muts_df[f"sample1_numb_muts_{muttype}"] + shared_muts_df[f"sample2_numb_muts_{muttype}"] - shared_muts_df[f"numb_shared_muts_{muttype}"] )

shared_muts_df

100%|██████████| 6/6 [00:00<00:00, 1491.84it/s]


Unnamed: 0,sample1,sample2,numb_shared_muts_allsnvs,numb_shared_muts_missense,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,numb_shared_muts_truncating,sample1_numb_muts_allsnvs,sample2_numb_muts_allsnvs,...,sample1_numb_muts_nonprotaffecting,sample2_numb_muts_nonprotaffecting,sample1_numb_muts_truncating,sample2_numb_muts_truncating,jaccard_index_allsnvs,jaccard_index_missense,jaccard_index_synonymous,jaccard_index_indels,jaccard_index_nonprotaffecting,jaccard_index_truncating
0,24_TR,24_TR,320,116,25,92,170,42,320,320,...,170,170,42,42,1.0,1.0,1.0,1.0,1.0,1.0
1,14_DO,24_TR,8,2,0,6,4,3,608,320,...,234,170,182,42,0.008696,0.006024,0.0,0.015666,0.01,0.013575
2,23_DO,24_TR,16,1,1,6,8,8,1198,320,...,414,170,358,42,0.010652,0.001859,0.010638,0.009631,0.013889,0.020408
3,40_TR,24_TR,10,1,1,5,8,2,159,320,...,97,170,35,42,0.021322,0.005988,0.02439,0.023256,0.030888,0.026667
4,02_TR,24_TR,8,2,1,2,7,1,988,320,...,382,170,170,42,0.006154,0.003704,0.008,0.005714,0.012844,0.004739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6236,13_TR,36_TR,3,0,2,3,4,0,128,31,...,82,27,3,1,0.019231,0.0,0.086957,0.054545,0.038095,0.0
6237,06_DO,36_TR,1,0,1,2,2,0,310,31,...,77,27,119,1,0.002941,0.0,0.038462,0.010638,0.019608,0.0
6238,41_TR,36_TR,2,0,1,5,2,1,289,31,...,74,27,105,1,0.006289,0.0,0.032258,0.023923,0.020202,0.009524
6239,27_TR,36_TR,2,1,1,3,1,0,123,31,...,93,27,5,1,0.013158,0.021277,0.029412,0.040541,0.008403,0.0


In [11]:
# add donor ID and bladder location
shared_muts_df["sample1_donorID"] = shared_muts_df.apply(lambda row: row["sample1"].split("_")[0], axis = 1)
shared_muts_df["sample2_donorID"] = shared_muts_df.apply(lambda row: row["sample2"].split("_")[0], axis = 1)
shared_muts_df["sample1_bladder_location"] = shared_muts_df.apply(lambda row: row["sample1"].split("_")[1], axis = 1)
shared_muts_df["sample2_bladder_location"] = shared_muts_df.apply(lambda row: row["sample2"].split("_")[1], axis = 1)
shared_muts_df = shared_muts_df.replace("TR", "trigone")
shared_muts_df = shared_muts_df.replace("DO", "dome")
shared_muts_df

Unnamed: 0,sample1,sample2,numb_shared_muts_allsnvs,numb_shared_muts_missense,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,numb_shared_muts_truncating,sample1_numb_muts_allsnvs,sample2_numb_muts_allsnvs,...,jaccard_index_allsnvs,jaccard_index_missense,jaccard_index_synonymous,jaccard_index_indels,jaccard_index_nonprotaffecting,jaccard_index_truncating,sample1_donorID,sample2_donorID,sample1_bladder_location,sample2_bladder_location
0,24_TR,24_TR,320,116,25,92,170,42,320,320,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,24,24,trigone,trigone
1,14_DO,24_TR,8,2,0,6,4,3,608,320,...,0.008696,0.006024,0.000000,0.015666,0.010000,0.013575,14,24,dome,trigone
2,23_DO,24_TR,16,1,1,6,8,8,1198,320,...,0.010652,0.001859,0.010638,0.009631,0.013889,0.020408,23,24,dome,trigone
3,40_TR,24_TR,10,1,1,5,8,2,159,320,...,0.021322,0.005988,0.024390,0.023256,0.030888,0.026667,40,24,trigone,trigone
4,02_TR,24_TR,8,2,1,2,7,1,988,320,...,0.006154,0.003704,0.008000,0.005714,0.012844,0.004739,02,24,trigone,trigone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6236,13_TR,36_TR,3,0,2,3,4,0,128,31,...,0.019231,0.000000,0.086957,0.054545,0.038095,0.000000,13,36,trigone,trigone
6237,06_DO,36_TR,1,0,1,2,2,0,310,31,...,0.002941,0.000000,0.038462,0.010638,0.019608,0.000000,06,36,dome,trigone
6238,41_TR,36_TR,2,0,1,5,2,1,289,31,...,0.006289,0.000000,0.032258,0.023923,0.020202,0.009524,41,36,trigone,trigone
6239,27_TR,36_TR,2,1,1,3,1,0,123,31,...,0.013158,0.021277,0.029412,0.040541,0.008403,0.000000,27,36,trigone,trigone


In [12]:
# reorder columns
shared_muts_df = shared_muts_df[[
    'sample1', 'sample1_donorID', 'sample1_bladder_location',
    'sample2', 'sample2_donorID', 'sample2_bladder_location',
    'sample1_numb_muts_allsnvs', 'sample2_numb_muts_allsnvs',
    'sample1_numb_muts_missense', 'sample2_numb_muts_missense',
    'sample1_numb_muts_truncating', 'sample2_numb_muts_truncating',
    'sample1_numb_muts_synonymous', 'sample2_numb_muts_synonymous',
    'sample1_numb_muts_indels', 'sample2_numb_muts_indels',
    'sample1_numb_muts_nonprotaffecting', 'sample2_numb_muts_nonprotaffecting',
    'numb_shared_muts_allsnvs', 'numb_shared_muts_missense',
    'numb_shared_muts_truncating', 'numb_shared_muts_synonymous', 
    'numb_shared_muts_indels', 'numb_shared_muts_nonprotaffecting',
    'jaccard_index_allsnvs', 'jaccard_index_missense',
    'jaccard_index_truncating', 'jaccard_index_synonymous',
    'jaccard_index_indels', 'jaccard_index_nonprotaffecting']]
shared_muts_df

Unnamed: 0,sample1,sample1_donorID,sample1_bladder_location,sample2,sample2_donorID,sample2_bladder_location,sample1_numb_muts_allsnvs,sample2_numb_muts_allsnvs,sample1_numb_muts_missense,sample2_numb_muts_missense,...,numb_shared_muts_truncating,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,jaccard_index_allsnvs,jaccard_index_missense,jaccard_index_truncating,jaccard_index_synonymous,jaccard_index_indels,jaccard_index_nonprotaffecting
0,24_TR,24,trigone,24_TR,24,trigone,320,320,116,116,...,42,25,92,170,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,14_DO,14,dome,24_TR,24,trigone,608,320,218,116,...,3,0,6,4,0.008696,0.006024,0.013575,0.000000,0.015666,0.010000
2,23_DO,23,dome,24_TR,24,trigone,1198,320,423,116,...,8,1,6,8,0.010652,0.001859,0.020408,0.010638,0.009631,0.013889
3,40_TR,40,trigone,24_TR,24,trigone,159,320,52,116,...,2,1,5,8,0.021322,0.005988,0.026667,0.024390,0.023256,0.030888
4,02_TR,02,trigone,24_TR,24,trigone,988,320,426,116,...,1,1,2,7,0.006154,0.003704,0.004739,0.008000,0.005714,0.012844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6236,13_TR,13,trigone,36_TR,36,trigone,128,31,58,6,...,0,2,3,4,0.019231,0.000000,0.000000,0.086957,0.054545,0.038095
6237,06_DO,06,dome,36_TR,36,trigone,310,31,116,6,...,0,1,2,2,0.002941,0.000000,0.000000,0.038462,0.010638,0.019608
6238,41_TR,41,trigone,36_TR,36,trigone,289,31,113,6,...,1,1,5,2,0.006289,0.000000,0.009524,0.032258,0.023923,0.020202
6239,27_TR,27,trigone,36_TR,36,trigone,123,31,42,6,...,0,1,3,1,0.013158,0.021277,0.000000,0.029412,0.040541,0.008403


In [13]:
shared_muts_allgenes_df = shared_muts_df.copy()

## Shared gene specific mutations between all possible pairs of samples

In [29]:
# count number of shared muts between every pair of samples, for each mutation type and for each gene separately

shared_muts_dfs_dict = {}

for muttype in muttypes:

    print(muttype)

    shared_muts_dfs_gene_dict = {}
    
    for gene in panel_all_genes:

        # filter maf by mutation type category and gene
        if muttype == "allsnvs":
            maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV") 
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()
        elif muttype == "indels":
            maf_df_f = maf_df.loc[(maf_df["TYPE"].isin(["INSERTION", "DELETION"])) 
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()
        elif muttype == "nonprotaffecting":
            maf_df_f = maf_df.loc[(maf_df["Protein_affecting"] == "non_protein_affecting") 
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()
        elif muttype == "truncating":
            maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV")
                                & (maf_df["canonical_Consequence_broader"].isin(["nonsense", "essential_splice"]))
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()
        else:
            maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV") 
                                & (maf_df["canonical_Consequence_broader"] == muttype)
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()
        shared_muts_df = pd.DataFrame(index = maf_df["SAMPLE_ID_short"].unique().tolist(),
                                    columns = maf_df["SAMPLE_ID_short"].unique().tolist())
        
        # iterate through all possible pairs of samples for the specific gene
        for sample1 in tqdm(samples):
            sample1_muts = set(maf_df_f.loc[maf_df_f["SAMPLE_ID_short"] == sample1]["MUT_ID"].values)
            
            for sample2 in samples:
                sample2_muts = set(maf_df_f.loc[maf_df_f["SAMPLE_ID_short"] == sample2]["MUT_ID"].values)
        
                shared_muts_df.loc[sample1, sample2] = len(sample1_muts.intersection(sample2_muts))

        shared_muts_dfs_gene_dict[gene] = shared_muts_df.reset_index(names = "sample1").melt(
            id_vars = "sample1", var_name = "sample2", value_name = f"numb_shared_muts_{muttype}")
        
        # add gene information to the table
        shared_muts_dfs_gene_dict[gene]["gene"] = gene

    shared_muts_dfs_dict[muttype] = pd.concat(shared_muts_dfs_gene_dict.values())

allsnvs


  0%|          | 0/79 [00:00<?, ?it/s]

100%|██████████| 79/79 [00:05<00:00, 13.84it/s]
100%|██████████| 79/79 [00:02<00:00, 33.97it/s]
100%|██████████| 79/79 [00:02<00:00, 31.76it/s]
100%|██████████| 79/79 [00:02<00:00, 35.57it/s]
100%|██████████| 79/79 [00:01<00:00, 40.54it/s]
100%|██████████| 79/79 [00:01<00:00, 52.95it/s]
100%|██████████| 79/79 [00:02<00:00, 39.14it/s]
100%|██████████| 79/79 [00:01<00:00, 58.54it/s]
100%|██████████| 79/79 [00:02<00:00, 32.99it/s]
100%|██████████| 79/79 [00:02<00:00, 36.17it/s]
100%|██████████| 79/79 [00:01<00:00, 56.89it/s]
100%|██████████| 79/79 [00:01<00:00, 69.16it/s]
100%|██████████| 79/79 [00:01<00:00, 63.16it/s]
100%|██████████| 79/79 [00:01<00:00, 59.14it/s]
100%|██████████| 79/79 [00:01<00:00, 76.89it/s]
100%|██████████| 79/79 [00:01<00:00, 77.48it/s]


missense


100%|██████████| 79/79 [00:02<00:00, 32.35it/s]
100%|██████████| 79/79 [00:01<00:00, 46.64it/s]
100%|██████████| 79/79 [00:01<00:00, 53.93it/s]
100%|██████████| 79/79 [00:01<00:00, 50.15it/s]
100%|██████████| 79/79 [00:01<00:00, 51.23it/s]
100%|██████████| 79/79 [00:01<00:00, 60.90it/s]
100%|██████████| 79/79 [00:01<00:00, 63.19it/s]
100%|██████████| 79/79 [00:01<00:00, 73.11it/s]
100%|██████████| 79/79 [00:01<00:00, 58.21it/s]
100%|██████████| 79/79 [00:01<00:00, 44.95it/s]
100%|██████████| 79/79 [00:01<00:00, 53.49it/s]
100%|██████████| 79/79 [00:01<00:00, 70.60it/s]
100%|██████████| 79/79 [00:01<00:00, 72.83it/s]
100%|██████████| 79/79 [00:01<00:00, 66.33it/s]
100%|██████████| 79/79 [00:01<00:00, 73.03it/s]
100%|██████████| 79/79 [00:01<00:00, 67.99it/s]


synonymous


100%|██████████| 79/79 [00:01<00:00, 44.58it/s]
100%|██████████| 79/79 [00:01<00:00, 58.16it/s]
100%|██████████| 79/79 [00:01<00:00, 70.72it/s]
100%|██████████| 79/79 [00:01<00:00, 72.61it/s]
100%|██████████| 79/79 [00:01<00:00, 73.44it/s]
100%|██████████| 79/79 [00:01<00:00, 76.25it/s]
100%|██████████| 79/79 [00:01<00:00, 65.79it/s]
100%|██████████| 79/79 [00:01<00:00, 78.66it/s]
100%|██████████| 79/79 [00:01<00:00, 75.57it/s]
100%|██████████| 79/79 [00:01<00:00, 76.91it/s]
100%|██████████| 79/79 [00:00<00:00, 80.12it/s]
100%|██████████| 79/79 [00:01<00:00, 77.96it/s]
100%|██████████| 79/79 [00:01<00:00, 78.90it/s]
100%|██████████| 79/79 [00:01<00:00, 76.36it/s]
100%|██████████| 79/79 [00:01<00:00, 74.18it/s]
100%|██████████| 79/79 [00:01<00:00, 72.40it/s]


indels


100%|██████████| 79/79 [00:03<00:00, 22.52it/s]
100%|██████████| 79/79 [00:02<00:00, 27.42it/s]
100%|██████████| 79/79 [00:02<00:00, 38.05it/s]
100%|██████████| 79/79 [00:01<00:00, 59.40it/s]
100%|██████████| 79/79 [00:01<00:00, 60.73it/s]
100%|██████████| 79/79 [00:01<00:00, 66.83it/s]
100%|██████████| 79/79 [00:01<00:00, 56.45it/s]
100%|██████████| 79/79 [00:01<00:00, 64.87it/s]
100%|██████████| 79/79 [00:01<00:00, 47.27it/s]
100%|██████████| 79/79 [00:02<00:00, 38.38it/s]
100%|██████████| 79/79 [00:01<00:00, 75.09it/s]
100%|██████████| 79/79 [00:01<00:00, 74.85it/s]
100%|██████████| 79/79 [00:01<00:00, 54.75it/s]
100%|██████████| 79/79 [00:01<00:00, 56.34it/s]
100%|██████████| 79/79 [00:00<00:00, 80.82it/s]
100%|██████████| 79/79 [00:01<00:00, 76.06it/s]


nonprotaffecting


100%|██████████| 79/79 [00:01<00:00, 40.00it/s]
100%|██████████| 79/79 [00:01<00:00, 54.50it/s]
100%|██████████| 79/79 [00:01<00:00, 55.97it/s]
100%|██████████| 79/79 [00:01<00:00, 47.28it/s]
100%|██████████| 79/79 [00:01<00:00, 57.79it/s]
100%|██████████| 79/79 [00:01<00:00, 68.06it/s]
100%|██████████| 79/79 [00:02<00:00, 39.37it/s]
100%|██████████| 79/79 [00:01<00:00, 59.93it/s]
100%|██████████| 79/79 [00:01<00:00, 57.12it/s]
100%|██████████| 79/79 [00:01<00:00, 61.43it/s]
100%|██████████| 79/79 [00:01<00:00, 67.56it/s]
100%|██████████| 79/79 [00:01<00:00, 43.78it/s]
100%|██████████| 79/79 [00:02<00:00, 33.99it/s]
100%|██████████| 79/79 [00:01<00:00, 69.06it/s]
100%|██████████| 79/79 [00:01<00:00, 78.74it/s]
100%|██████████| 79/79 [00:01<00:00, 52.73it/s]


truncating


100%|██████████| 79/79 [00:05<00:00, 13.17it/s]
100%|██████████| 79/79 [00:01<00:00, 61.47it/s]
100%|██████████| 79/79 [00:01<00:00, 46.86it/s]
100%|██████████| 79/79 [00:01<00:00, 69.26it/s]
100%|██████████| 79/79 [00:01<00:00, 68.19it/s]
100%|██████████| 79/79 [00:01<00:00, 71.91it/s]
100%|██████████| 79/79 [00:01<00:00, 57.69it/s]
100%|██████████| 79/79 [00:00<00:00, 79.06it/s]
100%|██████████| 79/79 [00:01<00:00, 46.93it/s]
100%|██████████| 79/79 [00:01<00:00, 49.30it/s]
100%|██████████| 79/79 [00:01<00:00, 77.50it/s]
100%|██████████| 79/79 [00:01<00:00, 75.75it/s]
100%|██████████| 79/79 [00:01<00:00, 67.18it/s]
100%|██████████| 79/79 [00:01<00:00, 72.31it/s]
100%|██████████| 79/79 [00:01<00:00, 65.38it/s]
100%|██████████| 79/79 [00:01<00:00, 75.25it/s]


In [30]:
# merge: 79²*16 comparisons (99856)
shared_muts_gene_df = shared_muts_dfs_dict["allsnvs"].merge(shared_muts_dfs_dict["missense"], on = ["sample1", "sample2", "gene"], how = "inner").merge(
    shared_muts_dfs_dict["synonymous"], on = ["sample1", "sample2", "gene"], how = "inner"
).merge(
    shared_muts_dfs_dict["indels"], on = ["sample1", "sample2", "gene"], how = "inner"
).merge(
    shared_muts_dfs_dict["nonprotaffecting"], on = ["sample1", "sample2", "gene"], how = "inner"
).merge(
    shared_muts_dfs_dict["truncating"], on = ["sample1", "sample2", "gene"], how = "inner"
)
shared_muts_gene_df

Unnamed: 0,sample1,sample2,numb_shared_muts_allsnvs,gene,numb_shared_muts_missense,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,numb_shared_muts_truncating
0,24_TR,24_TR,80,KMT2D,38,9,14,28,15
1,14_DO,24_TR,1,KMT2D,1,0,0,0,0
2,23_DO,24_TR,3,KMT2D,1,0,0,0,2
3,40_TR,24_TR,1,KMT2D,0,0,1,0,1
4,02_TR,24_TR,1,KMT2D,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
99851,13_TR,36_TR,0,TERTpromoter,0,0,0,0,0
99852,06_DO,36_TR,0,TERTpromoter,0,0,0,0,0
99853,41_TR,36_TR,0,TERTpromoter,0,0,0,0,0
99854,27_TR,36_TR,0,TERTpromoter,0,0,0,0,0


In [31]:
# count total number of mutations per sample, per muttype and per gene
for muttype in muttypes:

    print(muttype)

    nmuts_dfs_gene_dict = {}

    # filter maf by mutation type category and gene
    for gene in panel_all_genes:
        
        if muttype == "allsnvs":
            maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV") 
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()
        elif muttype == "indels":
            maf_df_f = maf_df.loc[(maf_df["TYPE"].isin(["INSERTION", "DELETION"])) 
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()
        elif muttype == "nonprotaffecting":
            maf_df_f = maf_df.loc[(maf_df["Protein_affecting"] == "non_protein_affecting") 
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()
        elif muttype == "truncating":
            maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV") 
                                & (maf_df["canonical_Consequence_broader"].isin(["nonsense", "essential_splice"]))
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()
        else:
            maf_df_f = maf_df.loc[(maf_df["TYPE"] == "SNV") 
                                & (maf_df["canonical_Consequence_broader"] == muttype) 
                                & (maf_df["canonical_SYMBOL"] == gene)].copy()

        # count number of mutations
        nmuts_dfs_gene_dict[gene] = maf_df_f.groupby(["SAMPLE_ID_short", "canonical_SYMBOL"]).size().to_frame(f"sample1_numb_muts_{muttype}").reset_index(names = ["sample1", "gene"])
    
    nmuts_df = pd.concat(nmuts_dfs_gene_dict.values())
    shared_muts_gene_df = shared_muts_gene_df.merge(nmuts_df, on = ["sample1", "gene"], how = "outer")
    nmuts_df = nmuts_df.rename({f"sample1_numb_muts_{muttype}": f"sample2_numb_muts_{muttype}", "sample1": "sample2"}, axis = 1)
    shared_muts_gene_df = shared_muts_gene_df.merge(nmuts_df, on = ["sample2", "gene"], how = "outer")

# fill with zeros those samples in which the number of mutations is zero
shared_muts_gene_df = shared_muts_gene_df.fillna(0)
shared_muts_gene_df

allsnvs
missense
synonymous
indels
nonprotaffecting
truncating


Unnamed: 0,sample1,sample2,numb_shared_muts_allsnvs,gene,numb_shared_muts_missense,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,numb_shared_muts_truncating,sample1_numb_muts_allsnvs,...,sample1_numb_muts_missense,sample2_numb_muts_missense,sample1_numb_muts_synonymous,sample2_numb_muts_synonymous,sample1_numb_muts_indels,sample2_numb_muts_indels,sample1_numb_muts_nonprotaffecting,sample2_numb_muts_nonprotaffecting,sample1_numb_muts_truncating,sample2_numb_muts_truncating
0,24_TR,24_TR,80,KMT2D,38,9,14,28,15,80.0,...,38.0,38.0,9.0,9.0,14.0,14.0,28.0,28.0,15.0,15.0
1,14_DO,24_TR,1,KMT2D,1,0,0,0,0,123.0,...,49.0,38.0,10.0,9.0,44.0,14.0,43.0,28.0,37.0,15.0
2,23_DO,24_TR,3,KMT2D,1,0,0,0,2,347.0,...,117.0,38.0,21.0,9.0,175.0,14.0,80.0,28.0,138.0,15.0
3,40_TR,24_TR,1,KMT2D,0,0,1,0,1,28.0,...,12.0,38.0,4.0,9.0,14.0,14.0,11.0,28.0,5.0,15.0
4,02_TR,24_TR,1,KMT2D,0,0,0,1,0,225.0,...,113.0,38.0,20.0,9.0,50.0,14.0,58.0,28.0,49.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99851,13_TR,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99852,06_DO,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99853,41_TR,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99854,27_TR,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# calculate jaccard index: shared / (sample1+sample2-shared)
for muttype in tqdm(muttypes):
    shared_muts_gene_df[f"jaccard_index_{muttype}"] = shared_muts_gene_df[f"numb_shared_muts_{muttype}"] / ( shared_muts_gene_df[f"sample1_numb_muts_{muttype}"] + shared_muts_gene_df[f"sample2_numb_muts_{muttype}"] - shared_muts_gene_df[f"numb_shared_muts_{muttype}"] )

shared_muts_gene_df

100%|██████████| 6/6 [00:00<00:00, 1416.28it/s]


Unnamed: 0,sample1,sample2,numb_shared_muts_allsnvs,gene,numb_shared_muts_missense,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,numb_shared_muts_truncating,sample1_numb_muts_allsnvs,...,sample1_numb_muts_nonprotaffecting,sample2_numb_muts_nonprotaffecting,sample1_numb_muts_truncating,sample2_numb_muts_truncating,jaccard_index_allsnvs,jaccard_index_missense,jaccard_index_synonymous,jaccard_index_indels,jaccard_index_nonprotaffecting,jaccard_index_truncating
0,24_TR,24_TR,80,KMT2D,38,9,14,28,15,80.0,...,28.0,28.0,15.0,15.0,1.000000,1.000000,1.0,1.000000,1.000000,1.000000
1,14_DO,24_TR,1,KMT2D,1,0,0,0,0,123.0,...,43.0,28.0,37.0,15.0,0.004950,0.011628,0.0,0.000000,0.000000,0.000000
2,23_DO,24_TR,3,KMT2D,1,0,0,0,2,347.0,...,80.0,28.0,138.0,15.0,0.007075,0.006494,0.0,0.000000,0.000000,0.013245
3,40_TR,24_TR,1,KMT2D,0,0,1,0,1,28.0,...,11.0,28.0,5.0,15.0,0.009346,0.000000,0.0,0.037037,0.000000,0.052632
4,02_TR,24_TR,1,KMT2D,0,0,0,1,0,225.0,...,58.0,28.0,49.0,15.0,0.003289,0.000000,0.0,0.000000,0.011765,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99851,13_TR,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,,,,,,
99852,06_DO,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,,,,,,
99853,41_TR,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,,,,,,
99854,27_TR,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,,,,,,


In [33]:
# when the number of total mutations is zero for the two samples the jaccard index is NA
for muttype in muttypes:
    print(shared_muts_gene_df.loc[(shared_muts_gene_df[f"sample1_numb_muts_{muttype}"] == 0)
                                & (shared_muts_gene_df[f"sample2_numb_muts_{muttype}"] == 0)]
                                [f"jaccard_index_{muttype}"].unique()
                                )

[nan]
[nan]
[nan]
[nan]
[nan]
[nan]


In [34]:
# add donor ID and bladder location
shared_muts_gene_df["sample1_donorID"] = shared_muts_gene_df.apply(lambda row: row["sample1"].split("_")[0], axis = 1)
shared_muts_gene_df["sample2_donorID"] = shared_muts_gene_df.apply(lambda row: row["sample2"].split("_")[0], axis = 1)
shared_muts_gene_df["sample1_bladder_location"] = shared_muts_gene_df.apply(lambda row: row["sample1"].split("_")[1], axis = 1)
shared_muts_gene_df["sample2_bladder_location"] = shared_muts_gene_df.apply(lambda row: row["sample2"].split("_")[1], axis = 1)
shared_muts_gene_df = shared_muts_gene_df.replace("TR", "trigone")
shared_muts_gene_df = shared_muts_gene_df.replace("DO", "dome")
shared_muts_gene_df

Unnamed: 0,sample1,sample2,numb_shared_muts_allsnvs,gene,numb_shared_muts_missense,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,numb_shared_muts_truncating,sample1_numb_muts_allsnvs,...,jaccard_index_allsnvs,jaccard_index_missense,jaccard_index_synonymous,jaccard_index_indels,jaccard_index_nonprotaffecting,jaccard_index_truncating,sample1_donorID,sample2_donorID,sample1_bladder_location,sample2_bladder_location
0,24_TR,24_TR,80,KMT2D,38,9,14,28,15,80.0,...,1.000000,1.000000,1.0,1.000000,1.000000,1.000000,24,24,trigone,trigone
1,14_DO,24_TR,1,KMT2D,1,0,0,0,0,123.0,...,0.004950,0.011628,0.0,0.000000,0.000000,0.000000,14,24,dome,trigone
2,23_DO,24_TR,3,KMT2D,1,0,0,0,2,347.0,...,0.007075,0.006494,0.0,0.000000,0.000000,0.013245,23,24,dome,trigone
3,40_TR,24_TR,1,KMT2D,0,0,1,0,1,28.0,...,0.009346,0.000000,0.0,0.037037,0.000000,0.052632,40,24,trigone,trigone
4,02_TR,24_TR,1,KMT2D,0,0,0,1,0,225.0,...,0.003289,0.000000,0.0,0.000000,0.011765,0.000000,02,24,trigone,trigone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99851,13_TR,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,,,,,,,13,36,trigone,trigone
99852,06_DO,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,,,,,,,06,36,dome,trigone
99853,41_TR,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,,,,,,,41,36,trigone,trigone
99854,27_TR,36_TR,0,TERTpromoter,0,0,0,0,0,0.0,...,,,,,,,27,36,trigone,trigone


In [35]:
# concat with all_genes df
shared_muts_allgenes_df["gene"] = "ALL_GENES"
shared_muts_df = pd.concat([shared_muts_allgenes_df, shared_muts_gene_df])
shared_muts_df

Unnamed: 0,sample1,sample1_donorID,sample1_bladder_location,sample2,sample2_donorID,sample2_bladder_location,sample1_numb_muts_allsnvs,sample2_numb_muts_allsnvs,sample1_numb_muts_missense,sample2_numb_muts_missense,...,numb_shared_muts_synonymous,numb_shared_muts_indels,numb_shared_muts_nonprotaffecting,jaccard_index_allsnvs,jaccard_index_missense,jaccard_index_truncating,jaccard_index_synonymous,jaccard_index_indels,jaccard_index_nonprotaffecting,gene
0,24_TR,24,trigone,24_TR,24,trigone,320.0,320.0,116.0,116.0,...,25,92,170,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,ALL_GENES
1,14_DO,14,dome,24_TR,24,trigone,608.0,320.0,218.0,116.0,...,0,6,4,0.008696,0.006024,0.013575,0.000000,0.015666,0.010000,ALL_GENES
2,23_DO,23,dome,24_TR,24,trigone,1198.0,320.0,423.0,116.0,...,1,6,8,0.010652,0.001859,0.020408,0.010638,0.009631,0.013889,ALL_GENES
3,40_TR,40,trigone,24_TR,24,trigone,159.0,320.0,52.0,116.0,...,1,5,8,0.021322,0.005988,0.026667,0.024390,0.023256,0.030888,ALL_GENES
4,02_TR,02,trigone,24_TR,24,trigone,988.0,320.0,426.0,116.0,...,1,2,7,0.006154,0.003704,0.004739,0.008000,0.005714,0.012844,ALL_GENES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99851,13_TR,13,trigone,36_TR,36,trigone,0.0,0.0,0.0,0.0,...,0,0,0,,,,,,,TERTpromoter
99852,06_DO,06,dome,36_TR,36,trigone,0.0,0.0,0.0,0.0,...,0,0,0,,,,,,,TERTpromoter
99853,41_TR,41,trigone,36_TR,36,trigone,0.0,0.0,0.0,0.0,...,0,0,0,,,,,,,TERTpromoter
99854,27_TR,27,trigone,36_TR,36,trigone,0.0,0.0,0.0,0.0,...,0,0,0,,,,,,,TERTpromoter


In [36]:
# save for figure
shared_muts_df.to_csv("../../data/sharedmutations/bladder_shared_mutations_allpairs_separatedByGenes.tsv",
                     sep = "\t", index = False)