# All coding alterations per sample:
- Germline (HC)
- Somatic:
    - SNV (intersect mutect, strelka,sage; with "rescued" mutations)
    - CNV (purple)
    - Fusions (GRIDDS)

In [1]:
import pandas as pd
import json
pd.set_option('display.max_rows', 250)

In [2]:
chroms = list(range(1,23))
chroms = ['chr'+str(chrom) for chrom in chroms]
sex_chroms = ['chrX','chrY']
chroms = chroms + sex_chroms
pts = ['pt1','pt2','pt3','pt4','pt5','pt6','pt7','pt8','pt10','pt11']
samples = json.load(open('/workspace/projects/sjd_pediatric_tumors/code/sample_ids.json','rb'))
ccf_thresholds = json.load( open( "/workspace/projects/sjd_pediatric_tumors/code/ccf_thresholds.json", "rb" ) )

In [3]:
samples

{'pt1': {'normal': 'AQ5174',
  'tumor1': 'AQ5180',
  'tumor2': 'AQ5186',
  'sex': 'female',
  'kidney': 'AX4954',
  'liver': 'AX4955',
  'pancreas': 'AX4956',
  'heart': 'AX4957',
  'clone1': 'AX4958',
  'clone2': 'AX4961',
  'mother': 'AW8063',
  'father': 'AW8064',
  'lung': 'AX4962',
  'medulla': 'AX4963',
  'spleen': 'AX4964',
  'brain': 'AX4965',
  'bma': 'AX4966'},
 'pt2': {'normal': 'AQ5175',
  'tumor1': 'AQ5181',
  'tumor2': 'AQ5187',
  'sex': 'female'},
 'pt3': {'normal': 'AQ5176',
  'tumor1': 'AQ5182',
  'tumor2': 'AQ5188',
  'sex': 'male'},
 'pt4': {'normal': 'AQ5177',
  'tumor1': 'AQ5183',
  'tumor2': 'AQ5189',
  'sex': 'female'},
 'pt5': {'normal': 'AQ5178',
  'tumor1': 'AQ5184',
  'tumor2': 'AQ5190',
  'sex': 'male'},
 'pt6': {'normal': 'AQ5179',
  'tumor1': 'AQ5185',
  'tumor2': 'AQ5191',
  'sex': 'female'},
 'pt7': {'normal': 'AW8058',
  'tumor1': 'AW8048',
  'tumor2': 'AW8049',
  'sex': 'female'},
 'pt8': {'normal': 'AW8061',
  'tumor1': 'AW8050',
  'tumor2': 'AW8051',

## Functions to arange all data

In [4]:
def ranked_table (df,gnomad=0.01):
#    df = df[['SYMBOL','germline','germline_mskcc','germline_akh','intogen','role','variant_type','Consequence','aa_change','mut','IMPACT','n_AF','n_AF_real','n_alt_reads','n_ref_reads','gnomADg','gnomADg_AF','Damaging']][(df['gnomADg_AF']<gnomad)&((df['intogen']==True)|(df['germline']==True)|(df['germline_mskcc']==True))&(df['Damaging']==True)]
    df = df[['SYMBOL','germline','germline_mskcc','germline_akh','intogen','role','variant_type','Consequence','aa_change','mut','IMPACT','n_AF','n_AF_real','n_alt_reads','n_ref_reads','gnomADg','gnomADg_AF','Damaging','STRAND']][(df['gnomADg_AF']<gnomad)&(df['Damaging']==True)]
    roles = ['LoF','ambiguous','Act']
    variants = ['truncating','miss_inframe','other']
    df['role'] = df['role'].astype("category")
    df['role'].cat.set_categories(roles, inplace=True)
    df['variant_type'] = df['variant_type'].astype("category")
    df['variant_type'].cat.set_categories(variants, inplace=True)
    df = df[~df['SYMBOL'].str.contains('HLA')].sort_values(['germline','germline_mskcc','germline_akh','role','variant_type'],ascending=[False,False,False,roles,variants])
    return df

In [5]:
def ranked_table_snvs (df,gnomad=0.001):
    #Filter by gnomad_AF and damaging
    df = df[['SYMBOL','germline','germline_mskcc','germline_akh','intogen','role','variant_type','Consequence','aa_change','mut','IMPACT','n_AF','n_AF_real','t_AF','n_alt_reads','n_ref_reads','t_alt_reads','t_ref_reads','t_CCF','clonal','gnomADg','gnomADg_AF','Damaging','STRAND','SAMPLE']][(df['gnomADg_AF']<gnomad)&(df['Damaging']==True)]
    #Rank mutations by: tumor, role and variant_type
    tumors = ['both','tumor1','tumor2']
    roles = ['LoF','ambiguous','Act']
    variants = ['truncating','miss_inframe','other']
    df['role'] = df['role'].astype("category")
    df['role'].cat.set_categories(roles, inplace=True)
    df['variant_type'] = df['variant_type'].astype("category")
    df['variant_type'].cat.set_categories(variants, inplace=True)
    df = df[~df['SYMBOL'].str.contains('HLA')].sort_values(['germline','germline_mskcc','germline_akh','role','variant_type'],ascending=[False,False,False,roles,variants])
    df = df[(df['clonal']==True)]
    return df

In [6]:
def concat_all_mutations (germ_df,snv_df,sv_df,cnv_df):
    
    #Merge snv and cnv
    sample = snv_df['SAMPLE'][0:1].to_list()[0]
    somatic_df = pd.merge(snv_df,cnv_df,how='outer')
    somatic_df = somatic_df[(~somatic_df['mut'].isnull())|((somatic_df['mut'].isnull())&(somatic_df['CNA']!='-')&(~somatic_df['role'].isnull()))]
    somatic_cnv_df = somatic_df[somatic_df['mut'].isnull()]
    somatic_cnv_df = somatic_cnv_df[~((somatic_cnv_df['role']=='Act')&(somatic_cnv_df['CNA']=='del')|(somatic_cnv_df['role']=='LoF')&(somatic_cnv_df['CNA']=='amp'))]
    role = ['Act','LoF','ambinguous']
    somatic_cnv_df.role = somatic_cnv_df.role.astype("category")
    somatic_cnv_df.role.cat.set_categories(role, inplace=True)
    somatic_cnv_df = somatic_cnv_df.sort_values(by=['role','intogen'],ascending=[role,False])
    somatic_df = somatic_df[~somatic_df['mut'].isnull()]
    somatic_df = pd.concat([somatic_df,somatic_cnv_df],ignore_index=True)
    
    #Merge sv
    sv_df = sv_df[~sv_df['SYMBOL'].isnull()]
    sv_df = sv_df.rename(columns={'mut':'mut_sv'})
    sv_df['role'] = sv_df['role'].astype(object)
    sv_type = ['fusion','del','ins','inv','other']
    sv_df.sv_type = sv_df.sv_type.astype("category")
    sv_df.sv_type.cat.set_categories(sv_type, inplace=True)
    sv_df = sv_df.sort_values(by=['cgc_transl','sv_type'],ascending=[False,sv_type])
    
    somatic_df = pd.merge(somatic_df,sv_df,how='outer')
    
    #Concat germline
    somatic_df['origin'] = 'somatic'
    germ_df['origin'] = 'germline'
    germ_som_df = pd.concat([germ_df,somatic_df],ignore_index=True)
    germ_som_df['germline'] = germ_som_df.apply(lambda row: True if (row['germline']==True or row['germline_mskcc']==True or row['germline_akh']==True) else False,axis=1)
    
    #Annotate altered genes in somatic and germline    
    germline = germ_df['SYMBOL'].tolist()
    somatic = somatic_df['SYMBOL'].tolist()    
    germline_somatic = list(set(germline) & set(somatic))
    germ_som_df['germ_som'] = germ_som_df['SYMBOL'].apply(lambda x: True if x in germline_somatic else False)
    germ_som_df['SAMPLE'] = sample
    return germ_som_df

import matplotlib.pyplot as plt
import numpy as np
import six

def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    data.drop_duplicates(inplace=True)
    data['n_AF_real'] = round(data['n_AF_real'],3)
    data['t_AF'] = round(data['t_AF'],3)
    data['gnomADg_AF'] = round(data['gnomADg_AF'],6)
    data['CN'] = round(data['CN'],1)
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    return ax

In [7]:
#Functions to print alterations in a CGI format

nt_dict = {'A':'T','T':'A','C':'G','G':'C'}

def fix_to_pos_strand(mut, strand):
    #print(mut,strand)
    ref = mut.split(':')[2].split('>')[0]
    alt = mut.split(':')[2].split('>')[1]
    if strand == '-1':
        if ref != '-':
            ref2 = ''
            for n in range(0,len(ref)):
                n2 = nt_dict[ref[n]]
                ref2 += n2
        else:
            ref2 = '-'
        if alt != '-':
            alt2 = ''
            for n in range(0,len(alt)):
                n2 = nt_dict[alt[n]]
                alt2 += n2
        else:
            alt2 = '-'
                
        chrom = mut.split(':')[0]
        pos = mut.split(':')[1]
        mut2 = chrom + ':' + pos + ':' + ref2 + '>' + alt2
        #print('negative strand',ref,alt,'positive strand',ref2,alt2)
    else:
        mut2 = mut
        
    return mut2



def print_muts(df):
    df = df[~df['mut'].isnull()]
    df['mut2'] = df.apply(lambda row: fix_to_pos_strand(row['mut'],row['STRAND']),axis=1)
    list1 = df['mut'].tolist()
    list1 = [ l.replace(':',':g.',1) for l in list1]
    list1 = [ l.replace(':','',2) for l in list1]
    list1 = [ l.replace('g.',':g.',2) for l in list1]
    [print(l) for l in list1]

def print_cna(df):
    
    df = df[~(df['CNA'].isnull())]
    df = df[df['CNA']!='-']
    df['cna_annot'] = df['SYMBOL']+':'+df['CNA']
    list1 = df['cna_annot'].tolist()
    list1 = list(set(list1))
    [print(l) for l in list1]
    
def print_transl(df):
    df = df[~df['fusion'].isnull()]
    list1 = df['fusion'].tolist()
    list1 = [l for l in list1 if '-' not in l]
    list1 = [l.replace('/','__') for l in list1]
    list1 = list(set(list1))
    [print(l) for l in list1]


In [8]:
#Functions to print alterations in a CGI format

def table_muts(df):
    df = df[~df['mut'].isnull()]
    #df['mut2'] = df.apply(lambda row: fix_to_pos_strand(row['mut'],row['STRAND']),axis=1)
    df['gdna'] = df['mut'].str.replace(':',':g.',1)
    df['gdna'] = df['gdna'].str.replace(':','',2)
    df['gdna'] = df['gdna'].str.replace('g.',':g.',2)
    df2 = df[['gdna','SAMPLE']]
    df2.drop_duplicates(inplace=True)
    df2 = df2.rename(columns={'SAMPLE':'sample'})
    return df2

def table_cna(df):
    
    df = df[~(df['CNA'].isnull())]
    df = df[df['CNA']!='-']
    df2 = df[['SYMBOL','CNA','SAMPLE']]
    df2.drop_duplicates(inplace=True)
    df2 = df2.rename(columns={'CNA':'cna','SYMBOL':'gene','SAMPLE':'sample'})
    return df2
    
def table_transl(df):
    df = df[~df['fusion'].isnull()]
    df = df[~df['fusion'].str.contains('-')]
    df['fus'] = df['fusion'].str.replace('/','__')   
    df2 = df[['fus','SAMPLE']]
    df2 = df2.rename(columns={'SAMPLE':'sample'})
    return df2


FILTERINGS

**Germline:**
- gnomADg_AF<0.01
- No HLA
- Intogen driver (only LoF), germline
- Damaging (affecting protein sequence)  

**Somatic:**
- Snvs and indels:
    - gnomADg_AF<0.01
    - No HLA
    - Damaging (affecting protein sequence)
    - Clonals
- CNA:
    - intogen driver, germline
    - CNA-del in Act genes and CNA-amp in LoF genes
- SV:
    - Breakpoint inside a gene
    - Annotate if it is inframe or out of frame
    - CGC list: fusion genes


 # Tables with germline and somatic variants for the paper

## Patient 1: Neuroblastoma and Rhabdoid tumor (+9 years)
Case 3 in paper

In [283]:
pt = 'pt1'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [284]:
#germline variants
cols = ['SYMBOL', 'origin','germline','role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut']

t1_df[cols][(t1_df['origin']=='germline')&(t1_df['gnomADg_AF']<.01)&((t1_df['germline']==True))].sort_values('gnomADg_AF',ascending=True)


Unnamed: 0,SYMBOL,origin,germline,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
5,RAF1,germline,True,Act,other,H389,0.484,6.3e-05,chr3:12599692:G>A
4,RAD50,germline,True,,miss_inframe,Q426R,0.582,0.000147,chr5:132589662:A>G
0,CDH1,germline,True,LoF,miss_inframe,P30T,0.412,0.001278,chr16:68738336:C>A
1,COL7A1,germline,True,,miss_inframe,G636V,0.464,0.004155,chr3:48590356:C>A
3,RTEL1,germline,True,,miss_inframe,GE770-771G,0.496,0.004962,chr20:63690338:AGA>-
2,GBA,germline,True,,miss_inframe,T408M,0.477,0.006204,chr1:155236246:G>A
6,SLX4,germline,True,,miss_inframe,R237Q,0.542,0.008306,chr16:3606524:C>T


In [285]:
t1_snv_df.columns

Index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'NORMAL', 'TUMOR', 't_AF', 'n_AF', 'DP_tumor', 't_alt_reads',
       't_ref_reads', 'DP_normal', 'n_alt_reads', 'n_ref_reads', 'mut_type',
       'GT_normal', 'GT_tumor', 'Gene', 'Feature', 'Feature_type',
       'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position',
       'Amino_acids', 'Codons', 'Existing_variation', 'IMPACT', 'DISTANCE',
       'STRAND', 'FLAGS', 'SYMBOL', 'SYMBOL_SOURCE', 'HGNC_ID', 'CANONICAL',
       'ENSP', 'SOURCE', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'CLIN_SIG', 'SOMATIC', 'PHENO', 'gnomADg', 'gnomADg_AF', 'gnomADg_NFE',
       'subset_origin', 'SAMPLE', 'Damaging', 'mut', 'aa_change', 'n_AF_real',
       'intogen', 'germline', 'germline_mskcc', 'germline_akh', 'role',
       'variant_type', 'CN', 't_CCF', 'n_CCF', 'clonal'],
      dtype='object')

In [286]:
#somatic SNV and indels variants
cols = ['#CHROM','POS','REF','ALT','SYMBOL','role','intogen',
       'variant_type','Consequence', 'aa_change','clonal']

t1_snv_df1 = t1_snv_df[cols]
t1_snv_df1['Tumor'] = 1
t2_snv_df1 = t2_snv_df[cols]
t2_snv_df1['Tumor'] = 2
snv_df1 = pd.concat([t1_snv_df1,t2_snv_df1])
snv_df1.sort_values(['Tumor','clonal','intogen','variant_type'],ascending=[True,False,False,False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,#CHROM,POS,REF,ALT,SYMBOL,role,intogen,variant_type,Consequence,aa_change,clonal,Tumor
1632,chr9,32633113,G,A,TAF1L,,True,truncating,stop_gained,R823*,True,1
718,chr2,29239684,G,C,ALK,Act,True,miss_inframe,missense_variant,P784R,True,1
2,chr1,11266002,T,-,MTOR,Act,True,,upstream_gene_variant,-,True,1
15,chr1,51382417,T,-,EPS15,Act,True,,intron_variant,-,True,1
176,chr10,113066641,G,T,TCF7L2,Act,True,,intron_variant,-,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1980,chrX,97514527,G,T,DIAPH2,,False,,intron_variant,-,False,2
2000,chrX,124743666,C,A,TENM1,,False,,intron_variant,-,False,2
2001,chrX,124925596,G,T,TENM1,,False,,intron_variant,-,False,2
2031,chrX,150160006,T,G,LINC00894,,False,,"intron_variant,non_coding_transcript_variant",-,False,2


In [291]:
snv_df1.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case3_snv_indels.tsv',sep='\t',index=None)

In [292]:
#somatic SV variants
cols = ['SYMBOL','intogen','role','mut','fusion','cgc_transl','chr/chr','sv_type','distance']
t1_sv_df1 = t1_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t2_sv_df1 = t2_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t1_sv_df1['Tumor'] = 1
t2_sv_df1['Tumor'] = 2
sv_df1 = pd.concat([t1_sv_df1,t2_sv_df1])
sv_df1

Unnamed: 0,SYMBOL,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,Tumor
31,MYCN,True,Act,chr2:15942475:C>C[chr2:15942572[,MYCN-del,False,chr2,del,97,1
54,FOXP1,True,Act,chr3:71040531:C>C[chr3:71099048[,FOXP1-del,True,chr3,del,58517,1
70,GALNT1,False,,chr18:35589777:T>]chr18:35589952]T,GALNT1-del,False,chr18,del,175,1
0,ALG14,False,,chr1:95023313:A>[chr17:58991027[A,ALG14/TRIM37,False,chr1/chr17,fusion,-,1
67,TRIM37,False,,chr17:58991027:G>[chr1:95023313[G,TRIM37/ALG14,False,chr17/chr1,fusion,-,1
2,NBAS,False,,chr2:15454924:G>GTTTTTTTTTTTTTTTTTTA.,NBAS-ins,False,chr2,ins,0,1
6,NBAS,False,,chr2:15468148:A>AATAAGTGTCAGAGATCGGAAGAGCGTCGT...,NBAS-ins,False,chr2,ins,0,1
7,NBAS,False,,chr2:15468148:A>AATAAGTGTCAGAGATCGGAAGAGCGTCGT...,NBAS-ins,False,chr2,ins,0,1
8,NBAS,False,,chr2:15474578:T>TCTTTGAGAGATCGGAAGAGCACACGTCTG...,NBAS-ins,False,chr2,ins,0,1
1,NBAS,False,,chr2:15407882:T>]chr2:16627671]CAT,NBAS/FAM49A,False,chr2,inv,1219789,1


In [293]:
sv_df1.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case3_sv.tsv',sep='\t',index=None)

In [294]:
#somatic CNV variants
pt = 'pt1'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t1-allsamples-t1/purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t2-allsamples-t2/purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]


def cnv_final_table(df1):
    chr_list = df1['chromosome'].unique()
    cols = ['chromosome','start','end','copyNumber']
    cn_final_df = pd.DataFrame(columns=cols)
    for chrom in chr_list:
        if len(df1[df1['chromosome']==chrom]) == 1:
            df2 = df1[df1['chromosome']==chrom]
        else:
            min_start = df1[df1['chromosome']==chrom]['start'].min()
            max_start = df1[df1['chromosome']==chrom]['start'].max()
            min_end = df1[df1['chromosome']==chrom]['end'].min()
            max_end = df1[df1['chromosome']==chrom]['end'].max()
            min_cn = df1[df1['chromosome']==chrom]['copyNumber'].min()
            max_cn = df1[df1['chromosome']==chrom]['copyNumber'].max()
            dict1 = {}
            dict1['chromosome'] = chrom
            dict1['start'] = '['+str(min_start)+','+str(max_start)+']'
            dict1['end'] = '['+str(min_end)+','+str(max_end)+']'
            dict1['copyNumber'] = '['+str(min_cn)+','+str(max_cn)+']'
            df2 = pd.DataFrame.from_dict({0:dict1},orient='index')
        cn_final_df = pd.concat([cn_final_df,df2])

    return cn_final_df
t1_segments_df2 = cnv_final_table(t1_segments_df1)
t2_segments_df2 = cnv_final_table(t2_segments_df1)
t1_segments_df2['Tumor'] = 1
t2_segments_df2['Tumor'] = 2
segments_df2 = pd.concat([t1_segments_df2[cols],t2_segments_df2[cols]])
segments_df2

Unnamed: 0,chromosome,start,end,copyNumber,Tumor
0,chr1,1,95023312,1.0253,1
0,chr2,"[15407882,16499225]","[15454924,16627671]","[291.6401,307.3457]",1
59,chr3,71040532,71099047,1.0782,1
0,chr17,"[58991027,69378970]","[69378834,83257441]","[3.0028,3.0119]",1
0,chr9,"[9229153,9249415]","[9249414,9672000]","[1.3871,1.4776]",2
78,chr21,21395644,21510344,1.3654,2
82,chr22,17229803,34540743,0.9947,2


In [230]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [289]:
intogen_df = pd.read_csv('/workspace/datasets/intogen/runs/v2023/20230224_release2023/run/intogen_analysis/unique_drivers.tsv',sep='\t')
intogen_drivers = intogen_df['SYMBOL'].tolist()

In [295]:
pt = 'pt1'
tumor = samples[pt]['tumor1']
t1_cnv_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t1-allsamples-t1/purple/'+tumor+'.purple.cnv.gene.tsv',sep='\t')

def add_genes (row,df1):
    chrom = row['chromosome']
    start_segment = row['start']
    end_segment = row['end']
    df1 = t1_cnv_df[t1_cnv_df['chromosome']==chrom]
    gene_list = df1['gene'].unique()
    driver_list = [gene for gene in gene_list if gene in intogen_drivers]
    segment_gene_list = []
    if type(start_segment) == str:
        start_segment = start_segment.split('[')[1].split(',')[0]
        end_segment = end_segment.split(']')[0].split(',')[1]
    for gene in driver_list:
        start_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['start']
        end_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['end']
        if int(start_gene) > int(start_segment) and int(end_gene) < int(end_segment):
            segment_gene_list.append(gene)
    return segment_gene_list

segments_df2['driver genes'] = segments_df2.progress_apply(lambda row: add_genes(row,df1),axis=1)
segments_df2

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))




Unnamed: 0,chromosome,start,end,copyNumber,Tumor,driver genes
0,chr1,1,95023312,1.0253,1,"[MIB2, SLC35E2A, TNFRSF14, PRDM16, RPL22, ZBTB..."
0,chr2,"[15407882,16499225]","[15454924,16627671]","[291.6401,307.3457]",1,[MYCN]
59,chr3,71040532,71099047,1.0782,1,[]
0,chr17,"[58991027,69378970]","[69378834,83257441]","[3.0028,3.0119]",1,"[CLTC, PPM1D, CD79B, SMURF2, GNA13, AXIN2, PRK..."
0,chr9,"[9229153,9249415]","[9249414,9672000]","[1.3871,1.4776]",2,[]
78,chr21,21395644,21510344,1.3654,2,[]
82,chr22,17229803,34540743,0.9947,2,"[CLTCL1, DGCR8, LZTR1, MAPK1, BCR, SMARCB1, SU..."


In [296]:
segments_df2.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case3_cnv.tsv',sep='\t',index=None)

## Patient 2: ARMS + tAML
Case 1 in paper

In [297]:
pt = 'pt2'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [298]:
#somatic SNV and indels variants
cols = ['#CHROM','POS','REF','ALT','SYMBOL','role','intogen',
       'variant_type','Consequence', 'aa_change','clonal']

t1_snv_df1 = t1_snv_df[cols]
t1_snv_df1['Tumor'] = 1
t2_snv_df1 = t2_snv_df[cols]
t2_snv_df1['Tumor'] = 2
snv_df1 = pd.concat([t1_snv_df1,t2_snv_df1])
snv_df1.sort_values(['Tumor','clonal','intogen','variant_type'],ascending=[True,False,False,False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,#CHROM,POS,REF,ALT,SYMBOL,role,intogen,variant_type,Consequence,aa_change,clonal,Tumor
6,chr1,10680552,C,G,CASZ1,LoF,True,,intron_variant,-,True,1
247,chr1,240824706,C,A,RGS7,LoF,True,,intron_variant,-,True,1
507,chr11,118785224,A,G,DDX6,ambiguous,True,,intron_variant,-,True,1
705,chr12,132624666,G,C,POLE,LoF,True,,3_prime_UTR_variant,-,True,1
711,chr13,20012708,C,G,ZMYM2,,True,,intron_variant,-,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2686,chrX,88843874,C,A,-,,False,,intergenic_variant,-,False,2
2706,chrX,97075625,G,T,DIAPH2,,False,,intron_variant,-,False,2
2714,chrX,102743951,C,T,BHLHB9,,False,,intron_variant,-,False,2
2775,chrX,144084558,T,A,-,,False,,intergenic_variant,-,False,2


In [299]:
snv_df1.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case1_snv_indels.tsv',sep='\t',index=None)

In [300]:
#somatic SV variants
cols = ['SYMBOL','intogen','role','mut','fusion','cgc_transl','chr/chr','sv_type','distance']
t1_sv_df1 = t1_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t2_sv_df1 = t2_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t1_sv_df1['Tumor'] = 1
t2_sv_df1['Tumor'] = 2
sv_df1 = pd.concat([t1_sv_df1,t2_sv_df1])
sv_df1

Unnamed: 0,SYMBOL,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,Tumor
26,TCF7L2,True,Act,chr10:112988994:A>A[chr10:113046386[,TCF7L2-del,True,chr10,del,57392,1
27,FOXO1,True,Act,chr13:40596631:T>T[chr2:222218414[,FOXO1/PAX3,True,chr13/chr2,fusion,-,1
28,FOXO1,True,Act,chr13:40596668:C>[chr2:222217523[C,FOXO1/PAX3,True,chr13/chr2,fusion,-,1
34,TCF12,True,,chr15:57054537:G>G]chr15:57561950],TCF12/-,True,chr15,inv,507413,1
2,PATJ,False,,chr1:61928598:C>C[chr1:61933425[,PATJ-del,False,chr1,del,4827,1
11,TTC7A,False,,chr2:47041786:T>T[chr2:47041925[,TTC7A-del,False,chr2,del,139,1
15,PAX3,False,,chr2:222217724:C>C]chr2:222218643],PAX3-del,True,chr2,del,919,1
19,MEGF10,False,,chr5:127450737:A>]chr5:127457508]A,MEGF10-del,False,chr5,del,6771,1
20,PHACTR2,False,,chr6:143616201:A>A]chr6:143627824],PHACTR2-del,False,chr6,del,11623,1
30,KLHL1,False,,chr13:69933996:G>GAT[chr13:69974703[,KLHL1-del,False,chr13,del,40707,1


In [301]:
sv_df1.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case1_sv.tsv',sep='\t',index=None)

In [302]:
#somatic CNV variants
pt = 'pt2'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']
cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t1-allsamples-t1/purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t2-allsamples-t2/purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]
t1_segments_df1['Tumor'] = 1
t2_segments_df1['Tumor'] = 2
segments_df1 = pd.concat([t1_segments_df1[cols],t2_segments_df1[cols]])
segments_df1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0,chromosome,start,end,copyNumber,Tumor
7,chr1,123605523,176819977,3.2341,1
8,chr1,176819978,176942486,3.0032,1
9,chr1,176942487,183303835,3.2517,1
10,chr1,183303836,201308985,2.7705,1
11,chr1,201308986,229572858,3.3421,1
12,chr1,229572859,229724401,3.2304,1
13,chr1,229724402,248956422,3.2641,1
14,chr2,1,47041786,3.1302,1
16,chr2,47041925,93139350,2.9796,1
17,chr2,93139351,209402111,2.9341,1


In [303]:
#somatic CNV variants
pt = 'pt2'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t1-allsamples-t1/purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t2-allsamples-t2/purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]

def cnv_final_table(df1):
    chr_list = df1['chromosome'].unique()
    cols = ['chromosome','start','end','copyNumber']
    cn_final_df = pd.DataFrame(columns=cols)
    for chrom in chr_list:
        if len(df1[df1['chromosome']==chrom]) == 1:
            df2 = df1[df1['chromosome']==chrom]
        else:
            min_start = df1[df1['chromosome']==chrom]['start'].min()
            max_start = df1[df1['chromosome']==chrom]['start'].max()
            min_end = df1[df1['chromosome']==chrom]['end'].min()
            max_end = df1[df1['chromosome']==chrom]['end'].max()
            min_cn = df1[df1['chromosome']==chrom]['copyNumber'].min()
            max_cn = df1[df1['chromosome']==chrom]['copyNumber'].max()
            dict1 = {}
            dict1['chromosome'] = chrom
            dict1['start'] = '['+str(min_start)+','+str(max_start)+']'
            dict1['end'] = '['+str(min_end)+','+str(max_end)+']'
            dict1['copyNumber'] = '['+str(min_cn)+','+str(max_cn)+']'
            df2 = pd.DataFrame.from_dict({0:dict1},orient='index')
        cn_final_df = pd.concat([cn_final_df,df2])
    return cn_final_df

t1_segments_df2 = cnv_final_table(t1_segments_df1)
t2_segments_df2 = cnv_final_table(t2_segments_df1)
t1_segments_df2['Tumor'] = 1
t2_segments_df2['Tumor'] = 2
segments_df2 = pd.concat([t1_segments_df2[cols],t2_segments_df2[cols]])
segments_df2

Unnamed: 0,chromosome,start,end,copyNumber,Tumor
0,chr1,"[123605523,229724402]","[176819977,248956422]","[2.7705,3.3421]",1
0,chr2,"[1,240472473]","[47041786,242193529]","[2.9111,4.1022]",1
38,chr6,143627825,147242546,1.0051,1
42,chr7,122369916,159345973,1.0042,1
0,chr12,"[1,35977330]","[35977329,133275309]","[2.9339,2.9552]",1
80,chr19,963743,1004704,3.0055,1
97,chrX,120750523,120762285,0.8943,1
0,chr7,"[1,59498944]","[59498943,159345973]","[1.0096,1.0187]",2


In [304]:
pt = 'pt2'
tumor = samples[pt]['tumor1']
t1_cnv_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t1-allsamples-t1/purple/'+tumor+'.purple.cnv.gene.tsv',sep='\t')

def add_genes (row,df1):
    chrom = row['chromosome']
    start_segment = row['start']
    end_segment = row['end']
    df1 = t1_cnv_df[t1_cnv_df['chromosome']==chrom]
    gene_list = df1['gene'].unique()
    driver_list = [gene for gene in gene_list if gene in intogen_drivers]
    segment_gene_list = []
    if type(start_segment) == str:
        start_segment = start_segment.split('[')[1].split(',')[0]
        end_segment = end_segment.split(']')[0].split(',')[1]
    for gene in driver_list:
        start_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['start']
        end_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['end']
        if int(start_gene) > int(start_segment) and int(end_gene) < int(end_segment):
            segment_gene_list.append(gene)
    return segment_gene_list

segments_df2['driver genes'] = segments_df2.progress_apply(lambda row: add_genes(row,df1),axis=1)
segments_df2

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




Unnamed: 0,chromosome,start,end,copyNumber,Tumor,driver genes
0,chr1,"[123605523,229724402]","[176819977,248956422]","[2.7705,3.3421]",1,"[BCL9, PDE4DIP, ARNT, SETDB1, S100A7, ZBTB7B, ..."
0,chr2,"[1,240472473]","[47041786,242193529]","[2.9111,4.1022]",1,"[MYCN, DNMT3A, ASXL2, ALK, BIRC6, SOS1, EPAS1,..."
38,chr6,143627825,147242546,1.0051,1,[]
42,chr7,122369916,159345973,1.0042,1,"[POT1, SMO, TRIM24, BRAF, EZH2, KMT2C]"
0,chr12,"[1,35977330]","[35977329,133275309]","[2.9339,2.9552]",1,"[CCND2, CHD4, PTPN6, ETV6, DUSP16, CDKN1B, ATF..."
80,chr19,963743,1004704,3.0055,1,[]
97,chrX,120750523,120762285,0.8943,1,[]
0,chr7,"[1,59498944]","[59498943,159345973]","[1.0096,1.0187]",2,"[CARD11, PMS2, RAC1, ETV1, MACC1, HNRNPA2B1, N..."


In [305]:
segments_df2.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case1_cnv.tsv',sep='\t',index=None)

##  Patient 3: Ependymoma + HGG HK27M +9y

In [306]:
pt = 'pt3'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [307]:
#germline variants
cols = ['SYMBOL', 'origin','germline','role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut']
t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t1_df[cols][(t1_df['origin']=='germline')&(t1_df['gnomADg_AF']<.01)&(t1_df['germline']==True)]

Unnamed: 0,SYMBOL,origin,germline,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
0,DICER1,germline,True,LoF,miss_inframe,Y1385C,0.456,0.0,chr14:95099832:T>C
1,WRN,germline,True,LoF,miss_inframe,T1262R,0.429,0.002702,chr8:31154721:C>G
2,SERPINA1,germline,True,,truncating,E347X,0.251,0.000253,chr14:94379488:CT>-
3,CFTR,germline,True,,miss_inframe,G576A,0.474,0.005081,chr7:117590400:G>C
4,CFTR,germline,True,,miss_inframe,R668C,0.414,0.006121,chr7:117592169:C>T


In [310]:
#somatic SNV and indels variants
cols = ['#CHROM','POS','REF','ALT','SYMBOL','role','intogen',
       'variant_type','Consequence', 'aa_change','clonal']

t1_snv_df1 = t1_snv_df[cols]
t1_snv_df1['Tumor'] = 1
t2_snv_df1 = t2_snv_df[cols]
t2_snv_df1['Tumor'] = 2
snv_df1 = pd.concat([t1_snv_df1,t2_snv_df1])
snv_df1.sort_values(['Tumor','clonal','intogen','variant_type'],ascending=[True,False,False,False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,#CHROM,POS,REF,ALT,SYMBOL,role,intogen,variant_type,Consequence,aa_change,clonal,Tumor
194,chr6,26031978,T,A,H3C2,,True,miss_inframe,missense_variant,K28M,True,1
12,chr1,179176486,-,A,ABL2,Act,True,,intron_variant,-,True,1
75,chr15,75440238,-,T,SIN3A,Act,True,,intron_variant,-,True,1
77,chr16,7232034,-,T,RBFOX1,Act,True,,intron_variant,-,True,1
142,chr3,85595120,C,A,CADM2,,True,,intron_variant,-,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1401,chrY,15780365,C,T,-,,False,,intergenic_variant,-,False,2
1402,chrY,16450054,G,A,-,,False,,intergenic_variant,-,False,2
1403,chrY,16763764,G,A,-,,False,,intergenic_variant,-,False,2
1404,chrY,19129676,A,G,-,,False,,intergenic_variant,-,False,2


In [309]:
snv_df1.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case2_snv_indels.tsv',sep='\t',index=None)

In [311]:
#somatic SV variants
cols = ['SYMBOL','intogen','role','mut','fusion','cgc_transl','chr/chr','sv_type','distance']
t1_sv_df1 = t1_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t2_sv_df1 = t2_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t1_sv_df1['Tumor'] = 1
t2_sv_df1['Tumor'] = 2
sv_df1 = pd.concat([t1_sv_df1,t2_sv_df1])
sv_df1

Unnamed: 0,SYMBOL,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,Tumor
0,PJA2,False,,chr5:109342397:G>G[chr5:109342435[,PJA2-del,False,chr5,del,38,1
11,FGL1,False,,chr8:17891100:T>T[chr8:17894283[,FGL1-del,False,chr8,del,3183,2
15,NALCN,False,,chr13:101132878:T>T[chr13:101132917[,NALCN-del,False,chr13,del,39,2
29,AFF2,False,,chrX:148655820:G>G[chrX:148655857[,AFF2-del,False,chrX,del,37,2
4,MAP3K20,False,,chr2:173249260:A>A[chr21:31254906[,MAP3K20/TIAM1,False,chr2/chr21,fusion,-,2
18,TIAM1,False,,chr21:31254906:A>]chr2:173249260]A,TIAM1/MAP3K20,False,chr21/chr2,fusion,-,2
0,,False,,chr1:118365082:C>[chr1:166745104[C,-/-,False,chr1,inv,48380022,2
1,,False,,chr1:166745104:G>[chr1:118365082[G,-/-,False,chr1,inv,-48380022,2
2,,False,,chr2:161078542:A>]chr2:161079084]A,-/-,False,chr2,inv,542,2
3,,False,,chr2:161079084:T>T[chr2:161078542[,-/-,False,chr2,inv,-542,2


In [312]:
sv_df1.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case2_sv.tsv',sep='\t',index=None)

In [271]:
#somatic CNV variants
pt = 'pt3'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']
cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t1-allsamples-t1/purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t2-allsamples-t2/purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]
t1_segments_df1['Tumor'] = 1
t2_segments_df1['Tumor'] = 2
segments_df1 = pd.concat([t1_segments_df1[cols],t2_segments_df1[cols]])
segments_df1

Unnamed: 0,chromosome,start,end,copyNumber,Tumor


##  Patient 8: BL + THC 
Case 4 in paper

In [313]:
pt = 'pt8'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [314]:
#germline variants
cols = ['SYMBOL', 'origin','germline','role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut']
t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t1_df[cols][(t1_df['origin']=='germline')&(t1_df['gnomADg_AF']<.01)&(t1_df['germline']==True)]

Unnamed: 0,SYMBOL,origin,germline,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
0,PTCH1,germline,True,LoF,miss_inframe,D436N,0.508,0.000824,chr9:95478096:C>T
1,SDHD,germline,True,LoF,miss_inframe,G12S,0.493,0.006979,chr11:112086941:G>A
2,SDHB,germline,True,LoF,miss_inframe,G53E,0.496,0.000405,chr1:17044803:C>T
3,MET,germline,True,Act,miss_inframe,T1010I,0.514,0.00897,chr7:116771936:C>T
4,PDGFRA,germline,True,Act,miss_inframe,G79D,0.579,0.008872,chr4:54261281:G>A
5,WAS,germline,True,Act,miss_inframe,V332A,0.486,0.004856,chrX:48688723:T>C
6,SERPINA1,germline,True,,truncating,E347X,0.116,0.000253,chr14:94379488:CT>-
7,AR,germline,True,Act,miss_inframe,GGGGG457-461-,1.0,0.009286,chrX:67546515:GGCGGCGGCGGCGGC>-
8,JMJD1C,germline,True,,miss_inframe,F130Y,0.508,4.9e-05,chr10:63264709:A>T
9,SHOC2,germline,True,,miss_inframe,E25G,0.511,7.7e-05,chr10:110964432:A>G


In [315]:
#somatic SNV and indels variants
cols = ['#CHROM','POS','REF','ALT','SYMBOL','role','intogen',
       'variant_type','Consequence', 'aa_change','clonal']

t1_snv_df1 = t1_snv_df[cols]
t1_snv_df1['Tumor'] = 1
t2_snv_df1 = t2_snv_df[cols]
t2_snv_df1['Tumor'] = 2
snv_df1 = pd.concat([t1_snv_df1,t2_snv_df1])
snv_df1.sort_values(['Tumor','clonal','intogen','variant_type'],ascending=[True,False,False,False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,#CHROM,POS,REF,ALT,SYMBOL,role,intogen,variant_type,Consequence,aa_change,clonal,Tumor
1084,chr13,40666152,G,A,FOXO1,Act,True,miss_inframe,missense_variant,R21C,True,1
1700,chr17,7674229,C,T,TP53,LoF,True,miss_inframe,missense_variant,G245D,True,1
1998,chr19,46919816,G,C,ARHGAP35,LoF,True,miss_inframe,missense_variant,V381L,True,1
3083,chr3,187725578,G,T,BCL6,Act,True,miss_inframe,missense_variant,A587D,True,1
4761,chr8,144512919,T,C,RECQL4,LoF,True,miss_inframe,missense_variant,R895G,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...
429,chrX,118499583,C,T,DOCK11,,False,,intron_variant,-,False,2
430,chrX,119447007,G,A,SLC25A43,,False,,intron_variant,-,False,2
432,chrX,143091310,C,-,RN7SKP81,,False,,downstream_gene_variant,-,False,2
434,chrX,144321775,C,T,-,,False,,intergenic_variant,-,False,2


In [316]:
snv_df1.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case4_snv_indels.tsv',sep='\t',index=None)

In [317]:
#somatic SV variants
cols = ['SYMBOL','intogen','role','mut','fusion','cgc_transl','chr/chr','sv_type','distance']
t1_sv_df1 = t1_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t2_sv_df1 = t2_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t1_sv_df1['Tumor'] = 1
t2_sv_df1['Tumor'] = 2
sv_df1 = pd.concat([t1_sv_df1,t2_sv_df1])
sv_df1

Unnamed: 0,SYMBOL,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,Tumor
8,FHIT,True,ambiguous,chr3:60334050:T>T[chr3:60343507[,FHIT-del,True,chr3,del,9457,1
15,DYDC2,False,,chr10:80356996:C>C[chr10:80357274[,DYDC2-del,False,chr10,del,278,1
20,RNF215,False,,nan:30387399:G>G[chr22:30387438[,RNF215-del,False,chr22,del,39,1
0,,False,,chr1:194481532:C>C[chr1:194481569[,-/-,False,chr1,inv,37,1
1,,False,,chr1:194481569:T>]chr1:194481532]T,-/-,False,chr1,inv,-37,1
2,,False,,chr2:88861251:A>AGGGGC[chr2:89196082[,-/-,False,chr2,inv,334831,1
3,,False,,chr2:88861257:A>A]chr2:88886153],-/-,False,chr2,inv,24896,1
4,,False,,chr2:88861924:C>[chr2:88897787[C,-/-,False,chr2,inv,35863,1
5,,False,,chr2:88886153:T>T]chr2:88861257],-/-,False,chr2,inv,-24896,1
6,,False,,chr2:88897787:C>[chr2:88861924[C,-/-,False,chr2,inv,-35863,1


In [318]:
sv_df1.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case4_sv.tsv',sep='\t',index=None)

In [319]:
#somatic CNV variants
pt = 'pt8'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']
cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t1-allsamples-t1/purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t2-allsamples-t2/purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]
t1_segments_df1['Tumor'] = 1
t2_segments_df1['Tumor'] = 2
segments_df1 = pd.concat([t1_segments_df1[cols],t2_segments_df1[cols]])
segments_df1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0,chromosome,start,end,copyNumber,Tumor
49,chr14,105864255,106373660,-0.0334,1
50,chr14,106373661,107043718,2.6802,1
1,chr1,43950914,119287566,1.0489,2


In [320]:
#somatic CNV variants
pt = 'pt8'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t1-allsamples-t1/purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t2-allsamples-t2/purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]

def cnv_final_table(df1):
    chr_list = df1['chromosome'].unique()
    cols = ['chromosome','start','end','copyNumber']
    cn_final_df = pd.DataFrame(columns=cols)
    for chrom in chr_list:
        if len(df1[df1['chromosome']==chrom]) == 1:
            df2 = df1[df1['chromosome']==chrom]
        else:
            min_start = df1[df1['chromosome']==chrom]['start'].min()
            max_start = df1[df1['chromosome']==chrom]['start'].max()
            min_end = df1[df1['chromosome']==chrom]['end'].min()
            max_end = df1[df1['chromosome']==chrom]['end'].max()
            min_cn = df1[df1['chromosome']==chrom]['copyNumber'].min()
            max_cn = df1[df1['chromosome']==chrom]['copyNumber'].max()
            dict1 = {}
            dict1['chromosome'] = chrom
            dict1['start'] = '['+str(min_start)+','+str(max_start)+']'
            dict1['end'] = '['+str(min_end)+','+str(max_end)+']'
            dict1['copyNumber'] = '['+str(min_cn)+','+str(max_cn)+']'
            df2 = pd.DataFrame.from_dict({0:dict1},orient='index')
        cn_final_df = pd.concat([cn_final_df,df2])

    return cn_final_df

t1_segments_df2 = cnv_final_table(t1_segments_df1)
t2_segments_df2 = cnv_final_table(t2_segments_df1)
t1_segments_df2['Tumor'] = 1
t2_segments_df2['Tumor'] = 2
segments_df2 = pd.concat([t1_segments_df2[cols],t2_segments_df2[cols]])
segments_df2

Unnamed: 0,chromosome,start,end,copyNumber,Tumor
0,chr14,"[105864255,106373661]","[106373660,107043718]","[-0.0334,2.6802]",1
1,chr1,43950914,119287566,1.0489,2


In [321]:
pt = 'pt8'
tumor = samples[pt]['tumor1']
t1_cnv_df = pd.read_csv('/workspace/datasets/sjd_seq/platinum_results/20220809/'+pt+'-t1-allsamples-t1/purple/'+tumor+'.purple.cnv.gene.tsv',sep='\t')

def add_genes (row,df1):
    chrom = row['chromosome']
    start_segment = row['start']
    end_segment = row['end']
    df1 = t1_cnv_df[t1_cnv_df['chromosome']==chrom]
    gene_list = df1['gene'].unique()
    driver_list = [gene for gene in gene_list if gene in intogen_drivers]
    segment_gene_list = []
    if type(start_segment) == str:
        start_segment = start_segment.split('[')[1].split(',')[0]
        end_segment = end_segment.split(']')[0].split(',')[1]
    for gene in driver_list:
        start_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['start']
        end_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['end']
        if int(start_gene) > int(start_segment) and int(end_gene) < int(end_segment):
            segment_gene_list.append(gene)
    return segment_gene_list

segments_df2['driver genes'] = segments_df2.progress_apply(lambda row: add_genes(row,df1),axis=1)
segments_df2

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,chromosome,start,end,copyNumber,Tumor,driver genes
0,chr14,"[105864255,106373661]","[106373660,107043718]","[-0.0334,2.6802]",1,[]
1,chr1,43950914,119287566,1.0489,2,"[TAL1, STIL, CDKN2C, EPS15, JUN, JAK1, FUBP1, ..."


In [322]:
segments_df2.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_paper/case4_cnv.tsv',sep='\t',index=None)

_____________________________________________________________

## Analysis Patient 3

In [15]:
samples['pt3']

{'normal': 'AQ5176', 'tumor1': 'AQ5182', 'tumor2': 'AQ5188', 'sex': 'male'}

In [16]:
#all germline variants with gnomadg_AF<0.0001
cols = ['SYMBOL', 'origin','germline','role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut']
df = t1_df[cols]

df[(df['origin']=='germline')&(df['gnomADg_AF']<0.0001)].sort_values(['variant_type','gnomADg_AF'],ascending=True)


Unnamed: 0,SYMBOL,origin,germline,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
17,CSF3R,germline,False,Act,truncating,-,0.557,0.0,chr1:36481538:C>G
42,TEX15,germline,False,,truncating,RV2302-2303SX,0.507,0.0,chr8:30843260:CT>-
44,MROH1,germline,False,,truncating,R612*,0.497,0.0,chr8:144240576:C>T
45,ACMSD,germline,False,,truncating,W83*,0.475,0.0,chr2:134862017:G>A
46,ZNF679,germline,False,,truncating,G9X,0.572,0.0,chr7:64249142:GG>-
54,TLR5,germline,False,,truncating,R29*,0.525,7e-06,chr1:223112947:G>A
35,VWA8,germline,False,,truncating,R1464*,0.492,1.4e-05,chr13:41675234:G>A
28,IGHD3OR15-3A,germline,False,,truncating,L8X,0.154,2.8e-05,chr15:20005913:AA>-
40,PRSS55,germline,False,,truncating,D249X,0.494,2.8e-05,chr8:10538480:A>-
52,ZBBX,germline,False,,truncating,R424*,0.424,2.8e-05,chr3:167315754:G>A


### Pt3: germline

In [18]:
germ_df = pd.read_excel('/workspace/projects/sjd_pediatric_tumors/data/germline_muts_data/41467_2020_16067_MOESM8_ESM.xlsx',skiprows=2)
germ_df

Unnamed: 0,Patient_ID,Tumor_types,Project,IVA_Prediction,Hugo_Symbol,Cancer_Predisposition,Variant_Classification,Variant_Type,NCBI_Build,Chromosome,...,Allele_Fraction,Call_Quality,Read_Depth,Inferred_Activity,Sample_Genotype_Quality,CLINVAR_ID,CLINVAR_Description,Mode_of_Inheritance,Mode_of_Inheritance_abb,Gene_Classification
0,SJNBL135,NBL,SJ_CAYA,Pathogenic,ALK,Known Cancer Predisposing Gene (KCPG),Missense_Mutation,SNP,38,2,...,25.0,121.77,24,gain,99,RCV000019709.5; RCV000432041.1; RCV000423720.1...,"Pathogenic, risk factor; Pathogenic; Likely pa...",Autosomal-Dominant,AD,Oncogene
1,SJCNS019186,CNS,SJ_CAYA,Pathogenic,ATM,Known Cancer Predisposing Gene (KCPG),Splice_Site,SNP,38,11,...,20.0,63.77,40,loss,92,,none,Autosomal-Recessive ; Autosomal-Dominant,AR/AD,Tumor Suppressor Gene
2,SJOS018803,OS,SJ_CAYA,Pathogenic,ATM,Known Cancer Predisposing Gene (KCPG),In_Frame_Del,DEL,38,11,...,25.86,437.73,58,loss,99,RCV000212075.3; RCV000206671.6; RCV000003163.5...,Pathogenic; Pathogenic; Pathogenic; Pathogenic...,Autosomal-Recessive ; Autosomal-Dominant,AR/AD,Tumor Suppressor Gene
3,SJRB041657,RB,SJ_CAYA,Pathogenic,ATM,Known Cancer Predisposing Gene (KCPG),Splice_Site,SNP,38,11,...,20.0,91.77,65,loss,99,,none,Autosomal-Recessive ; Autosomal-Dominant,AR/AD,Tumor Suppressor Gene
4,SJRHB042125,RHB,SJ_CAYA,Pathogenic,ATM,Known Cancer Predisposing Gene (KCPG),Missense_Mutation,SNP,38,11,...,50.0,2377.77,164,loss,99,RCV000003159.5; RCV000115244.9; RCV000515429.1...,Pathogenic; Pathogenic; Pathogenic; Pathogenic...,Autosomal-Recessive ; Autosomal-Dominant,AR/AD,Tumor Suppressor Gene
5,SJRHB019594,RHB,SJ_CAYA,Pathogenic,ATR,Known Cancer Predisposing Gene (KCPG),Nonsense_Mutation,SNP,38,3,...,46.15,2537.77,182,loss,99,,none,Autosomal-Dominant,AD,Tumor Suppressor Gene
6,SJRB018827,RB,SJ_CAYA,Pathogenic,BAP1,Known Cancer Predisposing Gene (KCPG),Nonsense_Mutation,SNP,38,3,...,67.77,2195.77,121,loss,99,,none,Autosomal-Dominant,AD,Tumor Suppressor Gene
7,SJCNS018594,CNS,SJ_CAYA,Pathogenic,BARD1,Known Cancer Predisposing Gene (KCPG),Splice_Site,SNP,38,2,...,44.05,1011.77,84,loss,99,,none,Autosomal-Dominant,AD,Tumor Suppressor Gene
8,SJCNS018579,CNS,SJ_CAYA,Pathogenic,BRCA1,Known Cancer Predisposing Gene (KCPG),Frame_Shift_Del,DEL,38,17,...,56.1,1434.73,82,loss,99,RCV000047845.7; RCV000167767.8; RCV000031052.1...,Pathogenic; Pathogenic; Pathogenic; Pathogenic...,Autosomal-Dominant,AD,Tumor Suppressor Gene
9,SJCNS041476,CNS,SJ_CAYA,Pathogenic,BRCA1,Known Cancer Predisposing Gene (KCPG),Frame_Shift_Del,DEL,38,17,...,45.73,4018.73,234,loss,99,RCV000048287.7; RCV000031123.8; RCV000238864.1...,Pathogenic; Pathogenic; Pathogenic; Pathogenic...,Autosomal-Dominant,AD,Tumor Suppressor Gene


In [19]:
t1_df[(t1_df['origin']=='germline')&(t1_df['germline']==True)]

Unnamed: 0,SYMBOL,germline,germline_mskcc,germline_akh,intogen,role,variant_type,Consequence,aa_change,mut,...,CN_min_allele,cytoband,mut_sv,fusion,cgc_transl,chr/chr,sv_type,distance,distance_rel,germ_som
0,DICER1,True,True,True,True,LoF,miss_inframe,missense_variant,Y1385C,chr14:95099832:T>C,...,,,,,,,,,,False
1,WRN,True,False,True,False,LoF,miss_inframe,missense_variant,T1262R,chr8:31154721:C>G,...,,,,,,,,,,False
2,SERPINA1,True,False,True,False,,truncating,frameshift_variant,E347X,chr14:94379488:CT>-,...,,,,,,,,,,False
3,CFTR,True,False,True,False,,miss_inframe,missense_variant,G576A,chr7:117590400:G>C,...,,,,,,,,,,False
4,CFTR,True,False,True,False,,miss_inframe,missense_variant,R668C,chr7:117592169:C>T,...,,,,,,,,,,False


In [20]:
genes = t1_df[(t1_df['origin']=='germline')&(t1_df['germline']==True)]['SYMBOL'].tolist()
germ_df[['Tumor_types', 'Project', 'Hugo_Symbol','Variant_Classification', 'Variant_Type',
        'Chromosome', 'Start_Position', 'End_position','Cytoband', 'Reference_Allele', 'Sample_Allele1', 'Sample_Allele2',
        'Nucleotide_Change', 'Protein_Change','Mode_of_Inheritance','IVA_Prediction']][germ_df['Hugo_Symbol'].isin(genes)]

Unnamed: 0,Tumor_types,Project,Hugo_Symbol,Variant_Classification,Variant_Type,Chromosome,Start_Position,End_position,Cytoband,Reference_Allele,Sample_Allele1,Sample_Allele2,Nucleotide_Change,Protein_Change,Mode_of_Inheritance,IVA_Prediction
47,CNS,SJ_CAYA,DICER1,Frame_Shift_Del,DEL,14,95117714,95117714,q32.13,A,A,,c.1417delT,p.Y473fs*7,Autosomal-Dominant,Pathogenic
48,RHB,SJ_CAYA,DICER1,Frame_Shift_Ins,INS,14,95103391,95103392,q32.13,,,T,c.4004dupA; c.698dupA,p.Y233*; p.Y1335*,Autosomal-Dominant,Pathogenic
49,OST,SJ_CAYA,DICER1,Frame_Shift_Ins,INS,14,95124246,95124247,q32.13,,,A,c.1325dupT,p.L442fs*35,Autosomal-Dominant,Pathogenic
172,WLM,SJ_CAYA,DICER1,Missense_Mutation,SNP,14,95108353,95108353,q32.13,C,C,T,c.2407G>A,p.G803R,Autosomal-Dominant,Likely Pathogenic


In [21]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a known predisposing cancer gene to NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
100101,PHOX2B-AS1,False,False,,,-,0.479,0.006546,chr4:41802775:A>T,"intron_variant,non_coding_transcript_variant"
100102,PHOX2B-AS1,False,False,,,-,0.407,0.007551,chr4:41819209:G>A,"intron_variant,non_coding_transcript_variant"


In [22]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL']=='NBAS']

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
68769,NBAS,False,False,,,-,0.49,0.004258,chr2:15186085:T>C,intron_variant
68770,NBAS,False,False,,miss_inframe,K2034T,0.435,0.0,chr2:15234590:T>G,missense_variant
68771,NBAS,False,False,,,-,0.481,0.006055,chr2:15266415:C>A,intron_variant
68772,NBAS,False,False,,,-,0.456,0.006958,chr2:15291921:C>T,intron_variant
68773,NBAS,False,False,,,-,0.538,0.008877,chr2:15408609:T>C,intron_variant
75809,NBAS,False,False,,,-,1.0,0.0,chr2:15533682:GTGTGTGTGTGTGTGT>-,intron_variant


In [23]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('MACROD2')]

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
37506,MACROD2,False,False,,,-,0.486,7.7e-05,chr20:14091174:A>T,intron_variant
37507,MACROD2,False,False,,,-,0.452,7.7e-05,chr20:14113813:G>A,intron_variant
37508,MACROD2,False,False,,,-,0.455,7.7e-05,chr20:14113814:A>T,intron_variant
37509,MACROD2,False,False,,,-,0.493,0.004809,chr20:14158026:T>G,intron_variant
37510,MACROD2,False,False,,,-,0.503,0.009863,chr20:14241761:A>G,intron_variant
37511,MACROD2,False,False,,,-,0.527,6.3e-05,chr20:14257961:C>A,intron_variant
37512,MACROD2,False,False,,,-,0.38,7.2e-05,chr20:14389337:G>A,intron_variant
37513,MACROD2,False,False,,,-,0.434,0.002236,chr20:14422608:A>G,intron_variant
37514,MACROD2,False,False,,,-,0.521,0.002239,chr20:14447824:G>A,intron_variant
37515,MACROD2,False,False,,,-,0.512,0.002289,chr20:14526447:A>G,intron_variant


In [24]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut',
       ]
df = t1_df[cols]

df[(df['origin']=='germline')&(df['gnomADg_AF']<.01)&(df['germline']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
0,DICER1,False,germline,True,True,LoF,miss_inframe,Y1385C,0.456,0.0,chr14:95099832:T>C
1,WRN,False,germline,True,False,LoF,miss_inframe,T1262R,0.429,0.002702,chr8:31154721:C>G
2,SERPINA1,False,germline,True,False,,truncating,E347X,0.251,0.000253,chr14:94379488:CT>-
3,CFTR,False,germline,True,False,,miss_inframe,G576A,0.474,0.005081,chr7:117590400:G>C
4,CFTR,False,germline,True,False,,miss_inframe,R668C,0.414,0.006121,chr7:117592169:C>T


In [25]:

cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut',
       ]
df = t1_df[cols]

#All truncating

df[(df['origin']=='germline')&(df['gnomADg_AF']<.01)&(df['variant_type']=='truncating')]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
2,SERPINA1,False,germline,True,False,,truncating,E347X,0.251,0.000253,chr14:94379488:CT>-
16,MUC4,False,germline,False,False,Act,truncating,D3685X,0.924,0.005217,chr3:195780526:TC>-
17,CSF3R,False,germline,False,True,Act,truncating,-,0.557,0.0,chr1:36481538:C>G
28,IGHD3OR15-3A,False,germline,False,False,,truncating,L8X,0.154,2.8e-05,chr15:20005913:AA>-
29,TYRO3,False,germline,False,False,,truncating,-,0.142,0.00226,chr15:41570156:GGTAAGGGGATGGGGATGTGGAGGGAGAGGC...
30,AC126755.1,False,germline,False,False,,truncating,-,0.529,0.004195,chr16:18344806:CAGGGAGGCGCACACGCTCACAGAGGG>-
31,PRELID3A,False,germline,False,False,,truncating,M1R,0.416,3.5e-05,chr18:12407977:T>G
32,LINC01694,False,germline,False,False,,truncating,-,0.576,0.006585,chr21:45596390:G>A
33,PELP1,False,germline,False,False,,truncating,HAPSP25-29X,0.44,0.001724,chr17:4704175:TGGGGAGGGGGCG>-
34,ZNF417,False,germline,False,False,,truncating,E28*,0.441,0.002499,chr19:57912141:C>A


In [26]:

cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut',
       ]
df = t1_df[cols]

#All missense, inframe
df[(df['origin']=='germline')&(df['gnomADg_AF']<.01)&(df['variant_type']=='miss_inframe')]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
0,DICER1,False,germline,True,True,LoF,miss_inframe,Y1385C,0.456,0.000000,chr14:95099832:T>C
1,WRN,False,germline,True,False,LoF,miss_inframe,T1262R,0.429,0.002702,chr8:31154721:C>G
3,CFTR,False,germline,True,False,,miss_inframe,G576A,0.474,0.005081,chr7:117590400:G>C
4,CFTR,False,germline,True,False,,miss_inframe,R668C,0.414,0.006121,chr7:117592169:C>T
5,MGA,False,germline,False,True,LoF,miss_inframe,N1982S,0.408,0.008404,chr15:41749552:A>G
...,...,...,...,...,...,...,...,...,...,...,...
503,INTS7,False,germline,False,False,,miss_inframe,I370T,0.572,0.003447,chr1:211982699:A>G
504,USH2A,False,germline,False,False,,miss_inframe,K2080N,0.481,0.007549,chr1:216046516:C>A
505,OBSCN,False,germline,False,False,,miss_inframe,L2539V,0.488,0.006148,chr1:228276932:C>G
506,NUP133,False,germline,False,False,,miss_inframe,G854D,0.479,0.004696,chr1:229463667:C>T


### Germline & Somatic Tumor 1

In [27]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut',
       'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']
df = t1_df[cols]

df[(df['germ_som']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance


### Germline & Somatic Tumor 2

In [28]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut',
       'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']
df = t2_df[cols]

df[(df['germ_som']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance


### PT3: somatic

In [29]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut',
       ]
df = t1_df[cols]

df[(df['germ_som']==True)&(df['origin']=='germline')&(df['gnomADg_AF']<.01)&((df['variant_type']=='miss_inframe')|(df['variant_type']=='truncating'))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut


In [30]:
t1_df[t1_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germline,germline_mskcc,germline_akh,intogen,role,variant_type,Consequence,aa_change,mut,...,CN_min_allele,cytoband,mut_sv,fusion,cgc_transl,chr/chr,sv_type,distance,distance_rel,germ_som
609,H3C2,False,False,False,True,,miss_inframe,missense_variant,K28M,chr6:26031978:T>A,...,,,,,,,,,,False
610,ZRSR2,False,False,,True,LoF,,,,,...,0.0,chrX:p22.2,,,,,,,,False
611,EIF1AX,False,False,,True,LoF,,,,,...,0.0,chrX:p22.12,,,,,,,,False
612,RPS6KA3,False,False,,True,LoF,,,,,...,0.0,chrX:p22.12,,,,,,,,False
613,ZFX,False,False,,True,LoF,,,,,...,0.0,chrX:p22.11,,,,,,,,False
614,BCOR,False,False,,True,LoF,,,,,...,0.0,chrX:p11.4,,,,,,,,False
615,DDX3X,False,False,,True,LoF,,,,,...,0.0,chrX:p11.4,,,,,,,,False
616,KDM6A,False,False,,True,LoF,,,,,...,0.0,chrX:p11.3,,,,,,,,False
617,RBM10,False,False,,True,LoF,,,,,...,0.0,chrX:p11.3,,,,,,,,False
618,WDR45,False,False,,True,LoF,,,,,...,0.0,chrX:p11.23,,,,,,,,False


In [31]:
t2_df.columns

Index(['SYMBOL', 'germline', 'germline_mskcc', 'germline_akh', 'intogen',
       'role', 'variant_type', 'Consequence', 'aa_change', 'mut', 'IMPACT',
       'n_AF', 'n_AF_real', 'n_alt_reads', 'n_ref_reads', 'gnomADg',
       'gnomADg_AF', 'Damaging', 'STRAND', 'origin', 't_AF', 't_alt_reads',
       't_ref_reads', 't_CCF', 'clonal', 'SAMPLE', 'CNA', 'CN',
       'CN_min_allele', 'cytoband', 'mut_sv', 'fusion', 'cgc_transl',
       'chr/chr', 'sv_type', 'distance', 'distance_rel', 'germ_som'],
      dtype='object')

In [32]:
cols = ['SYMBOL', 'origin', 'intogen', 'role',
       'variant_type', 'aa_change','gnomADg_AF', 'mut',
       ]
t2_df[cols+['CNA', 'CN','cytoband','fusion','chr/chr','sv_type']][t2_df['origin']=='somatic']

Unnamed: 0,SYMBOL,origin,intogen,role,variant_type,aa_change,gnomADg_AF,mut,CNA,CN,cytoband,fusion,chr/chr,sv_type
609,H3C2,somatic,True,,miss_inframe,K28M,0.0,chr6:26031978:T>A,,,,,,
610,LYPLA1,somatic,False,,miss_inframe,A98V,0.0,chr8:54055127:G>A,-,1.9788,chr8:q11.23,,,
611,PDGFRA,somatic,True,Act,,,,,amp,2.2202,chr4:q12,,,
612,KIT,somatic,True,Act,,,,,amp,2.2202,chr4:q12,,,
613,KDR,somatic,True,Act,,,,,amp,2.2202,chr4:q12,,,
614,ZRSR2,somatic,True,LoF,,,,,del,0.9923,chrX:p22.2,,,
615,EIF1AX,somatic,True,LoF,,,,,del,0.9919,chrX:p22.12,,,
616,RPS6KA3,somatic,True,LoF,,,,,del,0.9919,chrX:p22.12,,,
617,ZFX,somatic,True,LoF,,,,,del,0.9919,chrX:p22.11,,,
618,BCOR,somatic,True,LoF,,,,,del,0.9919,chrX:p11.4,,,


In [34]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role',
        'cytoband_t1','cytoband_t2','CNA_t1', 'CN_t1', 'CNA_t2', 'CN_t2']][common_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,cytoband_t1,cytoband_t2,CNA_t1,CN_t1,CNA_t2,CN_t2
609,H3C2,False,False,True,,,,,,,
610,ZRSR2,False,False,True,LoF,chrX:p22.2,chrX:p22.2,del,0.9986,del,0.9923
611,EIF1AX,False,False,True,LoF,chrX:p22.12,chrX:p22.12,del,0.9986,del,0.9919
612,RPS6KA3,False,False,True,LoF,chrX:p22.12,chrX:p22.12,del,0.9986,del,0.9919
613,ZFX,False,False,True,LoF,chrX:p22.11,chrX:p22.11,del,0.9986,del,0.9919
614,BCOR,False,False,True,LoF,chrX:p11.4,chrX:p11.4,del,0.9986,del,0.9919
615,DDX3X,False,False,True,LoF,chrX:p11.4,chrX:p11.4,del,0.9986,del,0.9919
616,KDM6A,False,False,True,LoF,chrX:p11.3,chrX:p11.3,del,0.9986,del,0.9919
617,RBM10,False,False,True,LoF,chrX:p11.3,chrX:p11.3,del,0.9986,del,0.9919
618,WDR45,False,False,True,LoF,chrX:p11.23,chrX:p11.23,del,0.9986,del,0.9919


In [35]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2
5,Z82190.2,intron_variant,0.0,0.458,True,0.316,True
1,PDS5B,intron_variant,0.002089,0.42,True,0.296,True
7,H3C2,missense_variant,0.0,0.468,True,0.329,True
6,COL23A1,intron_variant,0.03096,0.133,False,0.167,False
3,CLEC4G,upstream_gene_variant,1.4e-05,0.493,True,0.396,True
0,CACUL1,3_prime_UTR_variant,0.0,0.52,True,0.448,True
2,AL512358.2,"intron_variant,non_coding_transcript_variant",0.0,0.412,True,0.341,True
4,-,intergenic_variant,2.1e-05,0.439,True,0.386,True


In [36]:
df = pd.merge(t1_cnv_df,t2_cnv_df,how='inner',on=['SYMBOL','CNA'],suffixes=['_t1','_t2'])
df = df[(df['CNA']!='-')]
df.sort_values(by='cytoband_t1',ascending=False)

Unnamed: 0,SYMBOL,germline_t1,germline_mskcc_t1,intogen_t1,role_t1,CNA,CN_t1,CN_min_allele_t1,cytoband_t1,germline_t2,germline_mskcc_t2,intogen_t2,role_t2,CN_t2,CN_min_allele_t2,cytoband_t2
1956,CTBP2P1,False,False,False,,del,0.9922,0.0,chrY:q12,False,False,False,,0.9955,0.0,chrY:q12
1955,CCNQP2,False,False,False,,del,0.9922,0.0,chrY:q12,False,False,False,,0.9955,0.0,chrY:q12
1881,GOLGA2P2Y,False,False,False,,del,0.9922,0.0,chrY:q11.23,False,False,False,,0.9955,0.0,chrY:q11.23
1895,TRIM60P11Y,False,False,False,,del,0.9922,0.0,chrY:q11.23,False,False,False,,0.9955,0.0,chrY:q11.23
1885,CICP2,False,False,False,,del,0.9922,0.0,chrY:q11.23,False,False,False,,0.9955,0.0,chrY:q11.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5875,MIR4536-2,False,False,False,,del,0.9986,0.0,chrX:p11.21,False,False,False,,0.9990,0.0,chrX:p11.21
5876,MIR4536-1,False,False,False,,del,0.9986,0.0,chrX:p11.21,False,False,False,,0.9990,0.0,chrX:p11.21
5877,MAGEH1,False,False,False,,del,0.9986,0.0,chrX:p11.21,False,False,False,,0.9990,0.0,chrX:p11.21
5878,USP51,False,False,False,,del,0.9986,0.0,chrX:p11.21,False,False,False,,0.9990,0.0,chrX:p11.21


In [37]:
df = pd.merge(t1_sv_df,t2_sv_df,how='inner',on=['SYMBOL','sv_type','fusion','chr/chr','germline','germline_mskcc','intogen','role','cgc_transl'],suffixes=['_t1','_t2'])
# df = df[(df['CNA']!='-')]
df[['SYMBOL','fusion','cgc_transl','chr/chr','sv_type','distance_t1','distance_t2','distance_rel_t1','distance_rel_t2','mut_t1','mut_t2']].sort_values(by='SYMBOL',ascending=False)

Unnamed: 0,SYMBOL,fusion,cgc_transl,chr/chr,sv_type,distance_t1,distance_t2,distance_rel_t1,distance_rel_t2,mut_t1,mut_t2


In [38]:
def order_muts(df):
    roles = ['LoF','ambiguous','Act']
    variants = ['truncating','miss_inframe','other']
    df['role'] = df['role'].astype("category")
    df['role'].cat.set_categories(roles, inplace=True)
    df['variant_type'] = df['variant_type'].astype("category")
    df['variant_type'].cat.set_categories(variants, inplace=True)
    df = df[~df['SYMBOL'].str.contains('HLA')].sort_values(['germline','germline_mskcc','germline_akh','role','variant_type'],ascending=[False,False,False,roles,variants])
    return df

#pt1_g_df[['SYMBOL','role','germline','intogen','gnomADg_AF','Consequence']][(pt1_g_df['SYMBOL']!='-')&(pt1_g_df['germline']==True)&(pt1_g_df['Consequence']!='intron_variant')].sort_values(by)
df = order_muts(g_df)
df[['SYMBOL','role','germline','intogen','gnomADg_AF','Consequence','mut','n_AF_real']][(g_df['SYMBOL']!='-')&(g_df['germline']==True)&(g_df['Consequence']=='3_prime_UTR_variant')]


  del sys.path[0]


Unnamed: 0,SYMBOL,role,germline,intogen,gnomADg_AF,Consequence,mut,n_AF_real
2998,PTEN,LoF,True,True,2.1e-05,3_prime_UTR_variant,chr10:87969525:G>A,0.52
19645,SMAD4,LoF,True,True,0.0,3_prime_UTR_variant,chr18:51084008:GCGCGCACAC>-,0.73
46605,FANCC,LoF,True,True,0.002255,3_prime_UTR_variant,chr9:95101257:C>T,0.481
23075,RUNX1,ambiguous,True,True,0.003087,3_prime_UTR_variant,chr21:34791522:T>A,0.421
104610,PDGFRA,Act,True,True,0.0,3_prime_UTR_variant,chr4:54296369:TT>-,0.961
26736,SMARCE1,LoF,True,False,0.006633,3_prime_UTR_variant,chr17:40626609:C>T,0.49


In [39]:
#germline non-coding variants
df[['SYMBOL','role','germline','intogen','gnomADg_AF','Consequence','mut','n_AF_real']][(g_df['germline']==True)&(g_df['Consequence']!='intron_variant')&(g_df['Consequence']!='intergenic region')]


  


Unnamed: 0,SYMBOL,role,germline,intogen,gnomADg_AF,Consequence,mut,n_AF_real
15209,DICER1,LoF,True,True,0.0,missense_variant,chr14:95099832:T>C,0.456
2998,PTEN,LoF,True,True,2.1e-05,3_prime_UTR_variant,chr10:87969525:G>A,0.52
11264,CDH1,LoF,True,True,0.000628,synonymous_variant,chr16:68833370:C>T,0.441
19645,SMAD4,LoF,True,True,0.0,3_prime_UTR_variant,chr18:51084008:GCGCGCACAC>-,0.73
25566,TP53,LoF,True,True,0.00188,downstream_gene_variant,chr17:7667061:C>T,0.574
27294,BRIP1,LoF,True,False,0.005919,upstream_gene_variant,chr17:61863725:A>G,0.477
31018,SMARCA4,LoF,True,True,0.0,upstream_gene_variant,chr19:10959656:AAAC>-,1.0
46605,FANCC,LoF,True,True,0.002255,3_prime_UTR_variant,chr9:95101257:C>T,0.481
46610,FANCC,LoF,True,True,7e-06,"intron_variant,non_coding_transcript_variant",chr9:95338981:T>C,0.472
46611,FANCC,LoF,True,True,0.001423,"intron_variant,non_coding_transcript_variant",chr9:95364683:A>T,0.478


In [40]:
df1 = g_df[g_df['SYMBOL']!='-']
df2 = t1_snv_df[t1_snv_df['SYMBOL']!='-']

df = pd.merge(df1,df2,on=['SYMBOL','germline','intogen'],how='inner',suffixes=['_germ','_som'])

variants = ['truncating','miss_inframe','other']
df[['SYMBOL','germline','intogen','Consequence_germ','Consequence_som','aa_change_germ','aa_change_som','mut_germ','mut_som']][df['variant_type_germ'].isin(variants)|df['variant_type_som'].isin(variants)]

Unnamed: 0,SYMBOL,germline,intogen,Consequence_germ,Consequence_som,aa_change_germ,aa_change_som,mut_germ,mut_som
468,PHKB,False,False,missense_variant,intron_variant,N632D,-,chr16:47650844:A>G,chr16:47537021:AT>-
699,KLHL1,False,False,missense_variant,intron_variant,S80P,-,chr13:70107462:A>G,chr13:69787150:->A
1620,ROBO1,False,False,"splice_region_variant,intron_variant",intron_variant,-,-,chr3:78661986:A>C,chr3:79512975:A>G


In [41]:
df1 = g_df[g_df['SYMBOL']!='-']
df2 = t2_snv_df[t2_snv_df['SYMBOL']!='-']

df = pd.merge(df1,df2,on=['SYMBOL','germline','intogen'],how='inner',suffixes=['_germ','_som'])

variants = ['truncating','miss_inframe','other']
df[['SYMBOL','germline','intogen','Consequence_germ','Consequence_som','aa_change_germ','aa_change_som','mut_germ','mut_som']][(df['variant_type_germ'].isin(variants))&(df['variant_type_som'].isin(variants))]

Unnamed: 0,SYMBOL,germline,intogen,Consequence_germ,Consequence_som,aa_change_germ,aa_change_som,mut_germ,mut_som
6717,SAMD9L,False,False,missense_variant,inframe_deletion,Q1010E,PLMEAL1057-1062L,chr7:93132944:G>C,chr7:93132788:AAGCTTCCATTAATG>-


## Patient 2: ARMS and AML +4y

In [49]:
pt = 'pt2'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [43]:
t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut',
       'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']
df = t1_df[cols]
df[(df['origin']=='germline')&(df['gnomADg_AF']<.001)&(df['gnomADg_AF']>.0001)&((df['germline']==True)|(df['intogen']==True))|(df['origin']=='somatic')&((df['intogen']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
1,ATM,False,germline,True,True,LoF,miss_inframe,R2854C,0.53,0.000126,chr11:108345884:C>T,,,,,,,,
2,EGFR,False,germline,True,True,Act,miss_inframe,A1210V,0.485,0.000328,chr7:55205613:C>T,,,,,,,,
4,CBL,False,germline,True,True,Act,miss_inframe,P687L,0.488,0.000119,chr11:119296941:C>T,,,,,,,,
7,FAT1,False,germline,True,True,LoF,miss_inframe,F2549L,0.483,0.000223,chr4:186618941:A>G,,,,,,,,
8,FANCM,False,germline,True,False,,miss_inframe,D556G,0.486,0.000105,chr14:45164444:A>G,,,,,,,,
9,JMJD1C,False,germline,True,False,,miss_inframe,S1429L,0.383,0.000545,chr10:63207383:G>A,,,,,,,,
11,KMT2C,True,germline,False,True,LoF,miss_inframe,G315S,0.177,0.000467,chr7:152273774:C>T,,,,,,,,
14,TFAP4,False,germline,False,True,LoF,miss_inframe,S245F,0.557,0.00021,chr16:4260178:G>A,,,,,,,,
15,AMER1,False,germline,False,True,LoF,miss_inframe,A264T,0.518,0.000197,chrX:64192497:C>T,,,,,,,,
21,FAT4,False,germline,False,True,LoF,miss_inframe,S3752G,0.515,0.000593,chr4:125452270:A>G,,,,,,,,


In [44]:
df[(df['origin']=='germline')&(df['intogen']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,ATM,False,germline,True,True,LoF,miss_inframe,V182L,0.65,0.007418,chr11:108244000:G>C,,,,,,,,
1,ATM,False,germline,True,True,LoF,miss_inframe,R2854C,0.53,0.000126,chr11:108345884:C>T,,,,,,,,
2,EGFR,False,germline,True,True,Act,miss_inframe,A1210V,0.485,0.000328,chr7:55205613:C>T,,,,,,,,
3,MET,False,germline,True,True,Act,other,-,0.553,0.001314,chr7:116769637:C>-,,,,,,,,
4,CBL,False,germline,True,True,Act,miss_inframe,P687L,0.488,0.000119,chr11:119296941:C>T,,,,,,,,
6,EP300,False,germline,True,True,LoF,miss_inframe,G211S,0.529,0.006329,chr22:41117723:G>A,,,,,,,,
7,FAT1,False,germline,True,True,LoF,miss_inframe,F2549L,0.483,0.000223,chr4:186618941:A>G,,,,,,,,
11,KMT2C,True,germline,False,True,LoF,miss_inframe,G315S,0.177,0.000467,chr7:152273774:C>T,,,,,,,,
12,PRDM2,False,germline,False,True,LoF,miss_inframe,KE1257-1258K,0.491,0.004047,chr1:13781565:AAG>-,,,,,,,,
13,PTPRC,False,germline,False,True,LoF,miss_inframe,N351S,0.528,0.009358,chr1:198709705:A>G,,,,,,,,


In [45]:
g_df.columns

Index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'NORMAL', 'n_AF', 'n_alt_reads', 'n_ref_reads', 'DP_normal', 'mut_type',
       'GT_normal', 'Gene', 'Feature', 'Feature_type', 'Consequence',
       'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids',
       'Codons', 'Existing_variation', 'IMPACT', 'DISTANCE', 'STRAND', 'FLAGS',
       'SYMBOL', 'SYMBOL_SOURCE', 'HGNC_ID', 'CANONICAL', 'ENSP', 'SOURCE',
       'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'CLIN_SIG', 'SOMATIC',
       'PHENO', 'gnomADg', 'gnomADg_AF', 'gnomADg_NFE', 'subset_origin',
       'SAMPLE', 'maf_vs_vep', 'Damaging', 'mut', 'aa_change', 'n_AF_real',
       'intogen', 'germline', 'germline_mskcc', 'germline_akh', 'role',
       'variant_type'],
      dtype='object')

In [46]:
#table germline for paper
cols = ['SYMBOL','role','Consequence','aa_change','gnomADg_AF','Protein_position','germline','germline','germline_mskcc','germline_akh']
g_df[cols][(g_df['germline']==True)|(g_df['germline_mskcc']==True)|(g_df['germline_akh']==True)].drop_duplicates()

Unnamed: 0,SYMBOL,role,Consequence,aa_change,gnomADg_AF,Protein_position,germline,germline.1,germline_mskcc,germline_akh
393,PMS2,LoF,intron_variant,-,0.005007,-,True,True,True,True
2237,GLI3,,intron_variant,-,0.000405,-,False,False,False,True
2238,GLI3,,intron_variant,-,0.003811,-,False,False,False,True
2239,GLI3,,intron_variant,-,0.008488,-,False,False,False,True
2240,GLI3,,intron_variant,-,0.008655,-,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...
183731,DIS3L2,,intron_variant,-,0.001133,-,True,True,False,True
184089,SOS1,Act,intron_variant,-,0.000000,-,True,True,False,True
185173,ABCB11,,intron_variant,-,0.000000,-,True,True,False,True
185556,ERBB4,LoF,upstream_gene_variant,-,0.000000,-,False,False,False,True


In [47]:
#table germline for paper
cols = ['SYMBOL','role','Consequence','aa_change','gnomADg_AF','origin','germline']
t1_df[cols][(t1_df['origin']=='germline')&(t1_df['germline']==True)]

Unnamed: 0,SYMBOL,role,Consequence,aa_change,gnomADg_AF,origin,germline
0,ATM,LoF,missense_variant,V182L,0.007418,germline,True
1,ATM,LoF,missense_variant,R2854C,0.000126,germline,True
2,EGFR,Act,missense_variant,A1210V,0.000328,germline,True
3,MET,Act,"splice_region_variant,intron_variant",-,0.001314,germline,True
4,CBL,Act,missense_variant,P687L,0.000119,germline,True
5,ITK,,missense_variant,R193Q,0.003875,germline,True
6,EP300,LoF,missense_variant,G211S,0.006329,germline,True
7,FAT1,LoF,missense_variant,F2549L,0.000223,germline,True
8,FANCM,,missense_variant,D556G,0.000105,germline,True
9,JMJD1C,,missense_variant,S1429L,0.000545,germline,True


In [58]:
t2_df[(t2_df['origin']=='somatic')&(t2_df['intogen']==True)]

Unnamed: 0,SYMBOL,germline,germline_mskcc,germline_akh,intogen,role,variant_type,Consequence,aa_change,mut,...,CN_min_allele,cytoband,mut_sv,fusion,cgc_transl,chr/chr,sv_type,distance,distance_rel,germ_som
859,WT1,True,True,True,True,LoF,truncating,frameshift_variant,R158RDX,chr11:32396397:->CGTCG,...,0.9825,chr11:p13,,,,,,,,False
860,KRAS,True,True,True,True,Act,miss_inframe,missense_variant,G12A,chr12:25245350:C>G,...,0.9893,chr12:p12.1,,,,,,,,False
861,CSF3R,False,False,False,True,Act,other,"splice_region_variant,synonymous_variant",E492,chr1:36469256:C>T,...,0.991,chr1:p34.3,,,,,,,,False
862,H3-3A,False,False,False,True,,miss_inframe,missense_variant,R50P,chr1:226065676:G>C,...,,,,,,,,,,False
873,PMS2,True,True,,True,LoF,,,,,...,0.012,chr7:p22.1,,,,,,,,False
874,NT5C3A,False,False,,True,LoF,,,,,...,0.012,chr7:p14.3,,,,,,,,False
875,IKZF1,False,False,,True,LoF,,,,,...,0.012,chr7:p12.2,,,,,,,,False
876,POT1,False,False,,True,LoF,,,,,...,0.0092,chr7:q31.33,,,,,,,,False
877,EZH2,False,False,,True,LoF,,,,,...,0.0092,chr7:q36.1,,,,,,,,False
878,KMT2C,False,False,,True,LoF,,,,,...,0.0092,chr7:q36.1,,,,,,,,True


In [50]:
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
df = t2_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True))]

KeyError: 'intogen'

In [189]:
muts = t2_df[t2_df['origin']=='somatic']['mut'].tolist()
t2_snv_df[['mut','STRAND','SYMBOL','Consequence']][t2_snv_df['mut'].isin(muts)]

Unnamed: 0,mut,STRAND,SYMBOL,Consequence
26,chr1:36469256:C>T,-1,CSF3R,"splice_region_variant,synonymous_variant"
343,chr11:32396397:->CGTCG,-1,WT1,frameshift_variant
468,chr12:6522966:G>A,1,NCAPD2,missense_variant
515,chr12:49269604:C>G,1,TUBA1C,missense_variant
1106,chr19:12754970:C>A,1,BEST2,missense_variant
1371,chr20:968059:C>A,-1,RSPO4,missense_variant
2381,chr8:56305625:A>T,-1,SDR16C5,missense_variant
2395,chr8:72875201:A>T,-1,AC090735.1,"splice_region_variant,non_coding_transcript_ex..."


In [199]:
cols = ['SYMBOL','germline', 'intogen', 'role','t_CCF','clonal',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut']
t2_snv_df[cols][t2_snv_df['intogen']==True]

Unnamed: 0,SYMBOL,germline,intogen,role,t_CCF,clonal,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
26,CSF3R,False,True,Act,0.986938,True,other,E492,0.0,0.0,chr1:36469256:C>T
197,NRP1,False,True,LoF,1.056561,True,,-,0.0,0.0,chr10:33211862:G>A
294,TCF7L2,False,True,Act,0.9447,True,,-,0.0,0.0,chr10:113134863:C>A
343,WT1,True,True,LoF,1.10023,True,truncating,R158RDX,0.0,0.0,chr11:32396397:->CGTCG
492,KRAS,False,True,Act,0.766388,False,miss_inframe,G12A,0.0,0.0,chr12:25245350:C>G
534,PTPRB,False,True,LoF,1.02733,True,,-,0.0,0.0,chr12:70611043:G>T
585,PTPN11,True,True,Act,0.109769,False,miss_inframe,E76G,0.0,0.0,chr12:112450407:A>G
597,NCOR2,False,True,LoF,1.151172,True,,-,0.0,0.0,chr12:124338966:C>T
786,TRIP11,False,True,Act,1.03754,True,,-,0.0,0.0,chr14:91985520:C>T
859,NTRK3,False,True,Act,1.181603,True,,-,0.0,0.0,chr15:88247379:T>A


In [47]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a predisposing cancer gene for NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
161921,PHOX2B,True,True,LoF,,-,0.495,0.006526,chr4:41740208:T>C,downstream_gene_variant
161922,PHOX2B-AS1,False,False,,,-,0.439,0.006453,chr4:41755323:G>A,"intron_variant,non_coding_transcript_variant"
161923,PHOX2B-AS1,False,False,,,-,0.466,0.006539,chr4:41764266:T>C,"intron_variant,non_coding_transcript_variant"
161924,PHOX2B-AS1,False,False,,,-,0.539,0.006337,chr4:41788119:C>G,"intron_variant,non_coding_transcript_variant"
170536,PHOX2B-AS1,False,False,,,-,1.0,0.0,chr4:41776013:GTGT>-,"intron_variant,non_coding_transcript_variant"


In [48]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role',
        'cytoband_t1','cytoband_t2', 'mut_sv_t1','mut_sv_t2',
               'fusion_t1','fusion_t2','cgc_transl_t1','cgc_transl_t2',
               'sv_type_t1','sv_type_t2', 'chr/chr_t1','chr/chr_t2','distance_t1','distance_t2']][common_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,cytoband_t1,cytoband_t2,mut_sv_t1,mut_sv_t2,fusion_t1,fusion_t2,cgc_transl_t1,cgc_transl_t2,sv_type_t1,sv_type_t2,chr/chr_t1,chr/chr_t2,distance_t1,distance_t2
859,POT1,False,False,True,LoF,chr7:q31.33,chr7:q31.33,,,,,,,,,,,,
860,EZH2,False,False,True,LoF,chr7:q36.1,chr7:q36.1,,,,,,,,,,,,
861,KMT2C,True,False,True,LoF,chr7:q36.1,chr7:q36.1,,,,,,,,,,,,
862,CNTNAP2,False,False,False,LoF,chr7:q35,chr7:q35,,,,,,,,,,,,


In [49]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2
0,DGKI,intron_variant,0.48211,0.636,True,0.527,True


In [50]:
df[(df['origin']=='germline')&(df['intogen']==True)]

KeyError: 'origin'

In [51]:
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
df = t2_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True))]

KeyError: 'origin'

In [52]:
muts = t2_df[t2_df['origin']=='somatic']['mut'].tolist()
t2_snv_df[['mut','STRAND','SYMBOL','Consequence']][t2_snv_df['mut'].isin(muts)]

Unnamed: 0,mut,STRAND,SYMBOL,Consequence
26,chr1:36469256:C>T,-1,CSF3R,"splice_region_variant,synonymous_variant"
343,chr11:32396397:->CGTCG,-1,WT1,frameshift_variant
468,chr12:6522966:G>A,1,NCAPD2,missense_variant
515,chr12:49269604:C>G,1,TUBA1C,missense_variant
1106,chr19:12754970:C>A,1,BEST2,missense_variant
1371,chr20:968059:C>A,-1,RSPO4,missense_variant
2381,chr8:56305625:A>T,-1,SDR16C5,missense_variant
2395,chr8:72875201:A>T,-1,AC090735.1,"splice_region_variant,non_coding_transcript_ex..."


In [53]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a predisposing cancer gene for NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
161921,PHOX2B,True,True,LoF,,-,0.495,0.006526,chr4:41740208:T>C,downstream_gene_variant
161922,PHOX2B-AS1,False,False,,,-,0.439,0.006453,chr4:41755323:G>A,"intron_variant,non_coding_transcript_variant"
161923,PHOX2B-AS1,False,False,,,-,0.466,0.006539,chr4:41764266:T>C,"intron_variant,non_coding_transcript_variant"
161924,PHOX2B-AS1,False,False,,,-,0.539,0.006337,chr4:41788119:C>G,"intron_variant,non_coding_transcript_variant"
170536,PHOX2B-AS1,False,False,,,-,1.0,0.0,chr4:41776013:GTGT>-,"intron_variant,non_coding_transcript_variant"


In [54]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role',
        'cytoband_t1','cytoband_t2', 'mut_sv_t1','mut_sv_t2',
               'fusion_t1','fusion_t2','cgc_transl_t1','cgc_transl_t2',
               'sv_type_t1','sv_type_t2', 'chr/chr_t1','chr/chr_t2','distance_t1','distance_t2']][common_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,cytoband_t1,cytoband_t2,mut_sv_t1,mut_sv_t2,fusion_t1,fusion_t2,cgc_transl_t1,cgc_transl_t2,sv_type_t1,sv_type_t2,chr/chr_t1,chr/chr_t2,distance_t1,distance_t2
859,POT1,False,False,True,LoF,chr7:q31.33,chr7:q31.33,,,,,,,,,,,,
860,EZH2,False,False,True,LoF,chr7:q36.1,chr7:q36.1,,,,,,,,,,,,
861,KMT2C,True,False,True,LoF,chr7:q36.1,chr7:q36.1,,,,,,,,,,,,
862,CNTNAP2,False,False,False,LoF,chr7:q35,chr7:q35,,,,,,,,,,,,


In [55]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2
0,DGKI,intron_variant,0.48211,0.636,True,0.527,True


## Patient 4: Synovial sarcoma and AML +1y

In [56]:
pt = 'pt4'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(pt1_g_ranked_df,pt1_t1_snv_ranked_df,pt1_t1_sv_df,pt1_t1_cnv_df)
t2_df = concat_all_mutations(pt1_g_ranked_df,pt1_t2_snv_ranked_df,pt1_t2_sv_df,pt1_t2_cnv_df)

NameError: name 'pt1_g_ranked_df' is not defined

In [57]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 't_AF','gnomADg_AF', 'mut',
        'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
df = t1_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True)|(df['cgc_transl']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,CDH1,False,germline,True,True,LoF,miss_inframe,A592T,0.515,,0.003129,chr16:68822063:G>A,,,,,,,,
1,SDHB,False,germline,True,False,LoF,miss_inframe,R11H,0.476,,0.001807,chr1:17053988:C>T,,,,,,,,
2,APC,False,germline,True,True,LoF,miss_inframe,R2801Q,0.488,,3.5e-05,chr5:112843996:G>A,,,,,,,,
3,MSH6,False,germline,True,False,LoF,miss_inframe,M1156T,0.586,,0.0,chr2:47804938:T>C,,,,,,,,
4,DDB2,False,germline,True,True,LoF,miss_inframe,V374M,0.505,,0.000272,chr11:47237933:G>A,,,,,,,,
5,AXIN2,False,germline,True,False,LoF,other,-,0.475,,9.1e-05,chr17:65530110:G>C,,,,,,,,
6,WAS,False,germline,True,True,Act,miss_inframe,H180N,0.47,,0.001072,chrX:48686113:C>A,,,,,,,,
7,GJB2,False,germline,True,False,,miss_inframe,G4D,0.496,,0.000579,chr13:20189571:C>T,,,,,,,,
8,ETV6,False,germline,True,True,Act,miss_inframe,V166M,0.5,,0.000398,chr12:11869456:G>A,,,,,,,,
9,MSH3,False,germline,True,False,,miss_inframe,AAA51-53-,0.477,,0.001823,chr5:80654878:GCTGCAGCG>-,,,,,,,,


In [58]:
df[(df['origin']=='germline')&(df['intogen']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,CDH1,False,germline,True,True,LoF,miss_inframe,A592T,0.515,,0.003129,chr16:68822063:G>A,,,,,,,,
2,APC,False,germline,True,True,LoF,miss_inframe,R2801Q,0.488,,3.5e-05,chr5:112843996:G>A,,,,,,,,
4,DDB2,False,germline,True,True,LoF,miss_inframe,V374M,0.505,,0.000272,chr11:47237933:G>A,,,,,,,,
6,WAS,False,germline,True,True,Act,miss_inframe,H180N,0.47,,0.001072,chrX:48686113:C>A,,,,,,,,
8,ETV6,False,germline,True,True,Act,miss_inframe,V166M,0.5,,0.000398,chr12:11869456:G>A,,,,,,,,
11,CREBBP,False,germline,True,True,LoF,miss_inframe,TPGS1990-1993S,0.442,,6.3e-05,chr16:3729070:TCCCCGGGG>-,,,,,,,,
12,FANCD2,False,germline,True,True,LoF,other,-,0.147,,0.002208,chr3:10064724:C>T,,,,,,,,
13,NTRK1,False,germline,True,True,Act,other,-,0.477,,0.009287,chr1:156864720:C>T,,,,,,,,
18,MGA,False,germline,False,True,LoF,miss_inframe,G2634R,0.515,,3.5e-05,chr15:41765041:G>C,,,,,,,,
19,MGA,False,germline,False,True,LoF,miss_inframe,D2677N,0.482,,0.0,chr15:41766111:G>A,,,,,,,,


In [59]:
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
df = t2_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,CDH1,False,germline,True,True,LoF,miss_inframe,A592T,0.515,,0.003129,chr16:68822063:G>A,,,,,,,,
1,SDHB,False,germline,True,False,LoF,miss_inframe,R11H,0.476,,0.001807,chr1:17053988:C>T,,,,,,,,
2,APC,False,germline,True,True,LoF,miss_inframe,R2801Q,0.488,,3.5e-05,chr5:112843996:G>A,,,,,,,,
3,MSH6,False,germline,True,False,LoF,miss_inframe,M1156T,0.586,,0.0,chr2:47804938:T>C,,,,,,,,
4,DDB2,False,germline,True,True,LoF,miss_inframe,V374M,0.505,,0.000272,chr11:47237933:G>A,,,,,,,,
5,AXIN2,False,germline,True,False,LoF,other,-,0.475,,9.1e-05,chr17:65530110:G>C,,,,,,,,
6,WAS,False,germline,True,True,Act,miss_inframe,H180N,0.47,,0.001072,chrX:48686113:C>A,,,,,,,,
7,GJB2,False,germline,True,False,,miss_inframe,G4D,0.496,,0.000579,chr13:20189571:C>T,,,,,,,,
8,ETV6,True,germline,True,True,Act,miss_inframe,V166M,0.5,,0.000398,chr12:11869456:G>A,,,,,,,,
9,MSH3,False,germline,True,False,,miss_inframe,AAA51-53-,0.477,,0.001823,chr5:80654878:GCTGCAGCG>-,,,,,,,,


In [60]:
muts = t2_df[t2_df['origin']=='somatic']['mut'].tolist()
t2_snv_df[['mut','STRAND','SYMBOL','Consequence']][t2_snv_df['mut'].isin(muts)]

Unnamed: 0,mut,STRAND,SYMBOL,Consequence
16,chr1:77863954:G>A,1,MIGA1,missense_variant
139,chr12:25245350:C>A,-1,KRAS,missense_variant
362,chr2:189455343:A>G,1,WDR75,missense_variant
409,chr22:50287720:C>T,-1,PLXNB2,missense_variant
683,chr7:107952097:G>A,-1,LAMB1,missense_variant
790,chrX:27821570:C>G,1,MAGEB10,missense_variant


In [61]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a predisposing cancer gene for NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
217076,PHOX2B-AS1,False,False,,,-,0.548,0.004383,chr4:41823114:A>T,"intron_variant,non_coding_transcript_variant"


In [62]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role',
        'cytoband_t1','cytoband_t2', 'mut_sv_t1','mut_sv_t2',
               'fusion_t1','fusion_t2','cgc_transl_t1','cgc_transl_t2',
               'sv_type_t1','sv_type_t2', 'chr/chr_t1','chr/chr_t2','distance_t1','distance_t2']][common_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,cytoband_t1,cytoband_t2,mut_sv_t1,mut_sv_t2,fusion_t1,fusion_t2,cgc_transl_t1,cgc_transl_t2,sv_type_t1,sv_type_t2,chr/chr_t1,chr/chr_t2,distance_t1,distance_t2
1184,EPHA7,False,False,True,,chr6:q16.1,chr6:q16.1,,,,,,,,,,,,
1185,FOXO3,False,False,False,,chr6:q21,chr6:q21,,,,,,,,,,,,


In [63]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2


## Patient 5: Ewing sarcoma and AML +1.5y

In [65]:
pt = 'pt5'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [66]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 't_AF','gnomADg_AF', 'mut',
        'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
df = t1_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True)|(df['cgc_transl']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,BLM,False,germline,True,False,LoF,miss_inframe,A1043D,0.567,,0.004123,chr15:90794275:C>A,,,,,,,,
1,PHOX2B,False,germline,True,True,LoF,miss_inframe,A223D,0.601,,0.0,chr4:41746084:G>T,,,,,,,,
2,SDHB,False,germline,True,False,LoF,miss_inframe,S163P,0.552,,0.009316,chr1:17027802:A>G,,,,,,,,
3,MET,False,germline,True,True,Act,miss_inframe,I316M,0.492,,0.006273,chr7:116700032:A>G,,,,,,,,
4,XPC,False,germline,True,True,LoF,miss_inframe,P334H,0.526,,0.007884,chr3:14158882:G>T,,,,,,,,
5,ERCC4,False,germline,True,False,LoF,other,-,0.423,,0.0021,chr16:13937761:T>C,,,,,,,,
6,KDR,False,germline,True,True,Act,miss_inframe,T771M,0.496,,0.001941,chr4:55098758:G>A,,,,,,,,
7,NTRK1,False,germline,True,True,Act,miss_inframe,R444Q,0.523,,0.009883,chr1:156874985:G>A,,,,,,,,
8,CDKN1C,False,germline,True,False,,miss_inframe,PAP208-210P,0.389,,0.006312,chr11:2884861:GGGGCC>-,,,,,,,,
9,CTR9,False,germline,True,False,,miss_inframe,Q1101K,0.52,,4.9e-05,chr11:10778884:C>A,,,,,,,,


In [67]:
df[(df['origin']=='germline')&(df['intogen']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
1,PHOX2B,False,germline,True,True,LoF,miss_inframe,A223D,0.601,,0.0,chr4:41746084:G>T,,,,,,,,
3,MET,False,germline,True,True,Act,miss_inframe,I316M,0.492,,0.006273,chr7:116700032:A>G,,,,,,,,
4,XPC,False,germline,True,True,LoF,miss_inframe,P334H,0.526,,0.007884,chr3:14158882:G>T,,,,,,,,
6,KDR,False,germline,True,True,Act,miss_inframe,T771M,0.496,,0.001941,chr4:55098758:G>A,,,,,,,,
7,NTRK1,False,germline,True,True,Act,miss_inframe,R444Q,0.523,,0.009883,chr1:156874985:G>A,,,,,,,,
10,TRAF3,False,germline,False,True,LoF,miss_inframe,V367M,0.554,,0.000133,chr14:102903393:G>A,,,,,,,,
11,FOXA2,False,germline,False,True,LoF,miss_inframe,A297V,0.575,,0.00187,chr20:22582352:G>A,,,,,,,,
12,TOP1,False,germline,False,True,LoF,miss_inframe,R662Q,0.403,,7e-06,chr20:41121730:G>A,,,,,,,,
13,PRR14,False,germline,False,True,LoF,miss_inframe,M280V,0.505,,0.003135,chr16:30654808:A>G,,,,,,,,
14,DCC,False,germline,False,True,LoF,miss_inframe,N176S,0.523,,0.000112,chr18:52906158:A>G,,,,,,,,


In [68]:
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
df = t2_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,BLM,False,germline,True,False,LoF,miss_inframe,A1043D,0.567,,0.004123,chr15:90794275:C>A,,,,,,,,
1,PHOX2B,False,germline,True,True,LoF,miss_inframe,A223D,0.601,,0.0,chr4:41746084:G>T,,,,,,,,
2,SDHB,False,germline,True,False,LoF,miss_inframe,S163P,0.552,,0.009316,chr1:17027802:A>G,,,,,,,,
3,MET,False,germline,True,True,Act,miss_inframe,I316M,0.492,,0.006273,chr7:116700032:A>G,,,,,,,,
4,XPC,False,germline,True,True,LoF,miss_inframe,P334H,0.526,,0.007884,chr3:14158882:G>T,,,,,,,,
5,ERCC4,False,germline,True,False,LoF,other,-,0.423,,0.0021,chr16:13937761:T>C,,,,,,,,
6,KDR,False,germline,True,True,Act,miss_inframe,T771M,0.496,,0.001941,chr4:55098758:G>A,,,,,,,,
7,NTRK1,False,germline,True,True,Act,miss_inframe,R444Q,0.523,,0.009883,chr1:156874985:G>A,,,,,,,,
8,CDKN1C,False,germline,True,False,,miss_inframe,PAP208-210P,0.389,,0.006312,chr11:2884861:GGGGCC>-,,,,,,,,
9,CTR9,False,germline,True,False,,miss_inframe,Q1101K,0.52,,4.9e-05,chr11:10778884:C>A,,,,,,,,


In [69]:
muts = t2_df[t2_df['origin']=='somatic']['mut'].tolist()
t2_snv_df[['mut','STRAND','SYMBOL','Consequence']][t2_snv_df['mut'].isin(muts)]

Unnamed: 0,mut,STRAND,SYMBOL,Consequence
115,chr1:163168294:T>C,-1,RGS5,missense_variant
406,chr11:72235192:A>T,1,INPPL1,missense_variant
458,chr11:125620076:->CTC,1,STT3A,inframe_insertion
557,chr12:88055708:T>G,-1,CEP290,missense_variant
998,chr17:73414729:G>A,-1,SDK2,missense_variant
1112,chr19:35725033:G>C,1,KMT2B,missense_variant
1438,chr21:44886866:C>T,-1,ITGB2,missense_variant
1441,chr21:46260862:C>T,-1,MCM3AP,missense_variant
2566,chr8:123082795:->T,1,TBC1D31,frameshift_variant
2584,chr8:144517160:G>C,-1,RECQL4,missense_variant


In [70]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a predisposing cancer gene for NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
140138,PHOX2B,True,True,LoF,miss_inframe,A223D,0.601,0.0,chr4:41746084:G>T,missense_variant
140139,PHOX2B-AS1,False,False,,,-,0.511,0.004,chr4:41777400:C>T,"intron_variant,non_coding_transcript_variant"
140140,PHOX2B-AS1,False,False,,,-,0.537,0.003462,chr4:41815906:A>G,"intron_variant,non_coding_transcript_variant"
147681,PHOX2B-AS1,False,False,,,-,0.978,0.0,chr4:41776013:GT>-,"intron_variant,non_coding_transcript_variant"


In [71]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role',
        'cytoband_t1','cytoband_t2', 'mut_sv_t1','mut_sv_t2',
               'fusion_t1','fusion_t2','cgc_transl_t1','cgc_transl_t2',
               'sv_type_t1','sv_type_t2', 'chr/chr_t1','chr/chr_t2','distance_t1','distance_t2']][common_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,cytoband_t1,cytoband_t2,mut_sv_t1,mut_sv_t2,fusion_t1,fusion_t2,cgc_transl_t1,cgc_transl_t2,sv_type_t1,sv_type_t2,chr/chr_t1,chr/chr_t2,distance_t1,distance_t2
857,NRG1,False,False,True,Act,chr8:p12,chr8:p12,,,,,,,,,,,,
858,FGFR1,False,False,True,Act,chr8:p11.23,chr8:p11.23,,,,,,,,,,,,
859,KAT6A,False,False,True,Act,chr8:p11.21,chr8:p11.21,,,,,,,,,,,,
860,IKBKB,False,False,True,Act,chr8:p11.21,chr8:p11.21,,,,,,,,,,,,
861,SOX17,False,False,True,Act,chr8:q11.23,chr8:q11.23,,,,,,,,,,,,
862,PLAG1,False,False,True,Act,chr8:q12.1,chr8:q12.1,,,,,,,,,,,,
863,PREX2,False,False,True,Act,chr8:q13.2,chr8:q13.2,,,,,,,,,,,,
864,NCOA2,True,False,True,Act,chr8:q13.3,chr8:q13.3,,,,,,,,,,,,
865,UBR5,True,False,True,Act,chr8:q22.3,chr8:q22.3,,,,,,,,,,,,
866,EIF3E,False,False,True,Act,chr8:q23.1,chr8:q23.1,,,,,,,,,,,,


In [72]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2
1,Z93403.1,"intron_variant,non_coding_transcript_variant",1e-05,0.145,False,0.147,True
0,-,intergenic_variant,0.000316,0.136,False,0.122,True


## Patient 6: Neuroblastoma and AML +7.5y

In [73]:
pt = 'pt6'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [74]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 't_AF','gnomADg_AF', 'mut',
        'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
df = t1_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True)|(df['cgc_transl']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PALB2,False,germline,True,True,LoF,miss_inframe,E105D,0.583,,7e-06,chr16:23636231:C>G,,,,,,,,
1,STK11,False,germline,True,True,LoF,miss_inframe,F354L,0.472,,0.003367,chr19:1223126:C>G,,,,,,,,
2,SERPINA1,False,germline,True,False,,truncating,E347X,0.262,,0.000253,chr14:94379488:CT>-,,,,,,,,
3,DOCK8,False,germline,True,False,,truncating,-,0.493,,0.000356,chr9:271626:G>T,,,,,,,,
4,SERPINA1,False,germline,True,False,,miss_inframe,R247C,0.402,,0.002068,chr14:94381049:G>A,,,,,,,,
5,RAD50,False,germline,True,False,,miss_inframe,V315L,0.504,,0.001382,chr5:132587981:G>T,,,,,,,,
6,ATR,False,germline,True,True,LoF,other,-,0.528,,0.001022,chr3:142465089:C>T,,,,,,,,
7,EP300,True,germline,True,True,LoF,other,-,0.514,,0.007025,chr22:41127755:G>C,,,,,,,,
8,SMARCA2,False,germline,True,True,,miss_inframe,Q230P,0.457,,0.000328,chr9:2039799:A>C,,,,,,,,
596,HSPG2,False,somatic,False,True,ambiguous,miss_inframe,T118M,0.0,0.298,1.4e-05,chr1:21890586:G>A,-,2.0316,chr1:p36.12,,,,,


In [75]:
df[(df['origin']=='germline')&(df['intogen']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PALB2,False,germline,True,True,LoF,miss_inframe,E105D,0.583,,7e-06,chr16:23636231:C>G,,,,,,,,
1,STK11,False,germline,True,True,LoF,miss_inframe,F354L,0.472,,0.003367,chr19:1223126:C>G,,,,,,,,
6,ATR,False,germline,True,True,LoF,other,-,0.528,,0.001022,chr3:142465089:C>T,,,,,,,,
7,EP300,True,germline,True,True,LoF,other,-,0.514,,0.007025,chr22:41127755:G>C,,,,,,,,
8,SMARCA2,False,germline,True,True,,miss_inframe,Q230P,0.457,,0.000328,chr9:2039799:A>C,,,,,,,,
9,ZFHX3,False,germline,False,True,LoF,miss_inframe,G585S,0.481,,0.005855,chr16:72958393:C>T,,,,,,,,
10,ASXL2,False,germline,False,True,LoF,miss_inframe,D487G,0.551,,0.0,chr2:25750096:T>C,,,,,,,,
12,MAP3K1,False,germline,False,True,LoF,miss_inframe,Q237R,0.5,,0.000858,chr5:56859791:A>G,,,,,,,,
13,ARID2,False,germline,False,True,LoF,miss_inframe,G1529R,0.514,,0.000167,chr12:45852708:G>A,,,,,,,,
14,PTPRB,False,germline,False,True,LoF,miss_inframe,P423L,0.489,,0.0,chr12:70594715:G>A,,,,,,,,


In [76]:
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
df = t2_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PALB2,False,germline,True,True,LoF,miss_inframe,E105D,0.583,,7e-06,chr16:23636231:C>G,,,,,,,,
1,STK11,False,germline,True,True,LoF,miss_inframe,F354L,0.472,,0.003367,chr19:1223126:C>G,,,,,,,,
2,SERPINA1,False,germline,True,False,,truncating,E347X,0.262,,0.000253,chr14:94379488:CT>-,,,,,,,,
3,DOCK8,False,germline,True,False,,truncating,-,0.493,,0.000356,chr9:271626:G>T,,,,,,,,
4,SERPINA1,False,germline,True,False,,miss_inframe,R247C,0.402,,0.002068,chr14:94381049:G>A,,,,,,,,
5,RAD50,False,germline,True,False,,miss_inframe,V315L,0.504,,0.001382,chr5:132587981:G>T,,,,,,,,
6,ATR,False,germline,True,True,LoF,other,-,0.528,,0.001022,chr3:142465089:C>T,,,,,,,,
7,EP300,False,germline,True,True,LoF,other,-,0.514,,0.007025,chr22:41127755:G>C,,,,,,,,
8,SMARCA2,False,germline,True,True,,miss_inframe,Q230P,0.457,,0.000328,chr9:2039799:A>C,,,,,,,,
596,WT1,False,somatic,True,True,LoF,truncating,R168X,0.0,0.187,0.0,chr11:32396368:G>-,-,2.0039,chr11:p13,,,,,


In [77]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a predisposing cancer gene for NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
110848,PHOX2B-AS1,False,False,,,-,0.548,9.8e-05,chr4:41791209:G>T,"intron_variant,non_coding_transcript_variant"
115970,PHOX2B-AS1,False,False,,,-,1.0,0.0,chr4:41768932:A>-,"intron_variant,non_coding_transcript_variant"
115971,PHOX2B-AS1,False,False,,,-,1.0,0.0,chr4:41776013:GTGT>-,"intron_variant,non_coding_transcript_variant"


In [78]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role',
        'cytoband_t1','cytoband_t2', 'mut_sv_t1','mut_sv_t2',
               'fusion_t1','fusion_t2','cgc_transl_t1','cgc_transl_t2',
               'sv_type_t1','sv_type_t2', 'chr/chr_t1','chr/chr_t2','distance_t1','distance_t2']][common_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,cytoband_t1,cytoband_t2,mut_sv_t1,mut_sv_t2,fusion_t1,fusion_t2,cgc_transl_t1,cgc_transl_t2,sv_type_t1,sv_type_t2,chr/chr_t1,chr/chr_t2,distance_t1,distance_t2
596,MUC4,True,False,False,Act,chr3:q29,,,chr3:195767831:A>]chr3:195767896]A,,MUC4-del,,False,,del,,chr3,,65.0


In [79]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2


## Patient 7: Neuroblastoma and AML +2y

In [110]:
pt = 'pt7'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [111]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 't_AF','gnomADg_AF',
        'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
df = t1_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True)|(df['cgc_transl']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PHOX2B,False,germline,True,True,LoF,miss_inframe,AAAAAA255-260A,0.483,,0.001359,,,,,,,,
1,BLM,False,germline,True,False,LoF,miss_inframe,T298M,0.506,,0.00354,,,,,,,,
2,MUTYH,False,germline,True,False,LoF,miss_inframe,T253I,0.485,,1.4e-05,,,,,,,,
3,CDH1,False,germline,True,True,LoF,miss_inframe,T506A,0.45,,1.4e-05,,,,,,,,
4,MSH6,False,germline,True,False,LoF,miss_inframe,L72F,0.504,,0.0,,,,,,,,
5,BRCA1,False,germline,True,True,LoF,miss_inframe,P346S,0.51,,6.3e-05,,,,,,,,
6,JAK2,False,germline,True,True,Act,miss_inframe,G127D,0.522,,0.000509,,,,,,,,
7,FAT1,False,germline,True,True,LoF,miss_inframe,H4146Q,0.504,,5.6e-05,,,,,,,,
8,LZTR1,False,germline,True,True,LoF,miss_inframe,I205T,0.481,,0.0,,,,,,,,
9,CASP10,False,germline,True,False,,miss_inframe,I185R,0.486,,2.1e-05,,,,,,,,


In [112]:
df[(df['origin']=='germline')&(df['intogen']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PHOX2B,False,germline,True,True,LoF,miss_inframe,AAAAAA255-260A,0.483,,0.001359,,,,,,,,
3,CDH1,False,germline,True,True,LoF,miss_inframe,T506A,0.45,,1.4e-05,,,,,,,,
5,BRCA1,False,germline,True,True,LoF,miss_inframe,P346S,0.51,,6.3e-05,,,,,,,,
6,JAK2,False,germline,True,True,Act,miss_inframe,G127D,0.522,,0.000509,,,,,,,,
7,FAT1,False,germline,True,True,LoF,miss_inframe,H4146Q,0.504,,5.6e-05,,,,,,,,
8,LZTR1,False,germline,True,True,LoF,miss_inframe,I205T,0.481,,0.0,,,,,,,,
10,ATG7,True,germline,False,True,LoF,miss_inframe,A293T,0.509,,4.9e-05,,,,,,,,
11,TFG,False,germline,False,True,LoF,miss_inframe,N134S,0.587,,2.8e-05,,,,,,,,
12,GMPS,False,germline,False,True,LoF,miss_inframe,I93V,0.489,,4.2e-05,,,,,,,,
13,NFKBIE,False,germline,False,True,LoF,miss_inframe,S92W,0.542,,0.000321,,,,,,,,


In [113]:
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
df = t2_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PHOX2B,False,germline,True,True,LoF,miss_inframe,AAAAAA255-260A,0.483,,0.001359,,,,,,,,
1,BLM,False,germline,True,False,LoF,miss_inframe,T298M,0.506,,0.00354,,,,,,,,
2,MUTYH,False,germline,True,False,LoF,miss_inframe,T253I,0.485,,1.4e-05,,,,,,,,
3,CDH1,False,germline,True,True,LoF,miss_inframe,T506A,0.45,,1.4e-05,,,,,,,,
4,MSH6,False,germline,True,False,LoF,miss_inframe,L72F,0.504,,0.0,,,,,,,,
5,BRCA1,False,germline,True,True,LoF,miss_inframe,P346S,0.51,,6.3e-05,,,,,,,,
6,JAK2,False,germline,True,True,Act,miss_inframe,G127D,0.522,,0.000509,,,,,,,,
7,FAT1,False,germline,True,True,LoF,miss_inframe,H4146Q,0.504,,5.6e-05,,,,,,,,
8,LZTR1,False,germline,True,True,LoF,miss_inframe,I205T,0.481,,0.0,,,,,,,,
9,CASP10,False,germline,True,False,,miss_inframe,I185R,0.486,,2.1e-05,,,,,,,,


In [117]:
t2_sv_df[t2_sv_df['chr/chr']=='chr11']

Unnamed: 0,SYMBOL,germline,germline_mskcc,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,distance_rel
19,DNAJC24,False,False,False,,chr11:31382627:T>T]chr11:35404978],DNAJC24/SLC1A2,False,chr11,inv,4022351,0.03
20,SLC1A2,False,False,False,,chr11:35404833:G>]chr11:112533063]G,SLC1A2/-,False,chr11,inv,77128230,0.57
21,SLC1A2,False,False,False,,chr11:35404978:A>A]chr11:31382627],SLC1A2/DNAJC24,False,chr11,inv,-4022351,-0.03
22,SLC1A2,False,False,False,,chr11:35405646:G>[chr11:120499493[G,SLC1A2/-,False,chr11,inv,85093847,0.63
23,SLC1A2,False,False,False,,chr11:35405717:A>A]chr11:112007939],SLC1A2/DIXDC1,False,chr11,inv,76602222,0.57
24,DIXDC1,False,False,False,,chr11:112007939:A>A]chr11:35405717],DIXDC1/SLC1A2,False,chr11,inv,-76602222,-0.57
25,,False,False,False,,chr11:112533063:G>G[chr11:35404833[,-/SLC1A2,False,chr11,inv,-77128230,-0.57
26,,False,False,False,,chr11:120499493:T>[chr11:35405646[T,-/SLC1A2,False,chr11,inv,-85093847,-0.63
27,,False,False,False,,chr11:120500201:A>A]chr11:123291997],-/-,False,chr11,inv,2791796,0.02
28,,False,False,False,,chr11:123291997:C>C]chr11:120500201],-/-,False,chr11,inv,-2791796,-0.02


In [84]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a predisposing cancer gene for NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
40321,PHOX2B,True,True,LoF,miss_inframe,AAAAAA255-260A,0.483,0.001359,chr4:41745973:GCTGCCGCCGCTGCC>-,inframe_deletion
40322,PHOX2B-AS1,False,False,,,-,0.523,0.001047,chr4:41753468:G>A,"intron_variant,non_coding_transcript_variant"
40323,PHOX2B-AS1,False,False,,,-,0.476,0.001242,chr4:41769461:G>A,"intron_variant,non_coding_transcript_variant"
40324,PHOX2B-AS1,False,False,,,-,0.521,2.1e-05,chr4:41790832:C>A,"intron_variant,non_coding_transcript_variant"
40325,PHOX2B-AS1,False,False,,,-,0.518,0.002163,chr4:41816735:A>C,"intron_variant,non_coding_transcript_variant"
40326,PHOX2B-AS1,False,False,,,-,0.461,0.002576,chr4:41816985:C>G,"intron_variant,non_coding_transcript_variant"
40327,PHOX2B-AS1,False,False,,,-,0.6,0.000838,chr4:41818433:C>T,"intron_variant,non_coding_transcript_variant"


In [128]:
df1 = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/pt7/AW8048_vs_AW8058/filter_and_annot/AW8048_vs_AW8058_filt.maf.gz',sep='\t')
df1 = df1[['mut','mut_type','clonal']][(df1['mut_type']=='snv')&(df1['clonal']==True)]
df2 = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/pt7/AW8049_vs_AW8058/filter_and_annot/AW8049_vs_AW8058_filt.maf.gz',sep='\t')
df2 = df2[['mut','mut_type','clonal']][(df2['mut_type']=='snv')&(df2['clonal']==True)]

m_df = pd.merge(df1,df2,how='outer',indicator=True)
common_muts = m_df['mut'][m_df['_merge']=='both'].tolist()
m_df.groupby('_merge').count()

Unnamed: 0_level_0,mut,mut_type,clonal
_merge,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
left_only,8166,8166,8166
right_only,5004,5004,5004
both,1416,1416,1416


In [150]:
df1 = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/pt7/AW8048_vs_AW8058/filter_and_annot/AW8048_vs_AW8058_filt.maf.gz',sep='\t')

df1[['SYMBOL','mut','#CHROM', 'Consequence', 'intogen', 'role']][df1['mut'].isin(common_muts)].groupby('intogen').count().sort_values('mut',ascending=False)

Unnamed: 0_level_0,SYMBOL,mut,#CHROM,Consequence,role
intogen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,1395,1395,1395,1395,10
True,21,21,21,21,21


In [153]:
df1 = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/pt7/AW8048_vs_AW8058/filter_and_annot/AW8048_vs_AW8058_filt.maf.gz',sep='\t')

df1[['SYMBOL','mut','#CHROM', 'Consequence','t_AF','t_CCF', 'intogen', 'role']][df1['mut'].isin(common_muts)&(df1['intogen']==True)]

Unnamed: 0,SYMBOL,mut,#CHROM,Consequence,t_AF,t_CCF,intogen,role
584,ABL2,chr1:179101629:C>T,chr1,3_prime_UTR_variant,0.105,0.518875,True,Act
598,DHX9,chr1:182888300:G>T,chr1,downstream_gene_variant,0.119,0.588059,True,Act
716,SLC45A3,chr1:205671122:C>A,chr1,intron_variant,0.107,0.52786,True,LoF
1168,CPEB3,chr10:92159186:C>A,chr10,intron_variant,0.143,0.560556,True,LoF
1241,TCF7L2,chr10:113096417:C>A,chr10,intron_variant,0.209,0.819273,True,Act
2538,GPC5,chr13:92567632:G>A,chr13,intron_variant,0.196,0.76263,True,LoF
3211,RBFOX1,chr16:5873249:T>G,chr16,"intron_variant,non_coding_transcript_variant",0.148,0.576248,True,Act
4053,DCC,chr18:53262003:C>T,chr18,intron_variant,0.248,0.970838,True,LoF
4478,ALK,chr2:29393303:C>T,chr2,intron_variant,0.185,0.728746,True,Act
5395,PTPRT,chr20:42742845:G>A,chr20,intron_variant,0.214,0.841099,True,LoF


In [154]:
df2 = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/pt7/AW8049_vs_AW8058/filter_and_annot/AW8049_vs_AW8058_filt.maf.gz',sep='\t')

df2[['SYMBOL','mut','#CHROM', 'Consequence','t_AF','t_CCF', 'intogen', 'role']][df2['mut'].isin(common_muts)&(df2['intogen']==True)]

Unnamed: 0,SYMBOL,mut,#CHROM,Consequence,t_AF,t_CCF,intogen,role
319,ABL2,chr1:179101629:C>T,chr1,3_prime_UTR_variant,0.083,0.521041,True,Act
327,DHX9,chr1:182888300:G>T,chr1,downstream_gene_variant,0.148,0.929085,True,Act
385,SLC45A3,chr1:205671122:C>A,chr1,intron_variant,0.144,0.903974,True,LoF
696,CPEB3,chr10:92159186:C>A,chr10,intron_variant,0.142,0.889332,True,LoF
735,TCF7L2,chr10:113096417:C>A,chr10,intron_variant,0.11,0.7975,True,Act
1565,GPC5,chr13:92567632:G>A,chr13,intron_variant,0.084,0.524395,True,LoF
1950,RBFOX1,chr16:5873249:T>G,chr16,"intron_variant,non_coding_transcript_variant",0.083,0.51787,True,Act
2377,DCC,chr18:53262003:C>T,chr18,intron_variant,0.076,0.475327,True,LoF
2640,ALK,chr2:29393303:C>T,chr2,intron_variant,0.115,0.720532,True,Act
3208,PTPRT,chr20:42742845:G>A,chr20,intron_variant,0.108,0.676544,True,LoF


In [148]:
t1_snv_df

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,...,intogen,germline,germline_mskcc,germline_akh,role,variant_type,CN,t_CCF,n_CCF,clonal
0,chr1,599120,.,G,T,.,PASS,DP=284;MQ=30.88;MQ0=37;NT=ref;QSS=93;QSS_NT=93...,DP:FDP:SDP:SUBDP:AU:CU:GU:TU,"85:0:0:0:0,0:0,0:85,130:0,0",...,False,False,False,False,,,2.0089,0.569918,0.000000,True
1,chr1,898803,.,C,G,546,PASS,RC=AGGGGT;RC_IDX=2;RC_LF=ATATGTACAC;RC_NM=1;RC...,GT:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:RC_Q...,"./.:136,0:0:136:5119,0:141,0:0,0,0,0,0,136,136...",...,False,False,False,False,,,2.0089,0.833259,0.000000,True
2,chr1,1142085,.,G,T,861,PASS,RC=TCTGGA;RC_IDX=2;RC_LF=CCCTCTGTGC;RC_NM=1;RC...,GT:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:RC_Q...,"./.:157,1:0.006329:158:6017,11:165,1:1,0,0,0,0...",...,False,False,False,False,,,2.0089,0.982617,0.023583,True
3,chr1,1328667,.,G,T,998,PASS,RC=CTTGA;RC_IDX=2;RC_LF=ACAGGAGGGT;RC_NM=1;RC_...,GT:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:RC_Q...,"./.:158,0:0:158:6044,0:164,0:0,0,0,0,0,158,158...",...,False,False,False,False,,,2.0089,1.061227,0.000000,True
4,chr1,1386502,.,T,C,689,PASS,RC=AGCGG;RC_IDX=2;RC_LF=GGATGTCCTG;RC_NM=1;RC_...,GT:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:RC_Q...,"./.:143,0:0:144:5205,0:147,0:0,0,0,0,0,143,144...",...,False,False,False,False,,,2.0089,0.962965,0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11075,chrX,154132245,.,G,T,995,PASS,RC=CTTGA;RC_IDX=2;RC_LF=CTCTTTGGTA;RC_NM=1;RC_...,GT:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:RC_Q...,"./.:132,1:0.007519:133:4994,11:138,1:1,0,0,0,0...",...,False,False,False,False,,,1.9225,1.103248,0.030753,True
11076,chrX,154722636,.,A,G,695,PASS,RC=TGGGTT;RC_IDX=3;RC_LF=GAGATATCTA;RC_NM=1;RC...,GT:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:RC_Q...,"./.:130,0:0:130:4919,0:135,0:0,0,0,0,0,130,130...",...,False,False,False,False,,,1.9225,0.968705,0.000000,True
11077,chrX,155039199,.,A,G,803,PASS,RC=TAAGTC;RC_IDX=3;RC_LF=TGCCCATTTT;RC_NM=1;RC...,GT:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:RC_Q...,"./.:155,0:0:155:5831,0:161,0:0,0,0,0,0,155,155...",...,False,False,False,False,,,1.9225,0.922576,0.000000,True
11078,chrX,155090359,.,T,C,573,PASS,RC=TCCGT;RC_IDX=2;RC_LF=ATTCTCAGAA;RC_NM=1;RC_...,GT:AD:AF:DP:RABQ:RAD:RC_CNT:RC_IPC:RC_JIT:RC_Q...,"./.:131,0:0:132:4895,0:135,0:0,0,0,0,0,131,132...",...,False,False,False,False,,,1.9225,0.711153,0.000000,True


In [122]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role',
        'cytoband_t1','cytoband_t2', 'mut_sv_t1','mut_sv_t2',
               'fusion_t1','fusion_t2','cgc_transl_t1','cgc_transl_t2',
               'sv_type_t1','sv_type_t2', 'chr/chr_t1','chr/chr_t2','distance_t1','distance_t2']][common_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,cytoband_t1,cytoband_t2,mut_sv_t1,mut_sv_t2,fusion_t1,fusion_t2,cgc_transl_t1,cgc_transl_t2,sv_type_t1,sv_type_t2,chr/chr_t1,chr/chr_t2,distance_t1,distance_t2
1144,SEMA4F,False,False,False,,chr2:p13.1,chr2:p13.1,,,,,,,,,,,,
1145,NAE1,False,False,False,,chr16:q22.1,chr16:q22.1,,,,,,,,,,,,
1146,ADNP,False,False,False,,chr20:q13.13,chr20:q13.13,,,,,,,,,,,,
1147,OSMR,False,False,False,,chr5:p13.1,chr5:p13.1,,,,,,,,,,,,
1148,SPTAN1,False,False,False,,chr9:q34.11,chr9:q34.11,,,,,,,,,,,,
1149,USP11,False,False,False,,chrX:p11.3,chrX:p11.3,,,,,,,,,,,,
1150,SLC12A3,False,False,False,,chr16:q13,chr16:q13,,,,,,,,,,,,
1151,NF1P3,False,False,False,,chr21:q11.2,chr21:q11.2,,,,,,,,,,,,
1152,MLXIPL,False,False,False,,chr7:q11.23,chr7:q11.23,,,,,,,,,,,,
1153,U2AF1,False,False,True,Act,chr21:q22.3,chr21:q22.3,,,,,,,,,,,,


In [86]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2
42,ZRANB2-AS2,"intron_variant,non_coding_transcript_variant",0.003545,0.205,True,0.071,True
1308,ZNRF2P2,"intron_variant,non_coding_transcript_variant",0.000000,0.125,True,0.085,True
1331,ZNF736,intron_variant,0.000000,0.167,True,0.095,True
612,ZNF556,intron_variant,0.000000,0.128,True,0.082,True
629,ZNF536,intron_variant,0.000000,0.228,True,0.093,True
...,...,...,...,...,...,...,...
363,-,intergenic_variant,0.000000,0.150,True,0.110,True
1096,-,intergenic_variant,0.000000,0.137,True,0.115,True
362,-,intergenic_variant,0.000000,0.182,True,0.134,True
361,-,intergenic_variant,0.000000,0.209,True,0.121,True


In [87]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2','intogen_t1']]
df[(~df['Consequence_t1'].isin(['intergenic_variant','intron_variant']))&(df['intogen_t1']==True)].sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2,intogen_t1
474,RBFOX1,"intron_variant,non_coding_transcript_variant",7e-06,0.148,True,0.083,True,True
1225,NFKBIE,upstream_gene_variant,0.0,0.213,True,0.058,True,True
1224,HSP90AB1,upstream_gene_variant,0.0,0.216,True,0.109,True,True
77,DHX9,downstream_gene_variant,0.0,0.119,True,0.148,True,True
76,ABL2,3_prime_UTR_variant,0.0,0.105,True,0.083,True,True


## Patient 8: Burkit Lymphoma and Thyroid cancer +11y

In [61]:
pt = 'pt8'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [62]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 't_AF','gnomADg_AF', 'mut',
        'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
df = t1_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True)|(df['cgc_transl']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PTCH1,False,germline,True,True,LoF,miss_inframe,D436N,0.508,,0.000824,chr9:95478096:C>T,,,,,,,,
1,SDHD,False,germline,True,False,LoF,miss_inframe,G12S,0.493,,0.006979,chr11:112086941:G>A,,,,,,,,
2,SDHB,False,germline,True,False,LoF,miss_inframe,G53E,0.496,,0.000405,chr1:17044803:C>T,,,,,,,,
3,MET,False,germline,True,True,Act,miss_inframe,T1010I,0.514,,0.00897,chr7:116771936:C>T,,,,,,,,
4,PDGFRA,False,germline,True,True,Act,miss_inframe,G79D,0.579,,0.008872,chr4:54261281:G>A,,,,,,,,
5,WAS,False,germline,True,True,Act,miss_inframe,V332A,0.486,,0.004856,chrX:48688723:T>C,,,,,,,,
6,SERPINA1,False,germline,True,False,,truncating,E347X,0.116,,0.000253,chr14:94379488:CT>-,,,,,,,,
7,AR,False,germline,True,True,Act,miss_inframe,GGGGG457-461-,1.0,,0.009286,chrX:67546515:GGCGGCGGCGGCGGC>-,,,,,,,,
8,JMJD1C,False,germline,True,False,,miss_inframe,F130Y,0.508,,4.9e-05,chr10:63264709:A>T,,,,,,,,
9,SHOC2,False,germline,True,False,,miss_inframe,E25G,0.511,,7.7e-05,chr10:110964432:A>G,,,,,,,,


In [63]:
df[(df['origin']=='germline')&(df['intogen']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PTCH1,False,germline,True,True,LoF,miss_inframe,D436N,0.508,,0.000824,chr9:95478096:C>T,,,,,,,,
3,MET,False,germline,True,True,Act,miss_inframe,T1010I,0.514,,0.00897,chr7:116771936:C>T,,,,,,,,
4,PDGFRA,False,germline,True,True,Act,miss_inframe,G79D,0.579,,0.008872,chr4:54261281:G>A,,,,,,,,
5,WAS,False,germline,True,True,Act,miss_inframe,V332A,0.486,,0.004856,chrX:48688723:T>C,,,,,,,,
7,AR,False,germline,True,True,Act,miss_inframe,GGGGG457-461-,1.0,,0.009286,chrX:67546515:GGCGGCGGCGGCGGC>-,,,,,,,,
11,TET2,False,germline,False,True,LoF,miss_inframe,P174H,0.446,,0.001411,chr4:105234463:C>A,,,,,,,,
12,ZFHX3,False,germline,False,True,LoF,miss_inframe,GGGGGGGG3519-3526G,0.463,,0.000696,chr16:72787699:CCGCCGCCGCCGCCGCCGCCA>-,,,,,,,,
13,ATG7,False,germline,False,True,LoF,miss_inframe,A530T,0.529,,0.002164,chr3:11360689:G>A,,,,,,,,
14,ATG7,False,germline,False,True,LoF,miss_inframe,R632W,0.542,,4.9e-05,chr3:11379990:C>T,,,,,,,,
15,RBM15,False,germline,False,True,LoF,miss_inframe,A668V,0.446,,0.004704,chr1:110341408:C>T,,,,,,,,


In [69]:
t1_df[t1_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germline,germline_mskcc,germline_akh,intogen,role,variant_type,Consequence,aa_change,mut,...,CN_min_allele,cytoband,mut_sv,fusion,cgc_transl,chr/chr,sv_type,distance,distance_rel,germ_som
599,TP53,True,True,True,True,LoF,miss_inframe,missense_variant,G245D,chr17:7674229:C>T,...,0.9434,chr17:p13.1,,,,,,,,False
600,RECQL4,True,True,True,True,LoF,miss_inframe,missense_variant,R895G,chr8:144512919:T>C,...,0.9536,chr8:q24.3,,,,,,,,False
601,ARHGAP35,False,False,False,True,LoF,miss_inframe,missense_variant,V381L,chr19:46919816:G>C,...,0.9003,chr19:q13.32,,,,,,,,False
602,FOXO1,False,False,False,True,Act,miss_inframe,missense_variant,R21C,chr13:40666152:G>A,...,0.9644,chr13:q14.11,,,,,,,,False
603,BCL6,False,False,False,True,Act,miss_inframe,missense_variant,A587D,chr3:187725578:G>T,...,0.9748,chr3:q27.3,,,,,,,,False
604,IGHV4-34,False,False,False,False,,truncating,stop_gained,Y58*,chr14:106373858:G>C,...,1.2366,chr14:q32.33,,,,,,,,False
605,IGHV4-34,False,False,False,False,,miss_inframe,missense_variant,S61T,chr14:106373850:C>G,...,1.2366,chr14:q32.33,,,,,,,,False
606,IGHV4-34,False,False,False,False,,miss_inframe,missense_variant,Y59S,chr14:106373856:T>G,...,1.2366,chr14:q32.33,,,,,,,,False
607,ATAD2B,False,False,False,False,,truncating,frameshift_variant,GL535-536X,chr2:23834041:ATCC>-,...,0.9768,chr2:p23.3,,,,,,,,False
608,OGT,False,False,False,False,,truncating,stop_gained,E544*,chrX:71559294:G>T,...,0.8408,chrX:q13.1,,,,,,,,False


In [68]:
t1_sv_df[~t1_sv_df['fusion'].isnull()]

Unnamed: 0,SYMBOL,germline,germline_mskcc,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,distance_rel
0,,False,False,False,,chr1:194481532:C>C[chr1:194481569[,-/-,False,chr1,inv,37,0.0
1,,False,False,False,,chr1:194481569:T>]chr1:194481532]T,-/-,False,chr1,inv,-37,-0.0
2,,False,False,False,,chr2:88861251:A>AGGGGC[chr2:89196082[,-/-,False,chr2,inv,334831,0.0
3,,False,False,False,,chr2:88861257:A>A]chr2:88886153],-/-,False,chr2,inv,24896,0.0
4,,False,False,False,,chr2:88861924:C>[chr2:88897787[C,-/-,False,chr2,inv,35863,0.0
5,,False,False,False,,chr2:88886153:T>T]chr2:88861257],-/-,False,chr2,inv,-24896,-0.0
6,,False,False,False,,chr2:88897787:C>[chr2:88861924[C,-/-,False,chr2,inv,-35863,-0.0
7,,False,False,False,,chr2:89196082:T>]chr2:88861251]GGGGCT,-/-,False,chr2,inv,-334831,-0.0
8,FHIT,False,False,True,ambiguous,chr3:60334050:T>T[chr3:60343507[,FHIT-del,True,chr3,del,9457,0.0
9,,False,False,False,,chr5:111521459:A>A[chr5:111521511[,-/-,False,chr5,inv,52,0.0


In [281]:
#table germline for paper
cols = ['SYMBOL','role','Consequence','aa_change','gnomADg_AF','origin','germline']
t1_df[cols][(t1_df['origin']=='germline')&(t1_df['germline']==True)]

Unnamed: 0,SYMBOL,role,Consequence,aa_change,gnomADg_AF,origin,germline
0,PTCH1,LoF,missense_variant,D436N,0.000824,germline,True
1,SDHD,LoF,missense_variant,G12S,0.006979,germline,True
2,SDHB,LoF,missense_variant,G53E,0.000405,germline,True
3,MET,Act,missense_variant,T1010I,0.00897,germline,True
4,PDGFRA,Act,missense_variant,G79D,0.008872,germline,True
5,WAS,Act,missense_variant,V332A,0.004856,germline,True
6,SERPINA1,,frameshift_variant,E347X,0.000253,germline,True
7,AR,Act,inframe_deletion,GGGGG457-461-,0.009286,germline,True
8,JMJD1C,,missense_variant,F130Y,4.9e-05,germline,True
9,SHOC2,,missense_variant,E25G,7.7e-05,germline,True


In [202]:
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
df = t2_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PTCH1,False,germline,True,True,LoF,miss_inframe,D436N,0.508,,0.000824,chr9:95478096:C>T,,,,,,,,
1,SDHD,False,germline,True,False,LoF,miss_inframe,G12S,0.493,,0.006979,chr11:112086941:G>A,,,,,,,,
2,SDHB,False,germline,True,False,LoF,miss_inframe,G53E,0.496,,0.000405,chr1:17044803:C>T,,,,,,,,
3,MET,False,germline,True,True,Act,miss_inframe,T1010I,0.514,,0.00897,chr7:116771936:C>T,,,,,,,,
4,PDGFRA,False,germline,True,True,Act,miss_inframe,G79D,0.579,,0.008872,chr4:54261281:G>A,,,,,,,,
5,WAS,False,germline,True,True,Act,miss_inframe,V332A,0.486,,0.004856,chrX:48688723:T>C,,,,,,,,
6,SERPINA1,False,germline,True,False,,truncating,E347X,0.116,,0.000253,chr14:94379488:CT>-,,,,,,,,
7,AR,False,germline,True,True,Act,miss_inframe,GGGGG457-461-,1.0,,0.009286,chrX:67546515:GGCGGCGGCGGCGGC>-,,,,,,,,
8,JMJD1C,False,germline,True,False,,miss_inframe,F130Y,0.508,,4.9e-05,chr10:63264709:A>T,,,,,,,,
9,SHOC2,False,germline,True,False,,miss_inframe,E25G,0.511,,7.7e-05,chr10:110964432:A>G,,,,,,,,


In [92]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a predisposing cancer gene for NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
62365,PHOX2B-AS1,False,False,,,-,0.594,7e-06,chr4:41753334:G>C,"intron_variant,non_coding_transcript_variant"
67149,PHOX2B-AS1,False,False,,,-,0.978,0.0,chr4:41776013:GTGT>-,"intron_variant,non_coding_transcript_variant"


In [93]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role',
        'cytoband_t1','cytoband_t2', 'mut_sv_t1','mut_sv_t2',
               'fusion_t1','fusion_t2','cgc_transl_t1','cgc_transl_t2',
               'sv_type_t1','sv_type_t2', 'chr/chr_t1','chr/chr_t2','distance_t1','distance_t2']][common_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,cytoband_t1,cytoband_t2,mut_sv_t1,mut_sv_t2,fusion_t1,fusion_t2,cgc_transl_t1,cgc_transl_t2,sv_type_t1,sv_type_t2,chr/chr_t1,chr/chr_t2,distance_t1,distance_t2


In [94]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2
0,MALRD1,intron_variant,0.041356,0.399,True,0.346,True
1,-,intergenic_variant,0.01715,0.098,False,0.098,False


## Patient 10: Medulloblastoma and Meningioma +1.5y

In [96]:
pt = 'pt10'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [97]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 't_AF','gnomADg_AF', 'mut',
        'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
df = t1_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True)|(df['cgc_transl']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,ATM,True,germline,True,True,LoF,miss_inframe,F763L,0.403,,0.000935,chr11:108257519:T>A,,,,,,,,
1,KIT,False,germline,True,True,Act,other,I935,0.489,,0.001752,chr4:54738431:T>A,,,,,,,,
2,AXIN2,False,germline,True,False,LoF,miss_inframe,A758T,0.471,,0.001751,chr17:65534045:C>T,,,,,,,,
3,ERCC4,False,germline,True,False,LoF,miss_inframe,P379S,0.471,,0.004633,chr16:13934224:C>T,,,,,,,,
4,COL7A1,False,germline,True,False,,miss_inframe,E1297K,0.512,,0.001395,chr3:48585562:C>T,,,,,,,,
5,ERBB2,True,germline,True,True,Act,miss_inframe,A386D,0.507,,0.003218,chr17:39715294:C>A,,,,,,,,
6,ATR,False,germline,True,True,LoF,miss_inframe,Y2132D,0.503,,0.003447,chr3:142469495:A>C,,,,,,,,
7,NSD1,False,germline,True,True,Act,miss_inframe,S1241T,0.562,,0.000657,chr5:177212121:G>C,,,,,,,,
8,AR,False,germline,True,True,Act,miss_inframe,QQ60-61Q,1.0,,0.0,chrX:67545326:GCA>-,,,,,,,,
9,MAP2K1,False,germline,True,True,Act,other,-,0.537,,0.002212,chr15:66489771:TATT>-,,,,,,,,


In [98]:
df[(df['origin']=='germline')&(df['intogen']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,ATM,True,germline,True,True,LoF,miss_inframe,F763L,0.403,,0.000935,chr11:108257519:T>A,,,,,,,,
1,KIT,False,germline,True,True,Act,other,I935,0.489,,0.001752,chr4:54738431:T>A,,,,,,,,
5,ERBB2,True,germline,True,True,Act,miss_inframe,A386D,0.507,,0.003218,chr17:39715294:C>A,,,,,,,,
6,ATR,False,germline,True,True,LoF,miss_inframe,Y2132D,0.503,,0.003447,chr3:142469495:A>C,,,,,,,,
7,NSD1,False,germline,True,True,Act,miss_inframe,S1241T,0.562,,0.000657,chr5:177212121:G>C,,,,,,,,
8,AR,False,germline,True,True,Act,miss_inframe,QQ60-61Q,1.0,,0.0,chrX:67545326:GCA>-,,,,,,,,
9,MAP2K1,False,germline,True,True,Act,other,-,0.537,,0.002212,chr15:66489771:TATT>-,,,,,,,,
11,RANBP2,False,germline,False,True,LoF,miss_inframe,G1587V,0.465,,0.001996,chr2:108765299:G>T,,,,,,,,
12,CDH10,False,germline,False,True,LoF,miss_inframe,T177S,0.504,,0.0,chr5:24535820:T>A,,,,,,,,
13,SETD2,False,germline,False,True,LoF,miss_inframe,K280E,0.486,,7e-06,chr3:47123798:T>C,,,,,,,,


In [99]:
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
df = t2_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,ATM,False,germline,True,True,LoF,miss_inframe,F763L,0.403,,0.000935,chr11:108257519:T>A,,,,,,,,
1,KIT,False,germline,True,True,Act,other,I935,0.489,,0.001752,chr4:54738431:T>A,,,,,,,,
2,AXIN2,False,germline,True,False,LoF,miss_inframe,A758T,0.471,,0.001751,chr17:65534045:C>T,,,,,,,,
3,ERCC4,False,germline,True,False,LoF,miss_inframe,P379S,0.471,,0.004633,chr16:13934224:C>T,,,,,,,,
4,COL7A1,False,germline,True,False,,miss_inframe,E1297K,0.512,,0.001395,chr3:48585562:C>T,,,,,,,,
5,ERBB2,True,germline,True,True,Act,miss_inframe,A386D,0.507,,0.003218,chr17:39715294:C>A,,,,,,,,
6,ATR,False,germline,True,True,LoF,miss_inframe,Y2132D,0.503,,0.003447,chr3:142469495:A>C,,,,,,,,
7,NSD1,False,germline,True,True,Act,miss_inframe,S1241T,0.562,,0.000657,chr5:177212121:G>C,,,,,,,,
8,AR,False,germline,True,True,Act,miss_inframe,QQ60-61Q,1.0,,0.0,chrX:67545326:GCA>-,,,,,,,,
9,MAP2K1,False,germline,True,True,Act,other,-,0.537,,0.002212,chr15:66489771:TATT>-,,,,,,,,


In [100]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a predisposing cancer gene for NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
111532,PHOX2B,True,True,LoF,,-,0.556,0.007348,chr4:41739278:A>G,downstream_gene_variant
111533,PHOX2B-AS1,False,False,,,-,0.548,0.007405,chr4:41764850:G>A,"intron_variant,non_coding_transcript_variant"
111534,PHOX2B-AS1,False,False,,,-,0.51,0.000614,chr4:41826665:T>C,downstream_gene_variant
116605,PHOX2B-AS1,False,False,,,-,1.0,0.0,chr4:41776013:GT>-,"intron_variant,non_coding_transcript_variant"


In [165]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role','CNA_t1','CNA_t2', 'CN_t1','CN_t2',
        'cytoband_t1','cytoband_t2']][(common_df['origin']=='somatic')]

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,CNA_t1,CNA_t2,CN_t1,CN_t2,cytoband_t1,cytoband_t2
1144,SEMA4F,False,False,False,,-,-,2.0098,2.0155,chr2:p13.1,chr2:p13.1
1145,NAE1,False,False,False,,-,-,1.9952,1.9988,chr16:q22.1,chr16:q22.1
1146,ADNP,False,False,False,,-,-,2.0088,2.0143,chr20:q13.13,chr20:q13.13
1147,OSMR,False,False,False,,-,-,1.9242,1.9902,chr5:p13.1,chr5:p13.1
1148,SPTAN1,False,False,False,,-,-,2.0099,1.9889,chr9:q34.11,chr9:q34.11
1149,USP11,False,False,False,,-,-,1.8998,1.9675,chrX:p11.3,chrX:p11.3
1150,SLC12A3,False,False,False,,-,-,1.9937,1.9988,chr16:q13,chr16:q13
1151,NF1P3,False,False,False,,amp,amp,3.9168,2.2179,chr21:q11.2,chr21:q11.2
1152,MLXIPL,False,False,False,,-,-,1.9041,1.9917,chr7:q11.23,chr7:q11.23
1153,U2AF1,False,False,True,Act,amp,amp,4.1266,2.2179,chr21:q22.3,chr21:q22.3


In [102]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2
0,EMCN,intron_variant,0.001574,0.082,False,0.074,False


## Patient 11: Burkit Lymphoma and AML +6m

In [169]:
pt = 'pt11'
normal = samples[pt]['normal']
g_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv('/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [170]:
cols = ['SYMBOL', 'germ_som','origin','germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 't_AF','gnomADg_AF', 'mut',
        'CNA', 'CN','cytoband', 'fusion','cgc_transl','sv_type', 'chr/chr','distance']

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
df = t1_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True)|(df['cgc_transl']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PTCH1,False,germline,True,True,LoF,miss_inframe,P725S,0.508,,0.00097,chr9:95468828:G>A,,,,,,,,
1,RUNX1,False,germline,True,True,ambiguous,miss_inframe,G217E,0.495,,0.0,chr21:34834565:C>T,,,,,,,,
2,SERPINA1,False,germline,True,False,,truncating,E347X,0.153,,0.000253,chr14:94379488:CT>-,,,,,,,,
3,RTEL1,False,germline,True,False,,miss_inframe,A112T,0.59,,0.000189,chr20:63661882:G>A,,,,,,,,
4,RTEL1,False,germline,True,False,,miss_inframe,Q1014E,0.504,,0.0,chr20:63694419:C>G,,,,,,,,
5,NTRK1,True,germline,True,True,Act,miss_inframe,R780Q,0.563,,0.004262,chr1:156881590:G>A,,,,,,,,
691,TP53,False,somatic,True,True,LoF,miss_inframe,R273H,0.0,0.407,1.4e-05,chr17:7673802:C>T,-,1.9962,chr17:p13.1,,,,,
692,ARID1A,False,somatic,False,True,LoF,miss_inframe,M2001T,0.0,0.468,0.0,chr1:26779900:T>C,-,2.1246,chr1:p36.11,,,,,
693,CCND3,False,somatic,False,True,Act,truncating,-291-292X,0.0,0.426,0.0,chr6:41935946:->GT,-,2.0773,chr6:p21.1,,,,,
694,FOXO1,False,somatic,False,True,Act,miss_inframe,T24I,0.0,0.383,0.0,chr13:40666142:G>A,-,2.1305,chr13:q14.11,,,,,


In [171]:
df[(df['origin']=='germline')&(df['intogen']==True)]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PTCH1,False,germline,True,True,LoF,miss_inframe,P725S,0.508,,0.00097,chr9:95468828:G>A,,,,,,,,
1,RUNX1,False,germline,True,True,ambiguous,miss_inframe,G217E,0.495,,0.0,chr21:34834565:C>T,,,,,,,,
5,NTRK1,True,germline,True,True,Act,miss_inframe,R780Q,0.563,,0.004262,chr1:156881590:G>A,,,,,,,,
6,PTPRC,False,germline,False,True,LoF,miss_inframe,D603G,0.458,,7e-06,chr1:198728427:A>G,,,,,,,,
7,PTPN14,False,germline,False,True,LoF,miss_inframe,R668C,0.462,,0.002317,chr1:214383853:G>A,,,,,,,,
8,ARHGEF10,False,germline,False,True,LoF,miss_inframe,D653N,0.477,,1.4e-05,chr8:1905634:G>A,,,,,,,,
10,KEAP1,False,germline,False,True,LoF,miss_inframe,V440M,0.559,,4.9e-05,chr19:10491584:C>T,,,,,,,,
11,ZFHX3,False,germline,False,True,LoF,miss_inframe,EEEEED483-488D,0.512,,0.000777,chr16:72958683:TCTTCCTCCTCCTCT>-,,,,,,,,
12,LRP1B,False,germline,False,True,LoF,miss_inframe,P4480S,0.467,,0.000684,chr2:140238274:G>A,,,,,,,,
14,HSPG2,False,germline,False,True,ambiguous,miss_inframe,T3898I,0.564,,0.0,chr1:21830070:G>A,,,,,,,,


In [172]:
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
df = t2_df[cols]
df[(df['origin']=='germline')&(df['germline']==True)|(df['origin']=='somatic')&((df['intogen']==True))]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
0,PTCH1,False,germline,True,True,LoF,miss_inframe,P725S,0.508,,0.00097,chr9:95468828:G>A,,,,,,,,
1,RUNX1,True,germline,True,True,ambiguous,miss_inframe,G217E,0.495,,0.0,chr21:34834565:C>T,,,,,,,,
2,SERPINA1,False,germline,True,False,,truncating,E347X,0.153,,0.000253,chr14:94379488:CT>-,,,,,,,,
3,RTEL1,False,germline,True,False,,miss_inframe,A112T,0.59,,0.000189,chr20:63661882:G>A,,,,,,,,
4,RTEL1,False,germline,True,False,,miss_inframe,Q1014E,0.504,,0.0,chr20:63694419:C>G,,,,,,,,
5,NTRK1,True,germline,True,True,Act,miss_inframe,R780Q,0.563,,0.004262,chr1:156881590:G>A,,,,,,,,
692,MYC,False,somatic,False,True,Act,,,,,,,amp,2.2688,chr8:q24.21,MYC-ins,True,ins,chr8,0
693,MYC,False,somatic,False,True,Act,,,,,,,amp,2.2688,chr8:q24.21,-,True,other,chr8/-,-
694,DDX6,False,somatic,False,True,,,,,,,,del,0.944,chr11:q23.3,,,,,
695,CAMTA1,False,somatic,False,True,LoF,,,,,,,,,,CAMTA1-ins,True,ins,chr1,0


In [173]:
cols = ['SYMBOL', 'germline', 'intogen', 'role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut','Consequence'
       ]
g_df[cols][g_df['SYMBOL'].str.contains('PHOX')] #PHOX2B is a predisposing cancer gene for NB

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
31113,PHOX2A,False,False,,,-,0.492,0.000419,chr11:72247580:T>C,upstream_gene_variant
111667,PHOX2B,True,True,LoF,,-,0.435,0.00014,chr4:41740027:G>A,downstream_gene_variant
111668,PHOX2B,True,True,LoF,,-,0.471,0.006526,chr4:41740208:T>C,downstream_gene_variant
111669,PHOX2B,True,True,LoF,,-,0.511,0.009751,chr4:41740919:C>T,downstream_gene_variant
111670,PHOX2B-AS1,False,False,,,-,0.493,0.006508,chr4:41749793:C>T,"intron_variant,non_coding_transcript_variant"
111671,PHOX2B-AS1,False,False,,,-,0.527,0.006539,chr4:41764266:T>C,"intron_variant,non_coding_transcript_variant"
111672,PHOX2B-AS1,False,False,,,-,0.5,0.006337,chr4:41788119:C>G,"intron_variant,non_coding_transcript_variant"
111673,PHOX2B-AS1,False,False,,,-,0.467,0.0,chr4:41820449:C>G,"intron_variant,non_coding_transcript_variant"
111674,PHOX2B-AS1,False,False,,,-,0.5,0.004297,chr4:41821748:A>G,"intron_variant,non_coding_transcript_variant"


In [179]:
common_df = pd.merge(t1_df,t2_df,how='inner',on=['SYMBOL','mut','gnomADg_AF','germline','germline_mskcc','germline_akh','intogen','role','origin'],suffixes=['_t1','_t2'])
common_df[['SYMBOL', 'germ_som_t1','germline', 'intogen', 'role','mut',
        'cytoband_t1','cytoband_t2', 'mut_sv_t1','mut_sv_t2',
               'fusion_t1','fusion_t2','cgc_transl_t1','cgc_transl_t2',
               'sv_type_t1','sv_type_t2', 'chr/chr_t1','chr/chr_t2','distance_t1','distance_t2']][common_df['origin']=='somatic']

Unnamed: 0,SYMBOL,germ_som_t1,germline,intogen,role,mut,cytoband_t1,cytoband_t2,mut_sv_t1,mut_sv_t2,fusion_t1,fusion_t2,cgc_transl_t1,cgc_transl_t2,sv_type_t1,sv_type_t2,chr/chr_t1,chr/chr_t2,distance_t1,distance_t2
691,NTRK1,True,False,True,Act,,chr1:q23.1,,,chr1:156879529:C>CGCTGGTGGTGATGGCGCTGTGGTACTTC...,,NTRK1-ins,,True,,ins,,chr1,,0


In [109]:
common_df = pd.merge(t1_snv_df,t2_snv_df,how='inner',on=['#CHROM','POS','REF','ALT'],suffixes=['_t1','_t2'])
df = common_df[['SYMBOL_t1','Consequence_t1','gnomADg_AF_t1','t_AF_t1','clonal_t1','t_AF_t2','clonal_t2']]
df.sort_values(by='SYMBOL_t1',ascending=False)

Unnamed: 0,SYMBOL_t1,Consequence_t1,gnomADg_AF_t1,t_AF_t1,clonal_t1,t_AF_t2,clonal_t2
1,FRMPD4,intron_variant,0.19964,0.379,True,0.347,False
0,-,intergenic_variant,0.165172,0.592,True,0.755,True


## Double hits germline and somatic

### Patient 1

In [96]:
pt1_t1_df[cols][pt1_t1_df['germ_som']==True]

Unnamed: 0,SYMBOL,germline,intogen,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut,Consequence
13,CASZ1,False,True,LoF,miss_inframe,A211S,0.546,0.0,chr1:10660411:C>A,missense_variant
20,HSPG2,False,True,ambiguous,miss_inframe,N4030S,0.527,0.0,chr1:21828983:T>C,missense_variant
21,HSPG2,False,True,ambiguous,miss_inframe,P1019L,0.543,0.006146,chr1:21875990:G>A,missense_variant
30,MUC4,False,False,Act,miss_inframe,VPVTSTSSASTGDTTP4033-4048-,1.0,0.000268,chr3:195779436:AGGGGTGGTGTCACCTGTGGATGCTGAGGAA...,inframe_deletion
31,MUC4,False,False,Act,miss_inframe,S2858P,0.207,0.00408,chr3:195783008:A>G,missense_variant
32,MUC4,False,False,Act,miss_inframe,H2845D,0.133,0.000782,chr3:195783047:G>C,missense_variant
33,MUC4,False,False,Act,miss_inframe,ASTGDTTPLPVTDASSV1545-1561V,0.238,0.00647,chr3:195786899:CTGAGGAAGCGTCGGTGACAGGAAGAGGGGT...,inframe_deletion
34,MUC4,False,False,Act,miss_inframe,G1148A,0.429,0.003984,chr3:195788137:C>G,missense_variant
593,MUC4,False,False,Act,miss_inframe,F966L,0.0,2.3e-05,chr3:195788682:G>T,missense_variant
613,CASZ1,False,True,LoF,,,,,,


In [30]:
pt1_t2_df[cols][pt1_t2_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
203,TDRD6,True,germline,False,False,,miss_inframe,Q1952E,0.727,,0.000244,chr6:46693982:C>G,,,,,,,,
214,PRUNE2,True,germline,False,True,,miss_inframe,S1281F,0.567,,0.009763,chr9:76708432:G>A,,,,,,,,
589,TDRD6,True,somatic,False,False,,truncating,E908*,0.0,0.306,0.0,chr6:46690850:G>T,-,1.9846,chr6:p12.3,,,,,
602,PRUNE2,True,somatic,False,True,,miss_inframe,P1894R,0.0,0.307,0.0,chr9:76706593:G>C,-,2.0193,chr9:q21.2,,,,,


### Patient 2

In [31]:
pt2_t1_df[cols][pt2_t1_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
17,KMT2C,True,germline,False,True,LoF,miss_inframe,G315S,0.173,,0.000467,chr7:152273774:C>T,,,,,,,,
25,EML4,True,germline,False,False,ambiguous,miss_inframe,A403T,0.391,,0.000901,chr2:42288311:G>A,,,,,,,,
28,FAM186A,True,germline,False,True,ambiguous,miss_inframe,E1999G,0.519,,0.0,chr12:50350836:T>C,,,,,,,,
34,MDM4,True,germline,False,False,Act,miss_inframe,K374Q,0.529,,0.003259,chr1:204549329:A>C,,,,,,,,
263,EFHD1,True,germline,False,False,,miss_inframe,S176A,0.69,,0.000461,chr2:232672384:T>G,,,,,,,,
327,PRSS3,True,germline,False,False,,miss_inframe,T124S,0.189,,0.000709,chr9:33796801:A>T,,,,,,,,
394,OTUD7A,True,germline,False,False,,miss_inframe,G741S,0.567,,0.001692,chr15:31483854:C>T,,,,,,,,
778,PRSS3,True,germline,False,False,,other,-,0.194,,0.000122,chr9:33796805:AAGTG>-,,,,,,,,
865,EFHD1,True,somatic,False,False,,miss_inframe,N79S,0.0,0.3,2.8e-05,chr2:232633940:A>G,amp,2.9665,chr2:q37.1,,,,,
866,PRSS3,True,somatic,False,False,,miss_inframe,R152H,0.0,0.441,5.6e-05,chr9:33797912:G>A,-,2.0052,chr9:p13.3,,,,,


In [32]:
pt2_t1_df[pt2_t1_df['SYMBOL']=='ATM']

Unnamed: 0,SYMBOL,germline,germline_mskcc,germline_akh,intogen,role,variant_type,Consequence,aa_change,mut,...,CN_min_allele,cytoband,mut_sv,fusion,cgc_transl,chr/chr,sv_type,distance,distance_rel,germ_som
0,ATM,True,True,True,True,LoF,miss_inframe,missense_variant,V182L,chr11:108244000:G>C,...,,,,,,,,,,False
1,ATM,True,True,True,True,LoF,miss_inframe,missense_variant,R2854C,chr11:108345884:C>T,...,,,,,,,,,,False


In [33]:
pt2_t1_df[pt2_t1_df['SYMBOL']=='KMT2C']

Unnamed: 0,SYMBOL,germline,germline_mskcc,germline_akh,intogen,role,variant_type,Consequence,aa_change,mut,...,CN_min_allele,cytoband,mut_sv,fusion,cgc_transl,chr/chr,sv_type,distance,distance_rel,germ_som
17,KMT2C,False,False,False,True,LoF,miss_inframe,missense_variant,G315S,chr7:152273774:C>T,...,,,,,,,,,,True
917,KMT2C,False,False,,True,LoF,,,,,...,0.0076,chr7:q36.1,,,,,,,,True


In [34]:
pt2_t2_df[cols][pt2_t2_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
17,KMT2C,True,germline,False,True,LoF,miss_inframe,G315S,0.173,,0.000467,chr7:152273774:C>T,,,,,,,,
271,JARID2,True,germline,False,False,,miss_inframe,R492C,0.387,,0.009843,chr6:15496699:C>T,,,,,,,,
570,FASN,True,germline,False,False,,miss_inframe,S279L,0.615,,4.2e-05,chr17:82092755:G>A,,,,,,,,
866,FASN,True,somatic,False,False,,miss_inframe,G2270R,0.0,0.307,0.0,chr17:82080710:C>G,-,2.0185,chr17:q25.3,,,,,
878,KMT2C,True,somatic,False,True,LoF,,,,,,,del,1.0112,chr7:q36.1,,,,,
891,JARID2,True,somatic,False,False,,,,,,,,,,,JARID2/EDEM3,False,fusion,chr6/chr1,-


### Patient 3

In [35]:
pt3_t1_df[cols][pt3_t1_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance


In [36]:
pt3_t2_df[cols][pt3_t2_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance


### Patient 4

In [37]:
pt4_t1_df[cols][pt4_t1_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
20,PRDM1,True,germline,False,True,LoF,miss_inframe,S367F,0.483,,0.000216,chr6:106105260:C>T,,,,,,,,
52,MUC4,True,germline,False,False,Act,miss_inframe,IPSSSSSGHTTPLPVTS3701-3717S,0.25,,9.9e-05,chr3:195780431:TGGTGACAGGAAGAGGGGTGGTGTGACCTGA...,,,,,,,,
53,MUC4,True,germline,False,False,Act,miss_inframe,A3113S,0.72,,0.000134,chr3:195782243:C>A,,,,,,,,
54,MUC4,True,germline,False,False,Act,miss_inframe,H2845D,0.353,,0.000782,chr3:195783047:G>C,,,,,,,,
221,PHF3,True,germline,False,False,,miss_inframe,A854T,0.5,,0.00308,chr6:63694644:G>A,,,,,,,,
1121,ANKRD30B,True,germline,False,False,,other,-,0.714,,0.0,chr18:14822685:T>G,,,,,,,,
1177,PRDM1,True,somatic,False,True,LoF,,,,,,,del,1.0172,chr6:q21,,,,,
1191,MUC4,True,somatic,False,False,Act,,,,,,,,,,MUC4-del,False,del,chr3,77
1196,ANKRD30B,True,somatic,False,False,,,,,,,,,,,ANKRD30B/LDLRAD4,False,inv,chr18,-1398409
1198,PHF3,True,somatic,False,False,,,,,,,,,,,-,False,other,chr6/-,-


In [38]:
pt4_t2_df[cols][pt4_t2_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
8,ETV6,True,germline,True,True,Act,miss_inframe,V166M,0.536,,0.000398,chr12:11869456:G>A,,,,,,,,
47,IKBKB,True,germline,False,True,Act,miss_inframe,R526Q,0.571,,0.003193,chr8:42319645:G>A,,,,,,,,
50,UGT2B17,True,germline,False,True,Act,miss_inframe,V181I,0.469,,0.008175,chr4:68567944:C>T,,,,,,,,
51,SGK1,True,germline,False,True,Act,miss_inframe,M32V,0.438,,0.008989,chr6:134262124:T>C,,,,,,,,
52,MUC4,True,germline,False,False,Act,miss_inframe,IPSSSSSGHTTPLPVTS3701-3717S,0.25,,9.9e-05,chr3:195780431:TGGTGACAGGAAGAGGGGTGGTGTGACCTGA...,,,,,,,,
53,MUC4,True,germline,False,False,Act,miss_inframe,A3113S,0.72,,0.000134,chr3:195782243:C>A,,,,,,,,
54,MUC4,True,germline,False,False,Act,miss_inframe,H2845D,0.353,,0.000782,chr3:195783047:G>C,,,,,,,,
61,MUC16,True,germline,False,False,Act,miss_inframe,M2688I,0.294,,0.005682,chr19:8973075:C>T,,,,,,,,
64,BCLAF1,True,germline,False,True,Act,other,-,0.533,,0.000566,chr6:136261119:AAAG>-,,,,,,,,
759,PLXNB2,True,germline,False,False,,miss_inframe,P226L,0.545,,0.000649,chr22:50289908:G>A,,,,,,,,


### Patient 5

In [39]:
pt5_t1_df[cols][pt5_t1_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
30,NCOA2,True,germline,False,True,Act,miss_inframe,V1132I,0.484,,0.002778,chr8:70128911:C>T,,,,,,,,
41,USP6,True,germline,False,True,Act,miss_inframe,T1199I,0.605,,0.003441,chr17:5170557:C>T,,,,,,,,
46,UBR5,True,germline,False,True,Act,other,-,0.688,,0.007017,chr8:102275623:A>-,,,,,,,,
335,DYSF,True,germline,False,False,,miss_inframe,A202E,0.536,,0.009847,chr2:71513767:C>A,,,,,,,,
570,NLRP2,True,germline,False,False,,miss_inframe,A561T,0.395,,0.006777,chr19:54983379:G>A,,,,,,,,
571,NLRP2,True,germline,False,False,,miss_inframe,E1032V,0.667,,0.009484,chr19:55000804:A>T,,,,,,,,
861,NCOA2,True,somatic,False,True,Act,,,,,,,amp,4.0026,chr8:q13.3,,,,,
862,UBR5,True,somatic,False,True,Act,,,,,,,amp,4.0026,chr8:q22.3,,,,,
866,USP6,True,somatic,False,True,Act,,,,,,,amp,2.7743,chr17:p13.2,,,,,
952,NLRP2,True,somatic,False,False,,,,,,,,,,,NLRP2/NLRP7,False,inv,chr19,37


In [40]:
pt5_t2_df[cols][pt5_t2_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
20,NOTCH1,True,germline,False,True,ambiguous,miss_inframe,A1343V,0.552,,0.001675,chr9:136505868:G>A,,,,,,,,
26,MCM3AP,True,germline,False,False,ambiguous,other,A1680,0.438,,0.0,chr21:46243721:G>C,,,,,,,,
30,NCOA2,True,germline,False,True,Act,miss_inframe,V1132I,0.484,,0.002778,chr8:70128911:C>T,,,,,,,,
35,ABL1,True,germline,False,True,Act,miss_inframe,Y339C,0.625,,1.4e-05,chr9:130872911:A>G,,,,,,,,
42,MYH11,True,germline,False,True,Act,miss_inframe,E1899D,0.519,,0.005046,chr16:15715019:C>G,,,,,,,,
43,CLTCL1,True,germline,False,True,Act,miss_inframe,E1172A,0.611,,0.0,chr22:19208239:T>G,,,,,,,,
46,UBR5,True,germline,False,True,Act,other,-,0.688,,0.007017,chr8:102275623:A>-,,,,,,,,
396,RGS5,True,germline,False,False,,miss_inframe,R176H,0.5,,0.000454,chr1:163147373:C>T,,,,,,,,
570,NLRP2,True,germline,False,False,,miss_inframe,A561T,0.395,,0.006777,chr19:54983379:G>A,,,,,,,,
571,NLRP2,True,germline,False,False,,miss_inframe,E1032V,0.667,,0.009484,chr19:55000804:A>T,,,,,,,,


### Patient 6

In [41]:
pt6_t1_df[cols][pt6_t1_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
6,EP300,True,germline,True,True,LoF,other,-,0.464,,0.007025,chr22:41127755:G>C,,,,,,,,
17,ZBTB16,True,germline,False,True,ambiguous,miss_inframe,T189A,0.241,,0.000342,chr11:114063865:A>G,,,,,,,,
19,MACC1,True,germline,False,False,Act,miss_inframe,V616L,0.364,,0.00014,chr7:20158515:C>A,,,,,,,,
26,RNF213,True,germline,False,True,Act,miss_inframe,L806W,0.571,,6.3e-05,chr17:80306458:T>G,,,,,,,,
28,KDM5A,True,germline,False,True,Act,other,-,0.667,,0.008245,chr12:323211:AAAAAAAAAAAAAAAA>-,,,,,,,,
70,C7orf50,True,germline,False,False,,miss_inframe,T124M,0.29,,5.6e-05,chr7:1000504:G>A,,,,,,,,
119,ROS1,True,germline,False,True,,miss_inframe,T1865S,0.469,,2.1e-05,chr6:117324379:G>C,,,,,,,,
120,ROS1,True,germline,False,True,,miss_inframe,G1027D,0.6,,0.000328,chr6:117365098:C>T,,,,,,,,
203,FRYL,True,germline,False,False,,miss_inframe,V707I,0.52,,0.009858,chr4:48581473:C>T,,,,,,,,
349,CACNA1A,True,germline,False,False,,miss_inframe,GSG2428-2430G,0.3,,0.000358,chr19:13207563:CCGCTG>-,,,,,,,,


In [42]:
pt6_t2_df[cols][pt6_t2_df['germ_som']==True]

Unnamed: 0,SYMBOL,germ_som,origin,germline,intogen,role,variant_type,aa_change,n_AF_real,t_AF,gnomADg_AF,mut,CNA,CN,cytoband,fusion,cgc_transl,sv_type,chr/chr,distance
307,FSIP2,True,germline,False,False,,miss_inframe,L6554R,0.733,,0.000119,chr2:185808967:T>G,,,,,,,,
512,ITIH6,True,germline,False,False,,miss_inframe,T754M,0.452,,0.004525,chrX:54757813:G>A,,,,,,,,
595,ITIH6,True,somatic,False,False,,truncating,P975X,0.0,0.312,0.0,chrX:54757150:G>-,-,1.9817,chrX:p11.22,,,,,
602,FSIP2,True,somatic,False,False,,miss_inframe,F2895L,0.0,0.267,0.0,chr2:185795821:T>G,-,2.0057,chr2:q32.1,,,,,


## Check common somatic mutations

### Snvs

In [43]:
df1 = pt1_t1_snv_df[(pt1_t1_snv_df['Damaging']==True)&(pt1_t1_snv_df['gnomADg_AF']<0.01)]
df2 = pt1_t2_snv_df[(pt1_t2_snv_df['Damaging']==True)&(pt1_t2_snv_df['gnomADg_AF']<0.01)]
pt1_snv_df = pd.merge(df1,df2,on=['SYMBOL','intogen','germline','aa_change','Damaging','mut','gnomADg_AF','Consequence'],how='inner')
pt1_snv_df[['SYMBOL','intogen','germline','aa_change','Consequence','mut','gnomADg_AF']][~pt1_snv_df['SYMBOL'].str.contains('HLA')]

Unnamed: 0,SYMBOL,intogen,germline,aa_change,Consequence,mut,gnomADg_AF


In [44]:
df1 = pt2_t1_snv_df[(pt2_t1_snv_df['Damaging']==True)&(pt2_t1_snv_df['gnomADg_AF']<0.01)]
df2 = pt2_t2_snv_df[(pt2_t2_snv_df['Damaging']==True)&(pt2_t2_snv_df['gnomADg_AF']<0.01)]
pt2_snv_df = pd.merge(df1,df2,on=['SYMBOL','intogen','germline','aa_change','Consequence','Damaging','mut','gnomADg_AF'],how='inner')
pt2_snv_df[['SYMBOL','intogen','germline','aa_change','Consequence','mut','gnomADg_AF']][~pt2_snv_df['SYMBOL'].str.contains('HLA')]

Unnamed: 0,SYMBOL,intogen,germline,aa_change,Consequence,mut,gnomADg_AF
0,GOLGA8S,False,False,-,"splice_region_variant,intron_variant",chr15:23363671:CCC>-,0.000864
1,CR1,True,False,A1345T,missense_variant,chr1:207567904:G>A,0.006198


In [45]:
df1 = pt3_t1_snv_df[(pt3_t1_snv_df['Damaging']==True)&(pt3_t1_snv_df['gnomADg_AF']<0.01)]
df2 = pt3_t2_snv_df[(pt3_t2_snv_df['Damaging']==True)&(pt3_t2_snv_df['gnomADg_AF']<0.01)]
pt3_snv_df = pd.merge(df1,df2,on=['SYMBOL','intogen','germline','aa_change','Consequence','Damaging','mut','gnomADg_AF'],how='inner')
pt3_snv_df[['SYMBOL','intogen','germline','aa_change','Consequence','mut','gnomADg_AF']][~pt3_snv_df['SYMBOL'].str.contains('HLA')]

Unnamed: 0,SYMBOL,intogen,germline,aa_change,Consequence,mut,gnomADg_AF
0,H3C2,False,False,K28M,missense_variant,chr6:26031978:T>A,0.0


In [46]:
df1 = pt4_t1_snv_df[(pt4_t1_snv_df['Damaging']==True)&(pt4_t1_snv_df['gnomADg_AF']<0.01)]
df2 = pt4_t2_snv_df[(pt4_t2_snv_df['Damaging']==True)&(pt4_t2_snv_df['gnomADg_AF']<0.01)]
pt4_snv_df = pd.merge(df1,df2,on=['SYMBOL','intogen','germline','aa_change','Consequence','Damaging','mut','gnomADg_AF'],how='inner')
pt4_snv_df[['SYMBOL','intogen','germline','aa_change','Consequence','mut','gnomADg_AF']][~pt4_snv_df['SYMBOL'].str.contains('HLA')]

Unnamed: 0,SYMBOL,intogen,germline,aa_change,Consequence,mut,gnomADg_AF
0,MUC12,False,False,P1956T,missense_variant,chr7:100996429:C>A,2.8e-05


In [47]:
df1 = pt5_t1_snv_df[(pt5_t1_snv_df['Damaging']==True)&(pt5_t1_snv_df['gnomADg_AF']<0.01)]
df2 = pt5_t2_snv_df[(pt5_t2_snv_df['Damaging']==True)&(pt5_t2_snv_df['gnomADg_AF']<0.01)]
pt5_snv_df = pd.merge(df1,df2,on=['SYMBOL','intogen','germline','aa_change','Consequence','Damaging','mut','gnomADg_AF'],how='inner')
pt5_snv_df[['SYMBOL','intogen','germline','aa_change','Consequence','mut','gnomADg_AF']][~pt5_snv_df['SYMBOL'].str.contains('HLA')]

Unnamed: 0,SYMBOL,intogen,germline,aa_change,Consequence,mut,gnomADg_AF


In [49]:
df1 = pt6_t1_snv_df[(pt6_t1_snv_df['Damaging']==True)&(pt6_t1_snv_df['gnomADg_AF']<0.01)]
df2 = pt6_t2_snv_df[(pt6_t2_snv_df['Damaging']==True)&(pt6_t2_snv_df['gnomADg_AF']<0.01)]
pt6_snv_df = pd.merge(df1,df2,on=['SYMBOL','intogen','germline','aa_change','Consequence','Damaging','mut','gnomADg_AF'],how='inner')
pt6_snv_df[['SYMBOL','intogen','germline','aa_change','Consequence','mut','gnomADg_AF']][~pt6_snv_df['SYMBOL'].str.contains('HLA')]

Unnamed: 0,SYMBOL,intogen,germline,aa_change,Consequence,mut,gnomADg_AF
0,CYP4F2,False,False,A483G,missense_variant,chr19:15878886:G>C,0.005948
1,DND1,False,False,E334K,missense_variant,chr5:140671355:C>T,0.000136


### CNV

In [50]:
df1 = pt1_t1_cnv_df[pt1_t1_cnv_df['CNA']!='-']
df2 = pt1_t2_cnv_df[pt1_t2_cnv_df['CNA']!='-']
pt1_cnv_df = pd.merge(df1,df2,how='inner',on=['SYMBOL','germline','intogen','role','CNA','cytoband'])
pt1_cnv_df

Unnamed: 0,SYMBOL,germline,germline_mskcc_x,intogen,role,CNA,CN_x,CN_min_allele_x,cytoband,germline_mskcc_y,CN_y,CN_min_allele_y


In [51]:
df1 = pt2_t1_cnv_df[pt2_t1_cnv_df['CNA']!='-']
df2 = pt2_t2_cnv_df[pt2_t2_cnv_df['CNA']!='-']
pt2_cnv_df = pd.merge(df1,df2,how='inner',on=['SYMBOL','germline','intogen','role','CNA','cytoband'])
pt2_cnv_df.groupby('cytoband').count()

Unnamed: 0_level_0,SYMBOL,germline,germline_mskcc_x,intogen,role,CNA,CN_x,CN_min_allele_x,germline_mskcc_y,CN_y,CN_min_allele_y
cytoband,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
chr7:q31.32,14,14,14,14,0,14,14,14,14,14,14
chr7:q31.33,12,12,12,12,1,12,12,12,12,12,12
chr7:q32.1,33,33,33,33,1,33,33,33,33,33,33
chr7:q32.2,19,19,19,19,0,19,19,19,19,19,19
chr7:q32.3,10,10,10,10,0,10,10,10,10,10,10
chr7:q32.3-q33,1,1,1,1,0,1,1,1,1,1,1
chr7:q33,31,31,31,31,0,31,31,31,31,31,31
chr7:q33-q34,1,1,1,1,1,1,1,1,1,1,1
chr7:q34,64,64,64,64,2,64,64,64,64,64,64
chr7:q34-q35,1,1,1,1,0,1,1,1,1,1,1


In [52]:
pt2_cnv_df[pt2_cnv_df['SYMBOL']=='KMT2C']

Unnamed: 0,SYMBOL,germline,germline_mskcc_x,intogen,role,CNA,CN_x,CN_min_allele_x,cytoband,germline_mskcc_y,CN_y,CN_min_allele_y
286,KMT2C,False,False,True,LoF,del,1.0035,0.0076,chr7:q36.1,False,1.0112,0.0112


In [53]:
df1 = pt3_t1_cnv_df[pt3_t1_cnv_df['CNA']!='-']
df2 = pt3_t2_cnv_df[pt3_t2_cnv_df['CNA']!='-']
pt3_cnv_df = pd.merge(df1,df2,how='inner',on=['SYMBOL','germline','intogen','role','CNA','cytoband'])
pt3_cnv_df[~(pt3_cnv_df['cytoband'].str.contains('chrX')|pt3_cnv_df['cytoband'].str.contains('chrY'))].groupby('cytoband').count()

Unnamed: 0_level_0,SYMBOL,germline,germline_mskcc_x,intogen,role,CNA,CN_x,CN_min_allele_x,germline_mskcc_y,CN_y,CN_min_allele_y
cytoband,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [54]:
df1 = pt4_t1_cnv_df[pt4_t1_cnv_df['CNA']!='-']
df2 = pt4_t2_cnv_df[pt4_t2_cnv_df['CNA']!='-']
pt4_cnv_df = pd.merge(df1,df2,how='inner',on=['SYMBOL','germline','intogen','role','CNA','cytoband'])
pt4_cnv_df[~(pt4_cnv_df['cytoband'].str.contains('chrX')|pt4_cnv_df['cytoband'].str.contains('chrY'))].groupby('cytoband').count()

Unnamed: 0_level_0,SYMBOL,germline,germline_mskcc_x,intogen,role,CNA,CN_x,CN_min_allele_x,germline_mskcc_y,CN_y,CN_min_allele_y
cytoband,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [55]:
df1 = pt5_t1_cnv_df[pt5_t1_cnv_df['CNA']!='-']
df2 = pt5_t2_cnv_df[pt5_t2_cnv_df['CNA']!='-']
pt5_cnv_df = pd.merge(df1,df2,how='inner',on=['SYMBOL','germline','intogen','role','CNA','cytoband'])
pt5_cnv_df[~(pt5_cnv_df['cytoband'].str.contains('chrX')|pt5_cnv_df['cytoband'].str.contains('chrY'))].groupby('cytoband').count()

Unnamed: 0_level_0,SYMBOL,germline,germline_mskcc_x,intogen,role,CNA,CN_x,CN_min_allele_x,germline_mskcc_y,CN_y,CN_min_allele_y
cytoband,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
chr8:p11.1,1,1,1,1,0,1,1,1,1,1,1
chr8:p11.1-p11.21,1,1,1,1,0,1,1,1,1,1,1
chr8:p11.21,33,33,33,33,2,33,33,33,33,33,33
chr8:p11.22,11,11,11,11,0,11,11,11,11,11,11
chr8:p11.23,20,20,20,20,2,20,20,20,20,20,20
chr8:p11.23-p12,1,1,1,1,0,1,1,1,1,1,1
chr8:p12,29,29,29,29,3,29,29,29,29,29,29
chr8:p12-p21.1,1,1,1,1,0,1,1,1,1,1,1
chr8:p21.1,20,20,20,20,0,20,20,20,20,20,20
chr8:p21.1-p21.2,1,1,1,1,0,1,1,1,1,1,1


In [56]:
df1 = pt6_t1_cnv_df[pt6_t1_cnv_df['CNA']!='-']
df2 = pt6_t2_cnv_df[pt6_t2_cnv_df['CNA']!='-']
pt6_cnv_df = pd.merge(df1,df2,how='inner',on=['SYMBOL','germline','intogen','role','CNA','cytoband'])
pt6_cnv_df[~(pt6_cnv_df['cytoband'].str.contains('chrX')|pt6_cnv_df['cytoband'].str.contains('chrY'))].groupby('cytoband').count()

Unnamed: 0_level_0,SYMBOL,germline,germline_mskcc_x,intogen,role,CNA,CN_x,CN_min_allele_x,germline_mskcc_y,CN_y,CN_min_allele_y
cytoband,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


### SV

In [57]:
pt1_t1_sv_df.columns

Index(['SYMBOL', 'germline', 'germline_mskcc', 'intogen', 'role', 'mut',
       'fusion', 'cgc_transl', 'chr/chr', 'sv_type', 'distance',
       'distance_rel'],
      dtype='object')

In [58]:
pt1_sv_df = pd.merge(pt1_t1_sv_df[~pt1_t1_sv_df['SYMBOL'].isnull()],pt1_t2_sv_df[~pt1_t2_sv_df['SYMBOL'].isnull()],how='inner',on=['SYMBOL', 'germline', 'germline_mskcc', 'intogen', 'role','cgc_transl'],suffixes=['_t1','_t2'])
pt1_sv_df

Unnamed: 0,SYMBOL,germline,germline_mskcc,intogen,role,mut_t1,fusion_t1,cgc_transl,chr/chr_t1,sv_type_t1,distance_t1,distance_rel_t1,mut_t2,fusion_t2,chr/chr_t2,sv_type_t2,distance_t2,distance_rel_t2
0,TYW1,False,False,False,,chr7:67184629:T>T[chr7:67184675[,TYW1-del,False,chr7,del,46,0.0,chr7:67184629:T>T[chr7:67184670[,TYW1-del,chr7,del,41,0.0


In [59]:
pt2_sv_df = pd.merge(pt2_t1_sv_df[~pt2_t1_sv_df['SYMBOL'].isnull()],pt2_t2_sv_df[~pt2_t2_sv_df['SYMBOL'].isnull()],how='inner',on=['SYMBOL', 'germline', 'germline_mskcc', 'intogen', 'role','cgc_transl'],suffixes=['_t1','_t2'])
pt2_sv_df

Unnamed: 0,SYMBOL,germline,germline_mskcc,intogen,role,mut_t1,fusion_t1,cgc_transl,chr/chr_t1,sv_type_t1,distance_t1,distance_rel_t1,mut_t2,fusion_t2,chr/chr_t2,sv_type_t2,distance_t2,distance_rel_t2
0,RAB11FIP4,False,False,False,,chr17:31450810:T>TGTTCCCTGTGGCGCACCCTGTGGCGCAC...,RAB11FIP4-del,False,chr17,del,9,0.0,chr17:31450810:T>TGTTCCCTGTGGCGCACCCTGTGGCGCAC...,RAB11FIP4-del,chr17,del,9,0.0


In [60]:
pt3_sv_df = pd.merge(pt3_t1_sv_df[~pt3_t1_sv_df['SYMBOL'].isnull()],pt3_t2_sv_df[~pt3_t2_sv_df['SYMBOL'].isnull()],how='inner',on=['SYMBOL', 'germline', 'germline_mskcc', 'intogen', 'role','cgc_transl'],suffixes=['_t1','_t2'])
pt3_sv_df

Unnamed: 0,SYMBOL,germline,germline_mskcc,intogen,role,mut_t1,fusion_t1,cgc_transl,chr/chr_t1,sv_type_t1,distance_t1,distance_rel_t1,mut_t2,fusion_t2,chr/chr_t2,sv_type_t2,distance_t2,distance_rel_t2


In [61]:
pt4_sv_df = pd.merge(pt4_t1_sv_df[~pt4_t1_sv_df['SYMBOL'].isnull()],pt4_t2_sv_df[~pt4_t2_sv_df['SYMBOL'].isnull()],how='inner',on=['SYMBOL', 'germline', 'germline_mskcc', 'intogen', 'role','cgc_transl'],suffixes=['_t1','_t2'])
pt4_sv_df

Unnamed: 0,SYMBOL,germline,germline_mskcc,intogen,role,mut_t1,fusion_t1,cgc_transl,chr/chr_t1,sv_type_t1,distance_t1,distance_rel_t1,mut_t2,fusion_t2,chr/chr_t2,sv_type_t2,distance_t2,distance_rel_t2
0,MUC4,False,False,False,Act,chr3:195762714:G>]chr3:195762791]ACCCGGCCCTG,MUC4-del,False,chr3,del,77,0.0,chr3:195762714:G>]chr3:195762791]ACCCGGCCCTG,MUC4-del,chr3,del,77,0.0


In [62]:
pt5_sv_df = pd.merge(pt5_t1_sv_df[~pt5_t1_sv_df['SYMBOL'].isnull()],pt5_t2_sv_df[~pt5_t2_sv_df['SYMBOL'].isnull()],how='inner',on=['SYMBOL', 'germline', 'germline_mskcc', 'intogen', 'role','cgc_transl'],suffixes=['_t1','_t2'])
pt5_sv_df

Unnamed: 0,SYMBOL,germline,germline_mskcc,intogen,role,mut_t1,fusion_t1,cgc_transl,chr/chr_t1,sv_type_t1,distance_t1,distance_rel_t1,mut_t2,fusion_t2,chr/chr_t2,sv_type_t2,distance_t2,distance_rel_t2
0,DMGDH,False,False,False,,chr5:79136989:T>TGAATATATATGAATATATATGAATATATA...,DMGDH-del,False,chr5,del,1,0.0,chr5:79136989:T>TGAATATATATGAATATATATGAATATATA...,DMGDH-del,chr5,del,1,0.0
1,MGAM,False,False,False,,,-,False,chr7/-,other,-,-,,-,chr7/-,other,-,-
2,RIMS2,False,False,False,,,-,False,chr8/-,other,-,-,,-,chr8/-,other,-,-
3,RIMS2,False,False,False,,,-,False,chr8/-,other,-,-,chr8:104101005:T>TCATATTATATATCAATTTATTTTGTGTT...,RIMS2-ins,chr8,ins,0,0.0
4,RIMS2,False,False,False,,chr8:104101005:T>TCATATTATATATCAATTTATTTTGTGTT...,RIMS2-ins,False,chr8,ins,0,0.0,,-,chr8/-,other,-,-
5,RIMS2,False,False,False,,chr8:104101005:T>TCATATTATATATCAATTTATTTTGTGTT...,RIMS2-ins,False,chr8,ins,0,0.0,chr8:104101005:T>TCATATTATATATCAATTTATTTTGTGTT...,RIMS2-ins,chr8,ins,0,0.0
6,KPNB1,False,False,False,,,-,False,chr17/-,other,-,-,,-,chr17/-,other,-,-
7,NLRP2,False,False,False,,chr19:54953537:T>T[chr19:54953574[,NLRP2/NLRP7,False,chr19,inv,37,0.0,chr19:54953537:T>T[chr19:54953574[,NLRP2/NLRP7,chr19,inv,37,0.0
8,NLRP2,False,False,False,,chr19:54953537:T>T[chr19:54953574[,NLRP2/NLRP7,False,chr19,inv,37,0.0,chr19:54953574:G>]chr19:54953537]G,NLRP2/NLRP7,chr19,inv,-37,-0.0
9,NLRP2,False,False,False,,chr19:54953574:G>]chr19:54953537]G,NLRP2/NLRP7,False,chr19,inv,-37,-0.0,chr19:54953537:T>T[chr19:54953574[,NLRP2/NLRP7,chr19,inv,37,0.0


In [63]:
pt6_sv_df = pd.merge(pt6_t1_sv_df[~pt6_t1_sv_df['SYMBOL'].isnull()],pt6_t2_sv_df[~pt6_t2_sv_df['SYMBOL'].isnull()],how='inner',on=['SYMBOL', 'germline', 'germline_mskcc', 'intogen', 'role','cgc_transl'],suffixes=['_t1','_t2'])
pt6_sv_df

Unnamed: 0,SYMBOL,germline,germline_mskcc,intogen,role,mut_t1,fusion_t1,cgc_transl,chr/chr_t1,sv_type_t1,distance_t1,distance_rel_t1,mut_t2,fusion_t2,chr/chr_t2,sv_type_t2,distance_t2,distance_rel_t2
0,IL5RA,False,False,False,,chr3:3116219:T>T]chrX:30540039],IL5RA/-,False,chr3/chrX,other,-,-,chr3:3116219:T>T]chrX:30540039],IL5RA/-,chr3/chrX,other,-,-
1,DMD,False,False,False,,chrX:33009198:A>]chrX:33009231]A,DMD-del,False,chrX,del,33,0.0,chrX:33009198:A>]chrX:33009231]A,DMD-del,chrX,del,33,0.0


## Data for cgi website

In [41]:
pt1_t1_muts_df = table_muts(pt1_t1_df[pt1_t1_df['origin']=='somatic'])
pt1_t2_muts_df = table_muts(pt1_t2_df[pt1_t2_df['origin']=='somatic'])
pt2_t1_muts_df = table_muts(pt2_t1_df[pt2_t1_df['origin']=='somatic'])
pt2_t2_muts_df = table_muts(pt2_t2_df[pt2_t2_df['origin']=='somatic'])
pt3_t1_muts_df = table_muts(pt3_t1_df[pt3_t1_df['origin']=='somatic'])
pt3_t2_muts_df = table_muts(pt3_t2_df[pt3_t2_df['origin']=='somatic'])
pt4_t1_muts_df = table_muts(pt4_t1_df[pt4_t1_df['origin']=='somatic'])
pt4_t2_muts_df = table_muts(pt4_t2_df[pt4_t2_df['origin']=='somatic'])
pt5_t1_muts_df = table_muts(pt5_t1_df[pt5_t1_df['origin']=='somatic'])
pt5_t2_muts_df = table_muts(pt5_t2_df[pt5_t2_df['origin']=='somatic'])
pt6_t1_muts_df = table_muts(pt6_t1_df[pt6_t1_df['origin']=='somatic'])
pt6_t2_muts_df = table_muts(pt6_t2_df[pt6_t2_df['origin']=='somatic'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [42]:
muts_df = pd.concat([pt1_t1_muts_df,pt1_t2_muts_df,pt2_t1_muts_df,pt2_t2_muts_df,
                   pt3_t1_muts_df,pt3_t2_muts_df,pt4_t1_muts_df,pt4_t2_muts_df,
                   pt5_t1_muts_df,pt5_t2_muts_df,pt6_t1_muts_df,pt6_t2_muts_df])
muts_df.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_test_cgi/muts.tsv',sep='\t',index=None)
muts_df

Unnamed: 0,gdna,sample
585,chr2:g.29239684G>C,AQ5180_vs_AQ5174
586,chr3:g.195788682G>T,AQ5180_vs_AQ5174
587,chr9:g.32633113G>A,AQ5180_vs_AQ5174
588,chr11:g.65864771G>A,AQ5180_vs_AQ5174
589,chr15:g.45150658G>A,AQ5180_vs_AQ5174
590,chr17:g.10401775C>T,AQ5180_vs_AQ5174
591,chr2:g.170392474G>C,AQ5180_vs_AQ5174
592,chr4:g.157341453G>T,AQ5180_vs_AQ5174
593,chr5:g.55815329C>A,AQ5180_vs_AQ5174
594,chr7:g.117315154G>A,AQ5180_vs_AQ5174


In [125]:
pt1_t1_cna_df = table_cna(pt1_t1_df)
pt1_t2_cna_df = table_cna(pt1_t2_df)
pt2_t1_cna_df = table_cna(pt2_t1_df)
pt2_t2_cna_df = table_cna(pt2_t2_df)
pt3_t1_cna_df = table_cna(pt3_t1_df)
pt3_t2_cna_df = table_cna(pt3_t2_df)
pt4_t1_cna_df = table_cna(pt4_t1_df)
pt4_t2_cna_df = table_cna(pt4_t2_df)
pt5_t1_cna_df = table_cna(pt5_t1_df)
pt5_t2_cna_df = table_cna(pt5_t2_df)
pt6_t1_cna_df = table_cna(pt6_t1_df)
pt6_t2_cna_df = table_cna(pt6_t2_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [126]:
cna_df = pd.concat([pt1_t1_cna_df,pt1_t2_cna_df,pt2_t1_cna_df,pt2_t2_cna_df,
                   pt3_t1_cna_df,pt3_t2_cna_df,pt4_t1_cna_df,pt4_t2_cna_df,
                   pt5_t1_cna_df,pt5_t2_cna_df,pt6_t1_cna_df,pt6_t2_cna_df])
cna_df.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_test_cgi/cna.tsv',sep='\t',index=None)
cna_df

Unnamed: 0,gene,cna,sample
598,MYCN,amp,AQ5180_vs_AQ5174
600,CLTC,amp,AQ5180_vs_AQ5174
601,PPM1D,amp,AQ5180_vs_AQ5174
602,CD79B,amp,AQ5180_vs_AQ5174
603,SRSF2,amp,AQ5180_vs_AQ5174
...,...,...,...
726,MSI2,amp,AQ5185_vs_AQ5179
727,NEFH,del,AQ5185_vs_AQ5179
728,MSN,del,AQ5185_vs_AQ5179
729,WNK4,amp,AQ5185_vs_AQ5179


In [127]:
pt1_t1_transl_df = table_transl(pt1_t1_df)
pt1_t2_transl_df = table_transl(pt1_t2_df)
pt2_t1_transl_df = table_transl(pt2_t1_df)
pt2_t2_transl_df = table_transl(pt2_t2_df)
pt3_t1_transl_df = table_transl(pt3_t1_df)
pt3_t2_transl_df = table_transl(pt3_t2_df)
pt4_t1_transl_df = table_transl(pt4_t1_df)
pt4_t2_transl_df = table_transl(pt4_t2_df)
pt5_t1_transl_df = table_transl(pt5_t1_df)
pt5_t2_transl_df = table_transl(pt5_t2_df)
pt6_t1_transl_df = table_transl(pt6_t1_df)
pt6_t2_transl_df = table_transl(pt6_t2_df)

In [128]:
transl_df = pd.concat([pt1_t1_transl_df,pt1_t2_transl_df,pt2_t1_transl_df,pt2_t2_transl_df,
                   pt3_t1_transl_df,pt3_t2_transl_df,pt4_t1_transl_df,pt4_t2_transl_df,
                   pt5_t1_transl_df,pt5_t2_transl_df,pt6_t1_transl_df,pt6_t2_transl_df])
transl_df.to_csv('/workspace/projects/sjd_pediatric_tumors/tables_test_cgi/transl.tsv',sep='\t',index=None)
transl_df

Unnamed: 0,fus,sample
627,ALG14__TRIM37,AQ5180_vs_AQ5174
628,TRIM37__ALG14,AQ5180_vs_AQ5174
634,NBAS__FAM49A,AQ5180_vs_AQ5174
635,FAM49A__NBAS,AQ5180_vs_AQ5174
931,PAX3__FOXO1,AQ5181_vs_AQ5175
932,PAX3__FOXO1,AQ5181_vs_AQ5175
934,FOXO1__PAX3,AQ5181_vs_AQ5175
935,FOXO1__PAX3,AQ5181_vs_AQ5175
938,NME7__GTF2H5,AQ5181_vs_AQ5175
939,NMNAT2__KLHL1,AQ5181_vs_AQ5175


In [39]:
print_muts(pt1_t1_df[pt1_t1_df['origin']=='somatic'])

chr2:g.29239684G>C
chr3:g.195788682G>T
chr9:g.32633113G>A
chr11:g.65864771G>A
chr15:g.45150658G>A
chr17:g.10401775C>T
chr2:g.170392474G>C
chr4:g.157341453G>T
chr5:g.55815329C>A
chr7:g.117315154G>A
chr9:g.86016671G>T
chr8:g.143977835A>T
chr8:g.143977838G>C


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [132]:
print_muts(pt1_t1_df[pt1_t1_df['origin']=='somatic'])
print_muts(pt1_t2_df[pt1_t2_df['origin']=='somatic'])
print_muts(pt2_t1_df[pt2_t1_df['origin']=='somatic'])
print_muts(pt2_t2_df[pt2_t2_df['origin']=='somatic'])
print_muts(pt3_t1_df[pt3_t1_df['origin']=='somatic'])
print_muts(pt3_t2_df[pt3_t2_df['origin']=='somatic'])
print_muts(pt4_t1_df[pt4_t1_df['origin']=='somatic'])
print_muts(pt4_t2_df[pt4_t2_df['origin']=='somatic'])
print_muts(pt5_t1_df[pt5_t1_df['origin']=='somatic'])
print_muts(pt5_t2_df[pt5_t2_df['origin']=='somatic'])
print_muts(pt6_t1_df[pt6_t1_df['origin']=='somatic'])
print_muts(pt6_t2_df[pt6_t2_df['origin']=='somatic'])

chr2:g.29239684G>C
chr3:g.195788682G>T
chr9:g.32633113G>A
chr11:g.65864771G>A
chr15:g.45150658G>A
chr17:g.10401775C>T
chr2:g.170392474G>C
chr4:g.157341453G>T
chr5:g.55815329C>A
chr7:g.117315154G>A
chr9:g.86016671G>T
chr8:g.143977835A>T
chr8:g.143977838G>C
chr22:g.23825237GTGGG>-
chr16:g.87417566C>-
chr17:g.11930052ATGCA>-
chr1:g.212345866C>T
chr6:g.46690850G>T
chr10:g.109888587C>G
chr14:g.72993210TC>AA
chr17:g.18889822A>G
chr18:g.70006412G>A
chr19:g.8334767C>T
chr1:g.45620956C>T
chr2:g.151554025G>A
chr2:g.166443527C>G
chr2:g.178072254C>G
chr4:g.37439346C>A
chr5:g.172053015G>T
chr6:g.99412673C>T
chr9:g.76706593G>C
chr10:g.100261974C>G
chr10:g.70142858C>T
chr11:g.10560487CC>AT
chr1:g.67777454T>C
chr14:g.70732719G>A
chr10:g.100479946G>T
chr12:g.122990631C>T
chr18:g.33245423C>T
chr1:g.156137697A>G
chr2:g.185796924A>G
chr2:g.232633940A>G
chr9:g.33797912G>A
chr10:g.94358222->CA
chr4:g.11625811A>C
chrX:g.153830239C>T
chr11:g.32396397->CGTCG
chr12:g.25245350C>G
chr1:g.36469256C>T
chr10:g.94032

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [43]:
[print (m) for m in muts_df['gdna'].tolist()]

chr2:g.29239684G>C
chr3:g.195788682G>T
chr9:g.32633113G>A
chr11:g.65864771G>A
chr15:g.45150658G>A
chr17:g.10401775C>T
chr2:g.170392474G>C
chr4:g.157341453G>T
chr5:g.55815329C>A
chr7:g.117315154G>A
chr9:g.86016671G>T
chr8:g.143977835A>T
chr8:g.143977838G>C
chr22:g.23825237GTGGG>-
chr16:g.87417566C>-
chr17:g.11930052ATGCA>-
chr1:g.212345866C>T
chr6:g.46690850G>T
chr10:g.109888587C>G
chr14:g.72993210TC>AA
chr17:g.18889822A>G
chr18:g.70006412G>A
chr19:g.8334767C>T
chr1:g.45620956C>T
chr2:g.151554025G>A
chr2:g.166443527C>G
chr2:g.178072254C>G
chr4:g.37439346C>A
chr5:g.172053015G>T
chr6:g.99412673C>T
chr9:g.76706593G>C
chr10:g.100261974C>G
chr10:g.70142858C>T
chr11:g.10560487CC>AT
chr1:g.67777454T>C
chr14:g.70732719G>A
chr10:g.100479946G>T
chr12:g.122990631C>T
chr18:g.33245423C>T
chr1:g.156137697A>G
chr2:g.185796924A>G
chr2:g.232633940A>G
chr9:g.33797912G>A
chr10:g.94358222->CA
chr4:g.11625811A>C
chrX:g.153830239C>T
chr11:g.32396397->CGTCG
chr12:g.25245350C>G
chr1:g.36469256C>T
chr10:g.94032

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [133]:
print_muts(pt1_t1_df)
print_muts(pt1_t2_df)
print_muts(pt2_t1_df)
print_muts(pt2_t2_df)
print_muts(pt3_t1_df)
print_muts(pt3_t2_df)
print_muts(pt4_t1_df)
print_muts(pt4_t2_df)
print_muts(pt5_t1_df)
print_muts(pt5_t2_df)
print_muts(pt6_t1_df)
print_muts(pt6_t2_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


chr16:g.68738336C>A
chr14:g.94379488CT>-
chr1:g.155236246G>A
chr3:g.48590356C>A
chr5:g.132589662A>G
chr20:g.63690338AGA>-
chr3:g.12599692G>A
chr16:g.3606524C>T
chr1:g.10660411C>A
chr4:g.48167810A>T
chr4:g.125317150C>A
chr13:g.57633577G>T
chr15:g.89326947C>A
chr19:g.35721680C>T
chr20:g.1922504C>G
chr16:g.50320780T>G
chr1:g.21828983T>C
chr1:g.21875990G>A
chr3:g.121488622C>G
chr7:g.64522133C>A
chr13:g.35070794C>G
chr11:g.92834983A>G
chr11:g.92882868G>A
chr19:g.32999660T>C
chr3:g.195778922CTGTGGATGCTGAGGAAGTGTCGGTGACAGGAAGAGGGGTGGCG>-
chr5:g.151542408C>T
chr5:g.151567229T>C
chr5:g.151568190G>A
chr3:g.195778964CGTGACCTGTGGATGCTGAGGAAGTGTCGGTGACAGGAAGAGGGGTGGTGTCACCTGTGGATGCTGAGGAAGTGCTGGTGACAGGAAGAGGGGTGC>-
chr3:g.195779436AGGGGTGGTGTCACCTGTGGATGCTGAGGAAGTGCTGGTGACAGGAAC>-
chr3:g.195783008A>G
chr3:g.195783047G>C
chr3:g.195786899CTGAGGAAGCGTCGGTGACAGGAAGAGGGGTGGTGTCACCTGTGGATG>-
chr3:g.195787579G>A
chr6:g.136278351C>T
chr11:g.102331119G>A
chr17:g.5170557C>T
chr19:g.8960491C>T
chr20:g.3740246

chr11:g.108244000G>C
chr11:g.108345884C>T
chr7:g.55205613C>T
chr7:g.116769637C>-
chr11:g.119296941C>T
chr5:g.157222945G>A
chr4:g.186618941A>G
chr22:g.41117723G>A
chr14:g.45164444A>G
chr10:g.63207383G>A
chr16:g.58509151C>T
chr4:g.40120945A>G
chr4:g.125320630G>A
chr4:g.125452270A>G
chr1:g.13781565AAG>-
chr1:g.198709705A>G
chr2:g.108782803A>G
chr7:g.152273774C>T
chr16:g.4260178G>A
chr12:g.22673582A>T
chr12:g.121804760A>C
chr12:g.122333029G>C
chrX:g.64192497C>T
chr2:g.140297972A>G
chr1:g.21875990G>A
chr2:g.42288311G>A
chr17:g.42780714G>A
chr19:g.21973527CACATTCTTCACATTTGTAGGGTTTCTCTCCAGCATGAGTTGCCTTAT>-
chr12:g.50350836T>C
chr15:g.28275009C>A
chr11:g.63898288C>G
chr19:g.8888822GCTT>-
chr19:g.8888860CT>-
chr19:g.8902222G>-
chr1:g.204549329A>C
chr6:g.33405063G>C
chr7:g.20158698C>A
chr9:g.131198511G>C
chr8:g.127740499C>A
chr14:g.32154655G>A
chr10:g.7172622G>C
chr11:g.64307271C>T
chr11:g.118902443G>T
chr16:g.10918486G>A
chr22:g.19183443C>G
chr22:g.24183251G>A
chr4:g.73417525T>C
chr4:g.73419646

chr11:g.108244000G>C
chr11:g.108345884C>T
chr7:g.55205613C>T
chr7:g.116769637C>-
chr11:g.119296941C>T
chr5:g.157222945G>A
chr4:g.186618941A>G
chr22:g.41117723G>A
chr14:g.45164444A>G
chr10:g.63207383G>A
chr16:g.58509151C>T
chr4:g.40120945A>G
chr4:g.125320630G>A
chr4:g.125452270A>G
chr1:g.13781565AAG>-
chr1:g.198709705A>G
chr2:g.108782803A>G
chr7:g.152273774C>T
chr16:g.4260178G>A
chr12:g.22673582A>T
chr12:g.121804760A>C
chr12:g.122333029G>C
chrX:g.64192497C>T
chr2:g.140297972A>G
chr1:g.21875990G>A
chr2:g.42288311G>A
chr17:g.42780714G>A
chr19:g.21973527CACATTCTTCACATTTGTAGGGTTTCTCTCCAGCATGAGTTGCCTTAT>-
chr12:g.50350836T>C
chr15:g.28275009C>A
chr11:g.63898288C>G
chr19:g.8888822GCTT>-
chr19:g.8888860CT>-
chr19:g.8902222G>-
chr1:g.204549329A>C
chr6:g.33405063G>C
chr7:g.20158698C>A
chr9:g.131198511G>C
chr8:g.127740499C>A
chr14:g.32154655G>A
chr10:g.7172622G>C
chr11:g.64307271C>T
chr11:g.118902443G>T
chr16:g.10918486G>A
chr22:g.19183443C>G
chr22:g.24183251G>A
chr4:g.73417525T>C
chr4:g.73419646

chr14:g.95099832T>C
chr8:g.31154721C>G
chr14:g.94379488CT>-
chr7:g.117590400G>C
chr7:g.117592169C>T
chr3:g.77565064T>C
chr6:g.137879151G>A
chr6:g.149684514G>A
chr13:g.26214257G>T
chr12:g.49050626C>G
chr15:g.41749552A>G
chr1:g.10644911C>T
chr10:g.68572598C>G
chr15:g.28229540G>C
chr2:g.106423913C>T
chr1:g.36481538C>G
chr5:g.180603326G>A
chr8:g.70128796A>G
chr3:g.195779022GGATGCTGAGGAAGTGCTGGTGACAGGAAGAGGGGTGCCGTGACCTGTGGACACTGAGGAAGCGTCGGTGACAGGAAGAGAGGTGGTGTGACCTGA>-
chr1:g.58782892A>G
chr11:g.65658293C>T
chr17:g.40408549C>T
chr22:g.24187801G>A
chr19:g.8939543C>T
chr5:g.146515831G>A
chr8:g.10538480A>-
chr8:g.19142731T>A
chr8:g.30843260CT>-
chr8:g.99012868C>A
chr8:g.144240576C>T
chr3:g.167315754G>A
chr7:g.64249142GG>-
chr6:g.170318477C>T
chr4:g.5973658C>T
chr4:g.163472760AT>-
chr1:g.89263782TC>-
chr1:g.223112947G>A
chr1:g.227509556G>A
chr1:g.248145805C>T
chr13:g.41675234G>A
chr11:g.113363854T>A
chr12:g.7310538G>A
chr12:g.54162784C>G
chr16:g.14951779TC>-
chr16:g.18344806CAGGGAGGCGCACACGCT

chr5:g.74740028A>G
chr5:g.75030028A>G
chr5:g.77433149A>G
chr5:g.83520170T>C
chr5:g.83943430C>G
chr5:g.90675288C>T
chr5:g.90684040A>G
chr5:g.90690934A>G
chr5:g.90791053G>A
chr5:g.90802826C>T
chr5:g.94695160G>T
chr5:g.98901263T>G
chr5:g.111121044A>G
chr5:g.116475552G>T
chr5:g.124701214G>A
chr5:g.134308142C>T
chr5:g.140830448C>T
chr5:g.140870550G>T
chr5:g.141376061C>G
chr5:g.141573997GGAGGA>-
chr5:g.141869201T>C
chr5:g.144206896C>T
chr5:g.147383442C>T
chr5:g.154409589A>G
chr5:g.160092770T>C
chr5:g.172105724G>A
chr5:g.177409046C>T
chr5:g.178932352A>G
chr5:g.178981703G>A
chr5:g.80320264G>A
chr4:g.2305498G>A
chr4:g.2829594G>T
chr4:g.5990289T>C
chr4:g.10097770C>T
chr4:g.17633777T>C
chr4:g.24799792G>A
chr4:g.25333296A>G
chr4:g.25333328C>A
chr4:g.26481832C>T
chr4:g.67539851G>C
chr4:g.78255299G>A
chr4:g.78448166C>T
chr4:g.78540940T>C
chr4:g.83456130A>T
chr4:g.99420661T>C
chr4:g.102307494A>G
chr4:g.109913381A>C
chr4:g.113455777G>A
chr4:g.128077797G>A
chr4:g.134200028C>T
chr4:g.145141530T>C
chr4:g

chr3:g.130092130G>T
chr3:g.136152223C>T
chr3:g.138162242C>G
chr3:g.138500832A>T
chr3:g.138572893C>T
chr3:g.146085230C>G
chr3:g.146199886C>T
chr3:g.165189762G>C
chr3:g.183715218C>T
chr3:g.186804667A>G
chr3:g.187199789G>A
chr3:g.197017213C>T
chr3:g.197022958G>A
chr10:g.11747356G>A
chr10:g.17617200C>T
chr10:g.22318964T>C
chr10:g.23119314A>C
chr10:g.43206023C>T
chr10:g.46462395C>T
chr10:g.60073926C>T
chr10:g.69506125G>A
chr10:g.70255348G>A
chr10:g.77806783G>A
chr10:g.79612324C>T
chr10:g.89383617G>T
chr10:g.92024967G>A
chr10:g.92974147C>T
chr10:g.95225300A>C
chr10:g.95711979C>G
chr10:g.96379034G>C
chr10:g.96982331T>C
chr10:g.98429879C>T
chr10:g.100291335T>A
chr10:g.102161081G>A
chr10:g.104915853C>T
chr10:g.114290260C>T
chr10:g.117209737C>G
chr10:g.122461711C>T
chr10:g.122591497C>T
chr10:g.122593584C>T
chr10:g.122621160C>A
chr10:g.122678872G>A
chr10:g.122698453C>G
chr10:g.128108608G>A
chr10:g.133308974C>T
chr10:g.133073114G>T
chr13:g.23324368C>T
chr13:g.24321454G>A
chr13:g.41575802TGT>-
chr1

chr1:g.148587407A>T
chr1:g.156914099T>C
chr1:g.200693750T>A
chr1:g.201052550C>T
chr1:g.204997162C>T
chr1:g.159927893A>-
chr15:g.36881667C>A
chr15:g.41818585G>A
chr15:g.42085483T>-
chr15:g.60074796G>A
chr15:g.91915988C>T
chr15:g.81345017GAGA>-
chr11:g.17077484A>G
chr11:g.66367874G>A
chr11:g.101133123C>T
chr13:g.81283219G>-
chr13:g.108270336A>G
chr13:g.114281851G>A
chr21:g.5243698C>T
chr10:g.286267A>G
chr10:g.13127907A>G
chr10:g.30518377T>C
chr10:g.116627879G>T
chr19:g.618768G>A
chr19:g.8589243G>C
chr19:g.9818542T>C
chr19:g.17783177T>C
chr19:g.17880961C>T
chr19:g.51334311G>A
chr19:g.54805941CTC>-
chr19:g.54808890G>-
chrY:g.15549939T>C
chr17:g.1013098G>A
chr17:g.12958713G>A
chr17:g.39908221C>T
chr17:g.47829132T>C
chr17:g.50521230G>C
chr17:g.76348587G>A
chr17:g.77142641A>C
chr17:g.77560603G>A
chr17:g.78454466A>G
chr17:g.34178980A>-
chr16:g.2979920A>G
chr16:g.8766273G>A
chr16:g.19411176G>A
chr16:g.21127816C>A
chr16:g.58776702C>T
chr16:g.87327562G>A
chr16:g.88526806C>A
chr20:g.2339849C>G
chr

chr8:g.94866159A>C
chr8:g.143739922C>T
chr7:g.1000504G>A
chr7:g.5370961C>G
chr7:g.6149652G>A
chr7:g.6653529G>A
chr7:g.12370103G>A
chr7:g.20402089C>G
chr7:g.24852602T>C
chr7:g.47836946A>G
chr7:g.73578433G>T
chr7:g.128884094C>T
chr7:g.134199257G>A
chr7:g.135166847C>T
chr7:g.140673740C>T
chr7:g.141973287A>C
chr7:g.142052860C>G
chr7:g.142054778T>C
chr7:g.149071552C>T
chr7:g.149250771G>A
chr7:g.151374099A>T
chr5:g.6651930G>A
chr5:g.14681484G>T
chr5:g.17605178C>T
chr5:g.38967349C>T
chr5:g.70951971T>A
chr5:g.74685474C>T
chr5:g.77077343G>C
chr5:g.90637912C>T
chr5:g.110756712G>A
chr5:g.134578893G>A
chr5:g.141179448G>C
chr5:g.141187613A>G
chr5:g.141187881G>T
chr5:g.141409157G>A
chr5:g.146062586G>A
chr5:g.150548356G>A
chr5:g.156344500G>C
chr5:g.176575579C>T
chr5:g.179836445C>T
chr6:g.17850345G>A
chr6:g.21595689CTC>-
chr6:g.24596298G>T
chr6:g.26156450GAA>-
chr6:g.26234828C>T
chr6:g.26508695A>C
chr6:g.36302412C>A
chr6:g.43038278G>T
chr6:g.43071569G>A
chr6:g.43076902G>A
chr6:g.43666471T>G
chr6:g.117

In [28]:
pt1_t1_df

Unnamed: 0,SYMBOL,germline,germline_mskcc,germline_akh,intogen,role,variant_type,Consequence,aa_change,mut,...,CN_min_allele,cytoband,mut_sv,fusion,cgc_transl,chr/chr,sv_type,distance,distance_rel,germ_som
0,CDH1,True,True,True,True,LoF,miss_inframe,missense_variant,P30T,chr16:68738336:C>A,...,,,,,,,,,,False
1,SERPINA1,True,False,True,False,,truncating,frameshift_variant,E347X,chr14:94379488:CT>-,...,,,,,,,,,,False
2,GBA,True,False,True,False,,miss_inframe,"missense_variant,splice_region_variant",T408M,chr1:155236246:G>A,...,,,,,,,,,,False
3,COL7A1,True,False,True,False,,miss_inframe,"missense_variant,splice_region_variant",G636V,chr3:48590356:C>A,...,,,,,,,,,,False
4,RAD50,True,True,False,False,,miss_inframe,missense_variant,Q426R,chr5:132589662:A>G,...,,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,NBAS,False,False,,False,,,,,,...,,,chr2:15407882:T>]chr2:16627671]CAT,NBAS/FAM49A,False,chr2,inv,1219789,0.01,False
635,FAM49A,False,False,,False,,,,,,...,,,chr2:16627671:A>ACA[chr2:15407882[,FAM49A/NBAS,False,chr2,inv,-1219789,-0.01,False
636,ACMSD,False,False,,False,,,,,,...,,,chr2:134868444:T>]chr13:61730199]T,ACMSD/-,False,chr2/chr13,other,-,-,False
637,GAP43,False,False,,False,,,,,,...,,,chr3:115717328:T>TTTTTTTTTTTTTTT]chr2:129370338],GAP43/-,False,chr3/chr2,other,-,-,False


In [49]:
print_muts(pt1_t1_df[pt1_t1_df['origin']=='somatic'])
print_cna(pt1_t1_df)

chr2:g.29239684G>C
chr3:g.195788682G>T
chr9:g.32633113G>A
chr11:g.65864771G>A
chr15:g.45150658G>A
chr17:g.10401775C>T
chr2:g.170392474G>C
chr4:g.157341453G>T
chr5:g.55815329C>A
chr7:g.117315154G>A
chr9:g.86016671G>T
chr8:g.143977835A>T
chr8:g.143977838G>C
CD79B:amp
CAMTA1:del
CDKN2C:del
SEPT9:amp
BCL10:del
RPL5:del
CASP9:del
TNFRSF14:del
EPHA2:del
SRSF2:amp
ARHGEF10L:del
SDHB:del
ARID1A:del
HSPG2:del
MUTYH:del
PPM1D:amp
PRDM2:del
STIL:del
MYCN:amp
ID3:del
JAK1:del
H3F3B:amp
CASZ1:del
SPEN:del
RNF213:amp
CLTC:amp


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [47]:
print_cna(pt1_t1_df)
print_cna(pt1_t2_df)
print_cna(pt2_t1_df)
print_cna(pt2_t2_df)
print_cna(pt3_t1_df)
print_cna(pt3_t2_df)
print_cna(pt4_t1_df)
print_cna(pt4_t2_df)
print_cna(pt5_t1_df)
print_cna(pt5_t2_df)
print_cna(pt6_t1_df)
print_cna(pt6_t2_df)

CD79B:amp
CAMTA1:del
CDKN2C:del
SEPT9:amp
BCL10:del
RPL5:del
CASP9:del
TNFRSF14:del
EPHA2:del
SRSF2:amp
ARHGEF10L:del
SDHB:del
ARID1A:del
HSPG2:del
MUTYH:del
PPM1D:amp
PRDM2:del
STIL:del
MYCN:amp
ID3:del
JAK1:del
H3F3B:amp
CASZ1:del
SPEN:del
RNF213:amp
CLTC:amp
CHEK2:del
NF2:del
SMARCB1:del
ZNRF3:del
LZTR1:del
NEFH:del
KDM5A:amp
ACKR3:amp
BCL7A:amp
BTG1:amp
ALK:amp
REL:amp
ACVR1:amp
NCOA1:amp
IDH1:amp
MDM2:amp
CNTNAP2:del
ERBB3:amp
EZH2:del
FSIP2:amp
SF3B1:amp
CD28:amp
CNOT9:amp
XPO1:amp
HOXC13:amp
STRN:amp
CXCR4:amp
LMNA:amp
ZBTB7B:amp
STAT6:amp
SOS1:amp
BCL11A:amp
CDK4:amp
MYCN:amp
BIRC6:amp
KRAS:amp
PCBP1:amp
NTRK1:amp
ABL2:amp
SETDB1:amp
PRRX1:amp
KMT2C:del
POT1:del
SIX2:amp
AFF3:amp
SP140:amp
RGPD3:amp
DTX1:amp
LY75-CD302:amp
PDE4DIP:amp
LRIG3:amp
NFE2L2:amp
CCND2:amp
BCL9:amp
DDR2:amp
MDM4:amp
PITPNM2:amp
CTNNA2:amp
CHD4:amp
FAM186A:amp
ETV6:amp
EFHD1:amp
EML4:amp
CR1:amp
EPAS1:amp
ACSL3:amp
DHX9:amp
GLI1:amp
AKT3:amp
PTPN11:amp
H3F3A:amp
EZH2:del
CUX1:del
ELN:del
PMS2:del
NT5C3A

In [33]:
kmt2c_df = pt2_g_df[['#CHROM', 'POS', 'ID', 'REF', 'ALT','SYMBOL','Gene','Feature', 'Feature_type',
       'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position',
       'Amino_acids', 'Codons', 'Existing_variation']][(pt2_g_df['SYMBOL']=='KMT2C')&(pt2_g_df['Consequence']=='missense_variant')]
kmt2c_df['Patient'] = 'Patient 2 (ARMS,AML)'
smarcb1_df = pt1_t2_snv_df[['#CHROM', 'POS', 'ID', 'REF', 'ALT','SYMBOL','Gene','Feature', 'Feature_type',
       'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position',
       'Amino_acids', 'Codons', 'Existing_variation']][(pt1_t2_snv_df['SYMBOL']=='SMARCB1')]
smarcb1_df['Patient'] = 'Patient 1 (NB,RT)'

genes_to_test_df = pd.concat([smarcb1_df,kmt2c_df],ignore_index=True)
genes_to_test_df.to_csv('/workspace/projects/sjd_pediatric_tumors/mutations_to_check.tsv',sep='\t')
genes_to_test_df

Unnamed: 0,#CHROM,POS,ID,REF,ALT,SYMBOL,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,Patient
0,chr22,23825237,.,GTGGG,-,SMARCB1,ENSG00000099956,ENST00000344921,Transcript,frameshift_variant,1042-1046,835-839,279-280,VG/X,GTGGGa/a,-,"Patient 1 (NB,RT)"
1,chr7,152273774,.,C,T,KMT2C,ENSG00000055609,ENST00000262189,Transcript,missense_variant,1160,943,315,G/S,Ggc/Agc,"rs149992209,COSV51275293,COSV51338045","Patient 2 (ARMS,AML)"
