# All coding alterations per sample:
- Germline (HC)
- Somatic:
    - SNV (intersect mutect, strelka,sage; with "rescued" mutations)
    - CNV (purple)
    - Fusions (GRIDDS)

In [1]:
import pandas as pd
import json
from tqdm.notebook import tqdm
tqdm.pandas()
pd.set_option('display.max_rows', 250)

In [2]:
chroms = list(range(1,23))
chroms = ['chr'+str(chrom) for chrom in chroms]
sex_chroms = ['chrX','chrY']
chroms = chroms + sex_chroms
samples = json.load(open('../../../cases_ids.json','rb'))
ccf_thresholds = json.load( open( "ccf_thresholds.json", "rb" ) )

In [4]:
intogen_df = pd.read_csv('./data/unique_drivers.tsv',sep='\t')
intogen_drivers = intogen_df['SYMBOL'].tolist()

In [5]:
samples

{'case1': {'normal': 'AQ5175',
  'tumor1': 'AQ5181',
  'tumor2': 'AQ5187',
  'sex': 'female'},
 'case2': {'normal': 'AQ5176',
  'tumor1': 'AQ5182',
  'tumor2': 'AQ5188',
  'sex': 'male'},
 'case3': {'normal': 'AQ5174',
  'tumor1': 'AQ5180',
  'tumor2': 'AQ5186',
  'sex': 'female',
  'kidney': 'AX4954',
  'liver': 'AX4955',
  'pancreas': 'AX4956',
  'heart': 'AX4957',
  'clone1': 'AX4958',
  'clone2': 'AX4961',
  'mother': 'AW8063',
  'father': 'AW8064',
  'lung': 'AX4962',
  'medulla': 'AX4963',
  'spleen': 'AX4964',
  'brain': 'AX4965',
  'bma': 'AX4966'},
 'case4': {'normal': 'AW8061',
  'tumor1': 'AW8050',
  'tumor2': 'AW8051',
  'sex': 'female'}}

## Functions to arange all data

In [6]:
def ranked_table (df,gnomad=0.01):
#    df = df[['SYMBOL','germline','germline_mskcc','germline_akh','intogen','role','variant_type','Consequence','aa_change','mut','IMPACT','n_AF','n_AF_real','n_alt_reads','n_ref_reads','gnomADg','gnomADg_AF','Damaging']][(df['gnomADg_AF']<gnomad)&((df['intogen']==True)|(df['germline']==True)|(df['germline_mskcc']==True))&(df['Damaging']==True)]
    df = df[['SYMBOL','germline','germline_mskcc','germline_akh','intogen','role','variant_type','Consequence','aa_change','mut','IMPACT','n_AF','n_AF_real','n_alt_reads','n_ref_reads','gnomADg','gnomADg_AF','Damaging','STRAND']][(df['gnomADg_AF']<gnomad)&(df['Damaging']==True)]
    roles = ['LoF','ambiguous','Act']
    variants = ['truncating','miss_inframe','other']
    df['role'] = df['role'].astype("category")
    df['role'].cat.set_categories(roles, inplace=True)
    df['variant_type'] = df['variant_type'].astype("category")
    df['variant_type'].cat.set_categories(variants, inplace=True)
    df = df[~df['SYMBOL'].str.contains('HLA')].sort_values(['germline','germline_mskcc','germline_akh','role','variant_type'],ascending=[False,False,False,roles,variants])
    return df

In [7]:
def ranked_table_snvs (df,gnomad=0.001):
    #Filter by gnomad_AF and damaging
    df = df[['SYMBOL','germline','germline_mskcc','germline_akh','intogen','role','variant_type','Consequence','aa_change','mut','IMPACT','n_AF','n_AF_real','t_AF','n_alt_reads','n_ref_reads','t_alt_reads','t_ref_reads','t_CCF','clonal','gnomADg','gnomADg_AF','Damaging','STRAND','SAMPLE']][(df['gnomADg_AF']<gnomad)&(df['Damaging']==True)]
    #Rank mutations by: tumor, role and variant_type
    tumors = ['both','tumor1','tumor2']
    roles = ['LoF','ambiguous','Act']
    variants = ['truncating','miss_inframe','other']
    df['role'] = df['role'].astype("category")
    df['role'].cat.set_categories(roles, inplace=True)
    df['variant_type'] = df['variant_type'].astype("category")
    df['variant_type'].cat.set_categories(variants, inplace=True)
    df = df[~df['SYMBOL'].str.contains('HLA')].sort_values(['germline','germline_mskcc','germline_akh','role','variant_type'],ascending=[False,False,False,roles,variants])
    df = df[(df['clonal']==True)]
    return df

In [8]:
def concat_all_mutations (germ_df,snv_df,sv_df,cnv_df):
    
    #Merge snv and cnv
    sample = snv_df['SAMPLE'][0:1].to_list()[0]
    somatic_df = pd.merge(snv_df,cnv_df,how='outer')
    somatic_df = somatic_df[(~somatic_df['mut'].isnull())|((somatic_df['mut'].isnull())&(somatic_df['CNA']!='-')&(~somatic_df['role'].isnull()))]
    somatic_cnv_df = somatic_df[somatic_df['mut'].isnull()]
    somatic_cnv_df = somatic_cnv_df[~((somatic_cnv_df['role']=='Act')&(somatic_cnv_df['CNA']=='del')|(somatic_cnv_df['role']=='LoF')&(somatic_cnv_df['CNA']=='amp'))]
    role = ['Act','LoF','ambinguous']
    somatic_cnv_df.role = somatic_cnv_df.role.astype("category")
    somatic_cnv_df.role.cat.set_categories(role, inplace=True)
    somatic_cnv_df = somatic_cnv_df.sort_values(by=['role','intogen'],ascending=[role,False])
    somatic_df = somatic_df[~somatic_df['mut'].isnull()]
    somatic_df = pd.concat([somatic_df,somatic_cnv_df],ignore_index=True)
    
    #Merge sv
    sv_df = sv_df[~sv_df['SYMBOL'].isnull()]
    sv_df = sv_df.rename(columns={'mut':'mut_sv'})
    sv_df['role'] = sv_df['role'].astype(object)
    sv_type = ['fusion','del','ins','inv','other']
    sv_df.sv_type = sv_df.sv_type.astype("category")
    sv_df.sv_type.cat.set_categories(sv_type, inplace=True)
    sv_df = sv_df.sort_values(by=['cgc_transl','sv_type'],ascending=[False,sv_type])
    
    somatic_df = pd.merge(somatic_df,sv_df,how='outer')
    
    #Concat germline
    somatic_df['origin'] = 'somatic'
    germ_df['origin'] = 'germline'
    germ_som_df = pd.concat([germ_df,somatic_df],ignore_index=True)
    germ_som_df['germline'] = germ_som_df.apply(lambda row: True if (row['germline']==True or row['germline_mskcc']==True or row['germline_akh']==True) else False,axis=1)
    
    #Annotate altered genes in somatic and germline    
    germline = germ_df['SYMBOL'].tolist()
    somatic = somatic_df['SYMBOL'].tolist()    
    germline_somatic = list(set(germline) & set(somatic))
    germ_som_df['germ_som'] = germ_som_df['SYMBOL'].apply(lambda x: True if x in germline_somatic else False)
    germ_som_df['SAMPLE'] = sample
    return germ_som_df

ANNOTATIONS:

**Germline:**
- gnomADg_AF<0.01
- No HLA
- Intogen driver (only LoF), germline
- Damaging (affecting protein sequence)  

**Somatic:**
- Snvs and indels:
    - gnomADg_AF<0.01
    - No HLA
    - Damaging (affecting protein sequence)
    - Clonals
- CNA:
    - intogen driver, germline
    - CNA-del in Act genes and CNA-amp in LoF genes
- SV:
    - Breakpoint inside a gene
    - Annotate if it is inframe or out of frame
    - CGC list: fusion genes


 # Tables with germline and somatic variants for the paper

## Case 1: Alveolar Rhabdomyosarcoma (ARMS) + treatment related Acute Myeloid Leukemia (tAML)

In [9]:
# This commented code points to the original data
# pt = 'pt2'
# root_out = '/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt
root_out = './output/case1'

In [10]:
pt = 'case1'
normal = samples[pt]['normal']
g_df = pd.read_csv(root_out+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [11]:
t1_df.columns

Index(['SYMBOL', 'germline', 'germline_mskcc', 'germline_akh', 'intogen',
       'role', 'variant_type', 'Consequence', 'aa_change', 'mut', 'IMPACT',
       'n_AF', 'n_AF_real', 'n_alt_reads', 'n_ref_reads', 'gnomADg',
       'gnomADg_AF', 'Damaging', 'STRAND', 'origin', 't_AF', 't_alt_reads',
       't_ref_reads', 't_CCF', 'clonal', 'SAMPLE', 'CNA', 'CN',
       'CN_min_allele', 'cytoband', 'mut_sv', 'fusion', 'cgc_transl',
       'chr/chr', 'sv_type', 'distance', 'distance_rel', 'germ_som'],
      dtype='object')

In [12]:
#somatic variants tumor1 (snv, indels, cnv and sv)
cols = ['SYMBOL', 'origin','intogen','role','variant_type', 'aa_change','t_AF', 'gnomADg_AF', 'mut','clonal',
       'CNA','CN','mut_sv','fusion','cgc_transl']
t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t1_df[cols][(t1_df['origin']=='somatic')&((t1_df['clonal']==True)|(t1_df['cgc_transl']==True)|(t1_df['CNA']!='-'))]

Unnamed: 0,SYMBOL,origin,intogen,role,variant_type,aa_change,t_AF,gnomADg_AF,mut,clonal,CNA,CN,mut_sv,fusion,cgc_transl
859,MAP3K9,somatic,False,,truncating,R898*,0.409,7e-06,chr14:70732719:G>A,True,-,1.9987,,,
860,LMNA,somatic,False,,miss_inframe,E551G,0.27,0.0,chr1:156137697:A>G,True,amp,3.2341,,,
861,WNT8B,somatic,False,,miss_inframe,A59S,0.331,0.0,chr10:100479946:G>T,True,-,2.0068,,,
862,PITPNM2,somatic,False,,miss_inframe,R828H,0.25,0.0,chr12:122990631:C>T,True,amp,2.9552,,,
863,CCDC178,somatic,False,,miss_inframe,R472Q,0.508,7e-06,chr18:33245423:C>T,True,-,1.8326,,,
864,FSIP2,somatic,False,,miss_inframe,Y3263C,0.295,0.0,chr2:185796924:A>G,True,amp,2.9341,,,
865,EFHD1,somatic,False,,miss_inframe,N79S,0.3,2.8e-05,chr2:232633940:A>G,True,amp,2.9709,,,
866,PRSS3,somatic,False,,miss_inframe,R152H,0.441,5.6e-05,chr9:33797912:G>A,True,-,2.0095,,,
867,NOC3L,somatic,False,,other,-,0.359,0.001606,chr10:94358222:->CA,True,-,2.0068,,,
868,AC005699.1,somatic,False,,other,-,0.413,0.0,chr4:11625811:A>C,True,,,,,


In [13]:
#somatic variants tumor2 (snv, indels, cnv and sv)
cols = ['SYMBOL', 'origin','intogen','role','variant_type', 'aa_change','t_AF', 'gnomADg_AF', 'mut','clonal',
       'CNA','CN','mut_sv','fusion','cgc_transl']
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
t2_df[cols][(t2_df['origin']=='somatic')&((t2_df['clonal']==True)|(t2_df['cgc_transl']==True)|(t2_df['CNA']!='-'))]

Unnamed: 0,SYMBOL,origin,intogen,role,variant_type,aa_change,t_AF,gnomADg_AF,mut,clonal,CNA,CN,mut_sv,fusion,cgc_transl
859,WT1,somatic,True,LoF,truncating,R158RDX,0.39,0.0,chr11:32396397:->CGTCG,True,-,2.0042,,,
860,KRAS,somatic,True,Act,miss_inframe,G12A,0.272,0.0,chr12:25245350:C>G,True,-,2.0007,,,
861,CSF3R,somatic,True,Act,other,E492,0.349,0.0,chr1:36469256:C>T,True,-,2.011,,,
862,H3-3A,somatic,True,,miss_inframe,R50P,0.319,0.0,chr1:226065676:G>C,True,,,,,
863,PLCE1,somatic,False,,miss_inframe,R394H,0.272,4.2e-05,chr10:94032227:G>A,True,-,2.0031,,,
864,NCAPD2,somatic,False,,miss_inframe,R698H,0.358,4.9e-05,chr12:6522966:G>A,True,-,2.0333,,,
865,TUBA1C,somatic,False,,miss_inframe,S118C,0.322,1.4e-05,chr12:49269604:C>G,True,-,2.0149,,,
866,ACD,somatic,False,,miss_inframe,A528V,0.308,0.0,chr16:67657658:G>A,True,-,1.9931,,,
867,FASN,somatic,False,,miss_inframe,G2270R,0.307,0.0,chr17:82080710:C>G,True,-,2.0214,,,
868,BEST2,somatic,False,,miss_inframe,A192E,0.324,7e-06,chr19:12754970:C>A,True,-,1.986,,,


In [14]:
#germline variants
cols = ['SYMBOL', 'origin','germline','role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut']
germline_df = t1_df[cols][(t1_df['origin']=='germline')&(t1_df['gnomADg_AF']<.01)&(t1_df['germline']==True)]
germline_df

Unnamed: 0,SYMBOL,origin,germline,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
0,ATM,germline,True,LoF,miss_inframe,V182L,0.65,0.007418,chr11:108244000:G>C
1,ATM,germline,True,LoF,miss_inframe,R2854C,0.53,0.000126,chr11:108345884:C>T
2,EGFR,germline,True,Act,miss_inframe,A1210V,0.485,0.000328,chr7:55205613:C>T
3,MET,germline,True,Act,other,-,0.553,0.001314,chr7:116769637:C>-
4,CBL,germline,True,Act,miss_inframe,P687L,0.488,0.000119,chr11:119296941:C>T
5,ITK,germline,True,,miss_inframe,R193Q,0.526,0.003875,chr5:157222945:G>A
6,EP300,germline,True,LoF,miss_inframe,G211S,0.529,0.006329,chr22:41117723:G>A
7,FAT1,germline,True,LoF,miss_inframe,F2549L,0.483,0.000223,chr4:186618941:A>G
8,FANCM,germline,True,,miss_inframe,D556G,0.486,0.000105,chr14:45164444:A>G
9,JMJD1C,germline,True,,miss_inframe,S1429L,0.383,0.000545,chr10:63207383:G>A


In [15]:
germline_df.to_csv('./table1_paper/case1_germline.tsv',sep='\t',index=None)

In [16]:
#somatic SNV and indels variants
cols = ['#CHROM','POS','REF','ALT','SYMBOL','role','intogen',
       'variant_type','Consequence', 'aa_change','clonal']

t1_snv_df1 = t1_snv_df[cols]
t1_snv_df1['Tumor'] = 1
t2_snv_df1 = t2_snv_df[cols]
t2_snv_df1['Tumor'] = 2
snv_df1 = pd.concat([t1_snv_df1,t2_snv_df1])
snv_df1.sort_values(['Tumor','clonal','intogen','variant_type'],ascending=[True,False,False,False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,#CHROM,POS,REF,ALT,SYMBOL,role,intogen,variant_type,Consequence,aa_change,clonal,Tumor
6,chr1,10680552,C,G,CASZ1,LoF,True,,intron_variant,-,True,1
247,chr1,240824706,C,A,RGS7,LoF,True,,intron_variant,-,True,1
507,chr11,118785224,A,G,DDX6,ambiguous,True,,intron_variant,-,True,1
705,chr12,132624666,G,C,POLE,LoF,True,,3_prime_UTR_variant,-,True,1
711,chr13,20012708,C,G,ZMYM2,,True,,intron_variant,-,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2686,chrX,88843874,C,A,-,,False,,intergenic_variant,-,False,2
2706,chrX,97075625,G,T,DIAPH2,,False,,intron_variant,-,False,2
2714,chrX,102743951,C,T,BHLHB9,,False,,intron_variant,-,False,2
2775,chrX,144084558,T,A,-,,False,,intergenic_variant,-,False,2


In [17]:
snv_df1.to_csv('./table2_paper/case1_snv_indels.tsv',sep='\t',index=None)

In [18]:
#somatic SV variants
cols = ['SYMBOL','intogen','role','mut','fusion','cgc_transl','chr/chr','sv_type','distance']
t1_sv_df1 = t1_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t2_sv_df1 = t2_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t1_sv_df1['Tumor'] = 1
t2_sv_df1['Tumor'] = 2
sv_df1 = pd.concat([t1_sv_df1,t2_sv_df1])
sv_df1

Unnamed: 0,SYMBOL,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,Tumor
26,TCF7L2,True,Act,chr10:112988994:A>A[chr10:113046386[,TCF7L2-del,True,chr10,del,57392,1
27,FOXO1,True,Act,chr13:40596631:T>T[chr2:222218414[,FOXO1/PAX3,True,chr13/chr2,fusion,-,1
28,FOXO1,True,Act,chr13:40596668:C>[chr2:222217523[C,FOXO1/PAX3,True,chr13/chr2,fusion,-,1
34,TCF12,True,,chr15:57054537:G>G]chr15:57561950],TCF12/-,True,chr15,inv,507413,1
2,PATJ,False,,chr1:61928598:C>C[chr1:61933425[,PATJ-del,False,chr1,del,4827,1
11,TTC7A,False,,chr2:47041786:T>T[chr2:47041925[,TTC7A-del,False,chr2,del,139,1
15,PAX3,False,,chr2:222217724:C>C]chr2:222218643],PAX3-del,True,chr2,del,919,1
19,MEGF10,False,,chr5:127450737:A>]chr5:127457508]A,MEGF10-del,False,chr5,del,6771,1
20,PHACTR2,False,,chr6:143616201:A>A]chr6:143627824],PHACTR2-del,False,chr6,del,11623,1
30,KLHL1,False,,chr13:69933996:G>GAT[chr13:69974703[,KLHL1-del,False,chr13,del,40707,1


In [19]:
sv_df1.to_csv('./table2_paper/case1_sv.tsv',sep='\t',index=None)

In [20]:
# This commented code points to the original data
# root_in_hmf_t1 = '/workspace/datasets/sjd_seq/platinum_results/20220809/pt2-t1-allsamples-t1/'
# root_in_hmf_t2 = '/workspace/datasets/sjd_seq/platinum_results/20220809/pt2-t2-allsamples-t2/'

#Change accordingly the paths to hmf output
root_in_hmf_t1 = '/path/to/hmf_pipeline/output/'
root_in_hmf_t2 = '/path/to/hmf_pipeline/output/'

In [21]:
#somatic CNV variants
pt = 'case1'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv(root_in_hmf_t1+'purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv(root_in_hmf_t2+'purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]

def cnv_final_table(df1):
    chr_list = df1['chromosome'].unique()
    cols = ['chromosome','start','end','copyNumber']
    cn_final_df = pd.DataFrame(columns=cols)
    for chrom in chr_list:
        if len(df1[df1['chromosome']==chrom]) == 1:
            df2 = df1[df1['chromosome']==chrom]
        else:
            min_start = df1[df1['chromosome']==chrom]['start'].min()
            max_start = df1[df1['chromosome']==chrom]['start'].max()
            min_end = df1[df1['chromosome']==chrom]['end'].min()
            max_end = df1[df1['chromosome']==chrom]['end'].max()
            min_cn = df1[df1['chromosome']==chrom]['copyNumber'].min()
            max_cn = df1[df1['chromosome']==chrom]['copyNumber'].max()
            dict1 = {}
            dict1['chromosome'] = chrom
            dict1['start'] = '['+str(min_start)+','+str(max_start)+']'
            dict1['end'] = '['+str(min_end)+','+str(max_end)+']'
            dict1['copyNumber'] = '['+str(min_cn)+','+str(max_cn)+']'
            df2 = pd.DataFrame.from_dict({0:dict1},orient='index')
        cn_final_df = pd.concat([cn_final_df,df2])
    return cn_final_df

t1_segments_df2 = cnv_final_table(t1_segments_df1)
t2_segments_df2 = cnv_final_table(t2_segments_df1)
t1_segments_df2['Tumor'] = 1
t2_segments_df2['Tumor'] = 2
segments_df2 = pd.concat([t1_segments_df2[cols],t2_segments_df2[cols]])
segments_df2

Unnamed: 0,chromosome,start,end,copyNumber,Tumor
0,chr1,"[123605523,229724402]","[176819977,248956422]","[2.7705,3.3421]",1
0,chr2,"[1,240472473]","[47041786,242193529]","[2.9111,4.1022]",1
38,chr6,143627825,147242546,1.0051,1
42,chr7,122369916,159345973,1.0042,1
0,chr12,"[1,35977330]","[35977329,133275309]","[2.9339,2.9552]",1
80,chr19,963743,1004704,3.0055,1
97,chrX,120750523,120762285,0.8943,1
0,chr7,"[1,59498944]","[59498943,159345973]","[1.0096,1.0187]",2


In [22]:
pt = 'case1'
tumor = samples[pt]['tumor1']
t1_cnv_df = pd.read_csv(root_in_hmf_t1+'purple/'+tumor+'.purple.cnv.gene.tsv',sep='\t')

def add_genes (row,df1):
    chrom = row['chromosome']
    start_segment = row['start']
    end_segment = row['end']
    df1 = t1_cnv_df[t1_cnv_df['chromosome']==chrom]
    gene_list = df1['gene'].unique()
    driver_list = [gene for gene in gene_list if gene in intogen_drivers]
    segment_gene_list = []
    if type(start_segment) == str:
        start_segment = start_segment.split('[')[1].split(',')[0]
        end_segment = end_segment.split(']')[0].split(',')[1]
    for gene in driver_list:
        start_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['start']
        end_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['end']
        if int(start_gene) > int(start_segment) and int(end_gene) < int(end_segment):
            segment_gene_list.append(gene)
    return segment_gene_list

segments_df2['driver genes'] = segments_df2.progress_apply(lambda row: add_genes(row,t1_cnv_df),axis=1)
segments_df2

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




Unnamed: 0,chromosome,start,end,copyNumber,Tumor,driver genes
0,chr1,"[123605523,229724402]","[176819977,248956422]","[2.7705,3.3421]",1,"[BCL9, PDE4DIP, ARNT, SETDB1, S100A7, ZBTB7B, ..."
0,chr2,"[1,240472473]","[47041786,242193529]","[2.9111,4.1022]",1,"[MYCN, DNMT3A, ASXL2, ALK, BIRC6, SOS1, EPAS1,..."
38,chr6,143627825,147242546,1.0051,1,[]
42,chr7,122369916,159345973,1.0042,1,"[POT1, SMO, TRIM24, BRAF, EZH2, KMT2C]"
0,chr12,"[1,35977330]","[35977329,133275309]","[2.9339,2.9552]",1,"[CCND2, CHD4, PTPN6, ETV6, DUSP16, CDKN1B, ATF..."
80,chr19,963743,1004704,3.0055,1,[]
97,chrX,120750523,120762285,0.8943,1,[]
0,chr7,"[1,59498944]","[59498943,159345973]","[1.0096,1.0187]",2,"[CARD11, PMS2, RAC1, ETV1, MACC1, HNRNPA2B1, N..."


In [23]:
segments_df2.to_csv('./table2_paper/case1_cnv.tsv',sep='\t',index=None)

##  Case 2: Ependymoma (EPN) + High Grade Glioma (HGG) +9y

In [24]:
#This commented code points to the original data
# pt = 'pt3'
# root_out = '/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt
root_out = './output/case2'

In [25]:
pt = 'case2'
normal = samples[pt]['normal']
g_df = pd.read_csv(root_out+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [26]:
#somatic variants tumor1 (snv, indels, cnv and sv)
cols = ['SYMBOL', 'origin','intogen','role','variant_type', 'aa_change','t_AF', 'gnomADg_AF', 'mut','clonal',
       'cytoband','CNA','CN','mut_sv','fusion','cgc_transl']
t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t1_df[cols][(t1_df['origin']=='somatic')&((t1_df['clonal']==True)|(t1_df['cgc_transl']==True)|(t1_df['CNA']!='-'))]

Unnamed: 0,SYMBOL,origin,intogen,role,variant_type,aa_change,t_AF,gnomADg_AF,mut,clonal,cytoband,CNA,CN,mut_sv,fusion,cgc_transl
609,H3C2,somatic,True,,miss_inframe,K28M,0.468,0.0,chr6:26031978:T>A,True,,,,,,
610,ZRSR2,somatic,True,LoF,,,,,,,chrX:p22.2,del,0.9986,,,
611,EIF1AX,somatic,True,LoF,,,,,,,chrX:p22.12,del,0.9986,,,
612,RPS6KA3,somatic,True,LoF,,,,,,,chrX:p22.12,del,0.9986,,,
613,ZFX,somatic,True,LoF,,,,,,,chrX:p22.11,del,0.9986,,,
614,BCOR,somatic,True,LoF,,,,,,,chrX:p11.4,del,0.9986,,,
615,DDX3X,somatic,True,LoF,,,,,,,chrX:p11.4,del,0.9986,,,
616,KDM6A,somatic,True,LoF,,,,,,,chrX:p11.3,del,0.9986,,,
617,RBM10,somatic,True,LoF,,,,,,,chrX:p11.3,del,0.9986,,,
618,WDR45,somatic,True,LoF,,,,,,,chrX:p11.23,del,0.9986,,,


In [27]:
#somatic variants tumor2 (snv, indels, cnv and sv)
cols = ['SYMBOL', 'origin','intogen','role','variant_type', 'aa_change','t_AF', 'gnomADg_AF', 'mut','clonal',
       'cytoband','CNA','CN','mut_sv','fusion','cgc_transl']
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
t2_df[cols][(t2_df['origin']=='somatic')&((t2_df['clonal']==True)|(t2_df['cgc_transl']==True)|(t2_df['CNA']!='-'))]

Unnamed: 0,SYMBOL,origin,intogen,role,variant_type,aa_change,t_AF,gnomADg_AF,mut,clonal,cytoband,CNA,CN,mut_sv,fusion,cgc_transl
609,H3C2,somatic,True,,miss_inframe,K28M,0.329,0.0,chr6:26031978:T>A,True,,,,,,
610,LYPLA1,somatic,False,,miss_inframe,A98V,0.374,0.0,chr8:54055127:G>A,True,chr8:q11.23,-,1.9788,,,
611,PDGFRA,somatic,True,Act,,,,,,,chr4:q12,amp,2.2202,,,
612,KIT,somatic,True,Act,,,,,,,chr4:q12,amp,2.2202,,,
613,KDR,somatic,True,Act,,,,,,,chr4:q12,amp,2.2202,,,
614,ZRSR2,somatic,True,LoF,,,,,,,chrX:p22.2,del,0.9923,,,
615,EIF1AX,somatic,True,LoF,,,,,,,chrX:p22.12,del,0.9919,,,
616,RPS6KA3,somatic,True,LoF,,,,,,,chrX:p22.12,del,0.9919,,,
617,ZFX,somatic,True,LoF,,,,,,,chrX:p22.11,del,0.9919,,,
618,BCOR,somatic,True,LoF,,,,,,,chrX:p11.4,del,0.9919,,,


In [28]:
#germline variants
cols = ['SYMBOL', 'origin','germline','role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut']
germline_df = t1_df[cols][(t1_df['origin']=='germline')&(t1_df['gnomADg_AF']<.01)&(t1_df['germline']==True)]
germline_df

Unnamed: 0,SYMBOL,origin,germline,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
0,DICER1,germline,True,LoF,miss_inframe,Y1385C,0.456,0.0,chr14:95099832:T>C
1,WRN,germline,True,LoF,miss_inframe,T1262R,0.429,0.002702,chr8:31154721:C>G
2,SERPINA1,germline,True,,truncating,E347X,0.251,0.000253,chr14:94379488:CT>-
3,CFTR,germline,True,,miss_inframe,G576A,0.474,0.005081,chr7:117590400:G>C
4,CFTR,germline,True,,miss_inframe,R668C,0.414,0.006121,chr7:117592169:C>T


In [29]:
germline_df.to_csv('./table1_paper/case2_germline.tsv',sep='\t',index=None)

In [30]:
#somatic SNV and indels variants
cols = ['#CHROM','POS','REF','ALT','SYMBOL','role','intogen',
       'variant_type','Consequence', 'aa_change','clonal']

t1_snv_df1 = t1_snv_df[cols]
t1_snv_df1['Tumor'] = 1
t2_snv_df1 = t2_snv_df[cols]
t2_snv_df1['Tumor'] = 2
snv_df1 = pd.concat([t1_snv_df1,t2_snv_df1])
snv_df1.sort_values(['Tumor','clonal','intogen','variant_type'],ascending=[True,False,False,False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,#CHROM,POS,REF,ALT,SYMBOL,role,intogen,variant_type,Consequence,aa_change,clonal,Tumor
194,chr6,26031978,T,A,H3C2,,True,miss_inframe,missense_variant,K28M,True,1
12,chr1,179176486,-,A,ABL2,Act,True,,intron_variant,-,True,1
75,chr15,75440238,-,T,SIN3A,Act,True,,intron_variant,-,True,1
77,chr16,7232034,-,T,RBFOX1,Act,True,,intron_variant,-,True,1
142,chr3,85595120,C,A,CADM2,,True,,intron_variant,-,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1401,chrY,15780365,C,T,-,,False,,intergenic_variant,-,False,2
1402,chrY,16450054,G,A,-,,False,,intergenic_variant,-,False,2
1403,chrY,16763764,G,A,-,,False,,intergenic_variant,-,False,2
1404,chrY,19129676,A,G,-,,False,,intergenic_variant,-,False,2


In [31]:
snv_df1.to_csv('./table2_paper/case2_snv_indels.tsv',sep='\t',index=None)

In [32]:
#somatic SV variants
cols = ['SYMBOL','intogen','role','mut','fusion','cgc_transl','chr/chr','sv_type','distance']
t1_sv_df1 = t1_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t2_sv_df1 = t2_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t1_sv_df1['Tumor'] = 1
t2_sv_df1['Tumor'] = 2
sv_df1 = pd.concat([t1_sv_df1,t2_sv_df1])
sv_df1

Unnamed: 0,SYMBOL,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,Tumor
0,PJA2,False,,chr5:109342397:G>G[chr5:109342435[,PJA2-del,False,chr5,del,38,1
11,FGL1,False,,chr8:17891100:T>T[chr8:17894283[,FGL1-del,False,chr8,del,3183,2
15,NALCN,False,,chr13:101132878:T>T[chr13:101132917[,NALCN-del,False,chr13,del,39,2
29,AFF2,False,,chrX:148655820:G>G[chrX:148655857[,AFF2-del,False,chrX,del,37,2
4,MAP3K20,False,,chr2:173249260:A>A[chr21:31254906[,MAP3K20/TIAM1,False,chr2/chr21,fusion,-,2
18,TIAM1,False,,chr21:31254906:A>]chr2:173249260]A,TIAM1/MAP3K20,False,chr21/chr2,fusion,-,2
0,,False,,chr1:118365082:C>[chr1:166745104[C,-/-,False,chr1,inv,48380022,2
1,,False,,chr1:166745104:G>[chr1:118365082[G,-/-,False,chr1,inv,-48380022,2
2,,False,,chr2:161078542:A>]chr2:161079084]A,-/-,False,chr2,inv,542,2
3,,False,,chr2:161079084:T>T[chr2:161078542[,-/-,False,chr2,inv,-542,2


In [33]:
sv_df1.to_csv('./table2_paper/case2_sv.tsv',sep='\t',index=None)

In [34]:
#This commented code points to the original data
# root_in_hmf_t1 = '/workspace/datasets/sjd_seq/platinum_results/20220809/pt3-t1-allsamples-t1/'
# root_in_hmf_t2 = '/workspace/datasets/sjd_seq/platinum_results/20220809/pt3-t2-allsamples-t2/'

#Change accordingly the paths to hmf output
root_in_hmf_t1 = '/path/to/hmf_pipeline/output/'
root_in_hmf_t2 = '/path/to/hmf_pipeline/output/'

In [35]:
#somatic CNV variants
pt = 'case2'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']
cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv(root_in_hmf_t1+'purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv(root_in_hmf_t2+'purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]
t1_segments_df1['Tumor'] = 1
t2_segments_df1['Tumor'] = 2
segments_df1 = pd.concat([t1_segments_df1[cols],t2_segments_df1[cols]])
segments_df1

Unnamed: 0,chromosome,start,end,copyNumber,Tumor


## Case 3: Neuroblastoma and Rhabdoid tumor (+9 years)

In [36]:
#This commented code points to the original data
# pt = 'pt1'
# root_out = '/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt
root_out = './output/case3'

In [37]:
pt = 'case3'
normal = samples[pt]['normal']

g_df = pd.read_csv(root_out+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [38]:
#somatic variants tumor1 (snv, indels, cnv and sv)
cols = ['SYMBOL', 'origin','intogen','role','variant_type', 'aa_change','t_AF', 'gnomADg_AF', 'mut','clonal',
       'cytoband','CNA','CN','mut_sv','fusion','cgc_transl']
t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t1_df[cols][(t1_df['origin']=='somatic')&((t1_df['clonal']==True)|(t1_df['cgc_transl']==True)|(t1_df['CNA']!='-'))]

Unnamed: 0,SYMBOL,origin,intogen,role,variant_type,aa_change,t_AF,gnomADg_AF,mut,clonal,cytoband,CNA,CN,mut_sv,fusion,cgc_transl
592,ALK,somatic,True,Act,miss_inframe,P784R,0.406,0.0,chr2:29239684:G>C,True,chr2:p23.1,-,2.0181,,,
593,MUC4,somatic,False,Act,miss_inframe,F966L,0.427,2.3e-05,chr3:195788682:G>T,True,chr3:q29,-,1.9922,,,
594,TAF1L,somatic,True,,truncating,R823*,0.416,7e-06,chr9:32633113:G>A,True,chr9:p21.1,-,1.999,,,
595,MUS81,somatic,False,,miss_inframe,A410T,0.44,0.000279,chr11:65864771:G>A,True,chr11:q13.1,-,2.0087,,,
596,DUOX1,somatic,False,,miss_inframe,D949N,0.355,0.0,chr15:45150658:G>A,True,chr15:q21.1,-,2.0132,,,
597,MYH8,somatic,False,,miss_inframe,S900N,0.504,0.0,chr17:10401775:C>T,True,chr17:p13.1,-,1.9987,,,
598,MYO3B,somatic,False,,miss_inframe,R590S,0.445,0.0,chr2:170392474:G>C,True,chr2:q31.1,-,2.0,,,
599,GRIA2,somatic,False,,miss_inframe,E678D,0.423,0.0,chr4:157341453:G>T,True,chr4:q32.1,-,1.9872,,,
600,DDX4,somatic,False,,miss_inframe,P668H,0.365,0.0,chr5:55815329:C>A,True,chr5:q11.2,-,2.0057,,,
601,WNT2,somatic,False,,miss_inframe,R169C,0.427,2.1e-05,chr7:117315154:G>A,True,chr7:q31.2,-,2.001,,,


In [39]:
#somatic variants tumor2 (snv, indels, cnv and sv)
cols = ['SYMBOL', 'origin','intogen','role','variant_type', 'aa_change','t_AF', 'gnomADg_AF', 'mut','clonal',
       'cytoband','CNA','CN','mut_sv','fusion','cgc_transl']
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
t2_df[cols][(t2_df['origin']=='somatic')&((t2_df['clonal']==True)|(t2_df['cgc_transl']==True)|(t2_df['CNA']!='-'))]

Unnamed: 0,SYMBOL,origin,intogen,role,variant_type,aa_change,t_AF,gnomADg_AF,mut,clonal,cytoband,CNA,CN,mut_sv,fusion,cgc_transl
592,SMARCB1,somatic,True,LoF,truncating,VG279-280X,0.407,0.0,chr22:23825237:GTGGG>-,True,chr22:q11.23,del,0.9947,,,
593,PPP2R5A,somatic,False,,truncating,R213*,0.256,0.0,chr1:212345866:C>T,True,chr1:q32.3,-,2.017,,,
594,ZCCHC14,somatic,False,,truncating,S289X,0.244,0.0,chr16:87417566:C>-,True,chr16:q24.2,-,2.0198,,,
595,DNAH9,somatic,False,,truncating,MH4022-4023X,0.271,0.0,chr17:11930052:ATGCA>-,True,chr17:p12,-,2.0253,,,
596,TDRD6,somatic,False,,truncating,E908*,0.306,0.0,chr6:46690850:G>T,True,chr6:p12.3,-,2.007,,,
597,CCDC17,somatic,False,,miss_inframe,A516T,0.345,0.0,chr1:45620956:C>T,True,chr1:p34.1,-,2.0308,,,
598,XPNPEP1,somatic,False,,miss_inframe,D142H,0.229,0.0,chr10:109888587:C>G,True,chr10:q25.1,-,2.0127,,,
599,ZFYVE1,somatic,False,,miss_inframe,E379L,0.333,0.0,chr14:72993210:TC>AA,True,chr14:q24.2,-,2.0246,,,
600,PRPSAP2,somatic,False,,miss_inframe,I177V,0.326,0.0,chr17:18889822:A>G,True,chr17:p11.2,-,2.0253,,,
601,RTTN,somatic,False,,miss_inframe,A2165V,0.25,0.0,chr18:70006412:G>A,True,chr18:q22.2,-,1.9946,,,


In [40]:
#germline variants
cols = ['SYMBOL', 'origin','germline','role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut']
germline_df = t1_df[cols][(t1_df['origin']=='germline')&(t1_df['gnomADg_AF']<.01)&(t1_df['germline']==True)]
germline_df

Unnamed: 0,SYMBOL,origin,germline,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
0,CDH1,germline,True,LoF,miss_inframe,P30T,0.412,0.001278,chr16:68738336:C>A
1,COL7A1,germline,True,,miss_inframe,G636V,0.464,0.004155,chr3:48590356:C>A
2,GBA,germline,True,,miss_inframe,T408M,0.477,0.006204,chr1:155236246:G>A
3,RTEL1,germline,True,,miss_inframe,GE770-771G,0.496,0.004962,chr20:63690338:AGA>-
4,RAD50,germline,True,,miss_inframe,Q426R,0.582,0.000147,chr5:132589662:A>G
5,RAF1,germline,True,Act,other,H389,0.484,6.3e-05,chr3:12599692:G>A
6,SLX4,germline,True,,miss_inframe,R237Q,0.542,0.008306,chr16:3606524:C>T


In [41]:
germline_df.to_csv('./table1_paper/case3_germline.tsv',sep='\t',index=None)

In [42]:
t1_snv_df.columns

Index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'NORMAL', 'TUMOR', 't_AF', 'n_AF', 'DP_tumor', 't_alt_reads',
       't_ref_reads', 'DP_normal', 'n_alt_reads', 'n_ref_reads', 'mut_type',
       'GT_normal', 'GT_tumor', 'Gene', 'Feature', 'Feature_type',
       'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position',
       'Amino_acids', 'Codons', 'Existing_variation', 'IMPACT', 'DISTANCE',
       'STRAND', 'FLAGS', 'SYMBOL', 'SYMBOL_SOURCE', 'HGNC_ID', 'CANONICAL',
       'ENSP', 'SOURCE', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'CLIN_SIG', 'SOMATIC', 'PHENO', 'gnomADg', 'gnomADg_AF', 'gnomADg_NFE',
       'subset_origin', 'SAMPLE', 'Damaging', 'mut', 'aa_change', 'n_AF_real',
       'intogen', 'germline', 'germline_mskcc', 'germline_akh', 'role',
       'variant_type', 'CN', 't_CCF', 'n_CCF', 'clonal'],
      dtype='object')

In [43]:
#somatic SNV and indels variants
cols = ['#CHROM','POS','REF','ALT','SYMBOL','role','intogen',
       'variant_type','Consequence', 'aa_change','clonal']

t1_snv_df1 = t1_snv_df[cols]
t1_snv_df1['Tumor'] = 1
t2_snv_df1 = t2_snv_df[cols]
t2_snv_df1['Tumor'] = 2
snv_df1 = pd.concat([t1_snv_df1,t2_snv_df1])
snv_df1.sort_values(['Tumor','clonal','intogen','variant_type'],ascending=[True,False,False,False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,#CHROM,POS,REF,ALT,SYMBOL,role,intogen,variant_type,Consequence,aa_change,clonal,Tumor
1632,chr9,32633113,G,A,TAF1L,,True,truncating,stop_gained,R823*,True,1
718,chr2,29239684,G,C,ALK,Act,True,miss_inframe,missense_variant,P784R,True,1
2,chr1,11266002,T,-,MTOR,Act,True,,upstream_gene_variant,-,True,1
15,chr1,51382417,T,-,EPS15,Act,True,,intron_variant,-,True,1
176,chr10,113066641,G,T,TCF7L2,Act,True,,intron_variant,-,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1980,chrX,97514527,G,T,DIAPH2,,False,,intron_variant,-,False,2
2000,chrX,124743666,C,A,TENM1,,False,,intron_variant,-,False,2
2001,chrX,124925596,G,T,TENM1,,False,,intron_variant,-,False,2
2031,chrX,150160006,T,G,LINC00894,,False,,"intron_variant,non_coding_transcript_variant",-,False,2


In [44]:
snv_df1.to_csv('./table2_paper/case3_snv_indels.tsv',sep='\t',index=None)

In [45]:
#somatic SV variants
cols = ['SYMBOL','intogen','role','mut','fusion','cgc_transl','chr/chr','sv_type','distance']
t1_sv_df1 = t1_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t2_sv_df1 = t2_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t1_sv_df1['Tumor'] = 1
t2_sv_df1['Tumor'] = 2
sv_df1 = pd.concat([t1_sv_df1,t2_sv_df1])
sv_df1

Unnamed: 0,SYMBOL,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,Tumor
31,MYCN,True,Act,chr2:15942475:C>C[chr2:15942572[,MYCN-del,False,chr2,del,97,1
54,FOXP1,True,Act,chr3:71040531:C>C[chr3:71099048[,FOXP1-del,True,chr3,del,58517,1
70,GALNT1,False,,chr18:35589777:T>]chr18:35589952]T,GALNT1-del,False,chr18,del,175,1
0,ALG14,False,,chr1:95023313:A>[chr17:58991027[A,ALG14/TRIM37,False,chr1/chr17,fusion,-,1
67,TRIM37,False,,chr17:58991027:G>[chr1:95023313[G,TRIM37/ALG14,False,chr17/chr1,fusion,-,1
2,NBAS,False,,chr2:15454924:G>GTTTTTTTTTTTTTTTTTTA.,NBAS-ins,False,chr2,ins,0,1
6,NBAS,False,,chr2:15468148:A>AATAAGTGTCAGAGATCGGAAGAGCGTCGT...,NBAS-ins,False,chr2,ins,0,1
7,NBAS,False,,chr2:15468148:A>AATAAGTGTCAGAGATCGGAAGAGCGTCGT...,NBAS-ins,False,chr2,ins,0,1
8,NBAS,False,,chr2:15474578:T>TCTTTGAGAGATCGGAAGAGCACACGTCTG...,NBAS-ins,False,chr2,ins,0,1
1,NBAS,False,,chr2:15407882:T>]chr2:16627671]CAT,NBAS/FAM49A,False,chr2,inv,1219789,1


In [46]:
sv_df1.to_csv('./table2_paper/case3_sv.tsv',sep='\t',index=None)

In [47]:
#This commented code points to the original data
# root_in_hmf_t1 = '/workspace/datasets/sjd_seq/platinum_results/20220809/pt1-t1-allsamples-t1/'
# root_in_hmf_t2 = '/workspace/datasets/sjd_seq/platinum_results/20220809/pt1-t2-allsamples-t2/'

#Change accordingly the paths to hmf output
root_in_hmf_t1 = '/path/to/hmf_pipeline/output/'
root_in_hmf_t2 = '/path/to/hmf_pipeline/output/'

In [48]:
# somatic CNV variants
pt = 'case3'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv(root_in_hmf_t1+'purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv(root_in_hmf_t2+'purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]


def cnv_final_table(df1):
    chr_list = df1['chromosome'].unique()
    cols = ['chromosome','start','end','copyNumber']
    cn_final_df = pd.DataFrame(columns=cols)
    for chrom in chr_list:
        if len(df1[df1['chromosome']==chrom]) == 1:
            df2 = df1[df1['chromosome']==chrom]
        else:
            min_start = df1[df1['chromosome']==chrom]['start'].min()
            max_start = df1[df1['chromosome']==chrom]['start'].max()
            min_end = df1[df1['chromosome']==chrom]['end'].min()
            max_end = df1[df1['chromosome']==chrom]['end'].max()
            min_cn = df1[df1['chromosome']==chrom]['copyNumber'].min()
            max_cn = df1[df1['chromosome']==chrom]['copyNumber'].max()
            dict1 = {}
            dict1['chromosome'] = chrom
            dict1['start'] = '['+str(min_start)+','+str(max_start)+']'
            dict1['end'] = '['+str(min_end)+','+str(max_end)+']'
            dict1['copyNumber'] = '['+str(min_cn)+','+str(max_cn)+']'
            df2 = pd.DataFrame.from_dict({0:dict1},orient='index')
        cn_final_df = pd.concat([cn_final_df,df2])

    return cn_final_df
t1_segments_df2 = cnv_final_table(t1_segments_df1)
t2_segments_df2 = cnv_final_table(t2_segments_df1)
t1_segments_df2['Tumor'] = 1
t2_segments_df2['Tumor'] = 2
segments_df2 = pd.concat([t1_segments_df2[cols],t2_segments_df2[cols]])
segments_df2

Unnamed: 0,chromosome,start,end,copyNumber,Tumor
0,chr1,1,95023312,1.0253,1
0,chr2,"[15407882,16499225]","[15454924,16627671]","[291.6401,307.3457]",1
59,chr3,71040532,71099047,1.0782,1
0,chr17,"[58991027,69378970]","[69378834,83257441]","[3.0028,3.0119]",1
0,chr9,"[9229153,9249415]","[9249414,9672000]","[1.3871,1.4776]",2
78,chr21,21395644,21510344,1.3654,2
82,chr22,17229803,34540743,0.9947,2


In [49]:
pt = 'case3'
tumor = samples[pt]['tumor1']
t1_cnv_df = pd.read_csv(root_in_hmf_t1+'purple/'+tumor+'.purple.cnv.gene.tsv',sep='\t')

def add_genes (row,df1):
    chrom = row['chromosome']
    start_segment = row['start']
    end_segment = row['end']
    df1 = t1_cnv_df[t1_cnv_df['chromosome']==chrom]
    gene_list = df1['gene'].unique()
    driver_list = [gene for gene in gene_list if gene in intogen_drivers]
    segment_gene_list = []
    if type(start_segment) == str:
        start_segment = start_segment.split('[')[1].split(',')[0]
        end_segment = end_segment.split(']')[0].split(',')[1]
    for gene in driver_list:
        start_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['start']
        end_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['end']
        if int(start_gene) > int(start_segment) and int(end_gene) < int(end_segment):
            segment_gene_list.append(gene)
    return segment_gene_list

segments_df2['driver genes'] = segments_df2.progress_apply(lambda row: add_genes(row,t1_cnv_df),axis=1)
segments_df2

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))




Unnamed: 0,chromosome,start,end,copyNumber,Tumor,driver genes
0,chr1,1,95023312,1.0253,1,"[MIB2, SLC35E2A, TNFRSF14, PRDM16, RPL22, ZBTB..."
0,chr2,"[15407882,16499225]","[15454924,16627671]","[291.6401,307.3457]",1,[MYCN]
59,chr3,71040532,71099047,1.0782,1,[]
0,chr17,"[58991027,69378970]","[69378834,83257441]","[3.0028,3.0119]",1,"[CLTC, PPM1D, CD79B, SMURF2, GNA13, AXIN2, PRK..."
0,chr9,"[9229153,9249415]","[9249414,9672000]","[1.3871,1.4776]",2,[]
78,chr21,21395644,21510344,1.3654,2,[]
82,chr22,17229803,34540743,0.9947,2,"[CLTCL1, DGCR8, LZTR1, MAPK1, BCR, SMARCB1, SU..."


In [50]:
segments_df2.to_csv('./table2_paper/case3_cnv.tsv',sep='\t',index=None)

##  Case 4: Burkitt Lymphoma (BL) + Thytoid Carcinoma (THC) 

In [51]:
#This commented code points to the original data
# pt = 'pt8'
# root_out = '/workspace/projects/sjd_pediatric_tumors/mafs_platinum/20220809/'+pt
root_out = './output/case4'

In [52]:
pt = 'case4'
normal = samples[pt]['normal']
g_df = pd.read_csv(root_out+'/'+normal+'/filter_and_annot/haplotype_caller/'+normal+'_filt.maf.gz',sep='\t')
g_ranked_df = ranked_table(g_df)

tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

t1_snv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/filter_and_annot/'+tumor1+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t2_snv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/filter_and_annot/'+tumor2+'_vs_'+normal+'_filt.maf.gz',sep='\t')
t1_snv_ranked_df = ranked_table_snvs(t1_snv_df,0.01)
t2_snv_ranked_df = ranked_table_snvs(t2_snv_df,0.01)

t1_sv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/process_sv/gridds/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_sv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/process_sv/gridds/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_cnv_df = pd.read_csv(root_out+'/'+tumor1+'_vs_'+normal+'/process_cnv/purple/'+tumor1+'_vs_'+normal+'.maf.gz',sep='\t')
t2_cnv_df = pd.read_csv(root_out+'/'+tumor2+'_vs_'+normal+'/process_cnv/purple/'+tumor2+'_vs_'+normal+'.maf.gz',sep='\t')

t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)

In [53]:
#somatic variants tumor1 (snv, indels, cnv and sv)
cols = ['SYMBOL', 'origin','intogen','role','variant_type', 'aa_change','t_AF', 'gnomADg_AF', 'mut','clonal',
       'cytoband','CNA','CN','mut_sv','fusion','cgc_transl']
t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t1_df[cols][(t1_df['origin']=='somatic')&((t1_df['clonal']==True)|(t1_df['cgc_transl']==True)|(t1_df['CNA']!='-'))]

Unnamed: 0,SYMBOL,origin,intogen,role,variant_type,aa_change,t_AF,gnomADg_AF,mut,clonal,cytoband,CNA,CN,mut_sv,fusion,cgc_transl
599,TP53,somatic,True,LoF,miss_inframe,G245D,0.473,7e-06,chr17:7674229:C>T,True,chr17:p13.1,-,1.9631,,,
600,RECQL4,somatic,True,LoF,miss_inframe,R895G,0.297,0.0,chr8:144512919:T>C,True,chr8:q24.3,-,2.0006,,,
601,ARHGAP35,somatic,True,LoF,miss_inframe,V381L,0.475,0.0,chr19:46919816:G>C,True,chr19:q13.32,-,1.9017,,,
602,FOXO1,somatic,True,Act,miss_inframe,R21C,0.415,0.0,chr13:40666152:G>A,True,chr13:q14.11,-,2.0086,,,
603,BCL6,somatic,True,Act,miss_inframe,A587D,0.467,0.0,chr3:187725578:G>T,True,chr3:q27.3,-,2.0305,,,
604,IGHV4-34,somatic,False,,truncating,Y58*,0.442,1.4e-05,chr14:106373858:G>C,True,chr14:q32.33,amp,2.6802,,,
605,IGHV4-34,somatic,False,,miss_inframe,S61T,0.382,7e-05,chr14:106373850:C>G,True,chr14:q32.33,amp,2.6802,,,
606,IGHV4-34,somatic,False,,miss_inframe,Y59S,0.386,2.8e-05,chr14:106373856:T>G,True,chr14:q32.33,amp,2.6802,,,
607,ATAD2B,somatic,False,,truncating,GL535-536X,0.467,0.0,chr2:23834041:ATCC>-,True,chr2:p23.3,-,2.0372,,,
608,OGT,somatic,False,,truncating,E544*,0.405,0.0,chrX:71559294:G>T,True,chrX:q13.1,-,1.8116,,,


In [54]:
#somatic variants tumor2 (snv, indels, cnv and sv)
cols = ['SYMBOL', 'origin','intogen','role','variant_type', 'aa_change','t_AF', 'gnomADg_AF', 'mut','clonal',
       'CNA','CN','mut_sv','fusion']
t2_df = concat_all_mutations(g_ranked_df,t2_snv_ranked_df,t2_sv_df,t2_cnv_df)
t2_df[cols][(t2_df['origin']=='somatic')&((t2_df['clonal']==True)|(t2_df['cgc_transl']==True)|(t2_df['CNA']!='-'))]

Unnamed: 0,SYMBOL,origin,intogen,role,variant_type,aa_change,t_AF,gnomADg_AF,mut,clonal,CNA,CN,mut_sv,fusion
599,BRAF,somatic,True,Act,miss_inframe,V600E,0.311,0.0,chr7:140753336:A>T,True,-,2.0013,,
600,NOTCH1,somatic,True,ambiguous,miss_inframe,N320S,0.338,0.0,chr9:136518731:T>C,True,-,2.0161,,
601,APLP2,somatic,False,,truncating,-,0.328,0.0,chr11:130120820:T>C,True,-,2.035,,
602,LRBA,somatic,False,,miss_inframe,A1729G,0.354,0.0,chr4:150817243:G>C,True,-,2.0184,,
603,ADAM18,somatic,False,,miss_inframe,P289S,0.158,0.0,chr8:39638502:C>T,True,-,2.033,,
604,CDKN2C,somatic,True,LoF,,,,,,,del,1.0489,,
605,RPL5,somatic,True,LoF,,,,,,,del,1.0489,,
606,RBM15,somatic,True,LoF,,,,,,,del,1.0489,,
607,CD58,somatic,True,LoF,,,,,,,del,1.0489,,
608,MUTYH,somatic,False,LoF,,,,,,,del,1.0489,,


In [55]:
#germline variants
cols = ['SYMBOL', 'origin','germline','role',
       'variant_type', 'aa_change','n_AF_real', 'gnomADg_AF', 'mut']
t1_df = concat_all_mutations(g_ranked_df,t1_snv_ranked_df,t1_sv_df,t1_cnv_df)
t1_df[cols][(t1_df['origin']=='germline')&(t1_df['gnomADg_AF']<.01)&(t1_df['germline']==True)]

Unnamed: 0,SYMBOL,origin,germline,role,variant_type,aa_change,n_AF_real,gnomADg_AF,mut
0,PTCH1,germline,True,LoF,miss_inframe,D436N,0.508,0.000824,chr9:95478096:C>T
1,SDHD,germline,True,LoF,miss_inframe,G12S,0.493,0.006979,chr11:112086941:G>A
2,SDHB,germline,True,LoF,miss_inframe,G53E,0.496,0.000405,chr1:17044803:C>T
3,MET,germline,True,Act,miss_inframe,T1010I,0.514,0.00897,chr7:116771936:C>T
4,PDGFRA,germline,True,Act,miss_inframe,G79D,0.579,0.008872,chr4:54261281:G>A
5,WAS,germline,True,Act,miss_inframe,V332A,0.486,0.004856,chrX:48688723:T>C
6,SERPINA1,germline,True,,truncating,E347X,0.116,0.000253,chr14:94379488:CT>-
7,AR,germline,True,Act,miss_inframe,GGGGG457-461-,1.0,0.009286,chrX:67546515:GGCGGCGGCGGCGGC>-
8,JMJD1C,germline,True,,miss_inframe,F130Y,0.508,4.9e-05,chr10:63264709:A>T
9,SHOC2,germline,True,,miss_inframe,E25G,0.511,7.7e-05,chr10:110964432:A>G


In [56]:
germline_df.to_csv('./table1_paper/case4_germline.tsv',sep='\t',index=None)

In [57]:
#somatic SNV and indels variants
cols = ['#CHROM','POS','REF','ALT','SYMBOL','role','intogen',
       'variant_type','Consequence', 'aa_change','clonal']

t1_snv_df1 = t1_snv_df[cols]
t1_snv_df1['Tumor'] = 1
t2_snv_df1 = t2_snv_df[cols]
t2_snv_df1['Tumor'] = 2
snv_df1 = pd.concat([t1_snv_df1,t2_snv_df1])
snv_df1.sort_values(['Tumor','clonal','intogen','variant_type'],ascending=[True,False,False,False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,#CHROM,POS,REF,ALT,SYMBOL,role,intogen,variant_type,Consequence,aa_change,clonal,Tumor
1084,chr13,40666152,G,A,FOXO1,Act,True,miss_inframe,missense_variant,R21C,True,1
1700,chr17,7674229,C,T,TP53,LoF,True,miss_inframe,missense_variant,G245D,True,1
1998,chr19,46919816,G,C,ARHGAP35,LoF,True,miss_inframe,missense_variant,V381L,True,1
3083,chr3,187725578,G,T,BCL6,Act,True,miss_inframe,missense_variant,A587D,True,1
4761,chr8,144512919,T,C,RECQL4,LoF,True,miss_inframe,missense_variant,R895G,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...
429,chrX,118499583,C,T,DOCK11,,False,,intron_variant,-,False,2
430,chrX,119447007,G,A,SLC25A43,,False,,intron_variant,-,False,2
432,chrX,143091310,C,-,RN7SKP81,,False,,downstream_gene_variant,-,False,2
434,chrX,144321775,C,T,-,,False,,intergenic_variant,-,False,2


In [58]:
snv_df1.to_csv('./table2_paper/case4_snv_indels.tsv',sep='\t',index=None)

In [59]:
#somatic SV variants
cols = ['SYMBOL','intogen','role','mut','fusion','cgc_transl','chr/chr','sv_type','distance']
t1_sv_df1 = t1_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t2_sv_df1 = t2_sv_df[cols].sort_values(['intogen','sv_type'],ascending=[False,True])
t1_sv_df1['Tumor'] = 1
t2_sv_df1['Tumor'] = 2
sv_df1 = pd.concat([t1_sv_df1,t2_sv_df1])
sv_df1

Unnamed: 0,SYMBOL,intogen,role,mut,fusion,cgc_transl,chr/chr,sv_type,distance,Tumor
8,FHIT,True,ambiguous,chr3:60334050:T>T[chr3:60343507[,FHIT-del,True,chr3,del,9457,1
15,DYDC2,False,,chr10:80356996:C>C[chr10:80357274[,DYDC2-del,False,chr10,del,278,1
20,RNF215,False,,nan:30387399:G>G[chr22:30387438[,RNF215-del,False,chr22,del,39,1
0,,False,,chr1:194481532:C>C[chr1:194481569[,-/-,False,chr1,inv,37,1
1,,False,,chr1:194481569:T>]chr1:194481532]T,-/-,False,chr1,inv,-37,1
2,,False,,chr2:88861251:A>AGGGGC[chr2:89196082[,-/-,False,chr2,inv,334831,1
3,,False,,chr2:88861257:A>A]chr2:88886153],-/-,False,chr2,inv,24896,1
4,,False,,chr2:88861924:C>[chr2:88897787[C,-/-,False,chr2,inv,35863,1
5,,False,,chr2:88886153:T>T]chr2:88861257],-/-,False,chr2,inv,-24896,1
6,,False,,chr2:88897787:C>[chr2:88861924[C,-/-,False,chr2,inv,-35863,1


In [60]:
sv_df1.to_csv('./table2_paper/case4_sv.tsv',sep='\t',index=None)

In [61]:
#This commented code points to the original data
# root_in_hmf_t1 = '/workspace/datasets/sjd_seq/platinum_results/20220809/pt8-t1-allsamples-t1/'
# root_in_hmf_t2 = '/workspace/datasets/sjd_seq/platinum_results/20220809/pt8-t2-allsamples-t2/'

#Change accordingly the paths to hmf output
root_in_hmf_t1 = '/path/to/hmf_pipeline/output/'
root_in_hmf_t2 = '/path/to/hmf_pipeline/output/'

In [62]:
#somatic CNV variants
pt = 'case4'
tumor1 = samples[pt]['tumor1']
tumor2 = samples[pt]['tumor2']
normal = samples[pt]['normal']

cols = ['chromosome','start','end','copyNumber','Tumor']
t1_segments_df = pd.read_csv(root_in_hmf_t1+'purple/'+tumor1+'.purple.cnv.somatic.tsv',sep='\t')
t2_segments_df = pd.read_csv(root_in_hmf_t2+'purple/'+tumor2+'.purple.cnv.somatic.tsv',sep='\t')

t1_segments_df1 = t1_segments_df[((t1_segments_df['copyNumber']>2.5)|(t1_segments_df['copyNumber']<1.5))&(t1_segments_df['bafCount']>5)]
t2_segments_df1 = t2_segments_df[((t2_segments_df['copyNumber']>2.5)|(t2_segments_df['copyNumber']<1.5))&(t2_segments_df['bafCount']>5)]

def cnv_final_table(df1):
    chr_list = df1['chromosome'].unique()
    cols = ['chromosome','start','end','copyNumber']
    cn_final_df = pd.DataFrame(columns=cols)
    for chrom in chr_list:
        if len(df1[df1['chromosome']==chrom]) == 1:
            df2 = df1[df1['chromosome']==chrom]
        else:
            min_start = df1[df1['chromosome']==chrom]['start'].min()
            max_start = df1[df1['chromosome']==chrom]['start'].max()
            min_end = df1[df1['chromosome']==chrom]['end'].min()
            max_end = df1[df1['chromosome']==chrom]['end'].max()
            min_cn = df1[df1['chromosome']==chrom]['copyNumber'].min()
            max_cn = df1[df1['chromosome']==chrom]['copyNumber'].max()
            dict1 = {}
            dict1['chromosome'] = chrom
            dict1['start'] = '['+str(min_start)+','+str(max_start)+']'
            dict1['end'] = '['+str(min_end)+','+str(max_end)+']'
            dict1['copyNumber'] = '['+str(min_cn)+','+str(max_cn)+']'
            df2 = pd.DataFrame.from_dict({0:dict1},orient='index')
        cn_final_df = pd.concat([cn_final_df,df2])

    return cn_final_df

t1_segments_df2 = cnv_final_table(t1_segments_df1)
t2_segments_df2 = cnv_final_table(t2_segments_df1)
t1_segments_df2['Tumor'] = 1
t2_segments_df2['Tumor'] = 2
segments_df2 = pd.concat([t1_segments_df2[cols],t2_segments_df2[cols]])
segments_df2

Unnamed: 0,chromosome,start,end,copyNumber,Tumor
0,chr14,"[105864255,106373661]","[106373660,107043718]","[-0.0334,2.6802]",1
1,chr1,43950914,119287566,1.0489,2


In [63]:
pt = 'case4'
tumor = samples[pt]['tumor1']
t1_cnv_df = pd.read_csv(root_in_hmf_t1+'purple/'+tumor+'.purple.cnv.gene.tsv',sep='\t')

def add_genes (row,df1):
    chrom = row['chromosome']
    start_segment = row['start']
    end_segment = row['end']
    df1 = t1_cnv_df[t1_cnv_df['chromosome']==chrom]
    gene_list = df1['gene'].unique()
    driver_list = [gene for gene in gene_list if gene in intogen_drivers]
    segment_gene_list = []
    if type(start_segment) == str:
        start_segment = start_segment.split('[')[1].split(',')[0]
        end_segment = end_segment.split(']')[0].split(',')[1]
    for gene in driver_list:
        start_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['start']
        end_gene = df1[(df1['gene']==gene)&(df1['isCanonical']==True)]['end']
        if int(start_gene) > int(start_segment) and int(end_gene) < int(end_segment):
            segment_gene_list.append(gene)
    return segment_gene_list

segments_df2['driver genes'] = segments_df2.progress_apply(lambda row: add_genes(row,t1_cnv_df),axis=1)
segments_df2

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,chromosome,start,end,copyNumber,Tumor,driver genes
0,chr14,"[105864255,106373661]","[106373660,107043718]","[-0.0334,2.6802]",1,[]
1,chr1,43950914,119287566,1.0489,2,"[TAL1, STIL, CDKN2C, EPS15, JUN, JAK1, FUBP1, ..."


In [64]:
segments_df2.to_csv('./table2_paper/case4_cnv.tsv',sep='\t',index=None)

_____________________________________________________________