This notebook provides a list of candidate coding driver mutations of the cancer gene's list from the run of IntoGen. It also performs some test to compare BALL and TALL frequency of mutated genes. The list of mutations is used in Figure 1d, Additional file 1 Figure S2.

This piece of code relies on a workspace directory structure such as 
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv
 
Be aware that the filtered mafs with clonal classification and joined mutations after running the scripts in ```filter/```  have the following file name: ```TumorID_vs_normalID + _strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf``` 
.This file name is used in the following code.

In [None]:
import pandas as pd
import os
import numpy as np
import glob
import seaborn
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from aux_data_in_pyvar import pat_dirs
from decimal import Decimal

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

#### Read clinical data

In [None]:
## WGS from ST JUDE and ADULT FROM TALL

df_info = pd.read_csv("../ext_files/all_cohort_clinical_groups.tsv", sep='\t')
df_info['SUBTYPE'] = df_info['SUBTYPE'].str.replace(" ", "_")
df_info['PATIENT'] = df_info.apply(lambda x: 'oshima_pat_'+ str(x['PATIENT']) if x['COHORT'] == 'PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS)' else x['PATIENT'], axis=1)
df_info['PATIENT'] = df_info.apply(lambda x: 'oshima_pat_'+ str(x['PATIENT']) if x['COHORT'] == 'PEDIATRIC BALL WXS (Oshima et al., 2016; PNAS)' else x['PATIENT'], axis=1)
df_info = df_info[df_info['SUBTYPE'] != 'Other']

#### Read cancer genes of interest

In [None]:
## cancer genes detected in mutations

drivers = pd.read_csv("cancer_genes_ALL.csv", sep='\t')# results in Additional file 2 Table S3
drivers.head()

#### Read maf files with mutations and get the protein affecting mutations of the list of cancer genes

In [None]:
## protein altering consequence types from VEP

damaging_conseq = ['transcript_ablation',
'splice_acceptor_variant',
'splice_donor_variant',
'stop_gained',
'frameshift_variant',
'stop_lost',
'start_lost',
'transcript_amplification',
'inframe_insertion',
'inframe_deletion',
'missense_variant',
'protein_altering_variant',
'splice_region_variant',
'incomplete_terminal_codon_variant',
'start_retained_variant',
'stop_retained_variant']

In [None]:
#ADULT MUTATIONS

df_all = pd.DataFrame()

for com in df_info[df_info['SUBTYPE'].str.contains("Adult")]['COMPARISON']:
    pat_info = df_info[df_info['COMPARISON'] == com].reset_index(drop=True)
    df = pd.read_csv(os.path.join(pat_dirs[pat_info.loc[0,'PATIENT']], pat_info.loc[0,'PATIENT'], com, com+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'), sep='\t')
    df = df[df['Consequence'].isin(damaging_conseq)]
    df = df[df['SYMBOL'].isin(drivers['SYMBOL'].unique())]
    df['COMPARISON'] = com
    df['AA_change'] = df.apply(lambda x: x['Amino_acids'].split('/')[0]+x['Protein_position']+x['Amino_acids'].split('/')[1] if "/" in x['Amino_acids'] else x['Protein_position']+x['Amino_acids'], axis=1)
    df = df[['#CHROM', 'POS', 'REF', 'ALT', 'Consequence', 'AA_change', 'SYMBOL','COMPARISON']].merge(pat_info, how='left', on='COMPARISON')
    df = df.merge(drivers[['SYMBOL', 'ROLE', 'PATHWAY']].drop_duplicates(), how='left', on='SYMBOL')
    df = df[['#CHROM', 'POS','REF','ALT','SYMBOL', 'Consequence','AA_change', 'COMPARISON', 
             'PATIENT', 'ROLE', 'STAGE', 'AGE_RANGE', 'TYPE','SUBTYPE','SUBTYPE_LABEL', 'PATHWAY', 'COHORT']]
    df.sort_values('AGE_RANGE', inplace=True, ascending=False)
    df_all = df_all.append(df, ignore_index=True, sort=False)

In [None]:
order_subtypes = ['TALL_Pediatric_pry','DUX4-ERG', 'Hypodiploid',
                 'Hyperdiploid', 'PHALL', 'Ph-like', 'iAMP21', 'Infant_MLL-R']

info_pedia = df_info[df_info['SUBTYPE'].isin(order_subtypes)]

In [None]:
## PEDIATRIC COHORTS STJUDE

input_files_path = "" # filtered MAF files path

for i,rw in info_pedia.iterrows():
    try:
        df = pd.read_csv(glob.glob(os.path.join(input_files_path, '*', 
                                'maf_calls', rw['PATIENT'], 
                                rw['COMPARISON'], '*_strelka_uniq_all_anno_vep92_categories_filt_cluster.maf'))[0], sep='\t')
        pat_info = info_pedia[info_pedia['COMPARISON'] == rw['COMPARISON']].reset_index(drop=True)
        df = df[df['Consequence'].isin(damaging_conseq)]
        df = df[df['SYMBOL'].isin(drivers['SYMBOL'].unique())]
        if df.empty == False:
            df['COMPARISON'] = rw['COMPARISON']
            df['AA_change'] = df.apply(lambda x: x['Amino_acids'].split('/')[0]+x['Protein_position']+x['Amino_acids'].split('/')[1] if "/" in x['Amino_acids'] else x['Protein_position']+x['Amino_acids'], axis=1)
            df = df[['#CHROM', 'POS', 'REF', 'ALT', 'Consequence', 'AA_change', 'SYMBOL','COMPARISON']].merge(pat_info, how='left', on='COMPARISON')
            df = df.merge(drivers[['SYMBOL', 'ROLE', 'PATHWAY']].drop_duplicates(), how='left', on='SYMBOL')
            df = df[['#CHROM', 'POS','REF','ALT','SYMBOL', 'Consequence','AA_change', 'COMPARISON', 
                     'PATIENT', 'ROLE', 'STAGE', 'AGE_RANGE', 'TYPE','SUBTYPE', 'SUBTYPE_LABEL','PATHWAY', 'COHORT']]
            df.sort_values('AGE_RANGE', inplace=True, ascending=False)
            df_all = df_all.append(df, ignore_index=True, sort=False)
    except IndexError:
        print("{} {}".format(rw['PATIENT'], rw['COMPARISON']))
    

In [None]:
## OSHIMA 2015 PEDIATRIC WXS 

dire_mafs = "" # filtered MAF files path

for com in df_info[df_info['SUBTYPE'].str.contains("Pediatric_WXS")]['COMPARISON']:
    pat_info = df_info[df_info['COMPARISON'] == com].reset_index(drop=True)
    try:
        df = pd.read_csv(os.path.join(dire_mafs, pat_info.loc[0,'PATIENT'], com, com+'_strelka_uniq_all_anno_vep92_categories_filt_snps_cluster.maf'), sep='\t')
        df = df[df['Consequence'].isin(damaging_conseq)]
        df = df[df['SYMBOL'].isin(drivers['SYMBOL'].unique())]
        df['COMPARISON'] = com
        if df.empty == False:
            df['AA_change'] = df.apply(lambda x: x['Amino_acids'].split('/')[0]+x['Protein_position']+x['Amino_acids'].split('/')[1] if "/" in x['Amino_acids'] else x['Protein_position']+x['Amino_acids'], axis=1)
            df = df[['#CHROM', 'POS', 'REF', 'ALT', 'Consequence', 'AA_change', 'SYMBOL','COMPARISON']].merge(pat_info, how='left', on='COMPARISON')
            df = df.merge(drivers[['SYMBOL', 'ROLE', 'PATHWAY']].drop_duplicates(), how='left', on='SYMBOL')
            df = df[['#CHROM', 'POS','REF','ALT','SYMBOL', 'Consequence','AA_change', 'COMPARISON', 
                     'PATIENT', 'ROLE', 'STAGE', 'AGE_RANGE', 'TYPE','SUBTYPE','SUBTYPE_LABEL', 'PATHWAY', 'COHORT']]
            df.sort_values('AGE_RANGE', inplace=True, ascending=False)
            df_all = df_all.append(df, ignore_index=True, sort=False)
        else:
            print("Empty df {} {}".format(pat_info.loc[0,'PATIENT'], pat_info.loc[0,'COMPARISON']))
    except FileNotFoundError:
        print("No file {} {}".format(pat_info.loc[0,'PATIENT'], pat_info.loc[0,'COMPARISON']))

In [None]:
## BLOOD PAPER CHINESE COHORT LI ET AL 2020 
# Go to the supplementary materials of Li et al., 2020 PMID: 31697823 and make a dataframe with the clinical information
# like the one in Additional file 2 Table S2 of the paper.
df_info_chinese = pd.read_csv("", sep='\t')  
df_info_chinese_TALL = df_info_chinese[df_info_chinese['TYPE'] == 'TALL']
print(len(df_info_chinese_TALL['PATIENT'].unique()))

damaging_conseq_chinese = ['missense', 'proteinIns', 'frameshift', 
'nonsense', 'splice_region', 'splice','proteinDel','insertion']

dire_mafs = ""  # filtered MAF files path

for com in df_info_chinese_TALL['COMPARISON']:
    pat_info = df_info_chinese_TALL[df_info_chinese_TALL['COMPARISON'] == com].reset_index(drop=True)
    try:
        df = pd.read_csv(os.path.join(dire_mafs, pat_info.loc[0,'PATIENT'], com, 'mutations_supp.maf'), sep='\t')
        df = df[df['Class'].isin(damaging_conseq_chinese)]
        df = df[df['Gene'].isin(drivers['SYMBOL'].unique())]
        if len(df) != 0: 
            df['COMPARISON'] = com
            df['AA_change'] = df['AAChange']
            df['SYMBOL'] = df['Gene']
            df['Consequence'] = df['Class']
            df = df[['#CHROM', 'POS', 'REF', 'ALT', 'Consequence', 'AA_change', 'SYMBOL','COMPARISON']].merge(pat_info, how='left', on='COMPARISON')
            df = df.merge(drivers[['SYMBOL', 'ROLE', 'PATHWAY']].drop_duplicates(), how='left', on='SYMBOL')
            df = df[['#CHROM', 'POS','REF','ALT','SYMBOL', 'Consequence','AA_change', 'COMPARISON', 
                     'PATIENT', 'ROLE', 'STAGE', 'AGE_RANGE', 'TYPE','SUBTYPE', 'PATHWAY', 'COHORT']]
            df.sort_values('AGE_RANGE', inplace=True, ascending=False)
            df_all = df_all.append(df, ignore_index=True, sort=False)
    except FileNotFoundError:
        print(os.path.join(dire_mafs, df_info_chinese_TALL.loc[0,'PATIENT'], com))

In [None]:
out_path = "" # path to write the results
df_all.to_csv(os.path.join(out_path, "candidate_driver_muts.tsv", sep='\t', index=False))

#### perform some test to see differences between BALL and TALL cohorts 

In [None]:
df_info = df_all[(df_all['STAGE'] == 'primary') & (df_all['COHORT'] !='PEDIATRIC ALL (Li et al., 2019, Blood)')][['PATIENT', 'SUBTYPE_LABEL', 'TYPE']].drop_duplicates()
df_info

In [None]:
# Define cancer gene of interest
cancer_gene = 'NOTCH1'

test_gene = df_info.merge(df_all[df_all['SYMBOL'] == cancer_gene].drop_duplicates()[['SYMBOL', 'PATIENT']].drop_duplicates(), 
                              how='left', on='PATIENT')
test_gene['Mutated'] = test_gene.apply(lambda x: True if type(x['SYMBOL']) == str else False, axis=1)

In [None]:
# create cont table
contingency_table = pd.crosstab(index=test_gene['Mutated'], columns=test_gene['TYPE'])
contingency_table

In [None]:
stat_chi2, p, dof, expected = chi2_contingency(contingency_table, correction=False)
print('%.2E' % Decimal(p))
output = "{:.3f}".format(p)
print(output)

In [None]:
# interpret test-statistic
prob = 0.90
critical = chi2.ppf(prob, dof)
if abs(stat_chi2) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

In [None]:
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
if abs(stat_chi2) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')