This notebook provides a list of the candidate driver mutations in coding regions in TALL separating subsets as privates (primary and relapse) and shared. It takes the list ot mutations outputted by driver_mutations_primary_ALL.ipynb. The list is used in Figure 2b of the paper and in Additional file 1 Figure S4  Additional file 2 Table S5 

In [None]:
import os
import pandas as pd
import numpy as np
from aux_functions import get_three_subsets

In [None]:
# FUNCTIONS

def convert_set_2_df(set_obj):
    df = pd.DataFrame()

    for variant in set_obj:
        info = variant.split("_")
        chr_ = info[0]
        pos = info[1]
        ref = info[2]
        alt = info[3]
        df = df.append({"#CHROM": str(chr_), "POS": int(pos), "REF": ref, "ALT": alt}, ignore_index=True)
    df = df[["#CHROM", "POS", "REF", "ALT"]]
    df[["POS"]] = df[["POS"]].astype(int)
    df[["#CHROM", "REF", "ALT"]] = df[["#CHROM", "REF", "ALT"]].astype(str)
    return df

In [None]:
tall_cohorts = ['PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS)',
'ADULT TALL AECC PROJECT',
'PEDIATRIC ALL (Li et al., 2019, Blood)'] #TALL cohorts

# candidate_driver_muts.tsv input can be found Additional file 1 Table S5 
# In the additional tables are missing the mutations from Li et al., 2020 Blood. 
# Those must be added from the supplementary of their paper PMID: 31697823
driver_muts = pd.read_csv("candidate_driver_muts.tsv", sep='\t') 

driver_muts = driver_muts[driver_muts['COHORT'].isin(tall_cohorts)]
driver_muts = driver_muts[driver_muts['TYPE'] == 'TALL']

In [None]:
driver_muts['COHORT'].unique()

In [None]:
# DEFINE SUBSETS

grps_patient = driver_muts.groupby("PATIENT")
candidate_drivers_mutations = pd.DataFrame()

for g in grps_patient.groups:
    
    df_pat = grps_patient.get_group(g)
    df_pry = df_pat[df_pat['STAGE'] == 'primary']
    df_rel = df_pat[df_pat['STAGE'] == 'relapse']
    
    if df_pry.empty == False:
        df_pry['Variant'] = df_pry.apply(lambda x: x['#CHROM']+'_'+str(x['POS'])+'_'+x['REF']+'_'+x['ALT'], axis=1)
    else:
        df_pry['Variant'] = np.nan
    if df_rel.empty == False:
        df_rel['Variant'] = df_rel.apply(lambda x: x['#CHROM']+'_'+str(x['POS'])+'_'+x['REF']+'_'+x['ALT'], axis=1)
    else:
        df_rel['Variant'] = np.nan

    # make subsets
    shared, private_pry, private_rel = get_three_subsets(df_pry, df_rel)

    # primary
    if len(private_pry) != 0:
        private_pry_df = convert_set_2_df(private_pry)
        private_pry_df = private_pry_df.merge(df_pry, on=["#CHROM", "POS", "REF", "ALT"], how='left')
        private_pry_df['subset'] = 'private_primary'
        candidate_drivers_mutations = candidate_drivers_mutations.append(private_pry_df, ignore_index=True, sort=False)

    # relapse
    if len(private_rel) != 0:
        private_rel_df = convert_set_2_df(private_rel)
        private_rel_df = private_rel_df.merge(df_rel, on=["#CHROM", "POS", "REF", "ALT"], how='left')
        private_rel_df['subset'] = 'private_relapse'
        candidate_drivers_mutations = candidate_drivers_mutations.append(private_rel_df, ignore_index=True, sort=False)

    # shared (trunk)
    if len(shared) != 0:
        shared_df = convert_set_2_df(shared)
        shared_df = shared_df.merge(df_pry, on=["#CHROM", "POS", "REF", "ALT"], how='left')
        shared_df['subset'] = 'shared'
        candidate_drivers_mutations = candidate_drivers_mutations.append(shared_df, ignore_index=True, sort=False)


In [None]:
len(candidate_drivers_mutations)

In [None]:
out_path = # path to write the results
candidate_drivers_mutations.to_csv(os.path.join(out_path, "driver_muts_TALL_subsets.tsv"), sep='\t', index=False)