Joined list of driver copy number variants of all TALL cohorts. The list is simplified and provided in Additional file 2 Table S6a and figure 2a and Additional file 1 Figure S4. 

In [None]:
import sys, os
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]
import pandas as pd
import numpy as np
import glob
from collections import defaultdict
import pybedtools

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
## FUNCTION

def cnv_make_simple(df_cnv, drivers_lite):
    df_return = pd.DataFrame()
    
    df_undo = pd.DataFrame()
    
    for i,rw in df_cnv.iterrows():
        genes = rw['driver_cnv'].split(",")
        for g in genes:
            lite = drivers_lite[drivers_lite['SYMBOL'] == g].reset_index()
            print(g)
            if type(lite.loc[0,'PATHWAY']) == str:
                df_undo = df_undo.append({'patient':rw['patient'],'driver_cnv':g, 'alteration':rw['alteration'],
                                  'band_name':rw['band_name'],'subset':rw['subset'],'level_confidence':rw['level_confidence'], 
                                          'PATHWAY':lite.loc[0,'PATHWAY']}, ignore_index=True)
            else:
                df_undo = df_undo.append({'patient':rw['patient'],'driver_cnv':g, 'alteration':rw['alteration'],
                                  'band_name':rw['band_name'],'subset':rw['subset'],'level_confidence':rw['level_confidence'], 
                                          'PATHWAY':'other'}, ignore_index=True)

    grps = df_undo.groupby(['patient', 'driver_cnv', 'alteration', 'subset', 'level_confidence'])
    
    for g in grps.groups:
        df = grps.get_group(g)
        band = fix_band_anntotation(df)
        df['SYMBOL'] = df['driver_cnv']+' '+band
        df.drop_duplicates(inplace=True, keep='first')
        df_return = df_return.append(df, ignore_index=True)
    return df_return

def fix_band_anntotation(df):
    
    df['band'] = df['band_name'].apply(lambda x: x.split('.')[0])
    df['suband'] = df['band_name'].apply(lambda x: x.split('.')[1] if '.' in x else 0)
    df['suband'] = df['suband'].astype(int)
    df.sort_values('suband', inplace=True)
    df.drop_duplicates(subset='suband', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    band = df['band'].unique()[0]
    suband = list()
    for i,rw in df.iterrows():
        if rw['suband'] != 0:
            suband.append(str(rw['suband']))
    
    subands = ('-').join(suband)
    band = band+'.'+subands+' ('+df.loc[0, 'alteration']+')'
    return band

def check_candidates(rw, drivers_lite):
    if rw['level_confidence'] == 'cnv_in_interesting_gene':
        genes = rw['genes_of_interest'].split(",")
        drivers_meaningful = list()

        for gen in genes:
            driver_interest = drivers_lite[drivers_lite['SYMBOL'] == gen].reset_index()
            try:
                if (rw['alteration'] == 'del') and (driver_interest.loc[0, 'ROLE'] != 'act'):
                    drivers_meaningful.append(gen)
                elif (rw['alteration'] == 'amp') and (driver_interest.loc[0, 'ROLE'] != 'lof'):
                    drivers_meaningful.append(gen)
                else:
                    pass
            except KeyError:
                print(rw)
            kk = (',').join(drivers_meaningful)
        rw['driver_cnv'] = kk
    else:
        genes = rw['genes_of_interest'].split(",")
        genes = list(set(genes))
        kk = (',').join(genes)
        rw['driver_cnv'] = kk
    return rw

In [None]:
# define path for results
out_path = ""

# read lite known cnv in ALL
drivers_cnv = pd.read_csv("../ext_files/literature/cnv_lite.tsv", sep='\t')

# read all general cancer genes in ALL
drivers_muts = pd.read_csv("../ext_files/literature/mutations_lite.tsv", sep='\t')

# clinical data
df_info = pd.read_csv("../ext_files/all_cohort_clinical_groups.tsv", sep='\t')
df_info['PATIENT'] = df_info.apply(lambda x: 'oshima_pat_'+ str(x['PATIENT']) if x['COHORT'] == 'PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS)' else x['PATIENT'], axis=1)

In [None]:
# dictionary with the paths to the processed cnv files
dicc_paths_cnv = {'ADULT TALL AECC PROJECT':"",
                 'PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS)': "",
                 'PEDIATRIC ALL (Li et al., 2019, Blood)': ""}

In [None]:
drivers = drivers_muts.append(drivers_cnv, ignore_index=True, sort=False)

In [None]:
drivers[['SYMBOL','ROLE']].drop_duplicates().groupby('SYMBOL').count().sort_values('ROLE',ascending=False)

In [None]:
drivers[drivers['SYMBOL'] == 'NOTCH1']

#### Adult TALL AECC

In [None]:
# read cnv candidate cnv
candidate_drivers_cnv = pd.read_csv(os.path.join(dicc_paths_cnv['ADULT TALL AECC PROJECT'],"known_driver_cnv.tsv"), sep='\t')
candidate_drivers_cnv_2 = pd.read_csv(os.path.join(dicc_paths_cnv['ADULT TALL AECC PROJECT'],"recurrent_interesting_cnv.tsv"), sep='\t')
candidate_drivers_cnv['level_confidence'] = 'known_driver'
candidate_drivers_cnv_2['level_confidence'] = 'cnv_in_interesting_gene'
candidate_drivers_cnv.rename(columns={'driver_cnv':'genes_of_interest'}, inplace=True)
candidate_drivers_cnv = candidate_drivers_cnv.append(candidate_drivers_cnv_2, ignore_index=True)
candidate_drivers_cnv['alteration'] = candidate_drivers_cnv['alteration'].apply(lambda x: x.strip())

In [None]:
# remove balanced loh
candidate_drivers_cnv = candidate_drivers_cnv[candidate_drivers_cnv['alteration'] != 'loh']

In [None]:
candidate_drivers_cnv.head()

In [None]:
# check candidates
candidate_drivers_cnv = candidate_drivers_cnv.apply(lambda x: check_candidates(x,drivers), axis=1)

#remove those that we don't have any information
candidate_drivers_cnv = candidate_drivers_cnv[candidate_drivers_cnv['driver_cnv'] != ''] 

In [None]:
# make simple dataframe to summarize candidates
adults_cnv = cnv_make_simple(candidate_drivers_cnv, drivers)
adults_cnv = adults_cnv[['patient', 'subset', 'SYMBOL', 'level_confidence','PATHWAY']].drop_duplicates()
adults_cnv.rename(columns={'patient':'PATIENT'}, inplace=True)

# merge with clinical information
adults_cnv = adults_cnv.merge(df_info[['PATIENT', 'COHORT','AGE_RANGE']], how='left', on='PATIENT')

#### Pediatric WXS Oshima 2015

In [None]:
# read cnv candidate cnv

candidate_drivers_cnv = pd.read_csv(os.path.join(dicc_paths_cnv['PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS)'],"known_driver_cnv.tsv"), sep='\t')
candidate_drivers_cnv_2 = pd.read_csv(os.path.join(dicc_paths_cnv['PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS)'],"recurrent_interesting_cnv.tsv"), sep='\t')
candidate_drivers_cnv['level_confidence'] = 'known_driver'
candidate_drivers_cnv_2['level_confidence'] = 'cnv_in_interesting_gene'
candidate_drivers_cnv.rename(columns={'driver_cnv':'genes_of_interest'}, inplace=True)
candidate_drivers_cnv = candidate_drivers_cnv.append(candidate_drivers_cnv_2, ignore_index=True)
candidate_drivers_cnv['alteration'] = candidate_drivers_cnv['alteration'].apply(lambda x: x.strip())

In [None]:
# remove balanced loh
candidate_drivers_cnv = candidate_drivers_cnv[candidate_drivers_cnv['alteration'] != 'loh']

In [None]:
# check candidates
candidate_drivers_cnv = candidate_drivers_cnv.apply(lambda x: check_candidates(x,drivers), axis=1)

# remove those that we don't have any information 
candidate_drivers_cnv = candidate_drivers_cnv[candidate_drivers_cnv['driver_cnv'] != '']

# remove any copy number from chromosome X. Output from male patients reports unexisting copy number imbalance
candidate_drivers_cnv = candidate_drivers_cnv[~candidate_drivers_cnv['band_name'].str.contains("X")]

In [None]:
# make simple dataframe to summarize candidates
pediatrics_cnv = cnv_make_simple(candidate_drivers_cnv, drivers)
pediatrics_cnv = pediatrics_cnv[['patient', 'subset', 'SYMBOL', 'level_confidence','PATHWAY']].drop_duplicates()
pediatrics_cnv.rename(columns={'patient':'PATIENT'}, inplace=True)

In [None]:
# merge with clinical information
pediatrics_cnv = pediatrics_cnv.merge(df_info[['PATIENT', 'COHORT','AGE_RANGE']], how='left', on='PATIENT')

#### Pediatric WGS TALL Li 2020 Blood

In [None]:
# Go to the supplementary materials of Li et al., 2020 PMID: 31697823 and make a dataframe with the clinical information
# like the one in Additional file 2 Table S2 of the paper
df_info_chinese = pd.read_csv("", sep='\t')
df_info_chinese_TALL = df_info_chinese[df_info_chinese['TYPE'] == 'TALL']
print(len(df_info_chinese_TALL['PATIENT'].unique()))
df_info_chinese_TALL.head()

In [None]:
dicc_paths_cnv.keys()

In [None]:
# read cnv candidate cnv

candidate_drivers_cnv = pd.read_csv(os.path.join(dicc_paths_cnv['PEDIATRIC ALL (Li et al., 2019, Blood)'], "known_driver_cnv.tsv"), sep='\t')
candidate_drivers_cnv_2 = pd.read_csv(os.path.join(dicc_paths_cnv['PEDIATRIC ALL (Li et al., 2019, Blood)'], "recurrent_interesting_cnv.tsv"), sep='\t')
candidate_drivers_cnv['level_confidence'] = 'known_driver'
candidate_drivers_cnv_2['level_confidence'] = 'cnv_in_interesting_gene'
candidate_drivers_cnv.rename(columns={'driver_cnv':'genes_of_interest'}, inplace=True)
candidate_drivers_cnv = candidate_drivers_cnv.append(candidate_drivers_cnv_2, ignore_index=True)
candidate_drivers_cnv['alteration'] = candidate_drivers_cnv['alteration'].apply(lambda x: x.strip())

In [None]:
# check candidates
candidate_drivers_cnv = candidate_drivers_cnv.apply(lambda x: check_candidates(x,drivers), axis=1)

# remove those that we don't have any information 
candidate_drivers_cnv = candidate_drivers_cnv[candidate_drivers_cnv['driver_cnv'] != '']

In [None]:
# make simple dataframe to summarize candidates
pediatrics_cnv_2 = cnv_make_simple(candidate_drivers_cnv, drivers)
pediatrics_cnv_2 = pediatrics_cnv_2[['patient', 'subset', 'SYMBOL', 'level_confidence','PATHWAY']].drop_duplicates()
pediatrics_cnv_2.rename(columns={'patient':'PATIENT'}, inplace=True)

In [None]:
# merge with clinical information
pediatrics_cnv_2 = pediatrics_cnv_2.merge(df_info_chinese[['PATIENT', 'COHORT','AGE_RANGE']], how='left', on='PATIENT')

#### join them 

In [None]:
candidate_cnv_drivers = adults_cnv.copy()
candidate_cnv_drivers = candidate_cnv_drivers.append(pediatrics_cnv, ignore_index=True)
candidate_cnv_drivers = candidate_cnv_drivers.append(pediatrics_cnv_2, ignore_index=True)

candidate_cnv_drivers.drop_duplicates(inplace=True, keep='first')
candidate_cnv_drivers['subset'] = candidate_cnv_drivers['subset'].str.replace("private primary","private_primary")
candidate_cnv_drivers['subset'] = candidate_cnv_drivers['subset'].str.replace("private relapse","private_relapse")

In [None]:
candidate_cnv_drivers[candidate_cnv_drivers['SYMBOL'].str.contains('JAK3')]

#### last check

Check if the gene is truly affected by the cnv and not only the band 

Check if there are full amplifications of the activating genes

In [None]:
gene_length = pd.read_csv("../ext_files/mart_export_grch37.txt", sep='\t')
gene_length.rename(columns={'Chromosome/scaffold name':'chrom_gen', 'Gene start (bp)':'start_gen',
                           'Gene end (bp)':'end_gen', 'Gene name':'SYMBOL'}, inplace=True)

chroms = [str(x) for x in range(1,23,1)]
chroms.extend(['X', 'Y'])
gene_length  = gene_length[gene_length['chrom_gen'].isin(chroms)] 

In [None]:
cnv_check = pd.DataFrame()

for i,rw in candidate_cnv_drivers.iterrows():
    cnv = pd.read_excel(os.path.join(dicc_paths_cnv[rw['COHORT']],rw['subset']+"_processed.xlsx"), sheet_name="processed")
    cnv['chrom'] = cnv['chrom'].astype(str)
    symbol = rw['SYMBOL'].split(' ')[0]
    alt = rw['SYMBOL'].split(' ')[-1].replace("(", "")
    alt = alt.replace(")", "")
    gene_len = gene_length[gene_length['SYMBOL'] == symbol].reset_index(drop=True)

    gene_len['len_gene'] = gene_len.apply(lambda x: x['end_gen']-x['start_gen'], axis=1)

    cnv = cnv[(cnv['chrom'] == str(gene_len.loc[0,'chrom_gen'])) & (cnv['alteration'] == alt) & (cnv['patient'] == rw['PATIENT'])]

    cnv_bed = pybedtools.BedTool.from_dataframe(cnv[['chrom', 'start', 'end']])
    gene_len_bed = pybedtools.BedTool.from_dataframe(gene_len[['chrom_gen', 'start_gen', 'end_gen']])

    result = cnv_bed.intersect(gene_len_bed, wo = True)
    merged = pd.read_table(result.fn, names=['chrom', 'start', 'end', 
                                              'chrom_gen', 'start_gen', 'end_gen', 'len_overlap'])
    
    if merged.empty == False:
        if alt == 'del':
            cnv_check = cnv_check.append(rw, ignore_index=True)
        else:
            merged[['start', 'end', 
                 'start_gen', 'end_gen']] = merged[['start', 'end', 'start_gen', 'end_gen']].astype(int)
            merged[['chrom', 'chrom_gen']] = merged[['chrom', 'chrom_gen']].astype(str)

            merged = merged.merge(cnv, how='left', on=['chrom', 'start', 'end'])
            merged = merged.merge(gene_len, how='left', on=['chrom_gen', 'start_gen', 'end_gen'])

            merged['keep'] = merged.apply(lambda x: True if x['len_gene'] == x['len_overlap'] else False, axis=1)
            merged_filt = merged[merged['keep'] == True]
            if merged_filt.empty == False:
                cnv_check = cnv_check.append(rw, ignore_index=True)
    else:
        # these are genes in this cytobands that are actually not affected by the cnv that distrupts the cytoband
        print("--------------------------------------------------------------------------------------")
        print(rw)
        print(cnv)

In [None]:
cnv_check

In [None]:
cnv_check.to_csv(os.path.join(out_path, "driver_cnv_TALL.tsv", sep='\t', index=False))