This notebook processed the results from Delly. 
This piece of code relies on a worspace directory structure such as 
```
cohort/
	patientID/
		DxTumorID_vs_normalID/
		ReTumorID_vs_normalID/ (sometimes)

```
 patientID, DxTumorID etc can be found in ../ext_files/all_cohort_clinical_groups.tsv

In [None]:
import sys, os
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]

In [None]:
import pandas as pd
import numpy as np
import pybedtools
from io import StringIO
from aux_functions import stage_mapping, read_vcf

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
def basic_filter(df):
    if df.empty == True:
        df = pd.DataFrame(columns=['CHROM_A', 'START_A', 'END_A', 
                                       'CHROM_B', 'START_B', 'END_B', 'ID',
                                      'STRAND_A', 'STRAND_B','FILTER','TYPE'])
    df = df[df.FILTER == 'PASS']
#     df = df[~df.INFO_A.str.contains("IMPRECISE")]
#     df = df[~df.INFO_B.str.contains("IMPRECISE")]
    return df

In [None]:
def intersect_transloc_bands(df_sv, df_bands, letter):
    sv_bed = pybedtools.BedTool.from_dataframe(df_sv[['CHROM'+'_'+letter.upper(), 'START'+'_'+letter.upper(), 'END'+'_'+letter.upper(),
                                                  'FILTER', 'TYPE', 'STRAND'+'_'+letter.upper(), 'ID']])
    band_bed = pybedtools.BedTool.from_dataframe(df_bands[['chrom', 'start', 'end', 'band']])
    result = sv_bed.intersect(band_bed, wao = True)
    result = pd.read_table(result.fn, names=['CHROM'+'_'+letter.upper(), 'START'+'_'+letter.upper(), 
                                             'END'+'_'+letter.upper(), 
                                             'FILTER', 'TYPE', 'STRAND'+'_'+letter.upper(), 'ID',
                                             'chrom', 'start', 'end', 'band', 'overlap'])
    result[['CHROM'+'_'+letter.upper(), 'chrom']] = result[['CHROM'+'_'+letter.upper(), 'chrom']].astype(str)
    result[['START'+'_'+letter.upper(), 'END'+'_'+letter.upper(), 'start', 'end']] = result[['START'+'_'+letter.upper(), 'END'+'_'+letter.upper(), 'start', 'end']].astype(int)
    result['alt_band'+'_'+letter.upper()] = result['band']
    return result[['CHROM'+'_'+letter.upper(), 'START'+'_'+letter.upper(), 'END'+'_'+letter.upper(),
                    'FILTER', 'TYPE', 'STRAND'+'_'+letter.upper(), 'ID', 'alt_band'+'_'+letter.upper()]].drop_duplicates()

In [None]:
def sort_alterations(dfA, dfB):
    df = pd.DataFrame()
    
    aut_chrom = [str(x) for x in range(1,23,1)]
    sex_chrom = ['X', 'Y']
    chroms = sex_chrom+aut_chrom
  
    for bnd in dfA['ID'].unique():
        Arw = dfA[dfA['ID'] == bnd].reset_index()
        Brw = dfB[dfB['ID'] == bnd].reset_index()
    
        i_A = chroms.index(Arw.loc[0,'CHROM_A'])
        i_B = chroms.index(Brw.loc[0,'CHROM_B'])
    
        if i_A < i_B:
            df = df.append({'CHROM_L': Arw.loc[0,'CHROM_A'], 
                            'START_L':Arw.loc[0,'START_A'], 
                            'END_L':Arw.loc[0,'END_A'],
                            'STRAND_L':Arw.loc[0,'STRAND_A'],
                            'CHROM_R': Brw.loc[0,'CHROM_B'], 
                            'START_R':Brw.loc[0,'START_B'], 
                            'END_R':Brw.loc[0,'END_B'],
                            'STRAND_R':Brw.loc[0,'STRAND_B'],
                            'ID':bnd,
                            'BAND_R':Brw.loc[0,'alt_band_B'], 'BAND_L':Arw.loc[0,'alt_band_A']}, 
                           ignore_index=True, sort=False)
        elif i_A > i_B:
            df = df.append({'CHROM_L': Brw.loc[0,'CHROM_B'], 
                            'START_L':Brw.loc[0,'START_B'], 
                            'END_L':Brw.loc[0,'END_B'],'STRAND_L':Brw.loc[0,'STRAND_B'],
                            'CHROM_R': Arw.loc[0,'CHROM_A'], 
                            'START_R':Arw.loc[0,'START_A'], 
                            'END_R':Arw.loc[0,'END_A'],
                            'STRAND_R':Arw.loc[0,'STRAND_A'],
                            'ID':bnd, 
                            'BAND_L':Brw.loc[0,'alt_band_B'], 'BAND_R':Arw.loc[0,'alt_band_A']}, 
                           ignore_index=True, sort=False)
        else:
            print("same chromosome in ID:{}".format(band))
    if df.empty==False:
        df[['START_L', 'START_R']] = df[['START_L', 'START_R']].astype(int)
        df[['END_L', 'END_R']] = df[['END_L', 'END_R']].astype(int)
    else:
        df = pd.DataFrame(columns=['BAND_L', 'BAND_R', 'CHROM_L', 'CHROM_R', 'END_L', 'END_R', 'ID',
       'START_L', 'START_R', 'STRAND_L', 'STRAND_R'])
    return df

In [None]:
# FUNCTIONS

def get_three_branches(all_pry, all_rel):

    # CREATE SET OF VARIANTS CLONAL  SNVS

    all_pry_variants = set(all_pry['Variant'].unique())

    all_rel_variants = set(all_rel['Variant'].unique())

    trunk_variants = all_pry_variants.intersection(all_rel_variants)

    private_pry_variants = all_pry_variants.difference(trunk_variants)

    private_rel_variants = all_rel_variants.difference(trunk_variants)

    return trunk_variants, private_pry_variants, private_rel_variants


In [None]:
def check_known_del(rw, del_known):
    for driver in del_known['Variant']:
        if driver in rw['Variant']:
            rw['driver'] = True
        else:
            rw['driver'] = False
    return rw

In [None]:
def process_other_sv(type_sv, dire_in):

    dff_pry = pd.DataFrame()
    dff_rel = pd.DataFrame()
    dff_joined = pd.DataFrame()

    for pat in clinic.PATIENT.unique():
        if not pat in ['PAT3', 'PAT4']:

            # read data
            pat_clinic = clinic[clinic['PATIENT'] == pat].reset_index().sort_values('STAGE')


            df_pry = read_vcf(os.path.join(dire_in,pat, pat_clinic.loc[0, 'COMPARISON'], 
                                           pat_clinic.loc[0, 'COMPARISON']+"_"+type_sv+"_delly.bedpe"))

            df_rel = read_vcf(os.path.join(dire_in,pat, pat_clinic.loc[1, 'COMPARISON'], 
                                           pat_clinic.loc[1, 'COMPARISON']+"_"+type_sv+"_delly.bedpe"))

            if df_pry.empty == True:
                df_pry = pd.DataFrame(columns=['CHROM_A', 'START_A', 'END_A', 
                                               'CHROM_B', 'START_B', 'END_B', 'ID', 'INFO_A', 'INFO_B',
                                              'STRAND_A', 'STRAND_B','FILTER','TYPE'])
            else: 
                df_pry.rename(columns={'#CHROM_A':'CHROM_A'}, inplace=True)
            if df_rel.empty == True:
                df_rel = pd.DataFrame(columns=['CHROM_A', 'START_A', 'END_A', 
                                               'CHROM_B', 'START_B', 'END_B', 'ID', 'INFO_A', 'INFO_B',
                                              'STRAND_A', 'STRAND_B','FILTER','TYPE'])
            else:
                df_rel.rename(columns={'#CHROM_A':'CHROM_A'}, inplace=True)

            print(pat)
            print("Before filter pry:{} rel:{}".format(len(df_pry), len(df_rel)))

            # get reliable calls
            df_pry = basic_filter(df_pry)
            df_rel = basic_filter(df_rel)

            print("After filter pry:{} rel:{}".format(len(df_pry), len(df_rel)))

            # map cytobands
            df_pry_A = intersect_transloc_bands(df_pry, bands, 'A')
            df_pry_B = intersect_transloc_bands(df_pry, bands, 'B')

            df_rel_A = intersect_transloc_bands(df_rel, bands, 'A')
            df_rel_B = intersect_transloc_bands(df_rel, bands, 'B')
    

            grps = df_pry_A.groupby('ID')

            for i in grps.groups:
                test1 = grps.get_group(i).reset_index()
                test2 = df_pry_B[df_pry_B['ID'] == i].reset_index()

                if test1.loc[0,'alt_band_A'] != test2.loc[0,'alt_band_B']:
                    print("{} with diff band in {}: {} {} {}".format(pat, type_sv, test1.loc[0,'CHROM_A'],
                                                                    test1.loc[0,'alt_band_A'],
                                                                    test2.loc[0,'alt_band_B']))

            grps = df_rel_A.groupby('ID')

            for i in grps.groups:
                test1 = grps.get_group(i).reset_index()
                test2 = df_rel_B[df_rel_B['ID'] == i].reset_index()

                if test1.loc[0,'alt_band_A'] != test2.loc[0,'alt_band_B']:
                    print("{} with diff band in {}: {} {} {}".format(pat, type_sv, test1.loc[0,'CHROM_A'],
                                                                    test1.loc[0,'alt_band_A'],
                                                                    test2.loc[0,'alt_band_B']))

            df_pry = df_pry_A.merge(df_pry_B, on=['ID', 'TYPE', 'FILTER'], how='outer')
            df_rel = df_rel_A.merge(df_rel_B, on=['ID', 'TYPE', 'FILTER'], how='outer')

            df_pry['PATIENT'] = pat
            df_rel['PATIENT'] = pat

            dff_pry = dff_pry.append(df_pry, ignore_index=True)
            dff_rel = dff_rel.append(df_rel, ignore_index=True)

            dff_pry['Variant'] = dff_pry.apply(lambda x: '({})({};{})'.format(x['CHROM_A'], x['alt_band_A'], x['alt_band_B']), axis=1)
            dff_rel['Variant'] = dff_rel.apply(lambda x: '({})({};{})'.format(x['CHROM_A'], x['alt_band_A'], x['alt_band_B']), axis=1)

            shared, private_pry, private_rel = get_three_branches(dff_pry, dff_rel)

            dff_pry['subset'] = dff_pry['Variant'].apply(lambda x: 'shared' if x in shared else 'private_primary')
            dff_rel = dff_rel[~dff_rel['Variant'].isin(shared)]
            dff_rel['subset'] = 'private_relapse'
            dff_rel['PATIENT'] = pat
            dff_rel['PATIENT'] = pat

            dff_joined = dff_joined.append(dff_pry, ignore_index=True, sort=False)
            dff_joined = dff_joined.append(dff_rel, ignore_index=True, sort=False)
            dff_joined.drop_duplicates(inplace=True)
    return dff_joined

In [None]:
dire_out = ""
dire_in = ""
# read clinical data
clinic = pd.read_csv("", sep='\t') # read Table S1 from Additional File 2
clinic = stage_mapping(clinic)
# read cytobands of chromosomes
bands = pd.read_table("../ext_files/chromosome.band.hg19.txt", sep='\t', 
                     header=None, names=['chrom','start', 'end', 'band', 'giestain'], skiprows=[0])
bands['chrom'] = bands['chrom'].str.replace("chr", "") 

### BND/TRANSLOC Variants

In [None]:
# read SV drivers
known_transloc = pd.read_csv("../ext_files/literature/sv_transloc_lite.tsv", sep='\t')
known_transloc['Variant'] = known_transloc.apply(lambda x: "t({};{})({};{})".format(x['CHROM_L'], 
                                                                                      x['CHROM_R'],
                                                                                      x['BAND_L'],
                                                                                      x['BAND_R']), axis=1)

In [None]:
# DELLY
dff_known = pd.DataFrame()
dff_others = pd.DataFrame()

for pat in clinic.PATIENT.unique():
    
    # read data
    pat_clinic = clinic[clinic['PATIENT'] == pat].reset_index().sort_values('STAGE')

    df_pry = read_vcf(os.path.join(dire_in,pat, pat_clinic.loc[0, 'COMPARISON'], 
                                   pat_clinic.loc[0, 'COMPARISON']+"_bnd_delly.bedpe"))
    df_rel = read_vcf(os.path.join(dire_in,pat, pat_clinic.loc[1, 'COMPARISON'], 
                                   pat_clinic.loc[1, 'COMPARISON']+"_bnd_delly.bedpe"))
    df_pry.rename(columns={'#CHROM_A':'CHROM_A'}, inplace=True)
    df_rel.rename(columns={'#CHROM_A':'CHROM_A'}, inplace=True)


    print(pat)
    print("Before filter pry:{} rel:{}".format(len(df_pry), len(df_rel)))

    # get reliable calls
    df_pry = basic_filter(df_pry)
    df_rel = basic_filter(df_rel)

    print("After filter pry:{} rel:{}".format(len(df_pry), len(df_rel)))

    # map cytobands
    df_pry_A = intersect_transloc_bands(df_pry, bands, 'A')
    df_pry_B = intersect_transloc_bands(df_pry, bands, 'B')

    df_rel_A = intersect_transloc_bands(df_rel, bands, 'A')
    df_rel_B = intersect_transloc_bands(df_rel, bands, 'B')

    # sort A and B with chromosome order from 1 to Y
    df_pry = sort_alterations(df_pry_A, df_pry_B)
    df_rel = sort_alterations(df_rel_A, df_rel_B)

    # get general name of band
    df_pry["G_BAND_L"] = df_pry["BAND_L"].apply(lambda x: x.split('.')[0] if '.' in x else x)
    df_pry["G_BAND_R"] = df_pry["BAND_R"].apply(lambda x: x.split('.')[0] if '.' in x else x)

    df_rel["G_BAND_L"] = df_rel["BAND_L"].apply(lambda x: x.split('.')[0] if '.' in x else x)
    df_rel["G_BAND_R"] = df_rel["BAND_R"].apply(lambda x: x.split('.')[0] if '.' in x else x)

    
    if df_pry.empty == False:
        # get known alterations
        df_pry['Variant'] = df_pry.apply(lambda x: "t({};{})({};{})".format(x['CHROM_L'], 
                                                                           x['CHROM_R'],
                                                                           x['G_BAND_L'],
                                                                           x['G_BAND_R']), axis=1)
        df_pry_known = df_pry[df_pry['Variant'].isin(known_transloc['Variant'])]
        # other transloc found
        df_pry_others = df_pry[~df_pry['Variant'].isin(known_transloc['Variant'])]
    else:
        df_pry_known = pd.DataFrame(columns=['BAND_L', 'BAND_R', 'CHROM_L', 'CHROM_R', 'END_L', 'END_R', 'ID',
       'START_L', 'START_R', 'STRAND_L', 'STRAND_R', 'G_BAND_L', 'G_BAND_R','Variant'])
        
    if df_rel.empty == False:
        # get known alterations
        df_rel['Variant'] = df_rel.apply(lambda x: "t({};{})({};{})".format(x['CHROM_L'], 
                                                                            x['CHROM_R'],
                                                                            x['G_BAND_L'],
                                                                            x['G_BAND_R']), axis=1)
        df_rel_known = df_rel[df_rel['Variant'].isin(known_transloc['Variant'])]
        # other transloc found
        df_rel_others = df_rel[~df_rel['Variant'].isin(known_transloc['Variant'])]
    else:
        df_rel_known = pd.DataFrame(columns=['BAND_L', 'BAND_R', 'CHROM_L', 'CHROM_R', 'END_L', 'END_R', 'ID',
       'START_L', 'START_R', 'STRAND_L', 'STRAND_R', 'G_BAND_L', 'G_BAND_R','Variant'])

    shared, private_pry, private_rel = get_three_branches(df_pry_known, df_rel_known)

    df_pry_known['subset'] = df_pry_known['Variant'].apply(lambda x: 'shared' if x in shared else 'private_primary')
    df_rel_known = df_rel_known[~df_rel_known['Variant'].isin(shared)]
    df_rel_known['subset'] = 'private_relapse'
    df_pry_known['PATIENT'] = pat
    df_rel_known['PATIENT'] = pat

    df_pry_known = df_pry_known.merge(known_transloc[['Variant', 'SYMBOL']], how='left', on='Variant')
    df_rel_known = df_rel_known.merge(known_transloc[['Variant', 'SYMBOL']], how='left', on='Variant')

    dff_known = dff_known.append(df_pry_known, ignore_index=True, sort=False)
    dff_known = dff_known.append(df_rel_known, ignore_index=True, sort=False)

    shared, private_pry, private_rel = get_three_branches(df_pry_others, df_rel_others)

    df_pry_others['subset'] = df_pry_others['Variant'].apply(lambda x: 'shared' if x in shared else 'private_primary')
    df_rel_others = df_rel_others[~df_rel_others['Variant'].isin(shared)]
    df_rel_others['subset'] = 'private_relapse'
    df_pry_others['PATIENT'] = pat
    df_rel_others['PATIENT'] = pat

    dff_others = dff_others.append(df_pry_others, ignore_index=True, sort=False)
    dff_others = dff_others.append(df_rel_others, ignore_index=True, sort=False)

We have manually checked all the top recurrent ones in the BAM. Mapping information reveals no clear translocation. We have perfomed a BLAT of some of the regions of the BNDs in the UCSC and some of them are  Alu regions or mappable to many parts of the genome. 

In [None]:
print(len(dff_others[['Variant', 'subset','PATIENT']]))
test = dff_others[['Variant', 'subset','PATIENT']].drop_duplicates()
print(len(test))

In [None]:
test[['Variant', 'PATIENT']].groupby('Variant').count().sort_values('PATIENT',ascending=False)

In [None]:
dff_others[dff_others['Variant'] == 't(X;Y)(p22;q11)']

In [None]:
test[test['Variant'] == 't(4;8)(q32;p11)']

In [None]:
dff_others[dff_others['Variant'] == 't(4;8)(q32;p11)']

In [None]:
test[test['Variant'] == 't(X;2)(q28;p21)']

In [None]:
test[test['Variant'] == 't(6;14)(p22;q32)']

In [None]:
test[test['Variant'] == 't(6;16)(q16;p13)']

In [None]:
test[test['Variant'] == 't(6;18)(q14;q22)']

In [None]:
test[test['Variant'] == 't(1;11)(q42;q12)']

In [None]:
dff_others[dff_others['Variant'] == 't(6;16)(q16;p13)']

In [None]:
dff_known

In [None]:
dff_known.to_csv(os.path.join(dire_out,"bnd_known.tsv", sep='\t', index=False)

In [None]:
dff_others.to_csv(os.path.join(dire_out,"bnd_recurrent.tsv"), sep='\t', index=False)

### INVERSIONS

All inversions reported by Delly are within same cytoband and do not match to the known driver ones of the literature

In [None]:
known_inv = pd.read_csv("../ext_files/literature/sv_inv_lite.tsv", sep='\t')
known_inv['Variant'] = known_inv.apply(lambda x: "({});({})".format(x['CHROM'], x['BAND_L']+x['BAND_R']), axis=1)
known_inv.head()

In [None]:
dire_in = "" # delly results
dff_invs = process_other_sv('inv', dire_in)

NO KNOWN INVERSION FOUND

### DUPLICATIONS

known duplication in MYB 6q23 doi: 10.1182/blood-2016-10-706465 

Coordinates not mapping close to any known gene either

In [None]:
dire_in = "" # delly results
dff_dups = process_other_sv('dup',dire_in)

In [None]:
dff_dups[(dff_dups['CHROM_A'] == '6') & (dff_dups['alt_band_A'].str.contains('q23'))] # no MYB duplication

In [None]:
dff_dups[(dff_dups['CHROM_A'] == '6') & (dff_dups['alt_band_B'].str.contains('q23'))] # MYB duplication