This notebook gets a list of candidate driver CNV for each cohort. It first annotates the variants with cytobands and separates known and candidate drivers. 

In [None]:
import sys, os
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]
import pandas as pd
import numpy as np
import glob
from collections import defaultdict
import pybedtools

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
## FUNCTIONS

def get_info(df_subset,df_bands, dictionary, df_final):
    for i, rw in df_subset.iterrows():
        df_rw = pd.DataFrame(rw)
        df_rw = df_rw.T

        df_rw['chrom'] = df_rw['chrom'].astype(str)
        df_rw[['start', 'end']] = df_rw[['start', 'end']].astype(int)

        cnv = pybedtools.BedTool.from_dataframe(df_rw[['chrom', 'start', 'end']])
        bands = pybedtools.BedTool.from_dataframe(df_bands[['chrom_band', 'start_band', 'end_band']])
        intersection = cnv.intersect(bands, loj = True)
        cnv_annotated = pd.read_table(intersection.fn, names=['chrom', 'start', 'end',
                                                 'chrom_band', 'start_band', 'end_band'])

        cnv_annotated[['chrom', 'chrom_band']] = cnv_annotated[['chrom', 'chrom_band']].astype(str)
        cnv_annotated[['start', 'end','start_band', 'end_band']] = cnv_annotated[['start', 'end','start_band', 'end_band']].astype(int)
        cnv_annotated = cnv_annotated.merge(df_rw, how='left', on=['chrom', 'start', 'end'])
        cnv_annotated = cnv_annotated.merge(df_bands, how='left', on=['chrom_band', 'start_band', 'end_band'])

        cnv_annotated['genes'] = cnv_annotated.apply(lambda x: dictionary[x['band_name']], axis=1)
        df_final = df_final.append(cnv_annotated, sort=False, ignore_index=True) 
    return df_final

def check_lite_knowleadge(rw, lite, other_drivers):
    list_known=[]
    for i,row in lite.iterrows():
        check_band = rw['band_name'].split('.')[0]
        if (row['BAND'] == rw['band_name']) and (row['CNV'] == rw['alteration']):
            rw['specific_lite_cnv'] = 'known'
            list_known.append(row['SYMBOL'])
            list_genes_others= list()
            for gene in rw['genes']:
                if gene in other_drivers['SYMBOL'].tolist():
                    list_genes_others.append(gene)
            rw['other_known_genes_in_band'] = list_genes_others
        elif (row['BAND'] == check_band) and (row['CNV'] == rw['alteration']):
            rw['specific_lite_cnv'] = 'known'
            list_known.append(row['SYMBOL'])
            list_genes_others= list()
            for gene in rw['genes']:
                if gene in other_drivers['SYMBOL'].tolist():
                    list_genes_others.append(gene)
            rw['other_known_genes_in_band'] = list_genes_others
        else:
            do_nothing = 'do_nothing'
    print(list_known)
    kk = (',').join(list_known)
    rw['driver_cnv'] = kk
    print(rw['driver_cnv'])
    return rw

def add_gene_of_interest(rw, drivers):
    list_genes = list()
    for gene in rw['genes']:
        if gene in drivers['SYMBOL'].tolist():
            list_genes.append(gene)
    genes = (',').join(list_genes)
    rw['genes_of_interest'] = genes
    return rw

#### read manually checked processed results and annotate band and gene affected

In [None]:
dire_in = # directory with the inputs
dire_out = # directory to put the results

In [None]:
chroms = [str(x) for x in range(1,23,1)]
chroms.extend("X")

In [None]:
# read processed excels
cnv_private_pry = pd.read_excel(os.path.join(dire_in, "private_primary_processed.xlsx"))
cnv_private_rel = pd.read_excel(os.path.join(dire_in, "private_relapse_processed.xlsx"))
cnv_shared = pd.read_excel(os.path.join(dire_in, "shared_processed.xlsx"))

In [None]:
# read cytobands
df_bands = pd.read_csv("../ext_files/chromosome.band.hg19.txt",  sep='\t')
df_bands.rename(columns={'#chrom':'chrom_band', 'chromStart':'start_band', 'chromEnd':'end_band'}, inplace=True)
df_bands['chrom_band'] = df_bands['chrom_band'].apply(lambda x: x.replace('chr', ''))
df_bands['band_name'] = df_bands.apply(lambda x: x['chrom_band']+x['name'], axis=1)

In [None]:
# read genes hg19 of biomart
df_genes = pd.read_csv("../ext_files/mart_export_grch37.txt", sep='\t')
df_genes.rename(columns={'Chromosome/scaffold name':'chrom', 'Gene start (bp)':'start_gene', 
                         'Gene end (bp)':'end_gene', 'Gene name':'symbol', 'Gene stable ID':'ensembl_id'}, 
                inplace=True)
df_genes = df_genes[df_genes['chrom'].isin(chroms)]

In [None]:
# merge them
bands = pybedtools.BedTool.from_dataframe(df_bands[['chrom_band', 'start_band', 'end_band']])
genes = pybedtools.BedTool.from_dataframe(df_genes[['chrom', 'start_gene', 'end_gene']])
result = bands.intersect(genes, loj = True)
results = pd.read_table(result.fn, names=[ 'chrom_band', 'start_band', 'end_band',
                                          'chrom','start_gene', 'end_gene'])
results[['start_gene', 'end_gene', 'start_band', 'end_band']] = results[['start_gene', 'end_gene', 'start_band', 'end_band']].astype(int)
results[['chrom', 'chrom_band']] = results[['chrom', 'chrom_band']].astype(str)
results = results.merge(df_genes, how='left', on=['chrom', 'start_gene', 'end_gene'])
results = results.merge(df_bands, how='left', on=['chrom_band', 'start_band', 'end_band'])

In [None]:
# create dictionary with band name and genes 
grps = results.groupby(['chrom_band', 'start_band', 'end_band'])

dict_band_gene = defaultdict(list)

for g in grps.groups:
    df = grps.get_group(g)
    for i,rw in df.iterrows():
        dict_band_gene[rw['chrom_band']+rw['name']].append(rw['symbol']) 

In [None]:
# change column data types

cnv_private_pry['chrom'] = cnv_private_pry['chrom'].astype(str)
cnv_private_rel['chrom'] = cnv_private_rel['chrom'].astype(str) 
cnv_shared['chrom'] = cnv_shared['chrom'].astype(str)

cnv_private_pry[['start', 'end']] = cnv_private_pry[['start', 'end']].astype(int)
cnv_private_rel[['start', 'end']] = cnv_private_rel[['start', 'end']].astype(int)
cnv_shared[['start', 'end']] = cnv_shared[['start', 'end']].astype(int)

In [None]:
# add band-gene info to the cnv of each subset
df_cnv_info = pd.DataFrame()

df_cnv_info = get_info(cnv_shared, df_bands, dict_band_gene, df_cnv_info)
df_cnv_info = get_info(cnv_private_pry,df_bands, dict_band_gene, df_cnv_info)
df_cnv_info = get_info(cnv_private_rel, df_bands, dict_band_gene, df_cnv_info)

In [None]:
# read lite known cnv in ALL
df_lite = pd.read_csv("../ext_files/literature/cnv_lite.tsv", sep='\t')

# read all general cancer genes
drivers_muts = pd.read_csv("../ext_files/literature/mutations_lite.tsv", sep='\t')

In [None]:
df_lite.head()

In [None]:
df_lite['CNV'] = df_lite['CNV'].apply(lambda x: x.strip())
df_cnv_info['alteration'] = df_cnv_info['alteration'].apply(lambda x: x.strip())

In [None]:
# get known driver cnv
df_cnv_info = df_cnv_info.apply(lambda x: check_lite_knowleadge(x, df_lite, drivers_muts), axis=1)
df_cnv_info_known = df_cnv_info[df_cnv_info['specific_lite_cnv'] == 'known']

In [None]:
df_cnv_info_known

In [None]:
# get recurrent cnv in genes of interest
df_cnv_info = df_cnv_info[df_cnv_info['specific_lite_cnv'] != 'known']

count = df_cnv_info[['band_name', 'patient']].drop_duplicates().groupby('band_name').count().sort_values(by='patient',ascending=False)
count.rename(columns={'patient':'count'}, inplace=True)
count = count[count['count'] > 1]
count.reset_index(inplace=True)
list_bands_recurrent = count['band_name'].tolist()
df_cnv_info_recurrent = df_cnv_info[df_cnv_info['band_name'].isin(list_bands_recurrent)]
print(len(df_cnv_info_recurrent))

df_cnv_info_recurrent = df_cnv_info_recurrent.apply(lambda x: add_gene_of_interest(x,drivers_muts), axis=1)
df_cnv_info_recurrent_annotated = df_cnv_info_recurrent[df_cnv_info_recurrent['genes_of_interest'] != '']

In [None]:
# check unknown recurrent cnv
df_cnv_info_recurrent_only = df_cnv_info_recurrent[df_cnv_info_recurrent['genes_of_interest'] == '']

count = df_cnv_info_recurrent_only[['band_name', 'patient']].drop_duplicates().groupby('band_name').count().sort_values(by='patient',ascending=False)
count.rename(columns={'patient':'count'}, inplace=True)
count = count[count['count'] > 1]
count.reset_index(inplace=True)

In [None]:
df_cnv_info_known[['patient', 'band_name', 'subset', 'driver_cnv', 'alteration']].drop_duplicates().sort_values('patient')

In [None]:
# write results
df_cnv_info_known_write = df_cnv_info_known[['patient', 'band_name', 'subset', 'driver_cnv', 'alteration']]
df_cnv_info_known_write.drop_duplicates(subset=['patient', 'band_name', 'subset', 'driver_cnv', 'alteration'], inplace=True)
df_cnv_info_known_write.to_csv(os.path.join(dire_out, "known_driver_cnv.tsv"), sep='\t', index=False)

df_cnv_info_recurrent_annotated[['patient', 'band_name', 'subset', 'genes_of_interest', 'alteration']].to_csv(os.path.join(dire_out,"recurrent_interesting_cnv.tsv"), sep='\t', index=False)

In [None]:
df_cnv_info_recurrent_annotated[['patient', 'band_name', 'subset', 'genes_of_interest', 'alteration']]