Combine individual analyses in each subtype

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import os
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

## Read files

In [13]:
genes = pd.read_csv('../../data/genes.tsv', sep='\t')
genes.columns = ['name', 'band', 'chr', 'start', 'end', 'symbol']

In [3]:
def file2record(f):
    df = pd.read_csv(f, sep='\t')
    df = df[['gene_id', 'cnv_scaled']].transpose()
    df.columns = df.iloc[0,:]
    df.columns.name = None
    df = df.iloc[[1]]
    df.index = ['-'.join([organ.name, subtype.name])]
    return df

In [5]:
path = '../../data/'
amps = []
dels = []

amps_hi = []
dels_hi = []

min_samples = 50
for organ in os.scandir(path):
    if organ.is_dir():
        print(organ.name)
        
        with open(os.path.join(organ, '_'.join([organ.name.lower(), 'counts.tsv']))) as fc:
            counts = pd.read_csv(fc, sep='\t')
            print(counts)
        
        for subtype in os.scandir(os.path.abspath(organ)):
            if subtype.is_dir():
                print('\t{}'.format(subtype.name))
                    
                if counts.loc[counts['name'] == subtype.name, 'count'].values[0] < min_samples:
                    print('\t\t Not enough samples: {}'.format(subtype.name))
                else:
                    
                    for f in os.scandir(os.path.abspath(subtype)):
                        if len(f.name.split('_')) == 4:
                            if 'amp_genes' in f.name:
                                print('\t\t{}'.format(f.name))
                                amps.append(file2record(f))

                            if 'del_genes' in f.name:
                                print('\t\t{}'.format(f.name))
                                dels.append(file2record(f))

                                
                        if 'high_amp_genes' in f.name:
                            print('\t\t{}'.format(f.name))
                            amps_hi.append(file2record(f))
                        if 'high_del_genes' in f.name:
                            print('\t\t{}'.format(f.name))
                            dels_hi.append(file2record(f))                                

Skin
                   name  count
0          Bednar tumor     11
1  Epidermoid carcinoma     18
2          Keratinizing     11
3              Melanoma   1068
4         Not specified     50
5  Pagetoid reticulosis     32
	Bednar tumor
		 Not enough samples: Bednar tumor
	Melanoma
		skin_Melanoma_high_del_genes.tsv
		skin_Melanoma_amp_genes.tsv
		skin_Melanoma_del_genes.tsv
		skin_Melanoma_high_amp_genes.tsv
	Keratinizing
		 Not enough samples: Keratinizing
	Epidermoid carcinoma
		 Not enough samples: Epidermoid carcinoma
	Pagetoid reticulosis
		 Not enough samples: Pagetoid reticulosis
Prostate
             name  count
0  Adenocarcinoma    916
1       Carcinoma     16
2   Not specified    253
	Carcinoma
		 Not enough samples: Carcinoma
	Adenocarcinoma
		prostate_Adenocarcinoma_del_genes.tsv
		prostate_Adenocarcinoma_high_amp_genes.tsv
		prostate_Adenocarcinoma_amp_genes.tsv
		prostate_Adenocarcinoma_high_del_genes.tsv
Lung
                                                 name  count
0

	Primitive neuroectodermal tumor
		brain_Primitive neuroectodermal tumor_amp_genes.tsv
		brain_Primitive neuroectodermal tumor_high_del_genes.tsv
		brain_Primitive neuroectodermal tumor_del_genes.tsv
		brain_Primitive neuroectodermal tumor_high_amp_genes.tsv
	Oligodendroglioma
		brain_Oligodendroglioma_del_genes.tsv
		brain_Oligodendroglioma_high_del_genes.tsv
		brain_Oligodendroglioma_amp_genes.tsv
		brain_Oligodendroglioma_high_amp_genes.tsv
	Mixed glioma
		brain_Mixed glioma_del_genes.tsv
		brain_Mixed glioma_high_amp_genes.tsv
		brain_Mixed glioma_high_del_genes.tsv
		brain_Mixed glioma_amp_genes.tsv
Liver
                       name  count
0  Hepatocellular carcinoma    371
1             Not specified    280
	Hepatocellular carcinoma
		liver_Hepatocellular carcinoma_high_amp_genes.tsv
		liver_Hepatocellular carcinoma_amp_genes.tsv
		liver_Hepatocellular carcinoma_del_genes.tsv
		liver_Hepatocellular carcinoma_high_del_genes.tsv
Ovary
                   name  count
0        Adenoca

## Accumulate all muations 

In [16]:
all_amps = pd.concat(amps, sort=False)
all_dels = pd.concat(dels, sort=False)

# all amps
all_amps = all_amps.fillna(0)
sum_amps = np.sum(all_amps, axis=0)
gene_amps = pd.DataFrame({'name': all_amps.columns, 'sum':sum_amps})
gene_amps = pd.merge(gene_amps, genes, how='left', on='name')

# all dels
all_dels = all_dels.fillna(0)
sum_dels = np.sum(all_dels, axis=0)
gene_dels = pd.DataFrame({'name': all_dels.columns, 'sum':sum_dels})
gene_dels = pd.merge(gene_dels, genes, how='left', on='name')

# scaling 
sum_all = np.concatenate((sum_amps.values, sum_dels.values))
sum_all_scaled = preprocessing.minmax_scale(np.abs(sum_all))
gene_amps['cnv_scaled'] = sum_all_scaled[:sum_amps.shape[0]]
gene_dels['cnv_scaled'] = -sum_all_scaled[sum_amps.shape[0]:]

# output
gene_amps.to_csv(path + '/amp_genes.tsv', sep='\t', index=False)
gene_dels.to_csv(path + '/del_genes.tsv', sep='\t', index=False)

# overlapping drivers
censusfile = '../data/Census_allThu Jan 16 08_07_54 2020.tsv'
census_genes = pd.read_csv(censusfile, sep='\t')

census_amps = gene_amps[gene_amps['symbol'].isin(census_genes['Gene Symbol'].values)]
census_dels = gene_dels[gene_dels['symbol'].isin(census_genes['Gene Symbol'].values)]

census_amps.to_csv(path + '/amp_census.tsv', sep='\t', index=False)
census_dels.to_csv(path + '/del_census.tsv', sep='\t', index=False)

## Accumulate high mutations

In [17]:
all_amps_hi = pd.concat(amps_hi, sort=False)
all_dels_hi = pd.concat(dels_hi, sort=False)

# all amps_hi
all_amps_hi = all_amps_hi.fillna(0)
sum_amps_hi = np.sum(all_amps_hi, axis=0)
gene_amps_hi = pd.DataFrame({'name': all_amps_hi.columns, 'sum':sum_amps_hi})
gene_amps_hi = pd.merge(gene_amps_hi, genes, how='left', on='name')

# all dels_hi
all_dels_hi = all_dels_hi.fillna(0)
sum_dels_hi = np.sum(all_dels_hi, axis=0)
gene_dels_hi = pd.DataFrame({'name': all_dels_hi.columns, 'sum':sum_dels_hi})
gene_dels_hi = pd.merge(gene_dels_hi, genes, how='left', on='name')

# scaling 
sum_all = np.concatenate((sum_amps_hi.values, sum_dels_hi.values))
sum_all_scaled = preprocessing.minmax_scale(np.abs(sum_all))
gene_amps_hi['cnv_scaled'] = sum_all_scaled[:sum_amps_hi.shape[0]]
gene_dels_hi['cnv_scaled'] = -sum_all_scaled[sum_amps_hi.shape[0]:]

# output
gene_amps_hi.to_csv(path + '/high_amp_genes.tsv', sep='\t', index=False)
gene_dels_hi.to_csv(path + '/high_del_genes.tsv', sep='\t', index=False)

# overlapping drivers
census_amps_hi = gene_amps_hi[gene_amps_hi['symbol'].isin(census_genes['Gene Symbol'].values)]
census_dels_hi = gene_dels_hi[gene_dels_hi['symbol'].isin(census_genes['Gene Symbol'].values)]

census_amps_hi.to_csv(path + '/high_amp_census.tsv', sep='\t', index=False)
census_dels_hi.to_csv(path + '/high_del_census.tsv', sep='\t', index=False)

In [6]:
data = all_feat.loc[:, all_feat.columns != 'label']
data = data.fillna(0).values

In [10]:
col_sum = np.sum(data, axis=0)

In [19]:
gene_names = all_feat.columns [all_feat.columns != 'label'].values

In [20]:
gene_names

array(['ENSG00000143207', 'ENSG00000121454', 'ENSG00000116260', ...,
       'ENSG00000141371', 'ENSG00000170836', 'ENSG00000062725'],
      dtype=object)

In [21]:
gene_sums = pd.DataFrame({'name':gene_names, 'sum':col_sum})

In [23]:
gene_sums.head(10)

Unnamed: 0,name,sum
0,ENSG00000143207,8.681541
1,ENSG00000121454,8.760399
2,ENSG00000116260,8.756662
3,ENSG00000143337,8.704378
4,ENSG00000169905,8.71176
5,ENSG00000143340,8.743454
6,ENSG00000162782,8.732982
7,ENSG00000116218,8.721839
8,ENSG00000162779,8.745573
9,ENSG00000057252,8.733895


In [26]:
genes.columns = ['name', 'band', 'chro', 'start', 'end', 'symbol']

In [30]:
gene_sums = pd.merge(gene_sums, genes, how='left', on='name')

In [44]:
gene_sums = gene_sums.sort_values(['chro', 'band'])

In [282]:
gene_dels.loc[(gene_dels['chro'] == 16) & (gene_dels['scaled'] < -0.1), :].sort_values(['chro', 'start'])

Unnamed: 0,name,sum,band,chro,start,end,symbol,scaled
357,ENSG00000213853,-1.845472,p13.13,16,10528422,10580632,EMP2,-0.108856
362,ENSG00000153060,-1.831165,p13.13,16,10627501,10694930,TEKT5,-0.107982
361,ENSG00000103274,-1.81949,p13.13,16,10743786,10769351,NUBP1,-0.10727
360,ENSG00000166676,-1.818313,p13.13,16,10760919,10818794,TVP23A,-0.107198
359,ENSG00000179583,-1.823028,p13.13,16,10866222,10943021,CIITA,-0.107486
358,ENSG00000182108,-1.821702,p13.13,16,10928891,10942468,DEXI,-0.107405
355,ENSG00000038532,-1.826033,p13.13,16,10944564,11182186,CLEC16A,-0.107669
354,ENSG00000175643,-1.80545,p13.13,16,11249619,11381662,RMI2,-0.106412
353,ENSG00000185338,-1.794677,p13.13,16,11254405,11256200,SOCS1,-0.105755
352,ENSG00000178279,-1.794677,p13.13,16,11267748,11269533,TNP2,-0.105755


In [283]:
gene_dels.loc[(gene_dels['chro'] == 16) & (gene_dels['band'] == 'p13.13'), :].sort_values(['chro', 'start'])['symbol']

357        EMP2
362       TEKT5
361       NUBP1
360      TVP23A
359       CIITA
358        DEXI
355     CLEC16A
354        RMI2
353       SOCS1
352        TNP2
356        PRM2
350        PRM1
349         NaN
348       LITAF
347         SNN
351     TXNDC11
346      ZC3H7A
345      RSL1D1
344       GSPT1
343      NPIPB2
342    TNFRSF17
341       SNX29
Name: symbol, dtype: object

In [9]:
genes

Unnamed: 0,Gene stable ID,Karyotype band,Chromosome/scaffold name,Gene start (bp),Gene end (bp),HGNC symbol
0,ENSG00000117036,q23.1,1,157121191,157138474,ETV3
1,ENSG00000253831,q23.1,1,157092043,157112412,ETV3L
2,ENSG00000132694,q23.1,1,156934840,157045370,ARHGEF11
3,ENSG00000160838,q23.1,1,156920632,156933094,LRRC71
4,ENSG00000187800,q23.1,1,156893698,156916434,PEAR1
...,...,...,...,...,...,...
18745,ENSG00000148053,q21.33,9,84668551,85027070,NTRK2
18746,ENSG00000125484,q34.13,9,132670035,132694953,GTF3C4
18747,ENSG00000125485,q34.13,9,132592997,132670401,DDX31
18748,ENSG00000165698,q34.13,9,132878027,132890201,SPACA9
