In [4]:
##Preparing data as per https://doi.org/10.1186/s12920-018-0460-9 
##Predicting drug response of tumors from integrated genomic profiles by deep neural networks
##Yu-Chiao Chiu, Hung-I Harry Chen, Tinghe Zhang, Songyao Zhang, Aparna Gorthi, Li-Ju Wang, Yufei Huang

In [469]:
import pandas as pd
import numpy as np
import h5py

In [478]:
#TCGA_exp = pd.read_csv('counts_gene.tsv', delimiter='\t')  --> TCGA exp downloaded from GDC Portal is not used , it contains 58.000 genes, most of them from intron, 
#while those downloaded from https://amp.pharm.mssm.edu/archs4/download.html  contains a selection of 25.150 genes

In [480]:
#print(TCGA_exp.shape)
#TCGA_exp.columns[-1]

(58037, 11285)


'gene_id'

In [15]:
#TCGA_exp.loc[TCGA_exp['3DFF72D2-F292-497E-ACE3-6FAA9C884205'] == 439935,'gene_id']

50432    ENSG00000268895.5
Name: gene_id, dtype: object

In [20]:
import mygene
mg = mygene.MyGeneInfo()
mg.getgene('ENSG00000268895', 'name,symbol,refseq.rna') 

{'_id': '503538',
 '_score': 13.213065,
 'name': 'A1BG antisense RNA 1',
 'refseq': {'rna': 'NR_015380.2'},
 'symbol': 'A1BG-AS1'}

In [98]:
mg.getgene(150465, 'name,symbol,refseq.rna')

{'_id': '150465',
 '_score': 14.277951,
 'name': 'tubulin tyrosine ligase',
 'refseq': {'rna': ['NM_153712.5', 'XM_005263599.3', 'XM_011510665.2']},
 'symbol': 'TTL'}

In [473]:
#Importing TCGA RNA expression data downloaded from  https://amp.pharm.mssm.edu/archs4/download.html
filename = "../tcga_matrix.h5"

h5 = h5py.File(filename,'r')
for k in h5.keys():
    print(k)


meta = h5['meta']  
dat = h5['data']  
info = h5['info']  



#Showing which type of data it contains    
for s in meta.keys():
    print('meta')
    print(s)
    print(meta[s].shape)
    print(meta[s].value)
    
for s in dat.keys():
    print('data')
    print(s)
    print(dat[s].shape)
    print(dat[s].value)    
    
for s in info.keys():
    print(s)
    print(info[s].shape)
    print(info[s].value)    

#h5.close()

meta
data
info
meta
auc
(11284,)
[b'3116748351' b'5069849036' b'4882401623' ... b'5742234142' b'6773612787'
 b'7613399024']
meta
bigwig_file
(11284,)
[b'3DFF72D2-F292-497E-ACE3-6FAA9C884205.bw'
 b'B1E54366-42B9-463C-8615-B34D52BD14DC.bw'
 b'473713F7-EB41-4F20-A37F-ACD209E3CB75.bw' ...
 b'76987B28-B56B-4A1F-B77C-1E08B8EFEF90.bw'
 b'33737781-8638-4FA2-AD4C-E888BB9343D8.bw'
 b'0BB05CA7-C6FF-42A1-919C-D801F471CBBD.bw']
meta
cancertype
(11284,)
[b'Liver Hepatocellular Carcinoma' b'Prostate Adenocarcinoma'
 b'Rectum Adenocarcinoma' ... b'Brain Lower Grade Glioma'
 b'Acute Myeloid Leukemia' b'Thyroid Carcinoma']
meta
gdc_annotations
(11284,)
[b'NULL' b'NULL' b'NULL' ... b'NULL'
 b'list(category = "Alternate sample pipeline", status = "Approved", entity_id = "fff35c80-88cd-4923-80c1-0273ba5bed0f", classification = "Notification", updated_datetime = "2016-05-01T15:00:21.638875-05:00", created_datetime = "2012-11-13T12:37:22-05:00", entity_submitter_id = "TCGA-AB-2881", notes = "Biospecimens fro

 [  40426   40053      48 ...  793314  481256  313210]]
author
(1,)
[b'Alexander Lachmann']
contact
(1,)
[b'alexander.lachmann@mssm.edu']
countsource
(1,)
[b'recount2: https://jhubiostatistics.shinyapps.io/recount/']
creationdate
(1,)
[b'10/9/2017']
lab
(1,)
[b"Ma'ayan Lab, Icahn School of Medicine"]


In [472]:
#if not exists it saves a file 'log2expression.npy' with the log2 transformation of RNA expression 
import os.path
def f(x):
    #return log2(TPM+0.001)
    return np.log2(x +0.001 ) 

l = None
if not os.path.exists('../log2expression.npy'):
    #log2(TPM+0.001) expression values computed per gene
    l = dat['expression'].value
    l = np.vectorize(f)(l)
    np.save('log2expression.npy', l)
else: 
    l = np.load('../log2expression.npy')

from scipy import stats
stats.describe(l)

DescribeResult(nobs=11284, minmax=(array([ 4.90693868,  4.39238612, -9.96578428, ..., 14.11845455,
       12.98815228, 11.60733078]), array([25.74434345, 23.45781508, 21.78148379, ..., 24.41667745,
       22.1506869 , 21.65159744])), mean=array([14.51133518, 14.65828565,  7.81866386, ..., 19.87816157,
       18.0625713 , 17.54948585]), variance=array([ 5.12112323,  3.17381026, 58.31272768, ...,  1.31941998,
        1.13436237,  0.97464454]), skewness=array([ 1.34799737,  1.15626794, -0.97460423, ..., -0.09400296,
       -0.34315363, -0.48017077]), kurtosis=array([4.18004613, 4.2873423 , 0.95231487, ..., 0.81602377, 0.62193682,
       0.94856403]))

In [476]:
#associate gene names and case_id with RNA expression 
def f(x):
    return x.decode("utf-8")
genes = np.vectorize(f)(meta['genes'].value)
cases_id  = np.vectorize(f)(meta['gdc_cases.case_id'].value)
df = pd.DataFrame(l, columns = genes, index= cases_id)


Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0004d251-3f70-4395-b175-c94c2f5b1b81,20.472352,18.746931,18.272466,19.290201,15.257498,-9.965784,9.29921,5.643885,13.595724,-9.965784,...,13.189979,14.272849,11.805341,13.0,15.517669,9.278452,15.699383,18.206232,14.866072,15.448536
000d566c-96c7-4f1c-b36e-fa2222467983,14.818083,14.545749,7.58497,21.170981,17.082887,13.392988,10.586841,3.322072,17.862692,-9.965784,...,14.685789,14.368779,13.460072,15.752668,16.611414,12.744834,16.470659,19.945104,17.252194,16.725619
0011a67b-1ba9-4a32-a6b8-7850759a38cf,12.077484,13.246592,17.234948,20.949892,14.822969,6.700454,10.472692,7.321937,16.977783,7.044405,...,17.196554,17.947871,15.637785,17.493347,16.861135,8.839207,17.63632,20.331971,18.412549,17.949416
001887aa-36d0-463f-8bca-dec7043b4f2e,24.457885,22.684448,19.907039,23.860669,19.778088,7.918869,13.952559,1.000721,16.081317,-9.965784,...,14.631916,14.412041,12.74987,15.258603,16.945592,12.561527,18.267911,16.341345,17.340346,17.422426
001944e5-af34-4061-9c09-bb9ea346f6fd,11.676398,12.256504,8.076821,19.457004,14.529736,17.318905,8.209458,-9.965784,17.581149,-9.965784,...,17.75447,19.272135,13.757911,15.446534,17.096242,12.186734,17.651276,20.299427,17.006589,17.689343


In [483]:
print(df.shape)
TCGA_exp_matrix = df.transpose()

(11284, 25150)


In [82]:
#GDSC (not used) -> sanger1018_brainarray_ensemblgene_rma.txt, other gen_drug ML predictor: FORESEE https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btz145/5367831 
#CCLE exp -> v21.data.gex_avg_log2.txt, v21.meta.gex_features.txt is downloaded from ftp://caftpd.nci.nih.gov/pub/OCG-DCC/CTD2/Broad/CTRPv2.0_2015_ctd2_ExpandedDataset/ 
CCLE_read_exp = pd.read_csv('CTRPv2.1_2016_pub_NatChemBiol_12_109/v21.data.gex_avg_log2.txt', delimiter='\t')

In [83]:
CCLE_read_exp.columns

Index(['idx_gene_feature', 'master_ccl_id', 'mrna_expression_avg_log2'], dtype='object')

In [135]:
CCLE_meta = pd.read_csv('CTRPv2.1_2016_pub_NatChemBiol_12_109/v21.meta.gex_features.txt', delimiter='\t')
CCLE_meta_cells = pd.read_csv('CTRPv2.1_2016_pub_NatChemBiol_12_109/v21.meta.per_cell_line.txt', delimiter='\t')
print(CCLE_meta.columns)
#master_ccl_id   ccl_name
print(CCLE_meta_cells.columns)

Index(['idx_gene_feature', 'affy_probeset_id', 'entrez_gene_id',
       'gene_primary_name', 'gene_symbol'],
      dtype='object')
Index(['master_ccl_id', 'ccl_name', 'ccle_primary_site', 'ccle_primary_hist',
       'ccle_hist_subtype_1'],
      dtype='object')


In [102]:
CCLE_meta.shape

(18543, 5)

In [205]:
CCLE_gen_exp = pd.merge(CCLE_meta[['idx_gene_feature','gene_primary_name']], CCLE_read_exp, on=['idx_gene_feature'])
CCLE_gen_exp = pd.merge(CCLE_meta_cells[['master_ccl_id','ccl_name']], CCLE_gen_exp, on=['master_ccl_id'])

CCLE_gen_exp.drop(columns = ['idx_gene_feature', 'master_ccl_id'], inplace=True)


In [206]:
#making compatible CCLE identifiers between sources
CCLE_gen_exp['CELL_LINE_NAME']= CCLE_gen_exp['ccl_name'].str.replace('-','')

In [207]:
CCLE_gen_exp.drop(columns = ['ccl_name'], inplace=True)

In [208]:
CCLE_gen_exp= CCLE_gen_exp.set_index(['CELL_LINE_NAME','gene_primary_name'])
CCLE_gen_exp.shape

(15260889, 1)

In [209]:
#There are duplicated combinations of genes and cell lines, due to multiple probes that need to be averaged 
#CCLE_gen_exp[(CCLE_gen_exp['master_ccl_id']==1 ) & (CCLE_gen_exp['gene_primary_name']=='TTL' )]

CCLE = CCLE_gen_exp.groupby(['CELL_LINE_NAME','gene_primary_name']).mean()

In [210]:
CCLE.shape


(15259243, 1)

In [211]:
CCLE_matrix = CCLE.unstack(level=0)

In [212]:
CCLE_matrix

Unnamed: 0_level_0,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2,mrna_expression_avg_log2
CELL_LINE_NAME,22RV1,2313287,253J,253JBV,42MGBA,5637,639V,647V,697,769P,...,WSUDLCL2,YAPC,YD10B,YD15,YD38,YD8,YH13,YKG1,ZR751,ZR7530
gene_primary_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A1BG,4.980354,4.460359,4.783156,4.875861,8.037875,4.453434,7.144767,5.047940,7.652055,4.833232,...,8.303150,4.765994,4.971349,4.127073,4.736552,4.679927,7.242135,5.962800,7.731496,6.945533
A1BG-AS1,4.434383,4.496125,4.156233,4.115867,4.921515,4.380188,4.640216,3.852418,4.574058,4.226488,...,5.199862,3.954704,4.871769,3.933285,4.175528,4.314160,4.588566,4.303141,4.531014,4.480704
A1CF,7.531097,6.034458,3.698593,3.576494,3.605577,3.744421,3.821686,3.788699,3.600796,4.834175,...,3.802235,3.742811,3.928875,3.643380,3.696221,3.530718,3.684264,3.494644,3.649618,3.604962
A2M,4.732491,3.804899,4.503684,4.604683,4.022221,3.697799,3.679516,3.810548,4.025773,3.743379,...,4.019058,3.572556,4.218679,3.828684,3.913574,4.375905,4.588954,3.880388,3.767514,4.582317
A2M-AS1,5.487521,4.381788,4.358112,4.278722,3.414838,4.010439,4.221470,3.862556,4.361775,4.247613,...,5.472485,3.967758,4.090893,3.826735,3.810675,6.149712,3.568877,3.631777,5.773852,4.220501
A2ML1,3.374869,3.440269,3.552498,3.665464,3.891536,4.136166,3.644683,3.438546,3.654369,3.609130,...,3.332570,3.311664,3.697214,3.460398,3.772701,3.440113,3.696379,3.399162,3.775446,3.737868
A4GALT,3.704898,4.359996,4.114879,4.030298,3.714476,5.410250,3.992105,3.672118,3.781249,4.365838,...,4.394405,4.382130,4.850040,4.445679,4.453280,4.316323,3.942271,3.803461,4.217749,3.956384
A4GNT,3.965432,3.990351,4.148009,4.152585,3.784946,4.208724,4.333106,4.017701,3.946322,3.989604,...,4.236613,4.230899,3.936097,4.144899,4.217055,4.200849,3.696395,3.986703,4.388632,4.142876
AAAS,9.142425,8.057531,7.679034,7.670979,8.158512,8.513520,8.525572,8.096706,8.778584,8.724904,...,8.845578,7.277831,7.957358,8.184515,7.893897,7.714148,8.264009,7.672069,7.250468,7.935206
AACS,10.232410,8.807101,8.416268,9.025579,8.457420,9.300541,8.821612,9.641869,6.710451,8.945153,...,10.207470,9.424270,9.065984,9.576229,8.353688,9.087157,8.575915,8.370639,9.366894,9.054392


In [273]:
#IC50 for CCLE is obtained from supplement material in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4967469/#mmc5
IC50 = pd.read_excel('mmc5.xlsx', sheet_name='TableS4A-IC50s',  skiprows = range(1, 5), header=1, index_col=1)


In [274]:

IC50.drop(columns= 'Unnamed: 0', inplace = True)
IC50.index = IC50.index.str.replace('-','')


In [275]:
IC50 = IC50.transpose()
IC50

Unnamed: 0,ALLPO,AMO1,COLO668,CORL95,DG75,ECGI10,ES5,GT3TKB,JJN3,KINGS1,...,MM1S,EW12,H2810,NCIH128,CORL32,ECC12,KPNRTBM1,CP67MEL,NCIH378,NCIH250
TL-2-105,1.412465,3.363134,3.398414,3.312450,3.650059,5.001921,1.448888,4.490765,2.208866,5.381778,...,,,,,,,,,,
TAK-715,2.113150,3.051047,3.103017,6.096484,3.107754,4.003636,1.929536,3.736883,3.011577,6.112720,...,,,,,,,,,,
CP466722,-0.756692,1.718606,3.067645,2.950619,2.175176,5.448208,1.367814,3.789823,1.366097,6.000134,...,,,,,,,,,,
BMS-345541,1.386112,2.191936,1.228204,2.848541,1.587807,3.371173,1.536514,4.036876,2.032780,4.328236,...,,,,,,,,,,
Genentech Cpd 10,-1.398149,1.541583,0.313308,3.740211,2.438582,5.000838,-0.783992,2.562712,1.361857,2.415087,...,,,,,,,,,,
GSK429286A,3.681923,3.544126,3.572447,5.216102,5.301156,5.671351,3.937856,4.799862,3.797401,6.282322,...,,,,,,,,,,
Ruxolitinib,3.081088,2.989808,3.731025,3.719220,4.153156,4.331940,3.574216,3.651463,3.421299,4.987484,...,,,,,,,,,,
SB-715992,-3.230122,-3.888980,-2.275653,-0.931930,-2.527416,0.385862,-4.086624,-4.855325,-3.716937,-1.357139,...,,,,,,,,,,
ZSTK474,-1.691070,0.275043,0.577291,2.301070,2.322550,4.308773,-1.944276,3.063025,0.262038,3.227872,...,,,,,,,,,,
KIN001-102,-0.722249,1.626300,0.940925,2.685076,2.099412,5.001921,-0.463507,4.492776,1.595150,4.407665,...,,,,,,,,,,


In [276]:
#cell lines without IC50 for specific drugs are inputed with kNN (k = 5) using KNN from VIM package of R (other jupyter to be uploaded)


In [279]:
print(IC50.shape) #They downloaded drug response data of 990 CCLE cell lines to 265 anti-cancer
#drugs measured by the half maximal inhibitory concentration (IC50) from the GDSC Project.

(265, 990)


In [178]:
#MAF - Fields:entity_id, case_id, Hugo_Symbol
#four types of nonsynonymous mutations, including missense and nonsense mutations, frameshift insertions and deletions


In [185]:
# CCLE MAF downloaded from https://data.broadinstitute.org/ccle/CCLE_DepMap_18q3_maf_20180718.txt
#Note: TODO to use the newest version in 2019 from https://depmap.org/portal/download/all/?release=DepMap+Public+19Q1&file=DepMap-2019q1-celllines_v2.csv
CCLE_maf_read = pd.read_csv('CCLE_DepMap_18q3_maf_20180718.txt', delimiter='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [186]:
CCLE_maf_read

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,...,COSMIChsCnt,ExAC_AF,WES_AC,SangerWES_AC,SangerRecalibWES_AC,RNAseq_AC,HC_AC,RD_AC,WGS_AC,Broad_ID
0,DVL1,1855,37,1,1277461,1277461,+,Silent,SNP,C,...,0,,87:39,,,,,,,ACH-001270
1,AL590822.1,0,37,1,2144416,2144416,+,Missense_Mutation,SNP,G,...,0,,23:43,,,,,,,ACH-001270
2,PLCH2,9651,37,1,2435359,2435359,+,Splice_Site,SNP,A,...,0,,9:25,,,,,,,ACH-001270
3,UBE4B,10277,37,1,10177641,10177641,+,Missense_Mutation,SNP,G,...,0,,13:95,,,,,,,ACH-001270
4,SRM,6723,37,1,11119363,11119363,+,Silent,SNP,G,...,0,,113:57,,,,,,,ACH-001270
5,BAI2,576,37,1,32205607,32205607,+,Missense_Mutation,SNP,C,...,0,,101:84,,,,,,,ACH-001270
6,S100PBP,64766,37,1,33321630,33321630,+,Silent,SNP,G,...,0,0.000008,74:109,,,,,,,ACH-001270
7,ZSCAN20,7579,37,1,33958951,33958951,+,Missense_Mutation,SNP,C,...,0,,110:210,,,,,,,ACH-001270
8,SZT2,23334,37,1,43909122,43909122,+,Nonsense_Mutation,SNP,G,...,0,,86:144,,,,,,,ACH-001270
9,MOB3C,148932,37,1,47078936,47078936,+,Missense_Mutation,SNP,G,...,0,0.000025,131:58,,,,,,,ACH-001270


In [184]:
#CCLE_ID only match CELL_LINE_NAME if "_" is removed from CELL_LINE_NAME and then mapped to  CCLE_ID removing _*
CCLE_annot = pd.read_csv('Cell_lines_annotations_20181226.txt', delimiter='\t')
CCLE_annot.columns

Index(['CCLE_ID', 'depMapID', 'Name', 'Pathology', 'Site_Primary',
       'Site_Subtype1', 'Site_Subtype2', 'Site_Subtype3', 'Histology',
       'Hist_Subtype1', 'Hist_Subtype2', 'Hist_Subtype3', 'Gender',
       'Life_Stage', 'Age', 'Race', 'Geo_Loc', 'inferred_ethnicity',
       'Site_Of_Finding', 'Disease', 'Annotation_Source',
       'Original.Source.of.Cell.Line', 'Characteristics', 'Growth.Medium',
       'Supplements', 'Freezing.Medium', 'Doubling.Time.from.Vendor',
       'Doubling.Time.Calculated.hrs', 'type', 'type_refined',
       'PATHOLOGIST_ANNOTATION', 'mutRate', 'tcga_code'],
      dtype='object')

In [362]:
CCLE_maf = pd.merge(CCLE_annot[['CCLE_ID','depMapID']],CCLE_maf_read, left_on=['depMapID'], right_on=['Broad_ID'] )

In [363]:
CCLE_maf.drop(columns=['Broad_ID','depMapID'], inplace = True)

In [364]:
CCLE_maf['CELL_LINE_NAME'] = CCLE_maf['CCLE_ID'].str.replace(r'_.*','')

In [366]:
#select only 4 types of mutations: four types of nonsynonymous mutations, including missense and nonsense mutations, 
#frameshift insertions and deletions --> only those are tagged as mutated genes in the binary matrices
print(CCLE_maf.columns)
set(CCLE_maf['Variant_Classification'].values)
print(CCLE_maf.groupby(['Variant_Classification','Variant_Type']).count())

Index(['CCLE_ID', 'Hugo_Symbol', 'Entrez_Gene_Id', 'NCBI_Build', 'Chromosome',
       'Start_position', 'End_position', 'Strand', 'Variant_Classification',
       'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'dbSNP_RS',
       'dbSNP_Val_Status', 'Genome_Change', 'Annotation_Transcript',
       'Tumor_Sample_Barcode', 'cDNA_Change', 'Codon_Change', 'Protein_Change',
       'isDeleterious', 'isTCGAhotspot', 'TCGAhsCnt', 'isCOSMIChotspot',
       'COSMIChsCnt', 'ExAC_AF', 'WES_AC', 'SangerWES_AC',
       'SangerRecalibWES_AC', 'RNAseq_AC', 'HC_AC', 'RD_AC', 'WGS_AC',
       'CELL_LINE_NAME'],
      dtype='object')
                                       CCLE_ID  Hugo_Symbol  Entrez_Gene_Id  \
Variant_Classification   Variant_Type                                         
3'UTR                    DEL                50           50              50   
                         INS                12           12              12   
5'Flank                  DEL                64      

In [367]:
CCLE_maf = CCLE_maf[['Variant_Classification', 'Hugo_Symbol', 'CELL_LINE_NAME']]

In [368]:
mutated = ['Frame_Shift_Del','Frame_Shift_Ins','In_Frame_Del','In_Frame_Ins', 'Missense_Mutation','Nonsense_Mutation']
CCLE_maf.loc[:,'Mutated']   = 0
CCLE_maf.loc[CCLE_maf['Variant_Classification'].isin(mutated),'Mutated'] = 1

In [392]:
CCLE_maf.Mutated.value_counts()



1    738013
0    419289
Name: Mutated, dtype: int64

In [2]:
CCLE_maf.drop(columns=['Variant_Classification'], inplace = True)

In [378]:
CCLE_maf_matrix = CCLE_maf.groupby([ 'CELL_LINE_NAME','Hugo_Symbol']).count()

In [379]:
CCLE_maf_matrix = CCLE_maf_matrix.unstack(level=0)

In [406]:
CCLE_maf_matrix.shape

(19278, 1414)

In [401]:
CCLE_maf_matrix.fillna(value = 0) #all combinations of gene and cell line not reported in the MAF file is assumed to be wildtype 

Unnamed: 0_level_0,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated
CELL_LINE_NAME,201T,22RV1,2313287,253J,253JBV,42MGBA,451LU,5637,59M,639V,...,YD10B,YD15,YD38,YD8,YH13,YKG1,YMB1E,YT,ZR751,ZR7530
Hugo_Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AACS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AADAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [446]:
import os
import glob

#TCGA MAF data is obtained from GDC Portal using a manifest to download this query MAF files + open: 
#https://portal.gdc.cancer.gov/repository?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.access%22%2C%22value%22%3A%5B%22open%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.data_format%22%2C%22value%22%3A%5B%22MAF%22%5D%7D%7D%5D%7D
#long process 
if not os.path.exists('maf_TCGA.csv'):
    paths = glob.glob('maf_TCGA/**/*.somatic.maf.gz' , recursive=True)

    df = pd.DataFrame()
    i = 0
    for maf in paths:
        i+=1
        print(i)
        d = pd.read_csv(maf, compression='gzip', delimiter='\t', skiprows= 5)
        if d.columns[0] == 'Hugo_Symbol' :
            df = df.append(d)
        else:
            print(d.columns[0])
    df.to_csv('maf_TCGA.csv')
            
else:    
    TCGA_maf_read = pd.read_csv('maf_TCGA.csv', usecols=['Variant_Classification', 'Variant_Type','Hugo_Symbol', 'case_id'])


    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


  interactivity=interactivity, compiler=compiler, result=result)


20


  interactivity=interactivity, compiler=compiler, result=result)


21
22
23


  interactivity=interactivity, compiler=compiler, result=result)


24
25
26
27
28
29
30
31
32
33


  interactivity=interactivity, compiler=compiler, result=result)


34
35


  interactivity=interactivity, compiler=compiler, result=result)


36
37
38
39
40
41


  interactivity=interactivity, compiler=compiler, result=result)


42
43
44


  interactivity=interactivity, compiler=compiler, result=result)


45
46
47
48
49


  interactivity=interactivity, compiler=compiler, result=result)


50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72


  interactivity=interactivity, compiler=compiler, result=result)


73
74
75


  interactivity=interactivity, compiler=compiler, result=result)


76
77


  interactivity=interactivity, compiler=compiler, result=result)


78
79
80
81
82
83
84


  interactivity=interactivity, compiler=compiler, result=result)


85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121


  interactivity=interactivity, compiler=compiler, result=result)


122
123


  interactivity=interactivity, compiler=compiler, result=result)


124
125
126
127
128
129
130
131


In [449]:
#select only 4 types of mutations: four types of nonsynonymous mutations, including missense and nonsense mutations, 
#frameshift insertions and deletions --> only those are tagged as mutated genes in the binary matrices
print(TCGA_maf_read.columns)
set(TCGA_maf_read['Variant_Classification'].values)
print(TCGA_maf_read.groupby(['Variant_Classification','Variant_Type']).count())

Index(['Hugo_Symbol', 'Variant_Classification', 'Variant_Type', 'case_id'], dtype='object')
                                     Hugo_Symbol  case_id
Variant_Classification Variant_Type                      
3'Flank                DEL                  6202     6202
                       INS                  1779     1779
                       SNP                 78791    78791
3'UTR                  DEL                 73092    73092
                       INS                 21134    21134
                       SNP                737635   737635
5'Flank                DEL                  2183     2183
                       INS                   732      732
                       SNP                 62431    62431
5'UTR                  DEL                  8361     8361
                       INS                  2933     2933
                       SNP                205110   205110
Frame_Shift_Del        DEL                166957   166957
Frame_Shift_Ins        INS            

In [450]:

mutated = ['Frame_Shift_Del','Frame_Shift_Ins','In_Frame_Del','In_Frame_Ins', 'Missense_Mutation','Nonsense_Mutation']
TCGA_maf_read.loc[:,'Mutated']   = 0
TCGA_maf_read.loc[TCGA_maf_read['Variant_Classification'].isin(mutated),'Mutated'] = 1

In [451]:
TCGA_maf_read.Mutated.value_counts()

1    6525124
0    4391042
Name: Mutated, dtype: int64

In [452]:
TCGA_maf_read.drop(columns=['Variant_Classification','Variant_Type'], inplace = True)

In [454]:
TCGA_maf_matrix = TCGA_maf_read.groupby([ 'case_id','Hugo_Symbol']).count()

In [455]:
TCGA_maf_matrix = TCGA_maf_matrix.unstack(level=0)

In [456]:
TCGA_maf_matrix.fillna(value = 0) #all combinations of gene and cell line not reported in the MAF file are assumed to be wildtype 

Unnamed: 0_level_0,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated,Mutated
case_id,0004d251-3f70-4395-b175-c94c2f5b1b81,000d566c-96c7-4f1c-b36e-fa2222467983,0011a67b-1ba9-4a32-a6b8-7850759a38cf,001887aa-36d0-463f-8bca-dec7043b4f2e,001944e5-af34-4061-9c09-bb9ea346f6fd,001ad307-4ad3-4f1d-b2fc-efc032871c7e,001cef41-ff86-4d3f-a140-a647ac4b10a1,001e0309-9c50-42b0-9e38-347883ee2cd3,0022478c-4dfd-4cbe-a05e-fb20310844e3,0024ab57-4036-4b0f-b7a1-040f97787022,...,ffc73551-55e9-4bbb-bd15-76088551964b,ffc915b8-cacd-4974-a040-ee496f0efc0e,ffcec8e5-9fd3-4b42-a7cb-74761f713cf4,ffcf851d-7fa1-4b45-911a-a3fbd74c253a,ffcfa005-a04f-458e-9d1d-86143dd823e5,ffd8d31f-bc4b-4e19-bbaf-0e26e9f3a107,ffedc8be-1056-4205-b9d9-99b5bdb872db,fff304a2-113f-499d-a88c-9d3660c348d9,fff35c80-88cd-4923-80c1-0273ba5bed0f,fffdb1d9-58d1-425c-ac12-1e1e5f443bf7
Hugo_Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2MP1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AACS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [458]:
#Genes with no mutations in CCLE and TCGA samples were eliminated.
genes_mut_TCGA_CCLE = set(TCGA_maf_matrix.index.values)and set(CCLE_maf_matrix.index.values)
print(len(genes_mut_TCGA_CCLE))

19278


In [467]:
TCGA_maf_matrix_r = TCGA_maf_matrix.loc[list(genes_mut_TCGA_CCLE),:]
TCGA_maf_matrix_r.shape

(19278, 10189)

In [466]:
CCLE_maf_matrix_r = CCLE_maf_matrix.loc[list(genes_mut_TCGA_CCLE),:]
CCLE_maf_matrix_r.shape

(19278, 1414)

In [487]:
#Summary of data
print(len(set(CCLE_maf['CELL_LINE_NAME'].values)))
print(len(IC50.columns)) #cell lines
print(len(set(CCLE_matrix.columns)))
print(len(set(CCLE_maf['Hugo_Symbol'].values)))
print("N Cell lines with MAF + expression + IC50 information")
#Counts of cell lines with MAF + expression + IC50 information
print(len( set(IC50.columns) and set(CCLE_maf['CELL_LINE_NAME'].values) and set(CCLE_matrix.columns.values)))
#Couts of genes included in expression and MAF files
print("N genes with MAF and expression in cell lines")
print(len(set(CCLE_maf['Hugo_Symbol'].values) and set(CCLE_matrix.index.values)))
print('N of drugs tested on the cell lines')
print(len(IC50.index))
#Counts of tumors with TCGA MAF and gene expression
print("N tumors with TCGA MAF and gene expression")
tumors_TCGA = set(TCGA_maf_matrix.columns.values)and set(TCGA_exp_matrix.columns.values)
print(len(tumors_TCGA))


1414
990
823
19278
N Cell lines with MAF + expression + IC50 information
823
N genes with MAF and expression in cell lines
18541
N of drugs tested on the cell lines
265
N tumors with TCGA MAF and gene expression
10340


In [496]:
TCGA_exp_matrix.head() 
TCGA_exp_matrix.to_csv('E_TCGA.csv')

In [497]:
CCLE_matrix.head() 
CCLE_matrix.to_csv('E_CCLE.csv')

In [498]:
TCGA_maf_matrix.head()
TCGA_maf_matrix.to_csv('M_TCGA.csv')

In [499]:
CCLE_maf_matrix.head()
CCLE_maf_matrix.to_csv('M_CCLE.csv')

In [3]:
IC50.head()
IC50.to_csv('IC50_CCLE.csv')