In [1]:
##Preparing data as per https://doi.org/10.1186/s12920-018-0460-9 
##Predicting drug response of tumors from integrated genomic profiles by deep neural networks
##Yu-Chiao Chiu, Hung-I Harry Chen, Tinghe Zhang, Songyao Zhang, Aparna Gorthi, Li-Ju Wang, Yufei Huang

In [2]:
import pandas as pd
import numpy as np
import h5py

In [3]:
#TCGA_exp = pd.read_csv('counts_gene.tsv', delimiter='\t')  --> TCGA exp downloaded from GDC Portal is not used , it contains FPKM instead of TPM, 
#while those downloaded from https://amp.pharm.mssm.edu/archs4/download.html  contains a selection of 25.150 genes

In [4]:
#print(TCGA_exp.shape)
#TCGA_exp.columns[-1]

In [5]:
#TCGA_exp.loc[TCGA_exp['3DFF72D2-F292-497E-ACE3-6FAA9C884205'] == 439935,'gene_id']

In [6]:
import mygene
mg = mygene.MyGeneInfo()
mg.getgene('ENSGR0000223511', 'name,symbol,refseq.rna') 

In [7]:
mg.getgene(150465, 'name,symbol,refseq.rna')

{'_id': '150465',
 '_score': 13.8292675,
 'name': 'tubulin tyrosine ligase',
 'refseq': {'rna': ['NM_153712.5', 'XM_005263599.3', 'XM_011510665.2']},
 'symbol': 'TTL'}

In [8]:
#sample_submitter_id from /maf_TCGA/sample.tsv
TCGA_meta = pd.read_csv('./maf_TCGA/sample.tsv', delimiter='\t')
print(TCGA_meta.shape)
TCGA_meta['sample_submitter'] = TCGA_meta['sample_submitter_id'].str[:-1]
TCGA_meta = TCGA_meta.drop_duplicates(subset='sample_submitter', keep='first')
TCGA_meta.shape

(31049, 36)


  interactivity=interactivity, compiler=compiler, result=result)


(21800, 37)

In [9]:
TCGA_exp = pd.read_csv('TreehousePEDv5_unique_hugo_log2_tpm_plus_1.2018-05-09.tsv', delimiter='\t' , index_col=0)

In [10]:
TCGA_transposed = TCGA_exp.transpose()
print(TCGA_transposed.shape)

print(len(set(TCGA_meta['sample_submitter'].values) & set(TCGA_transposed.index.values)))
TCGA_meta.loc[TCGA_meta['sample_submitter'].isin(['TCGA-02-0047-01']),:]

(11258, 58581)
9327


Unnamed: 0,sample_id,sample_submitter_id,case_id,case_submitter_id,project_id,sample_type_id,time_between_excision_and_freezing,oct_embedded,tumor_code_id,intermediate_dimension,...,biospecimen_laterality,days_to_sample_procurement,freezing_method,preservation_method,growth_rate,days_to_collection,catalog_reference,initial_weight,longest_dimension,sample_submitter
20156,9e5f2355-b872-4b5a-94f1-6eef49bb73de,TCGA-02-0047-01A,3caf009f-d9e0-4c57-b1d9-8eb59fc833bd,TCGA-02-0047,TCGA-GBM,1,--,--,--,--,...,--,--,--,--,--,--,--,--,--,TCGA-02-0047-01


In [11]:
TCGA_exp_matrix = pd.merge(TCGA_meta[['sample_submitter','case_id']], TCGA_transposed, left_on = 'sample_submitter', right_on = TCGA_transposed.index)


In [12]:
TCGA_exp_matrix.drop(columns='sample_submitter',inplace=True)

In [13]:
TCGA_exp_matrix.index = TCGA_exp_matrix.case_id

In [14]:
TCGA_exp_matrix = TCGA_exp_matrix.transpose()


In [15]:
TCGA_exp_matrix.drop(index = 'case_id', inplace = True)

In [16]:
TCGA_exp_matrix.columns.name = None

In [17]:
TCGA_exp_matrix.head()

Unnamed: 0,d3b7ada6-5395-4212-a921-8e97c92b605d,065f7b7a-05ee-4728-bcfc-6095357512a6,065f7b7a-05ee-4728-bcfc-6095357512a6.1,c328f661-c109-4e01-b819-299a75bd4348,ebe927e6-0a7e-4c97-ad73-f3c302cca7bc,c739fd61-22b2-412d-bcf3-89bda45a2c0f,a7fd6522-8953-461f-9e5f-fdeb7d180d46,75fb484d-cd12-4cf0-a99d-faf27aead118,86d1e388-b30f-48fd-9647-3848eb6134f1,2b715eb2-055e-4bc5-bc98-128e8bf6954f,...,4dc5cffe-ed8c-47ab-b5fc-f9615b5414f7,40551d53-388b-4067-9341-76e94d023703,0775583e-c0a0-4f18-9ca2-8f89cedce3d6,e76cf9a0-061e-4a7d-ba28-5a40b8da54b8,5166d82f-7b22-4101-bea9-6056e5a74c48,5cf3911a-f3ea-4c0e-851d-5bb57ca7190f,aaacf22c-2362-4613-b024-88fb7dd1cca1,95b69517-4971-450e-9ed0-43bc8b2c0534,b4f7f9e4-5fc5-4461-8e0b-0b6c384863c6,180a2b9c-9cac-4d96-86e8-cd6b3b9386b5
5S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224475,0.0,...,0.583365,0.0,0.0,0.0,0.0,0.0,0.201713,0.0,0.04733,0.0
A1BG,1.96719,6.04677,6.47885,1.42767,5.45973,4.06349,1.60884,3.91263,1.98914,3.80017,...,2.85405,0.669106,1.18278,3.71367,5.40092,6.50061,3.53732,6.06716,3.02502,8.42211
A1BG-AS1,0.748536,3.04443,2.92221,0.495795,2.74851,1.69601,0.722542,1.85204,0.65087,2.00361,...,1.64619,0.298782,0.660012,0.790837,3.27802,0.978248,1.42227,0.443721,1.48548,0.623024


In [18]:
#CCLE_NAME only match CELL_LINE_NAME if "_" is removed from CELL_LINE_NAME and then mapped to  CCLE_ID removing _*
CCLE_annot = pd.read_csv('DepMap-2019q1-celllines_v2.csv')
CCLE_annot.columns

Index(['DepMap_ID', 'CCLE_Name', 'Aliases', 'COSMIC_ID', 'Sanger ID',
       'Primary Disease', 'Subtype Disease', 'Gender', 'Source'],
      dtype='object')

In [19]:
#GDSC (not used) -> sanger1018_brainarray_ensemblgene_rma.txt, other gen_drug ML predictor: FORESEE https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btz145/5367831 
#CCLE exp -> v21.data.gex_avg_log2.txt, v21.meta.gex_features.txt is downloaded from ftp://caftpd.nci.nih.gov/pub/OCG-DCC/CTD2/Broad/CTRPv2.0_2015_ctd2_ExpandedDataset/ 
#CCLE_read_exp = pd.read_csv('CTRPv2.1_2016_pub_NatChemBiol_12_109/v21.data.gex_avg_log2.txt', delimiter='\t')
CCLE_read_exp = pd.read_csv('CCLE_depMap_19Q1_TPM.csv', index_col=0)


In [20]:
CCLE_exp = pd.merge(CCLE_annot[['CCLE_Name','DepMap_ID']],CCLE_read_exp, left_on=['DepMap_ID'], right_on=CCLE_read_exp.index )


In [21]:
CCLE_exp.drop(columns=['DepMap_ID'], inplace = True)

In [22]:
CCLE_exp.head()


Unnamed: 0,CCLE_Name,TSPAN6 (ENSG00000000003),TNMD (ENSG00000000005),DPM1 (ENSG00000000419),SCYL3 (ENSG00000000457),C1orf112 (ENSG00000000460),FGR (ENSG00000000938),CFH (ENSG00000000971),FUCA2 (ENSG00000001036),GCLC (ENSG00000001084),...,RP11-309M23.1 (ENSGR0000237531),AMDP1 (ENSGR0000237801),BX649553.1 (ENSGR0000263835),BX649553.2 (ENSGR0000263980),BX649553.3 (ENSGR0000264510),BX649553.4 (ENSGR0000264819),RN7SL355P (ENSGR0000265350),MIR3690 (ENSGR0000265658),AL732314.1 (ENSGR0000266731),AJ271736.10 (ENSGR0000270726)
0,NIHOVCAR3_OVARY,5.851999,0.0,7.390083,2.097611,4.253233,0.042644,0.879706,5.666188,4.542258,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.163499,0.0,5.659639,1.238787,3.040892,4.250962,0.163499,4.168321,4.0268,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CACO2_LARGE_INTESTINE,6.054631,0.084064,7.760487,1.851999,3.87578,0.0,0.056584,6.594399,4.57107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,HEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,2.615887,0.0,5.32337,2.405992,3.902074,0.925999,4.888987,3.896272,4.835924,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,3.06695,0.0,5.762615,2.992768,5.359662,0.238787,5.700162,4.14323,5.380591,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
CCLE_exp['CELL_LINE_NAME'] = CCLE_exp['CCLE_Name'].str.replace(r'_.*','')

In [24]:
CCLE_exp.index = CCLE_exp['CELL_LINE_NAME']
CCLE_exp.drop(columns=['CELL_LINE_NAME'], inplace = True)


In [25]:
CCLE_exp.drop(columns=['CCLE_Name'], inplace = True)


In [26]:
CCLE_matrix = CCLE_exp.transpose()


In [27]:
CCLE_matrix.columns.name = None


In [28]:
CCLE_matrix['t'] = CCLE_matrix.index

In [29]:
CCLE_matrix['new_index'] = CCLE_matrix['t'].str.replace(r'(\.)*\s*(\(.*)*','')


In [30]:
CCLE_matrix.index = CCLE_matrix['new_index']
CCLE_matrix.drop(columns=['new_index','t'], inplace=True)


In [31]:
del CCLE_matrix.index.name 

In [32]:
#the same genes need to be on both CCLE_exp and TCGA_exp
exp_genes = list(set(TCGA_exp_matrix.index) & set(CCLE_matrix.index))
CCLE_matrix_filtered = CCLE_matrix.loc[exp_genes, :]
CCLE_matrix_f = CCLE_matrix_filtered[~CCLE_matrix_filtered.index.duplicated(keep='first')]
TCGA_exp_matrix_filtered = TCGA_exp_matrix.loc[exp_genes, :]
TCGA_exp_matrix_f = TCGA_exp_matrix_filtered[~TCGA_exp_matrix_filtered.index.duplicated(keep='first')]
print(CCLE_matrix_filtered.shape)
print(TCGA_exp_matrix_filtered.shape)
print(CCLE_matrix_f.shape)
print(TCGA_exp_matrix_f.shape)

(35620, 1165)
(33645, 9327)
(33645, 1165)
(33645, 9327)


In [33]:
CCLE_matrix_f.loc['A1BG','22RV1']

1.72246602447

In [34]:
#IC50 for CCLE as provided by authors after imputation
IC50_CCLE_imputed = pd.read_csv('ic50_data_imputed.txt', delimiter='\t', index_col=0, header=0)

In [35]:
del IC50_CCLE_imputed.index.name

In [36]:
#IC50 for CCLE obtained from supplement material in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4967469/#mmc5 
#Caveat of this source: even if it has a larger number of cells (990), many of them are NaN, so imputation of values for 
#those cell lines and use them as ground truth seems too inacurate. 
IC50 = pd.read_excel('mmc5.xlsx', sheet_name='TableS4A-IC50s',  skiprows = range(1, 5), header=1, index_col=1)


In [37]:

IC50.drop(columns= 'Unnamed: 0', inplace = True)
IC50.index = IC50.index.str.replace('-','')


In [38]:
IC50 = IC50.transpose()
IC50.head()

Unnamed: 0,ALLPO,AMO1,COLO668,CORL95,DG75,ECGI10,ES5,GT3TKB,JJN3,KINGS1,...,MM1S,EW12,H2810,NCIH128,CORL32,ECC12,KPNRTBM1,CP67MEL,NCIH378,NCIH250
TL-2-105,1.412465,3.363134,3.398414,3.31245,3.650059,5.001921,1.448888,4.490765,2.208866,5.381778,...,,,,,,,,,,
TAK-715,2.11315,3.051047,3.103017,6.096484,3.107754,4.003636,1.929536,3.736883,3.011577,6.11272,...,,,,,,,,,,
CP466722,-0.756692,1.718606,3.067645,2.950619,2.175176,5.448208,1.367814,3.789823,1.366097,6.000134,...,,,,,,,,,,
BMS-345541,1.386112,2.191936,1.228204,2.848541,1.587807,3.371173,1.536514,4.036876,2.03278,4.328236,...,,,,,,,,,,
Genentech Cpd 10,-1.398149,1.541583,0.313308,3.740211,2.438582,5.000838,-0.783992,2.562712,1.361857,2.415087,...,,,,,,,,,,


In [39]:
#cell lines without IC50 for specific drugs are inputed with kNN (k = 5) using KNN from VIM package of R (other jupyter to be uploaded)


In [40]:
print(IC50.shape) #drug response data of 990 CCLE cell lines to 265 anti-cancer
#drugs measured by the half maximal inhibitory concentration (IC50) from the GDSC Project.

(265, 990)


In [41]:
#MAF - Fields:entity_id, case_id, Hugo_Symbol
#four types of nonsynonymous mutations, including missense and nonsense mutations, frameshift insertions and deletions


In [109]:
#CCLE MAF uses the newest version in 2019 from https://depmap.org/portal/download/all/?release=DepMap+Public+19Q1&file=DepMap-2019q1-celllines_v2.csv
#CCLE MAF downloaded from https://data.broadinstitute.org/ccle/CCLE_DepMap_18q3_maf_20180718.txt is not longer used
CCLE_maf_read = pd.read_csv('depmap_19Q1_mutation_calls.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [110]:
CCLE_maf = pd.merge(CCLE_annot[['CCLE_Name','DepMap_ID']],CCLE_maf_read, left_on=['DepMap_ID'], right_on=['DepMap_ID'] )

In [111]:
CCLE_maf.drop(columns=['DepMap_ID'], inplace = True)

In [112]:
CCLE_maf['CELL_LINE_NAME'] = CCLE_maf['CCLE_Name'].str.replace(r'_.*','')

In [113]:
#select only 4 types of mutations: four types of nonsynonymous mutations, including missense and nonsense mutations, 
#frameshift insertions and deletions --> only those are tagged as mutated genes in the binary matrices
print(CCLE_maf.columns)
print(set(CCLE_maf['Variant_Classification'].values))
print(CCLE_maf.groupby(['Variant_Classification','Variant_Type']).count())

Index(['CCLE_Name', 'Unnamed: 0', 'Hugo_Symbol', 'Entrez_Gene_Id',
       'NCBI_Build', 'Chromosome', 'Start_position', 'End_position', 'Strand',
       'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'dbSNP_RS', 'dbSNP_Val_Status', 'Genome_Change',
       'Annotation_Transcript', 'Tumor_Sample_Barcode', 'cDNA_Change',
       'Codon_Change', 'Protein_Change', 'isDeleterious', 'isTCGAhotspot',
       'TCGAhsCnt', 'isCOSMIChotspot', 'COSMIChsCnt', 'ExAC_AF', 'VA_WES_AC',
       'CGA_WES_AC', 'SangerWES_AC', 'SangerRecalibWES_AC', 'RNAseq_AC',
       'HC_AC', 'RD_AC', 'WGS_AC', 'Variant_annotation', 'CELL_LINE_NAME'],
      dtype='object')
{nan, "3'UTR", "5'Flank", 'De_novo_Start_OutOfFrame', 'Start_Codon_SNP', 'Frame_Shift_Del', 'Start_Codon_Del', 'Intron', 'In_Frame_Ins', 'IGR', 'Stop_Codon_Ins', 'Stop_Codon_Del', 'Silent', 'Splice_Site', 'Nonsense_Mutation', 'In_Frame_Del', 'Missense_Mutation', 'Start_Codon_Ins', 'Nonstop_Mutation', "5'UTR", 'F

In [114]:
CCLE_maf = CCLE_maf[['Variant_Classification', 'Hugo_Symbol', 'CELL_LINE_NAME']]

In [115]:
#Severity Score by type of mutation: https://ucsc-xena.gitbook.io/project/faq/basic-xena-browser
#4 --> Nonsense_Mutation, frameshift_variant, stop_gained, splice_acceptor_variant, 
#splice_acceptor_variant&intron_variant, splice_donor_variant, splice_donor_variant&intron_variant, 
#Splice_Site, Frame_Shift_Del, Frame_Shift_Ins
mut_4 = ['Splice_Site','Frame_Shift_Ins','Nonsense_Mutation','Frame_Shift_Del','Stop_Codon_Ins']
#3 --> splice_region_variant, splice_region_variant&intron_variant, missense, 
#non_coding_exon_variant, missense_variant, Missense_Mutation, exon_variant, 
#RNA, Indel, start_lost, start_gained, De_novo_Start_OutOfFrame, Translation_Start_Site, 
#De_novo_Start_InFrame, stop_lost, Nonstop_Mutation, initiator_codon_variant, 
#5_prime_UTR_premature_start_codon_gain_variant, disruptive_inframe_deletion, 
#inframe_deletion, inframe_insertion, In_Frame_Del, In_Frame_Ins
mut_3 = ['Nonstop_Mutation','Start_Codon_SNP','Missense_Mutation','In_Frame_Del',
         'Stop_Codon_Del','In_Frame_Ins', 'Start_Codon_Del', 'Start_Codon_Ins', 
         'De_novo_Start_OutOfFrame', 'Translation_Start_Site', 'RNA']
#2 --> synonymous_variant, 5_prime_UTR_variant, 3_prime_UTR_variant, 5'Flank, 3'Flank, 3'UTR, 5'UTR, 
#Silent, stop_retained_variant
mut_2 = ['Silent',"5'Flank","3'UTR","5'UTR","3'Flank"]
#1 --> others, SV, upstreamgenevariant, downstream_gene_variant, intron_variant, intergenic_region
mut_1 = [np.nan, 'IGR','Intron']
#0 --> Wild type

In [116]:

mutated = ['Splice_Site','Frame_Shift_Del','Frame_Shift_Ins','In_Frame_Del','In_Frame_Ins', 'Missense_Mutation','Nonsense_Mutation']
CCLE_maf.loc[:,'Mutated']   = 0
CCLE_maf.loc[:,'Mutated_score']   = 0
CCLE_maf.loc[CCLE_maf['Variant_Classification'].isin(mutated),'Mutated'] = 1
CCLE_maf.loc[CCLE_maf['Variant_Classification'].isin(mut_4),'Mutated_score'] = 4
CCLE_maf.loc[CCLE_maf['Variant_Classification'].isin(mut_3),'Mutated_score'] = 3
CCLE_maf.loc[CCLE_maf['Variant_Classification'].isin(mut_2),'Mutated_score'] = 2
CCLE_maf.loc[CCLE_maf['Variant_Classification'].isin(mut_1),'Mutated_score'] = 1

In [117]:
CCLE_maf.Mutated_score.value_counts(dropna=False)

3    699185
2    394957
4    148476
1       527
Name: Mutated_score, dtype: int64

In [118]:
CCLE_maf_score_matrix = CCLE_maf.drop(columns=['Variant_Classification','Mutated'])
CCLE_maf_score_matrix = CCLE_maf_score_matrix.groupby([ 'CELL_LINE_NAME','Hugo_Symbol']).max()
CCLE_maf_score_matrix.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mutated_score
CELL_LINE_NAME,Hugo_Symbol,Unnamed: 2_level_1
127399,ABCA10,3
127399,ABCA2,4
127399,ABCA5,3
127399,ABCA9,3
127399,ABCC10,4


In [119]:
CCLE_maf_score_matrix = CCLE_maf_score_matrix.unstack(level=0)

In [120]:
CCLE_maf_score_matrix.shape

(19350, 1585)

In [121]:
del CCLE_maf_score_matrix.index.name
CCLE_maf_score_matrix.columns = CCLE_maf_score_matrix.columns.droplevel() 
CCLE_maf_score_matrix.columns.name = None

In [122]:
CCLE_maf_score_matrix = CCLE_maf_score_matrix.fillna(value = 0) #all combinations of gene and cell line not reported in the MAF file is assumed to be wildtype 
CCLE_maf_score_matrix.head()

Unnamed: 0,127399,201T,22RV1,2313287,253J,253JBV,42MGBA,451LU,5637,59M,...,YD15,YD38,YD8,YH13,YKG1,YMB1E,YT,ZR751,ZR7530,[MERGED
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
A2ML1,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
CCLE_maf_score_matrix.apply(pd.value_counts)

Unnamed: 0,127399,201T,22RV1,2313287,253J,253JBV,42MGBA,451LU,5637,59M,...,YD15,YD38,YD8,YH13,YKG1,YMB1E,YT,ZR751,ZR7530,[MERGED
0.0,19088.0,18944.0,16800,16730,18953.0,18914.0,19084.0,18732.0,18726,19022,...,19124.0,18744.0,18908.0,19017.0,18873.0,19002.0,18629.0,19003.0,18807.0,17040.0
1.0,,,9,5,,,,,2,1,...,,,,,,,,,,
2.0,75.0,119.0,495,558,89.0,102.0,61.0,177.0,167,74,...,62.0,153.0,124.0,105.0,127.0,101.0,194.0,98.0,172.0,493.0
3.0,150.0,256.0,1342,1381,256.0,275.0,166.0,385.0,397,206,...,142.0,386.0,267.0,192.0,303.0,220.0,467.0,203.0,322.0,1536.0
4.0,37.0,31.0,704,676,52.0,59.0,39.0,56.0,58,47,...,22.0,67.0,51.0,36.0,47.0,27.0,60.0,46.0,49.0,281.0


In [124]:
CCLE_maf_score_matrix.isna().sum().sum()

0

In [125]:
CCLE_maf_score_matrix['127399'].value_counts()

0.0    19088
3.0      150
2.0       75
4.0       37
Name: 127399, dtype: int64

In [126]:
CCLE_maf.Mutated.value_counts()



1    840301
0    402844
Name: Mutated, dtype: int64

In [129]:
CCLE_maf.drop(columns=['Mutated_score'], inplace = True)

In [130]:
CCLE_maf.columns

Index(['Hugo_Symbol', 'CELL_LINE_NAME', 'Mutated'], dtype='object')

In [131]:
CCLE_maf_matrix = CCLE_maf.groupby([ 'CELL_LINE_NAME','Hugo_Symbol']).count()

In [132]:
CCLE_maf_matrix = CCLE_maf_matrix.unstack(level=0)

In [133]:
CCLE_maf_matrix.shape

(19350, 1585)

In [134]:
del CCLE_maf_matrix.index.name
CCLE_maf_matrix.columns = CCLE_maf_matrix.columns.droplevel() 
CCLE_maf_matrix.columns.name = None

In [135]:
CCLE_maf_matrix = CCLE_maf_matrix.fillna(value = 0) #all combinations of gene and cell line not reported in the MAF file is assumed to be wildtype 
CCLE_maf_matrix.head()

Unnamed: 0,127399,201T,22RV1,2313287,253J,253JBV,42MGBA,451LU,5637,59M,...,YD15,YD38,YD8,YH13,YKG1,YMB1E,YT,ZR751,ZR7530,[MERGED
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
A2ML1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
import os
import glob

#TCGA MAF data is obtained from GDC Portal using a manifest to download this query MAF files + open: 
#https://portal.gdc.cancer.gov/repository?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.access%22%2C%22value%22%3A%5B%22open%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.data_format%22%2C%22value%22%3A%5B%22MAF%22%5D%7D%7D%5D%7D
#long process 
if not os.path.exists('maf_TCGA.csv'):
    paths = glob.glob('maf_TCGA/**/*.somatic.maf.gz' , recursive=True)

    df = pd.DataFrame()
    i = 0
    for maf in paths:
        i+=1
        print(i)
        d = pd.read_csv(maf, compression='gzip', delimiter='\t', skiprows= 5)
        if d.columns[0] == 'Hugo_Symbol' :
            df = df.append(d)
        else:
            print(d.columns[0])
    df.to_csv('maf_TCGA.csv')
            
else:    
    TCGA_maf_read = pd.read_csv('maf_TCGA.csv', usecols=['Variant_Classification', 'Variant_Type','Hugo_Symbol', 'case_id'])


    

In [73]:
#select only 4 types of mutations: four types of nonsynonymous mutations, including missense and nonsense mutations, 
#frameshift insertions and deletions --> only those are tagged as mutated genes in the binary matrices
#Red --> Nonsense_Mutation, frameshift_variant, stop_gained, splice_acceptor_variant, splice_acceptor_variant&intron_variant, splice_donor_variant, splice_donor_variant&intron_variant, Splice_Site, Frame_Shift_Del, Frame_Shift_Ins
print(TCGA_maf_read.columns)
set(TCGA_maf_read['Variant_Classification'].values)
print(TCGA_maf_read.groupby(['Variant_Classification','Variant_Type']).count())

Index(['Hugo_Symbol', 'Variant_Classification', 'Variant_Type', 'case_id'], dtype='object')
                                     Hugo_Symbol  case_id
Variant_Classification Variant_Type                      
3'Flank                DEL                  6202     6202
                       INS                  1779     1779
                       SNP                 78791    78791
3'UTR                  DEL                 73092    73092
                       INS                 21134    21134
                       SNP                737635   737635
5'Flank                DEL                  2183     2183
                       INS                   732      732
                       SNP                 62431    62431
5'UTR                  DEL                  8361     8361
                       INS                  2933     2933
                       SNP                205110   205110
Frame_Shift_Del        DEL                166957   166957
Frame_Shift_Ins        INS            

In [74]:
print(set(TCGA_maf_read['Variant_Classification'].values))

{"3'Flank", 'Intron', "3'UTR", 'Nonstop_Mutation', "5'Flank", 'Splice_Region', 'In_Frame_Del', 'In_Frame_Ins', 'Missense_Mutation', 'IGR', 'Translation_Start_Site', "5'UTR", 'Nonsense_Mutation', 'Frame_Shift_Ins', 'RNA', 'Silent', 'Splice_Site', 'Frame_Shift_Del'}


In [76]:
mutated = ['Splice_Site','Frame_Shift_Del','Frame_Shift_Ins','In_Frame_Del','In_Frame_Ins', 'Missense_Mutation','Nonsense_Mutation']
TCGA_maf_read.loc[:,'Mutated']   = 0
TCGA_maf_read.loc[:,'Mutated_score']   = 0
TCGA_maf_read.loc[TCGA_maf_read['Variant_Classification'].isin(mutated),'Mutated'] = 1
TCGA_maf_read.loc[TCGA_maf_read['Variant_Classification'].isin(mut_4),'Mutated_score'] = 4
TCGA_maf_read.loc[TCGA_maf_read['Variant_Classification'].isin(mut_3),'Mutated_score'] = 3
TCGA_maf_read.loc[TCGA_maf_read['Variant_Classification'].isin(mut_2),'Mutated_score'] = 2
TCGA_maf_read.loc[TCGA_maf_read['Variant_Classification'].isin(mut_1),'Mutated_score'] = 1

In [78]:
TCGA_maf_read.Mutated_score.value_counts(dropna=False)

3    6029513
2    3488750
4     837246
1     450623
0     110034
Name: Mutated_score, dtype: int64

In [138]:
TCGA_maf_read.columns

Index(['Hugo_Symbol', 'Variant_Classification', 'Variant_Type', 'case_id',
       'Mutated', 'Mutated_score'],
      dtype='object')

In [82]:
TCGA_maf_score_matrix = TCGA_maf_read.drop(columns=['Variant_Classification','Mutated', 'Variant_Type'])
TCGA_maf_score_matrix = TCGA_maf_score_matrix.groupby([ 'case_id','Hugo_Symbol']).max()
TCGA_maf_score_matrix.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mutated_score
case_id,Hugo_Symbol,Unnamed: 2_level_1
0004d251-3f70-4395-b175-c94c2f5b1b81,ADGRA2,3
0004d251-3f70-4395-b175-c94c2f5b1b81,ADGRD2,3
0004d251-3f70-4395-b175-c94c2f5b1b81,ADGRF1,2
0004d251-3f70-4395-b175-c94c2f5b1b81,APBA2,2
0004d251-3f70-4395-b175-c94c2f5b1b81,ARFIP2,3


In [83]:
TCGA_maf_score_matrix = TCGA_maf_score_matrix.unstack(level=0)

In [84]:
TCGA_maf_score_matrix.shape

(22109, 10189)

In [85]:
del TCGA_maf_score_matrix.index.name
TCGA_maf_score_matrix.columns = TCGA_maf_score_matrix.columns.droplevel() 
TCGA_maf_score_matrix.columns.name = None

In [86]:
TCGA_maf_score_matrix = TCGA_maf_score_matrix.fillna(value = 0) #all combinations of gene and cell line not reported in the MAF file is assumed to be wildtype 
TCGA_maf_score_matrix.head()

Unnamed: 0,0004d251-3f70-4395-b175-c94c2f5b1b81,000d566c-96c7-4f1c-b36e-fa2222467983,0011a67b-1ba9-4a32-a6b8-7850759a38cf,001887aa-36d0-463f-8bca-dec7043b4f2e,001944e5-af34-4061-9c09-bb9ea346f6fd,001ad307-4ad3-4f1d-b2fc-efc032871c7e,001cef41-ff86-4d3f-a140-a647ac4b10a1,001e0309-9c50-42b0-9e38-347883ee2cd3,0022478c-4dfd-4cbe-a05e-fb20310844e3,0024ab57-4036-4b0f-b7a1-040f97787022,...,ffc73551-55e9-4bbb-bd15-76088551964b,ffc915b8-cacd-4974-a040-ee496f0efc0e,ffcec8e5-9fd3-4b42-a7cb-74761f713cf4,ffcf851d-7fa1-4b45-911a-a3fbd74c253a,ffcfa005-a04f-458e-9d1d-86143dd823e5,ffd8d31f-bc4b-4e19-bbaf-0e26e9f3a107,ffedc8be-1056-4205-b9d9-99b5bdb872db,fff304a2-113f-499d-a88c-9d3660c348d9,fff35c80-88cd-4923-80c1-0273ba5bed0f,fffdb1d9-58d1-425c-ac12-1e1e5f443bf7
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2MP1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
TCGA_maf_score_matrix.apply(pd.value_counts)

Unnamed: 0,0004d251-3f70-4395-b175-c94c2f5b1b81,000d566c-96c7-4f1c-b36e-fa2222467983,0011a67b-1ba9-4a32-a6b8-7850759a38cf,001887aa-36d0-463f-8bca-dec7043b4f2e,001944e5-af34-4061-9c09-bb9ea346f6fd,001ad307-4ad3-4f1d-b2fc-efc032871c7e,001cef41-ff86-4d3f-a140-a647ac4b10a1,001e0309-9c50-42b0-9e38-347883ee2cd3,0022478c-4dfd-4cbe-a05e-fb20310844e3,0024ab57-4036-4b0f-b7a1-040f97787022,...,ffc73551-55e9-4bbb-bd15-76088551964b,ffc915b8-cacd-4974-a040-ee496f0efc0e,ffcec8e5-9fd3-4b42-a7cb-74761f713cf4,ffcf851d-7fa1-4b45-911a-a3fbd74c253a,ffcfa005-a04f-458e-9d1d-86143dd823e5,ffd8d31f-bc4b-4e19-bbaf-0e26e9f3a107,ffedc8be-1056-4205-b9d9-99b5bdb872db,fff304a2-113f-499d-a88c-9d3660c348d9,fff35c80-88cd-4923-80c1-0273ba5bed0f,fffdb1d9-58d1-425c-ac12-1e1e5f443bf7
0.0,21985,22088,21995,22079.0,21987,22062,22073,22071,22020,22098,...,22100.0,22095.0,22069.0,21962,21952,21943,22046,22088.0,22095.0,22105.0
1.0,8,1,7,,1,3,1,1,6,1,...,,,,5,8,8,1,,,
2.0,46,4,38,7.0,38,13,11,9,22,3,...,2.0,1.0,16.0,44,55,66,22,5.0,3.0,
3.0,59,15,57,19.0,68,25,20,22,51,6,...,7.0,13.0,22.0,91,80,76,34,13.0,10.0,4.0
4.0,11,1,12,4.0,15,6,4,6,10,1,...,,,2.0,7,14,16,6,3.0,1.0,


In [88]:
TCGA_maf_score_matrix.isna().sum().sum()

0

In [139]:
TCGA_maf_read.Mutated.value_counts()

1    6658212
0    4257954
Name: Mutated, dtype: int64

In [140]:
TCGA_maf_read.columns

Index(['Hugo_Symbol', 'Variant_Classification', 'Variant_Type', 'case_id',
       'Mutated', 'Mutated_score'],
      dtype='object')

In [141]:
TCGA_maf_read.drop(columns=['Variant_Classification', 'Variant_Type','Mutated_score'], inplace=True)

In [142]:
TCGA_maf_matrix = TCGA_maf_read.groupby([ 'case_id','Hugo_Symbol']).count()

In [143]:
TCGA_maf_matrix = TCGA_maf_matrix.unstack(level=0)

In [144]:
del TCGA_maf_matrix.index.name
TCGA_maf_matrix.columns = TCGA_maf_matrix.columns.droplevel() 
TCGA_maf_matrix.columns.name = None


In [146]:
TCGA_maf_matrix = TCGA_maf_matrix.fillna(value = 0) #all combinations of gene and cell line not reported in the MAF file are assumed to be wildtype 
TCGA_maf_matrix.head()

(22109, 10189)

In [136]:
#Genes with no mutations in CCLE and TCGA samples were eliminated.
genes_mut_TCGA_CCLE = set(TCGA_maf_matrix.index.values)& set(CCLE_maf_matrix.index.values)
print(len(genes_mut_TCGA_CCLE))

18528


In [151]:
TCGA_maf_matrix_r = TCGA_maf_matrix.filter(list(genes_mut_TCGA_CCLE),axis = 0)
TCGA_maf_matrix_r.shape

(18528, 10189)

In [152]:
TCGA_maf_score_matrix_r = TCGA_maf_score_matrix.filter(list(genes_mut_TCGA_CCLE),axis = 0)
TCGA_maf_score_matrix_r.shape

(18528, 10189)

In [153]:
CCLE_maf_matrix_r = CCLE_maf_matrix.filter(list(genes_mut_TCGA_CCLE),axis = 0)
CCLE_maf_matrix_r.shape

(18528, 1585)

In [154]:
CCLE_maf_score_matrix_r = CCLE_maf_score_matrix.filter(list(genes_mut_TCGA_CCLE),axis = 0)
CCLE_maf_score_matrix_r.shape

(18528, 1585)

In [155]:
TCGA_exp_matrix_r = TCGA_exp_matrix_f.filter(list(genes_mut_TCGA_CCLE), axis=0)
TCGA_exp_matrix_r.shape

(17947, 9327)

In [156]:
CCLE_matrix_r = CCLE_matrix_f.filter(list(genes_mut_TCGA_CCLE), axis=0)
CCLE_matrix_r.shape

(17947, 1165)

In [157]:
#Summary of data
print(CCLE_maf_matrix_r.shape)
print(TCGA_exp_matrix_r.shape)
print(CCLE_matrix_r.shape)
print(TCGA_maf_matrix_r.shape)
print(len(IC50_CCLE_imputed.columns)) #cell lines
print("N Cell lines with MAF + expression + IC50 information")
#Counts of cell lines with MAF + expression + IC50 information
print(len( set(IC50_CCLE_imputed.columns) & set(CCLE_maf['CELL_LINE_NAME'].values) & set(CCLE_matrix.columns.values)))
#Couts of genes included in expression and MAF files
print("N genes with MAF and expression in cell lines")
print(len(set(CCLE_maf['Hugo_Symbol'].values) & set(CCLE_matrix.index.values)))
print("N genes with MAF and expression in cell lines and TCGA")
print(len(set(TCGA_exp_matrix.index) & set(TCGA_maf_matrix.index) & set(CCLE_matrix.index) & set(CCLE_maf_matrix.index)  )) 
print('N of drugs tested on the cell lines')
print(len(IC50_CCLE_imputed.index))
#Counts of tumors with TCGA MAF and gene expression
print("N tumors with TCGA MAF and gene expression")
tumors_TCGA = set(TCGA_maf_matrix.columns.values) & set(TCGA_exp_matrix.columns.values)
print(len(tumors_TCGA))


(18528, 1585)
(17947, 9327)
(17947, 1165)
(18528, 10189)
622
N Cell lines with MAF + expression + IC50 information
610
N genes with MAF and expression in cell lines
18189
N genes with MAF and expression in cell lines and TCGA
17947
N of drugs tested on the cell lines
265
N tumors with TCGA MAF and gene expression
9090


In [159]:
TCGA_exp_matrix_r.head() 
TCGA_exp_matrix_r.to_csv('E_TCGA.csv')

In [163]:
CCLE_matrix_r.head() 
CCLE_matrix_r.to_csv('E_CCLE.csv')

In [173]:
TCGA_maf_matrix_r.head()
TCGA_maf_matrix_r.to_csv('M_TCGA.csv')

In [171]:
CCLE_maf_matrix_r.head()
CCLE_maf_matrix_r.to_csv('M_CCLE.csv')

In [166]:
IC50.head()
IC50.to_csv('IC50_CCLE.csv')

In [167]:
IC50_CCLE_imputed.to_csv('IC50_CCLE_author.csv')

In [168]:
print(len( set(IC50.columns) & set(IC50_CCLE_imputed.columns)))

567


In [158]:
TCGA_maf_score_matrix_r.head()
TCGA_maf_score_matrix_r.to_csv('M_scored_TCGA.csv')

In [159]:
CCLE_maf_score_matrix_r.head()
CCLE_maf_score_matrix_r.to_csv('M_scored_CCLE.csv')