##  Part 3: Compute mRNA-protein correlation for non-tumour studies

**Input:** Transcriptomics and Proteomics data of the non tumour studies listed below.
1. <a href=https://tinyurl.com/54af2rvy>Cancer cell lines  encyclopedia (2020)</a>           
2. <a href=https://tinyurl.com/rsv3efp5>GTEx 32 healthy tissues (2020)</a>          
3. <a href=https://tinyurl.com/56439rpr>NCI-60 cancer cellines (2019)</a>

**Output:** Gene-wise correlation between mRNA and protein abundances 

#### Import Packages

In [1]:
import os, re
import numpy as np
import pandas as pd 

import standardised_pipeline_utils

In [2]:
get_data_path = lambda folders, fname: os.path.normpath(os.environ['DATA_PATH']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../local_data/'+'/'.join(folders) +'/'+ fname)

In [3]:
file_ccle_transcriptomics = get_data_path(['depmap','20Q4'], 'CCLE_expression.csv')
file_ccle_sample_info = get_data_path(['depmap','20Q4'], 'sample_info.csv')
file_ccle_proteomics = get_data_path(['nusinow_2020'],'Table_S2.xlsx')

file_gtex_transcriptomics = get_data_path(['jiang_2020'], 'Table_S3.xlsx')
file_gtex_proteomics = get_data_path(['jiang_2020'], 'Table_S2.xlsx')

file_nci60_transcriptomics =  get_data_path(['guo_2019'], 'Table_S6.xlsx')
file_nci60_proteomics =  get_data_path(['guo_2019'], 'Table_S1.xlsx')

file_non_tumour_studies = get_local_data_path(['processed', 'correlation_mRNA_protein'], 'non_tumour_studies.csv')
file_samples_info = get_local_data_path(['processed', 'correlation_mRNA_protein'], 'samples_info.csv')

In [4]:
# Information collected for the Supplemental Table S1B
common_samples = {}
common_genes = {}
transcriptomic_samples = {}
proteomic_samples = {}

### CCLE

#### Transcriptomics Data

In [5]:
ccle_sample_info = pd.read_csv(file_ccle_sample_info)
print("Dimensions: ", ccle_sample_info.shape)
ccle_sample_info[:2]

Dimensions:  (1812, 26)


Unnamed: 0,DepMap_ID,cell_line_name,stripped_cell_line_name,CCLE_Name,alias,COSMICID,sex,source,Achilles_n_replicates,cell_line_NNMD,...,primary_or_metastasis,primary_disease,Subtype,age,Sanger_Model_ID,depmap_public_comments,lineage,lineage_subtype,lineage_sub_subtype,lineage_molecular_subtype
0,ACH-000001,NIH:OVCAR-3,NIHOVCAR3,NIHOVCAR3_OVARY,OVCAR3,905933.0,Female,ATCC,,,...,Metastasis,Ovarian Cancer,"Adenocarcinoma, high grade serous",60.0,SIDM00105,,ovary,ovary_adenocarcinoma,high_grade_serous,
1,ACH-000002,HL-60,HL60,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,905938.0,Female,ATCC,,,...,Primary,Leukemia,"Acute Myelogenous Leukemia (AML), M3 (Promyelo...",35.0,SIDM00829,,blood,AML,M3,


In [6]:
ccle_transcriptomics = pd.read_csv(file_ccle_transcriptomics, index_col=0)
print("Dimensions: ", ccle_transcriptomics.shape)
ccle_transcriptomics[:2]

Dimensions:  (1376, 19182)


Unnamed: 0,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),NFYA (4800),...,AC090517.4 (54816),AL160269.1 (11046),ABCF2-H2BE1 (114483834),FO393400.1 (149951),POLR2J3 (548644),H2BE1 (114483833),AL445238.1 (647264),GET1-SH3BGR (106865373),AC024598.1 (22891),AC113348.1 (102724657)
ACH-001113,4.990501,0.0,7.273702,2.765535,4.480265,0.028569,1.269033,3.058316,6.483171,5.05398,...,0.111031,0.15056,1.427606,0.042644,5.781884,0.0,0.0,0.799087,0.014355,0.0
ACH-001289,5.209843,0.545968,7.070604,2.538538,3.510962,0.0,0.176323,3.836934,4.20085,3.83289,...,0.31034,0.0,0.807355,0.0,4.704319,0.0,0.0,0.464668,0.0,0.070389


In [7]:
# Rename the cell lines given as DepMap ID to CCLE_Name and columns to just contain the hgnc_symbol
ccle_transcriptomics = ccle_transcriptomics.rename(index = dict(zip(ccle_sample_info['DepMap_ID'], 
                                                                    ccle_sample_info['CCLE_Name'])),
                                                   columns = lambda x : str(x).split(' ')[0])
# Transpose to obtain a matrix similar to other studies
ccle_transcriptomics = ccle_transcriptomics.transpose()
ccle_transcriptomics[:2]

Unnamed: 0,LC1SQSF_LUNG,COGAR359_SOFT_TISSUE,COLO794_SKIN,KKU213_BILIARY_TRACT,RT4_URINARY_TRACT,SNU283_LARGE_INTESTINE,NCIH1395_LUNG,DEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SNU1196_BILIARY_TRACT,LC1F_LUNG,...,BL70_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,U343_CENTRAL_NERVOUS_SYSTEM,639V_URINARY_TRACT,MON_SOFT_TISSUE,LOXIMVI_SKIN,TOLEDO_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SSP25_BILIARY_TRACT,ECC2_BILIARY_TRACT,A673_BONE,TT_OESOPHAGUS
TSPAN6,4.990501,5.209843,3.77926,5.726831,7.465648,4.914086,4.032982,0.097611,4.712596,5.101398,...,0.070389,4.69265,5.0268,6.699052,4.173127,0.097611,5.045268,5.805292,4.870858,5.117695
TNMD,0.0,0.545968,0.0,0.0,0.0,0.176323,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
assert len(ccle_transcriptomics.columns[ccle_transcriptomics.columns.duplicated()]) == 0, "columns contain duplicates"

In [9]:
ccle_transcriptomics_processed = standardised_pipeline_utils.process(ccle_transcriptomics)
ccle_transcriptomics_processed[:2]

Dimensions:  (19180, 1376)


Unnamed: 0,LC1SQSF_LUNG,COGAR359_SOFT_TISSUE,COLO794_SKIN,KKU213_BILIARY_TRACT,RT4_URINARY_TRACT,SNU283_LARGE_INTESTINE,NCIH1395_LUNG,DEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SNU1196_BILIARY_TRACT,LC1F_LUNG,...,BL70_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,U343_CENTRAL_NERVOUS_SYSTEM,639V_URINARY_TRACT,MON_SOFT_TISSUE,LOXIMVI_SKIN,TOLEDO_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SSP25_BILIARY_TRACT,ECC2_BILIARY_TRACT,A673_BONE,TT_OESOPHAGUS
A1BG,3.976364,2.134221,2.650765,0.0,0.111031,0.226509,0.097611,5.276869,0.111031,2.871844,...,3.854993,2.570463,4.990955,4.296457,0.0,4.874797,0.189034,0.0,4.878235,2.650765
A1CF,0.014355,0.0,0.0,0.0,0.028569,1.673556,0.757023,0.028569,0.056584,0.014355,...,0.0,0.097611,0.0,0.0,0.028569,0.014355,0.056584,0.0,0.0,0.042644


#### Proteomic Data

In [10]:
ccle_proteomics = pd.read_excel(io=file_ccle_proteomics, sheet_name="Normalized Protein Expression", engine='openpyxl')
print("Dimensions: ", ccle_proteomics.shape)
ccle_proteomics[:2]

Dimensions:  (12755, 16384)


Unnamed: 0,Protein_Id,Gene_Symbol,Description,Group_ID,Uniprot,Uniprot_Acc,TenPx01_Peptides,TenPx02_Peptides,TenPx03_Peptides,TenPx04_Peptides,...,Column15949,Column15950,Column15951,Column15952,Column15953,Column15954,Column15955,Column15956,Column15957,Column15958
0,sp|P55011|S12A2_HUMAN,SLC12A2,S12A2_HUMAN Solute carrier family 12 member 2,0,S12A2_HUMAN,P55011,41,16,23,59,...,,,,,,,,,,
1,sp|P35453|HXD13_HUMAN,HOXD13,HXD13_HUMAN Homeobox protein Hox-D13,1,HXD13_HUMAN,P35453,0,0,0,0,...,,,,,,,,,,


In [11]:
ccle_proteomics.set_index('Gene_Symbol', inplace=True)
ccle_proteomics = ccle_proteomics.loc[:, ccle_proteomics.columns.str.contains('_TenPx')]

In [12]:
# Checking for cell lines repeated in >1 Ten-plexes
ccle_proteomics.filter(regex='SW948_LARGE_INTESTINE|CAL120_BREAST|HCT15_LARGE_INTESTINE').columns

Index(['CAL120_BREAST_TenPx02', 'SW948_LARGE_INTESTINE_TenPx11',
       'SW948_LARGE_INTESTINE_TenPx20', 'CAL120_BREAST_TenPx28',
       'HCT15_LARGE_INTESTINE_TenPx30', 'HCT15_LARGE_INTESTINE_TenPx18'],
      dtype='object')

<div class="alert alert-block alert-warning">
<b>Note:</b> The above 3 cell lines were repeated in different Ten-plexes. However, there is not much correlation between the replicates of different Ten-plexes. So, as mentioned in <a href=https://tinyurl.com/93acy42y>A Guide to the Quantitative Proteomic Profiles of the Cancer Cell Line Encyclopedia</a>, choose to retain the cell lines that only correlate well with the transcriptomics data. 
</div>

In [13]:
# Eliminating the cell lines that do not correlate well with transcriptomics data as mentioned in the paper 
ccle_proteomics.drop(columns=['SW948_LARGE_INTESTINE_TenPx11', 'CAL120_BREAST_TenPx02', 'HCT15_LARGE_INTESTINE_TenPx30'], 
                    inplace=True)
ccle_proteomics = ccle_proteomics.rename(columns = lambda x : str(x).split('_TenPx')[0])

In [14]:
assert len(ccle_proteomics.columns[ccle_proteomics.columns.duplicated()]) == 0, "columns contain duplicates"

In [15]:
ccle_proteomics_processed = standardised_pipeline_utils.process(ccle_proteomics)
ccle_proteomics_processed[:2]

Dimensions:  (7372, 375)


Unnamed: 0_level_0,MDAMB468_BREAST,SH4_SKIN,AU565_BREAST,KMRC1_KIDNEY,CAL51_BREAST,RPMI7951_SKIN,RERFLCMS_LUNG,IGR37_SKIN,VMRCRCW_KIDNEY,HEP3B217_LIVER,...,NCIH2030_LUNG,22RV1_PROSTATE,A172_CENTRAL_NERVOUS_SYSTEM,BT20_BREAST,CALU6_LUNG,FADU_UPPER_AERODIGESTIVE_TRACT,KP4_PANCREAS,MONOMAC6_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,OVCAR8_OVARY,THP1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
Gene_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,-0.345562,1.642186,-1.129114,1.493333,0.274652,-0.615667,-0.957249,-0.174494,-0.987932,1.513325,...,-0.981657,-0.921538,3.905544,-0.649348,-1.055916,-1.116991,1.922976,-1.106344,-0.770372,-1.342713
AAAS,-0.109405,-0.283335,0.024663,-0.588886,0.673324,0.010466,0.474615,0.037159,-0.362833,-0.225824,...,-0.093176,-0.518686,-0.642067,-0.188224,-0.259101,-0.779655,-0.62877,1.205109,-0.584588,1.622359


In [16]:
transcriptomic_samples['CCLE'] = ccle_transcriptomics_processed.shape[1]
proteomic_samples['CCLE'] = ccle_proteomics_processed.shape[1]

In [17]:
ccle_transcriptomics_processed, ccle_proteomics_processed = standardised_pipeline_utils.match_proteins_samples( \
                                                                            ccle_transcriptomics_processed, 
                                                                            ccle_proteomics_processed)

Number of common proteins:  6987
Number of common samples:  369


In [18]:
common_samples['CCLE'] = ccle_transcriptomics_processed.shape[1]
common_genes['CCLE'] = ccle_transcriptomics_processed.shape[0]

In [19]:
correlation_ccle = standardised_pipeline_utils.correlate_genewise(ccle_transcriptomics_processed, 
                                                                   ccle_proteomics_processed, 'CCLE')

Median Spearman Correlation:  0.4566


In [20]:
correlation_ccle_pearson = standardised_pipeline_utils.correlate_genewise(ccle_transcriptomics_processed, 
                                                                   ccle_proteomics_processed, 'CCLE', method='pearson')

Median Pearson Correlation:  0.4776


#### GTEX

#### Transcriptomic Data

In [21]:
gtex_transcriptomics = pd.read_excel(io=file_gtex_transcriptomics, sheet_name='D RNA TS score', skiprows=3)
print("Dimensions: ", gtex_transcriptomics.shape)
gtex_transcriptomics[:2]

Dimensions:  (12245, 36)


Unnamed: 0,ensembl_id,entrez_id,hgnc_name,hgnc_symbol,Adrenal Gland,Artery Aorta,Artery Coronary,Artery Tibial,Brain Cerebellum,Brain Cortex,...,Prostate,Skin Unexpo,Skin SunExpo,Small Intestine,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina
0,ENSG00000000003,7105.0,tetraspanin 6,TSPAN6,0.28,-0.68,-0.48,-0.88,-1.64,-0.15,...,0.79,-1.26,-0.46,0.46,-0.42,0.57,2.54,0.35,0.54,0.74
1,ENSG00000000419,8813.0,dolichyl-phosphate mannosyltransferase subunit...,DPM1,1.46,0.8,-0.35,0.34,-2.32,-1.9,...,-0.95,0.24,-0.2,-0.8,0.42,-0.4,-0.07,0.18,-0.74,-0.84


In [22]:
gtex_transcriptomics.set_index('hgnc_symbol', inplace=True)
gtex_transcriptomics.drop(columns=['ensembl_id', 'entrez_id', 'hgnc_name'], inplace=True)

# Replace the different kinds of null value representation into a recognizable form using np.Nan
gtex_transcriptomics.replace('NA;all_tissues_tpm_less_1', np.nan, inplace=True)
gtex_transcriptomics.replace('^.*NA_raw_tpm_less_1 ;.*$', np.nan, regex=True, inplace=True)
gtex_transcriptomics = gtex_transcriptomics.apply(pd.to_numeric)

In [23]:
gtex_transcriptomics_processed = standardised_pipeline_utils.process(gtex_transcriptomics)
gtex_transcriptomics_processed[:2]

Dimensions:  (12099, 32)


Unnamed: 0_level_0,Adrenal Gland,Artery Aorta,Artery Coronary,Artery Tibial,Brain Cerebellum,Brain Cortex,Breast,Colon Sigmoid,Colon Transverse,GE junction,...,Prostate,Skin Unexpo,Skin SunExpo,Small Intestine,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina
hgnc_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-1.0,0.13,0.86,0.46,0.87,0.64,0.67,-0.22,0.47,-0.27,...,0.43,-1.76,-0.5,0.07,0.54,-0.11,-0.77,0.64,1.0,0.78
A1CF,0.06,-0.62,0.27,-0.26,0.48,0.32,-0.31,-0.16,3.99,-0.31,...,-0.03,0.74,0.03,3.65,-0.77,2.39,0.29,-0.48,-0.5,0.14


#### Proteomic Data

In [24]:
gtex_proteomics = pd.read_excel(io=file_gtex_proteomics, sheet_name='G protein TS score', skiprows=2, engine='openpyxl')
print("Dimensions: ", gtex_proteomics.shape)
gtex_proteomics[:2]

Dimensions:  (12627, 36)


Unnamed: 0,ensembl_id,entrez_id,hgnc_name,hgnc_symbol,Adrenal Gland,Artery Aorta,Artery Coronary,Artery Tibial,Brain Cerebellum,Brain Cortex,...,Prostate,Skin Unexpo,Skin SunExpo,Small Intestine,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina
0,ENSG00000000003,7105.0,tetraspanin 6,TSPAN6,0.26,0.81,0.02;NA_one_rep_in_raw,0.26,-1.32,-0.11;NA_one_rep_in_raw,...,-0.07,0.36,0.52,0.23,-0.78,1.18,0.66,-0.03,-0.09,-0.28
1,ENSG00000000419,8813.0,dolichyl-phosphate mannosyltransferase subunit...,DPM1,1.2,-1.3,-0.8,-0.1,-2.15,-0.46,...,0.14,0.3,-0.22,-0.57,-0.08,1.36,1.04,1.41,0.1,0.0


In [25]:
gtex_proteomics.set_index('hgnc_symbol', inplace=True)
gtex_proteomics.drop(columns=['ensembl_id', 'entrez_id', 'hgnc_name'], inplace=True)
gtex_proteomics.replace('^.*NA_one_rep_in_raw.*$', np.nan, regex=True, inplace=True)
gtex_proteomics = gtex_proteomics.apply(pd.to_numeric)

In [26]:
gtex_proteomics_processed = standardised_pipeline_utils.process(gtex_proteomics)
gtex_proteomics_processed[:2]

Dimensions:  (7661, 32)


Unnamed: 0_level_0,Adrenal Gland,Artery Aorta,Artery Coronary,Artery Tibial,Brain Cerebellum,Brain Cortex,Breast,Colon Sigmoid,Colon Transverse,GE junction,...,Prostate,Skin Unexpo,Skin SunExpo,Small Intestine,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina
hgnc_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-1.39,0.35,1.03,-0.45,-2.67,-2.14,1.7,0.1,-0.42,0.37,...,-0.33,0.75,0.15,-1.24,-1.65,-0.59,-0.55,-0.25,0.72,0.97
A1CF,-10.0,,0.0,-1.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,0.0,0.0,0.0,-10.0,0.0,0.0,


In [27]:
transcriptomic_samples['GTEx'] = gtex_transcriptomics_processed.shape[1]
proteomic_samples['GTEx'] = gtex_proteomics_processed.shape[1]

In [28]:
gtex_transcriptomics_processed, gtex_proteomics_processed = standardised_pipeline_utils.match_proteins_samples( \
                                                                            gtex_transcriptomics_processed, 
                                                                            gtex_proteomics_processed)

Number of common proteins:  7471
Number of common samples:  32


In [29]:
common_samples['GTEx'] = gtex_transcriptomics_processed.shape[1]
common_genes['GTEx'] = gtex_proteomics_processed.shape[0]

In [30]:
correlation_gtex = standardised_pipeline_utils.correlate_genewise(gtex_transcriptomics_processed,
                                                                   gtex_proteomics_processed, 'GTEx')

Median Spearman Correlation:  0.5087


In [31]:
correlation_gtex_pearson = standardised_pipeline_utils.correlate_genewise(gtex_transcriptomics_processed, 
                                                                   gtex_proteomics_processed, 'GTEx', method='pearson')

Median Pearson Correlation:  0.5911


### NCI-60 cancer cell lines

#### Transcriptomic Data

In [32]:
nci60_transcriptomics = pd.read_excel(io=file_nci60_transcriptomics, sheet_name='B')
print("Dimensions: ", nci60_transcriptomics.shape)
nci60_transcriptomics[:2]

Dimensions:  (23059, 61)


Unnamed: 0,GENE,BR:MCF7,BR:MDA-MB-231,BR:HS 578T,BR:BT-549,BR:T-47D,CNS:SF-268,CNS:SF-295,CNS:SF-539,CNS:SNB-19,...,PR:PC-3,PR:DU-145,RE:786-0,RE:A498,RE:ACHN,RE:CAKI-1,RE:RXF 393,RE:SN12C,RE:TK-10,RE:UO-31
0,LOC729737,9.412,9.313,8.476,8.175,9.621,8.712,8.422,,8.472,...,8.25,9.371,8.459,9.132,8.946,8.376,8.876,9.173,8.377,8.594
1,CICP3,6.152,6.484,6.477,6.223,6.434,6.282,6.075,,6.453,...,5.966,6.227,6.472,6.322,6.477,6.464,6.623,6.313,6.076,5.933


In [33]:
nci60_transcriptomics = nci60_transcriptomics.dropna(subset=['GENE']).set_index('GENE')
nci60_transcriptomics = nci60_transcriptomics.rename(columns= lambda x: re.sub("[-\s]", "", str(x)))  \
                                             .rename(columns= lambda x: re.sub("\([^)]*\)", "", str(x))) \
                                             .rename(columns= lambda x: str(x).replace(':', '_')) \
                                             .rename(columns = {'LC_A549/ATCC': 'LC_A549', 
                                                                'OV_NCI/ADRRES': 'OV_NCIADRRES'})

In [34]:
nci60_transcriptomics_processed = standardised_pipeline_utils.process(nci60_transcriptomics)
nci60_transcriptomics_processed[:2]

Dimensions:  (23059, 60)


Unnamed: 0_level_0,BR_MCF7,BR_MDAMB231,BR_HS578T,BR_BT549,BR_T47D,CNS_SF268,CNS_SF295,CNS_SF539,CNS_SNB19,CNS_SNB75,...,PR_PC3,PR_DU145,RE_7860,RE_A498,RE_ACHN,RE_CAKI1,RE_RXF393,RE_SN12C,RE_TK10,RE_UO31
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,6.963,6.418,7.593,7.444,6.669,6.61,6.432,4.795,6.219,5.694,...,5.653,6.229,5.681,5.684,5.725,6.371,6.698,6.973,5.858,6.024
A1BG-AS1,6.23,5.271,5.994,6.434,6.129,5.536,5.803,,5.23,4.972,...,4.654,5.542,4.819,5.014,5.058,5.554,5.373,5.091,4.886,5.202


#### Proteomic Data

In [35]:
nci60_proteomics = pd.read_excel(io=file_nci60_proteomics, sheet_name='E')
print("Dimensions: ", nci60_proteomics.shape)
nci60_proteomics[:2]

Dimensions:  (3171, 68)


Unnamed: 0,protein accession number,gene symbol,numPep,R2,best_mscore,numNA,transition_group_id,average protein intensity,BR_BT549,BR_HS578T,...,PR_DU145,PR_PC3,RE_7860,RE_A498,RE_ACHN,RE_CAKI1,RE_RXF393,RE_SN12C,RE_TK10,RE_UO31
0,A0AV96,RBM47,2,0.461295,1.3e-05,0,89197_VPEGVAGAPNEAALLALMER_3,321415.0725,252295.5,320967.0,...,231154.8,240414.8,260905.1,342500.2,376886.1,289172.1,306021.4,276934.6,378189.4,313858.5
1,A0AVT1,UBA6,9,0.684149,3.8e-05,22,48306_LETGQFLTFR_2,325395.9267,325644.3,47653.2,...,47653.2,313214.0,301022.1,196917.8,47653.2,378384.5,913480.0,333520.5,262599.8,268524.1


In [36]:
nci60_proteomics.drop(columns = ['protein accession number', 'numPep', 'R2', 'best_mscore', 
                                 'numNA', 'transition_group_id', 'average protein intensity'], inplace=True)
nci60_proteomics = nci60_proteomics.dropna(subset=['gene symbol']).set_index('gene symbol')
nci60_proteomics[:2]

Unnamed: 0_level_0,BR_BT549,BR_HS578T,BR_MCF7,BR_MDAMB231,BR_MDAMB468,BR_T47D,CNS_SF268,CNS_SF295,CNS_SF539,CNS_SNB19,...,PR_DU145,PR_PC3,RE_7860,RE_A498,RE_ACHN,RE_CAKI1,RE_RXF393,RE_SN12C,RE_TK10,RE_UO31
gene symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RBM47,252295.5,320967.0,386714.5,308863.3,291428.0,296138.2,161876.4,328064.5,259639.0,240479.0,...,231154.8,240414.8,260905.1,342500.2,376886.1,289172.1,306021.4,276934.6,378189.4,313858.5
UBA6,325644.3,47653.2,394142.5,47653.2,47653.2,188434.5,409606.1,275464.7,188715.0,47653.2,...,47653.2,313214.0,301022.1,196917.8,47653.2,378384.5,913480.0,333520.5,262599.8,268524.1


In [37]:
nci60_proteomics_processed = standardised_pipeline_utils.process(nci60_proteomics)
nci60_proteomics_processed[:2]

Dimensions:  (3157, 60)


Unnamed: 0_level_0,BR_BT549,BR_HS578T,BR_MCF7,BR_MDAMB231,BR_MDAMB468,BR_T47D,CNS_SF268,CNS_SF295,CNS_SF539,CNS_SNB19,...,PR_DU145,PR_PC3,RE_7860,RE_A498,RE_ACHN,RE_CAKI1,RE_RXF393,RE_SN12C,RE_TK10,RE_UO31
gene symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAS,8331.8,64198.0,62750.7,63921.5,77924.6,73633.1,87421.6,76357.9,84516.4,68816.6,...,8331.8,118359.2,54281.8,50204.8,8331.8,70821.3,59718.6,87341.6,92722.8,63334.2
AAMDC,86447.4,54617.7,109416.6,116873.8,155572.5,98071.3,70818.7,83636.2,80560.1,55873.1,...,72638.6,89337.1,109282.4,85325.6,99538.3,81063.0,57490.7,69688.8,78092.4,98849.5


In [38]:
transcriptomic_samples['NCI60'] = nci60_transcriptomics_processed.shape[1]
proteomic_samples['NCI60'] = nci60_proteomics_processed.shape[1]

In [39]:
nci60_transcriptomics_processed, nci60_proteomics_processed = standardised_pipeline_utils.match_proteins_samples( \
                                                                            nci60_transcriptomics_processed, 
                                                                            nci60_proteomics_processed)

Number of common proteins:  3022
Number of common samples:  59


In [40]:
common_samples['NCI60'] = nci60_transcriptomics_processed.shape[1]
common_genes['NCI60'] = nci60_proteomics_processed.shape[0]

In [41]:
correlation_nci60 = standardised_pipeline_utils.correlate_genewise(nci60_transcriptomics_processed,
                                                                   nci60_proteomics_processed, 'NCI60')

Median Spearman Correlation:  0.3658


In [42]:
correlation_nci60_pearson = standardised_pipeline_utils.correlate_genewise(nci60_transcriptomics_processed,
                                                                   nci60_proteomics_processed, 'NCI60', method='pearson')

Median Pearson Correlation:  0.3981


In [43]:
def dataframe_from_dict(*dict_args):
    dataframe = pd.DataFrame.from_dict(dict_args[0], orient='index')
    for i in range(1, len(dict_args)):
        dataframe = pd.concat([dataframe, pd.DataFrame.from_dict(dict_args[i], orient='index')], axis=1)
    dataframe.reset_index(inplace=True)
    dataframe.columns=['Data', '# Samples in Trancriptomic Data', '# Samples in Proteomic Data', 
                       '# Common Samples', '# Common Proteins']
    return dataframe.set_index('Data')

sample_data = dataframe_from_dict(transcriptomic_samples, proteomic_samples, common_samples, common_genes)
sample_data.to_csv(file_samples_info, header = False, mode='a')
sample_data

Unnamed: 0_level_0,# Samples in Trancriptomic Data,# Samples in Proteomic Data,# Common Samples,# Common Proteins
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CCLE,1376,375,369,6987
GTEx,32,32,32,7471
NCI60,60,60,59,3022


In [44]:
correlation_combined = pd.concat([correlation_nci60, correlation_ccle, correlation_gtex], axis=1)
correlation_combined.to_csv(file_non_tumour_studies)
correlation_combined[:2]

Unnamed: 0,NCI60,CCLE,GTEx
AAAS,0.115409,0.05404,0.070586
AAMDC,0.18057,0.293006,0.779245


In [45]:
correlation_combined_pearson = pd.concat([correlation_nci60_pearson, 
                                          correlation_ccle_pearson, correlation_gtex_pearson], axis=1)
round(correlation_combined_pearson.median(), 2)

NCI60    0.40
CCLE     0.48
GTEx     0.59
dtype: float64