##  Part 1: Compute mRNA-protein correlation for older tumour studies

**Input:** Transcriptomics and Proteomics data of the tumour studies listed below.                  
1. <a href=https://www.nature.com/articles/nature13438>Colorectal Cancer (2014)</a>                   
2. <a href=https://www.nature.com/articles/nature18003>Breast Cancer (2016)</a>      
3. <a href=https://linkinghub.elsevier.com/retrieve/pii/S0092-8674(16)30673-0>Ovarian Cancer (2016)</a>     
4. <a href=https://tinyurl.com/3m8239bk>Colon Cancer (2019)</a>        

**Output:** Gene-wise correlation between mRNA and protein abundances 

<div class="alert alert-block alert-info">
<b>Note:</b> 
<ul>                    
<li> The input data are downloaded from the links specified in data_sources.ipynb </li>
<li> The transcriptomic and the proteomic data are preprocessed to get a matrix of proteins (rows) and samples (columns)</li>
<li> The standardised pipeline is applied before computing the mRNA-protein correlation for each study.</li>
</div>

#### Import packages

In [1]:
import os
import numpy as np
import pandas as pd
import standardised_pipeline_utils

In [2]:
get_data_path = lambda folders, fname: os.path.normpath(os.environ['DATA_PATH']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../local_data/'+'/'.join(folders) +'/'+ fname)

#### I/O Files

In [3]:
# Input Files
file_crc_transcriptomics =  get_data_path(['tumour_studies','crc','coadread_tcga_pub'],
                                          'data_RNA_Seq_expression_median.txt')
file_crc_proteomics = get_data_path(['tumour_studies','crc', 'zhang_2014'], 'Supplemental_Table.xlsx')

file_brca_transcriptomics = get_data_path(['tumour_studies','brca', 'brca_tcga_pub2015'], 
                                          'data_RNA_Seq_v2_expression_median.txt')
file_brca_proteomics = get_data_path(['tumour_studies','brca', 'mertins_2016'], 'CPTAC_BC_SupplementaryTable03.xlsx')

file_ovca_transcriptomics = get_data_path(['tumour_studies','ovca', 'firebrowse'], 'OV.medianexp.txt')
file_ovca_proteomics = get_data_path(['tumour_studies','ovca', 'zhang_2016'], 'Table_S2.xlsx')

file_colon_transcriptomics = get_data_path(['tumour_studies','colon', 'vasaikar_2019'], 'RNAseq.cct')
file_colon_proteomics = get_data_path(['tumour_studies','colon', 'vasaikar_2019'], 'Proteome_TMT.cct')
file_colon_proteomics_lf = get_data_path(['tumour_studies','colon', 'vasaikar_2019'], 'Proteome_LF.cct')

                 
# Output File
file_tumour_correlation = get_local_data_path(['processed', 'correlation_mRNA_protein'], 'older_tumour_studies.csv')
file_samples_info = get_local_data_path(['processed', 'correlation_mRNA_protein'], 'samples_info.csv')

In [4]:
# Information collected for the Supplemental Table S1B
common_samples = {}
common_genes = {}
transcriptomic_samples = {}
proteomic_samples = {}

In [5]:
# Pre-processing the data before applying the standardised pipeline
def preprocess(dataframe, index_col, columns_to_drop=[]):
    dataframe = dataframe.dropna(subset=[index_col]).set_index(index_col)
    dataframe.drop(columns_to_drop, axis=1, inplace=True)
    # choose the first 12 characters to identify the patient id
    # that is consistent across transcriptomic and proteomic data
    dataframe.rename(columns= lambda x: str(x)[:12], inplace=True)
    dataframe = dataframe.apply(pd.to_numeric)
    print("Dimensions: ", dataframe.shape)
    print("Null values count: ", dataframe.isnull().sum().sum())
    return dataframe

<a id="ColoRectal_Cancer"></a>
### 1. ColoRectal Cancer

#### Transcriptomic Data

In [6]:
# colorectal cancer transcriptomics data
crc_transcriptomics = pd.read_csv(file_crc_transcriptomics, sep='\t',  low_memory=False)
print('Dimensions: ', crc_transcriptomics.shape)
crc_transcriptomics[:1]

Dimensions:  (19489, 246)


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-A6-2670-01,TCGA-A6-2671-01,TCGA-A6-2672-01,TCGA-A6-2674-01,TCGA-A6-2676-01,TCGA-A6-2677-01,TCGA-A6-2678-01,TCGA-A6-2679-01,...,TCGA-AY-4071-01,TCGA-AZ-4308-01,TCGA-AZ-4313-01,TCGA-AZ-4315-01,TCGA-AZ-4614-01,TCGA-AZ-4615-01,TCGA-AZ-4684-01,TCGA-CA-5256-01,TCGA-CK-4951-01,TCGA-CM-4747-01
0,A1BG,1,0.523167,0.371355,0.925661,0.750148,0.524965,0.045138,0.163163,2.267252,...,0.960731,0.988869,0.285177,0.419773,0.134522,0.337758,0.714598,0.808955,0.308311,0.718461


In [7]:
#Clean data
crc_transcriptomics = preprocess(crc_transcriptomics, index_col='Hugo_Symbol', columns_to_drop=['Entrez_Gene_Id'])
crc_transcriptomics[:2]

Dimensions:  (19489, 244)
Null values count:  0


Unnamed: 0_level_0,TCGA-A6-2670,TCGA-A6-2671,TCGA-A6-2672,TCGA-A6-2674,TCGA-A6-2676,TCGA-A6-2677,TCGA-A6-2678,TCGA-A6-2679,TCGA-A6-2680,TCGA-A6-2681,...,TCGA-AY-4071,TCGA-AZ-4308,TCGA-AZ-4313,TCGA-AZ-4315,TCGA-AZ-4614,TCGA-AZ-4615,TCGA-AZ-4684,TCGA-CA-5256,TCGA-CK-4951,TCGA-CM-4747
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.523167,0.371355,0.925661,0.750148,0.524965,0.045138,0.163163,2.267252,0.556225,0.61989,...,0.960731,0.988869,0.285177,0.419773,0.134522,0.337758,0.714598,0.808955,0.308311,0.718461
A1CF,3.269805,0.24015,0.297705,1.956353,1.96037,4.923709,4.173823,1.151729,4.542303,1.670868,...,4.42313,1.619616,0.201777,0.459658,4.037968,0.412526,2.298243,4.482096,0.433493,1.821577


In [8]:
assert len(crc_transcriptomics.columns[crc_transcriptomics.columns.duplicated()]) == 0, "columns contain duplicates"

<div class="alert alert-block alert-warning">
<b>Note:</b> The older transcriptomic and proteomic data such as this contain 0s and no null values. In such cases, the standardised pipeline obtains proteins where the number of 0s is not measured in >20% of the samples.
</div>

In [9]:
#eliminate rows with >20% of null/zero values and compute mean for the isoforms 
crc_transcriptomics_processed = standardised_pipeline_utils.process(crc_transcriptomics)
crc_transcriptomics_processed[:2]

Dimensions:  (15961, 244)


Unnamed: 0_level_0,TCGA-A6-2670,TCGA-A6-2671,TCGA-A6-2672,TCGA-A6-2674,TCGA-A6-2676,TCGA-A6-2677,TCGA-A6-2678,TCGA-A6-2679,TCGA-A6-2680,TCGA-A6-2681,...,TCGA-AY-4071,TCGA-AZ-4308,TCGA-AZ-4313,TCGA-AZ-4315,TCGA-AZ-4614,TCGA-AZ-4615,TCGA-AZ-4684,TCGA-CA-5256,TCGA-CK-4951,TCGA-CM-4747
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.523167,0.371355,0.925661,0.750148,0.524965,0.045138,0.163163,2.267252,0.556225,0.61989,...,0.960731,0.988869,0.285177,0.419773,0.134522,0.337758,0.714598,0.808955,0.308311,0.718461
A1CF,3.269805,0.24015,0.297705,1.956353,1.96037,4.923709,4.173823,1.151729,4.542303,1.670868,...,4.42313,1.619616,0.201777,0.459658,4.037968,0.412526,2.298243,4.482096,0.433493,1.821577


In [10]:
# suppress warnings while reading the colorectal proteomics file 
import warnings 
warnings.simplefilter("ignore", category=UserWarning)

#### Proteomic Data

In [11]:
# colorectal cancer proteomics data
crc_proteomics = pd.read_excel(file_crc_proteomics, sheet_name="S4_sample95_count_quantilelog")
print('Dimensions: ', crc_proteomics.shape)
crc_proteomics[:1]

Dimensions:  (7211, 96)


Unnamed: 0,GeneSymbol,TCGA-A6-3807-01A-22,TCGA-A6-3808-01A-22,TCGA-A6-3810-01A-22,TCGA-AA-3518-01A-11,TCGA-AA-3525-01A-12,TCGA-AA-3526-01A-11,TCGA-AA-3529-01A-12,TCGA-AA-3531-01A-22,TCGA-AA-3534-01A-22,...,TCGA-AG-A01L-01A-22,TCGA-AG-A01N-01A-23,TCGA-AG-A01W-01A-23,TCGA-AG-A01Y-01A-43,TCGA-AG-A020-01A-23,TCGA-AG-A026-01A-71,TCGA-AG-A02N-01A-31,TCGA-AG-A02X-01A-32,TCGA-AG-A032-01A-31,TCGA-AG-A036-01A-22
0,A1BG,3.168237,4.53212,4.460122,4.638684,3.161463,3.740757,3.250323,3.463567,4.249525,...,3.598817,2.640816,3.68281,4.406709,3.753199,3.735066,3.471804,4.367886,2.514953,4.111383


In [12]:
# Clean data
crc_proteomics = preprocess(crc_proteomics, index_col='GeneSymbol')
crc_proteomics[:2]

Dimensions:  (7211, 95)
Null values count:  0


Unnamed: 0_level_0,TCGA-A6-3807,TCGA-A6-3808,TCGA-A6-3810,TCGA-AA-3518,TCGA-AA-3525,TCGA-AA-3526,TCGA-AA-3529,TCGA-AA-3531,TCGA-AA-3534,TCGA-AA-3552,...,TCGA-AG-A01L,TCGA-AG-A01N,TCGA-AG-A01W,TCGA-AG-A01Y,TCGA-AG-A020,TCGA-AG-A026,TCGA-AG-A02N,TCGA-AG-A02X,TCGA-AG-A032,TCGA-AG-A036
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,3.168237,4.53212,4.460122,4.638684,3.161463,3.740757,3.250323,3.463567,4.249525,3.482712,...,3.598817,2.640816,3.68281,4.406709,3.753199,3.735066,3.471804,4.367886,2.514953,4.111383
A1CF,1.549085,0.0,1.178337,0.0,0.0,0.889576,0.0,2.485427,1.864773,0.0,...,0.992387,2.381429,1.796467,0.0,2.297423,2.297423,0.992387,0.0,1.14439,2.10257


In [13]:
assert len(crc_proteomics.columns[crc_proteomics.columns.duplicated()]) == 5, "columns contain no duplicates"

<div class="alert alert-block alert-warning">
<b>Note:</b> Since the Colorectal cancer proteomics data contains 5 duplicated sample IDs, drop the duplicated sample IDs. The same procedure is followed for the other studies too in this context. 
</div>

In [14]:
crc_proteomics = crc_proteomics.loc[:, ~crc_proteomics.columns.duplicated(keep='first')]

In [15]:
#eliminate rows with >20% of null/zero values
crc_proteomics_processed = standardised_pipeline_utils.process(crc_proteomics)
crc_proteomics_processed[:2]

Dimensions:  (3506, 90)


Unnamed: 0_level_0,TCGA-A6-3807,TCGA-A6-3808,TCGA-A6-3810,TCGA-AA-3518,TCGA-AA-3525,TCGA-AA-3526,TCGA-AA-3529,TCGA-AA-3531,TCGA-AA-3534,TCGA-AA-3552,...,TCGA-AG-A01L,TCGA-AG-A01N,TCGA-AG-A01W,TCGA-AG-A01Y,TCGA-AG-A020,TCGA-AG-A026,TCGA-AG-A02N,TCGA-AG-A02X,TCGA-AG-A032,TCGA-AG-A036
GeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,3.168237,4.53212,4.460122,4.638684,3.161463,3.740757,3.250323,3.463567,4.249525,3.482712,...,3.598817,2.640816,3.68281,4.406709,3.753199,3.735066,3.471804,4.367886,2.514953,4.111383
A2M,7.194187,7.74494,6.507127,7.973176,6.890216,6.481183,6.177288,6.364757,7.194187,6.616646,...,6.219169,5.67985,6.976798,6.289679,6.81513,5.924375,5.876065,6.704523,6.574803,6.161251


In [16]:
transcriptomic_samples['CRC (2014)'] = crc_transcriptomics_processed.shape[1]
proteomic_samples['CRC (2014)'] = crc_proteomics_processed.shape[1]

In [17]:
crc_transcriptomics_processed, crc_proteomics_processed = standardised_pipeline_utils.match_proteins_samples( \
                                                                            crc_transcriptomics_processed, 
                                                                            crc_proteomics_processed)

Number of common proteins:  3238
Number of common samples:  77


In [18]:
common_samples['CRC (2014)'] = crc_transcriptomics_processed.shape[1]
common_genes['CRC (2014)'] = crc_transcriptomics_processed.shape[0]

In [19]:
correlation_crc = standardised_pipeline_utils.correlate_genewise(crc_transcriptomics_processed, 
                                                                 crc_proteomics_processed, 'CRC (2014)')

Median Spearman Correlation:  0.22


In [20]:
correlation_crc_pearson = standardised_pipeline_utils.correlate_genewise(crc_transcriptomics_processed, 
                                                                 crc_proteomics_processed, 'CRC (2014)', method='pearson')

Median Pearson Correlation:  0.2233


<a id="Breast_Cancer"></a>
### 2. Breast Cancer (2016)

#### Transcriptomic Data

In [21]:
brca_transcriptomics = pd.read_csv(file_brca_transcriptomics, sep='\t')
print('Dimensions: ', brca_transcriptomics.shape)
brca_transcriptomics[:1]

Dimensions:  (20440, 819)


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-A1-A0SB-01,TCGA-A1-A0SD-01,TCGA-A1-A0SE-01,TCGA-A1-A0SF-01,TCGA-A1-A0SH-01,TCGA-A1-A0SI-01,TCGA-A1-A0SJ-01,TCGA-A1-A0SK-01,...,TCGA-LL-A5YM-01,TCGA-LL-A5YN-01,TCGA-LL-A5YO-01,TCGA-LL-A5YP-01,TCGA-LQ-A4E4-01,TCGA-MS-A51U-01,TCGA-OL-A66H-01,TCGA-OL-A66I-01,TCGA-OL-A66J-01,TCGA-OL-A66K-01
0,UBE2Q2P2,100134869,14.3935,11.3241,4.4426,10.7401,3.0048,2.9782,4.9419,28.856,...,0.0,2.9624,3.6899,6.302,14.1288,7.9343,2.2519,1.2603,5.0428,4.3892


In [22]:
#Clean data
brca_transcriptomics = preprocess(brca_transcriptomics, index_col='Hugo_Symbol', columns_to_drop=['Entrez_Gene_Id'])
brca_transcriptomics[:2]

Dimensions:  (20440, 817)
Null values count:  0


Unnamed: 0_level_0,TCGA-A1-A0SB,TCGA-A1-A0SD,TCGA-A1-A0SE,TCGA-A1-A0SF,TCGA-A1-A0SH,TCGA-A1-A0SI,TCGA-A1-A0SJ,TCGA-A1-A0SK,TCGA-A1-A0SM,TCGA-A1-A0SN,...,TCGA-LL-A5YM,TCGA-LL-A5YN,TCGA-LL-A5YO,TCGA-LL-A5YP,TCGA-LQ-A4E4,TCGA-MS-A51U,TCGA-OL-A66H,TCGA-OL-A66I,TCGA-OL-A66J,TCGA-OL-A66K
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UBE2Q2P2,14.3935,11.3241,4.4426,10.7401,3.0048,2.9782,4.9419,28.856,7.6484,5.6992,...,0.0,2.9624,3.6899,6.302,14.1288,7.9343,2.2519,1.2603,5.0428,4.3892
HMGB1P1,116.387,60.263,153.1452,141.1933,79.8003,63.5491,134.8733,1119.1932,119.476,95.0898,...,101.2865,100.3083,278.5626,206.4376,117.03,150.6834,115.3378,158.3599,124.6327,106.3475


In [23]:
assert len(brca_transcriptomics.columns[brca_transcriptomics.columns.duplicated()]) == 0, \
       "columns contain duplicates"

In [24]:
brca_transcriptomics_processed = standardised_pipeline_utils.process(brca_transcriptomics)
brca_transcriptomics_processed[:2]

Dimensions:  (16558, 817)


Unnamed: 0_level_0,TCGA-A1-A0SB,TCGA-A1-A0SD,TCGA-A1-A0SE,TCGA-A1-A0SF,TCGA-A1-A0SH,TCGA-A1-A0SI,TCGA-A1-A0SJ,TCGA-A1-A0SK,TCGA-A1-A0SM,TCGA-A1-A0SN,...,TCGA-LL-A5YM,TCGA-LL-A5YN,TCGA-LL-A5YO,TCGA-LL-A5YP,TCGA-LQ-A4E4,TCGA-MS-A51U,TCGA-OL-A66H,TCGA-OL-A66I,TCGA-OL-A66J,TCGA-OL-A66K
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,49.1992,142.2976,192.8194,326.0194,180.3235,128.455,226.7801,86.6436,307.4937,87.5786,...,1507.1099,452.4033,295.5528,136.0353,498.7862,362.5874,51.2469,207.2697,267.0482,422.2195
A1BG-AS1,75.0039,143.644,148.617,187.298,157.7045,147.4985,115.2922,65.2821,121.0795,66.0327,...,317.9009,352.0115,159.218,63.2459,250.2883,291.5545,54.4412,167.8665,174.9752,227.4997


#### Proteomic Data

In [25]:
brca_proteomics = pd.read_excel(io=file_brca_proteomics, sheet_name="Global-Proteome-G3")
print('Dimensions: ', brca_proteomics.shape)
brca_proteomics[:2]

Dimensions:  (12553, 95)


Unnamed: 0,accession_number,numITRAQExperimentsProteinObserved,numSpectraProteinObserved,protein_mw,species,accession_numbers,numPepsUnique,scoreUnique,total percent Coverage across all experiments,subgroupNum,...,AO-A12B.34TCGA,A2-A0SW.35TCGA,AO-A0JL.35TCGA,BH-A0BV.35TCGA,A2-A0YM.36TCGA,BH-A0C7.36TCGA,A2-A0SX.36TCGA,263d3f-I.CPTAC,blcdb9-I.CPTAC,c4155b-C.CPTAC
0,NP_958782,37,43425,533778.0,Human,NP_958782,678,11158.42,89.1,1.1,...,-0.963904,-0.487772,-0.10668,-0.065838,0.65585,-0.552212,-0.39856,0.598585,-0.191285,0.566975
1,NP_958785,37,42994,518637.5,Human,NP_958785,670,11042.11,90.0,1.2,...,-0.93821,-0.487772,-0.10668,-0.055893,0.658143,-0.547749,-0.392601,0.606697,-0.183918,0.578702


In [26]:
brca_proteomics[['Name', 'geneName']].head(3)

Unnamed: 0,Name,geneName
0,plectin isoform 1,PLEC
1,plectin isoform 1g,
2,plectin isoform 1a,PLEC


<div class="alert alert-block alert-warning">
<b>Note:</b> Since some proteins contain NaN in the geneName column although other isoforms of it exists with correct geneName, proteomics data needs to be processed to fill in the NaN values with the appropriate geneName
</div>

In [27]:
#split based on the name and then identify the right geneNames for each protein record
brca_proteomics.loc[:, 'modifiedName'] = brca_proteomics['Name'].str.split('isoform').str[0] 

brca_proteomics['geneName'] = brca_proteomics.groupby(['modifiedName'])['geneName'] \
                                             .apply(lambda x: x.ffill().bfill())

# #Even after filling if there exists NaN values for gene - we drop them
brca_proteomics.dropna(subset=['geneName'], inplace=True)
brca_proteomics = brca_proteomics.groupby(['geneName']).mean()

# Modifying sample names to suit the samples in transcriptomic data
brca_proteomics.columns = [i.replace('.', '-') for i in brca_proteomics.columns]
brca_proteomics = brca_proteomics.loc[:, brca_proteomics.columns.str.contains('TCGA')]
brca_proteomics = brca_proteomics.rename(columns = lambda x : 'TCGA-' + str(x)[:7])
# Aggregate the duplicated columns present
brca_proteomics = brca_proteomics.groupby(brca_proteomics.columns, axis=1).mean()

print("Null values count: ", brca_proteomics.isnull().sum().sum())
brca_proteomics[:2]

Null values count:  80573


Unnamed: 0_level_0,TCGA-A2-A0CM,TCGA-A2-A0D2,TCGA-A2-A0EQ,TCGA-A2-A0EV,TCGA-A2-A0EX,TCGA-A2-A0EY,TCGA-A2-A0SW,TCGA-A2-A0SX,TCGA-A2-A0T3,TCGA-A2-A0T6,...,TCGA-C8-A12Z,TCGA-C8-A130,TCGA-C8-A131,TCGA-C8-A134,TCGA-C8-A135,TCGA-C8-A138,TCGA-D8-A142,TCGA-E2-A154,TCGA-E2-A158,TCGA-E2-A15A
geneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.899882,-0.1054,-0.380704,1.699253,-0.390673,1.849433,1.560915,2.369127,-0.00341,2.519514,...,1.775668,0.850687,-0.294419,0.320314,2.185114,-2.018457,1.539879,1.856193,-1.987341,-1.277926
A2BP1,0.911162,0.882545,-0.874399,-2.084358,1.796662,-0.578121,1.50624,-0.520707,0.120195,-1.671573,...,-1.859222,1.00812,0.737591,1.945039,-2.473849,0.329426,0.957906,0.520191,0.291579,-2.101058


In [28]:
assert len(brca_proteomics.columns[brca_proteomics.columns.duplicated()]) == 0, \
       "columns contain duplicates"

In [29]:
brca_proteomics_processed = standardised_pipeline_utils.process(brca_proteomics)
brca_proteomics_processed[:2]

Dimensions:  (8052, 77)


Unnamed: 0_level_0,TCGA-A2-A0CM,TCGA-A2-A0D2,TCGA-A2-A0EQ,TCGA-A2-A0EV,TCGA-A2-A0EX,TCGA-A2-A0EY,TCGA-A2-A0SW,TCGA-A2-A0SX,TCGA-A2-A0T3,TCGA-A2-A0T6,...,TCGA-C8-A12Z,TCGA-C8-A130,TCGA-C8-A131,TCGA-C8-A134,TCGA-C8-A135,TCGA-C8-A138,TCGA-D8-A142,TCGA-E2-A154,TCGA-E2-A158,TCGA-E2-A15A
geneName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.899882,-0.1054,-0.380704,1.699253,-0.390673,1.849433,1.560915,2.369127,-0.00341,2.519514,...,1.775668,0.850687,-0.294419,0.320314,2.185114,-2.018457,1.539879,1.856193,-1.987341,-1.277926
A2BP1,0.911162,0.882545,-0.874399,-2.084358,1.796662,-0.578121,1.50624,-0.520707,0.120195,-1.671573,...,-1.859222,1.00812,0.737591,1.945039,-2.473849,0.329426,0.957906,0.520191,0.291579,-2.101058


In [30]:
transcriptomic_samples['BrCa (2016)'] = brca_transcriptomics_processed.shape[1]
proteomic_samples['BrCa (2016)'] = brca_proteomics_processed.shape[1]

In [31]:
brca_transcriptomics_processed, brca_proteomics_processed =  standardised_pipeline_utils.match_proteins_samples( \
                                                                                brca_transcriptomics_processed, 
                                                                                brca_proteomics_processed)

Number of common proteins:  7283
Number of common samples:  59


In [32]:
common_samples['BrCa (2016)'] = brca_transcriptomics_processed.shape[1]
common_genes['BrCa (2016)'] = brca_transcriptomics_processed.shape[0]

In [33]:
correlation_brca = standardised_pipeline_utils.correlate_genewise(brca_transcriptomics_processed, 
                                                                  brca_proteomics_processed, 'BrCa (2016)')

Median Spearman Correlation:  0.4175


In [34]:
correlation_brca_pearson = standardised_pipeline_utils.correlate_genewise(brca_transcriptomics_processed, 
                                                                  brca_proteomics_processed, 'BrCa (2016)', method='pearson')

Median Pearson Correlation:  0.4222


<a id="Ovarian_Cancer"></a>
### 3. Ovarian Cancer 

#### Transcriptomic Data

In [35]:
ovca_transcriptomics = pd.read_csv(file_ovca_transcriptomics, sep="\t", low_memory=False)
print('Dimensions: ', ovca_transcriptomics.shape)
ovca_transcriptomics[:2]

Dimensions:  (18633, 595)


Unnamed: 0,Hybridization REF,TCGA-01-0628-11A-01R-0361-03,TCGA-01-0630-11A-01R-0361-03,TCGA-01-0631-11A-01R-0361-03,TCGA-01-0633-11A-01R-0361-03,TCGA-01-0636-11A-01R-0361-03,TCGA-01-0637-11A-01R-0361-03,TCGA-01-0639-11A-01R-0361-03,TCGA-01-0642-11A-02R-0361-03,TCGA-04-1331-01A-01R-0435-03,...,TCGA-61-2104-01A-01R-0669-03,TCGA-61-2109-01A-01R-0669-03,TCGA-61-2110-01A-01R-0669-03,TCGA-61-2111-01A-01R-0669-03,TCGA-61-2113-01A-01R-0669-03,TCGA-61-2610-02A-01R-1140-03,TCGA-61-2611-02A-01R-1140-03,TCGA-61-2612-01A-01R-1140-03,TCGA-61-2613-01A-01R-1140-03,TCGA-61-2614-01A-01R-1140-03
0,Composite Element REF,Signal,Signal,Signal,Signal,Signal,Signal,Signal,Signal,Signal,...,Signal,Signal,Signal,Signal,Signal,Signal,Signal,Signal,Signal,Signal
1,C9orf152,5.68406070638714,5.74059738582655,5.37909487049308,5.02811262452816,4.90491745435555,5.91024526125034,6.03056527156875,5.87840655167558,5.1293974840562,...,4.46343743805034,4.53738193913162,4.06865889851126,4.58072953660696,3.64353233073353,4.85965771708381,3.69981049486697,4.68639184341207,3.98064149167257,4.12777538628098


In [36]:
ovca_transcriptomics.drop(ovca_transcriptomics.index[0], inplace=True)
ovca_transcriptomics = preprocess(ovca_transcriptomics, 'Hybridization REF', [])

Dimensions:  (18632, 594)
Null values count:  0


In [37]:
assert len(ovca_transcriptomics.columns[ovca_transcriptomics.columns.duplicated()]) != 0, "columns contain no duplicates"

In [38]:
ovca_transcriptomics = ovca_transcriptomics.loc[:, ~ovca_transcriptomics.columns.duplicated(keep='first')]

In [39]:
ovca_transcriptomics_processed = standardised_pipeline_utils.process(ovca_transcriptomics)
ovca_transcriptomics_processed[:2]

Dimensions:  (18632, 579)


Unnamed: 0_level_0,TCGA-01-0628,TCGA-01-0630,TCGA-01-0631,TCGA-01-0633,TCGA-01-0636,TCGA-01-0637,TCGA-01-0639,TCGA-01-0642,TCGA-04-1331,TCGA-04-1332,...,TCGA-61-2104,TCGA-61-2109,TCGA-61-2110,TCGA-61-2111,TCGA-61-2113,TCGA-61-2610,TCGA-61-2611,TCGA-61-2612,TCGA-61-2613,TCGA-61-2614
Hybridization REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15E1.2,8.518316,7.370906,7.708458,7.786219,8.224809,7.956135,7.197224,7.94985,8.704958,8.88289,...,8.731404,9.531594,8.583059,8.847043,8.976926,8.046281,9.136807,8.983845,9.509617,9.763647
2'-PDE,6.252983,6.297369,6.206035,6.278075,6.339588,6.165869,6.159611,6.257575,6.26453,6.662592,...,6.576362,6.345743,7.163527,6.374608,6.481556,6.845093,6.108022,6.476601,6.654893,6.04205


#### Proteomic Data

In [40]:
ovca_proteomics = pd.read_excel(io=file_ovca_proteomics, sheet_name="FullOvarianProteomeData")
print('Dimensions: ', ovca_proteomics.shape)
ovca_proteomics[:2]

Dimensions:  (9600, 208)


Unnamed: 0,refseq_peptide,hgnc_symbol,PNNL-TCGA-09-1664,PNNL-TCGA-13-1484,PNNL-TCGA-13-1488,PNNL-TCGA-13-1489,PNNL-TCGA-13-1494,PNNL-TCGA-13-1495,PNNL-TCGA-13-1499,PNNL-TCGA-13-2071,...,JHU-TCGA-61-1911,JHU-TCGA-61-1914,JHU-TCGA-61-1915,JHU-TCGA-61-1918,JHU-TCGA-61-1995,JHU-TCGA-61-2008,JHU-TCGA-61-2087,JHU-TCGA-61-2088,JHU-TCGA-61-2094,JHU-TCGA-61-2613
0,NP_000005,A2M,-0.0581,-0.688,0.518,-0.432,0.333,0.109,-1.23,0.172,...,0.26575,-0.68825,-0.07125,-0.76725,-0.82125,-0.36055,-1.02225,-1.17225,-0.13225,1.16575
1,NP_000007,ACADM,-0.104,0.404,0.0714,0.272,-0.369,0.175,-0.295,-0.425,...,0.073225,-0.326775,-1.069775,-0.383775,0.105225,0.365225,-0.277775,0.401225,0.056225,-0.372775


In [41]:
# Clean data
ovca_proteomics = ovca_proteomics.set_index('hgnc_symbol').drop('refseq_peptide', axis=1)
ovca_proteomics.columns = ['-'.join(index) for index in ovca_proteomics.columns.str.split('-').str[1:4]]
# taking mean of the replicates the replicates 
ovca_proteomics = ovca_proteomics.groupby(ovca_proteomics.columns, axis=1).mean()
# taking mean of the isoforms 
ovca_proteomics = ovca_proteomics.groupby(ovca_proteomics.index).mean()

In [42]:
assert len(ovca_proteomics.columns[ovca_proteomics.columns.duplicated()]) == 0, "columns contain duplicates"

In [43]:
ovca_proteomics_processed = standardised_pipeline_utils.process(ovca_proteomics)
ovca_proteomics_processed[:2]

Dimensions:  (6627, 174)


Unnamed: 0_level_0,TCGA-09-1664,TCGA-09-2056,TCGA-13-1404,TCGA-13-1409,TCGA-13-1410,TCGA-13-1482,TCGA-13-1483,TCGA-13-1484,TCGA-13-1485,TCGA-13-1487,...,TCGA-61-1919,TCGA-61-1995,TCGA-61-2008,TCGA-61-2087,TCGA-61-2088,TCGA-61-2094,TCGA-61-2095,TCGA-61-2096,TCGA-61-2612,TCGA-61-2613
hgnc_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.31685,-0.6323,-1.0193,-0.6973,-0.429,-0.427,-0.402,-1.06,-0.2143,-0.6443,...,-0.00193,-0.12235,0.63485,-1.1153,-0.8383,-0.0673,0.635,1.91,0.0689,0.9097
A2M,-0.138175,0.00675,-0.86525,-0.98125,-0.00425,-0.46865,-1.12625,-0.688,-0.58425,-1.38325,...,-0.0036,-0.706125,-0.549775,-1.02225,-1.17225,-0.13225,0.91,0.353,0.35,1.16575


In [44]:
transcriptomic_samples['Ovarian'] = ovca_transcriptomics_processed.shape[1]
proteomic_samples['Ovarian'] = ovca_proteomics_processed.shape[1]

In [45]:
ovca_transcriptomics_processed, ovca_proteomics_processed =  standardised_pipeline_utils.match_proteins_samples( \
                                                                                ovca_transcriptomics_processed, 
                                                                                ovca_proteomics_processed)

Number of common proteins:  5841
Number of common samples:  174


In [46]:
common_samples['Ovarian'] = ovca_transcriptomics_processed.shape[1]
common_genes['Ovarian'] = ovca_transcriptomics_processed.shape[0]

In [47]:
correlation_ovca = standardised_pipeline_utils.correlate_genewise(ovca_transcriptomics_processed, 
                                                                   ovca_proteomics_processed, 'Ovarian')

Median Spearman Correlation:  0.4121


In [48]:
correlation_ovca_pearson = standardised_pipeline_utils.correlate_genewise(ovca_transcriptomics_processed, 
                                                                   ovca_proteomics_processed, 'Ovarian', method='pearson')

Median Pearson Correlation:  0.4097


<a id="Colon_Cancer"></a>
### 4. Colon Cancer

#### Transcriptomic Data

In [49]:
colon_transcriptomics = pd.read_csv(file_colon_transcriptomics, sep='\t',  low_memory=False)
print('Dimensions: ', colon_transcriptomics.shape)
colon_transcriptomics[:2]

Dimensions:  (13482, 107)


Unnamed: 0,attrib_name,01CO001,01CO005,01CO006,01CO008,01CO013,01CO014,01CO015,01CO019,01CO022,...,20CO001,20CO003,20CO004,20CO007,21CO006,21CO007,22CO004,22CO006,24CO005,27CO004
0,A1CF,10.1984,10.1586,9.2119,10.7322,9.716,9.5177,9.6275,9.5736,4.858,...,8.5078,2.585,10.1408,9.0688,9.2336,10.0768,10.9293,9.2432,8.6546,9.8025
1,A2M,13.8117,12.6434,15.004,13.9284,12.9963,11.87,13.0274,12.5637,11.6234,...,14.2033,13.7171,11.3078,12.7292,12.9123,13.4762,12.6131,14.262,13.025,13.1139


In [50]:
colon_transcriptomics = preprocess(colon_transcriptomics, index_col='attrib_name')

Dimensions:  (13482, 106)
Null values count:  0


In [51]:
assert len(colon_transcriptomics.columns[colon_transcriptomics.columns.duplicated()]) == 0, "columns contain duplicates"

In [52]:
colon_transcriptomics_processed = standardised_pipeline_utils.process(colon_transcriptomics)
colon_transcriptomics_processed[:2]

Dimensions:  (13448, 106)


Unnamed: 0_level_0,01CO001,01CO005,01CO006,01CO008,01CO013,01CO014,01CO015,01CO019,01CO022,05CO002,...,20CO001,20CO003,20CO004,20CO007,21CO006,21CO007,22CO004,22CO006,24CO005,27CO004
attrib_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,10.1984,10.1586,9.2119,10.7322,9.716,9.5177,9.6275,9.5736,4.858,9.2432,...,8.5078,2.585,10.1408,9.0688,9.2336,10.0768,10.9293,9.2432,8.6546,9.8025
A2M,13.8117,12.6434,15.004,13.9284,12.9963,11.87,13.0274,12.5637,11.6234,15.3414,...,14.2033,13.7171,11.3078,12.7292,12.9123,13.4762,12.6131,14.262,13.025,13.1139


#### Proteomic Data

In [53]:
colon_proteomics = pd.read_csv(file_colon_proteomics, sep='\t')
print('Dimensions: ', colon_proteomics.shape)
colon_proteomics[:2]

Dimensions:  (6422, 97)


Unnamed: 0,attrib_name,01CO005,01CO006,01CO008,01CO013,01CO014,01CO015,01CO019,01CO022,05CO002,...,20CO001,20CO003,20CO004,20CO006,20CO007,21CO006,21CO007,22CO004,22CO006,27CO004
0,A1BG,-1.672,-2.15,-1.786,-2.115,-1.793,-2.526,-1.441,1.047,-1.878,...,0.297,0.005,-3.39,-1.648,-1.72,0.038,-1.952,-1.3342,-0.069,-2.602
1,A1CF,-0.34,-0.3476,-0.124,-0.197,-0.73,,-0.103,-1.126,0.507,...,-0.097,-1.693,0.985,0.9395,,-0.877,0.777,0.112,-0.519,0.1017


In [54]:
colon_proteomics = preprocess(colon_proteomics, index_col='attrib_name')

Dimensions:  (6422, 96)
Null values count:  64560


In [55]:
assert len(colon_proteomics.columns[colon_proteomics.columns.duplicated()]) == 0, "columns contain duplicates"

In [56]:
colon_proteomics_processed = standardised_pipeline_utils.process(colon_proteomics)
colon_proteomics_processed[:2]

Dimensions:  (5152, 96)


Unnamed: 0_level_0,01CO005,01CO006,01CO008,01CO013,01CO014,01CO015,01CO019,01CO022,05CO002,05CO003,...,20CO001,20CO003,20CO004,20CO006,20CO007,21CO006,21CO007,22CO004,22CO006,27CO004
attrib_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-1.672,-2.15,-1.786,-2.115,-1.793,-2.526,-1.441,1.047,-1.878,-0.294,...,0.297,0.005,-3.39,-1.648,-1.72,0.038,-1.952,-1.3342,-0.069,-2.602
A1CF,-0.34,-0.3476,-0.124,-0.197,-0.73,,-0.103,-1.126,0.507,-1.657,...,-0.097,-1.693,0.985,0.9395,,-0.877,0.777,0.112,-0.519,0.1017


In [57]:
transcriptomic_samples['Colon'] = colon_transcriptomics_processed.shape[1]
proteomic_samples['Colon'] = colon_proteomics_processed.shape[1]

In [58]:
colon_transcriptomics_processed, colon_proteomics_processed =  standardised_pipeline_utils.match_proteins_samples( \
                                                                                colon_transcriptomics_processed, 
                                                                                colon_proteomics_processed)

Number of common proteins:  4970
Number of common samples:  95


In [59]:
common_samples['Colon'] = colon_transcriptomics_processed.shape[1]
common_genes['Colon'] = colon_transcriptomics_processed.shape[0]

In [60]:
correlation_colon = standardised_pipeline_utils.correlate_genewise(colon_transcriptomics_processed, 
                                                                    colon_proteomics_processed, 'Colon')

Median Spearman Correlation:  0.2702


In [61]:
correlation_colon_pearson = standardised_pipeline_utils.correlate_genewise(colon_transcriptomics_processed, 
                                                                    colon_proteomics_processed, 'Colon', method='pearson')

Median Pearson Correlation:  0.2791


In [62]:
correlation_combined = pd.concat([correlation_crc, correlation_brca, correlation_ovca, correlation_colon], axis=1)
correlation_combined.to_csv(file_tumour_correlation)
correlation_combined[:2]

Unnamed: 0,CRC (2014),BrCa (2016),Ovarian,Colon
A1BG,0.123133,-0.093805,-0.078889,
A2M,-0.037595,0.118235,0.191865,-0.026288


In [63]:
correlation_combined_pearson = pd.concat([correlation_crc_pearson, correlation_brca_pearson, 
                                          correlation_ovca_pearson, correlation_colon_pearson], axis=1)
round(correlation_combined_pearson.median(), 2)

CRC (2014)     0.22
BrCa (2016)    0.42
Ovarian        0.41
Colon          0.28
dtype: float64

In [64]:
def dataframe_from_dict(*dict_args):
    dataframe = pd.DataFrame.from_dict(dict_args[0], orient='index')
    for i in range(1, len(dict_args)):
        dataframe = pd.concat([dataframe, pd.DataFrame.from_dict(dict_args[i], orient='index')], axis=1)
    dataframe.reset_index(inplace=True)
    dataframe.columns=['Data', '# Samples in Trancriptomic Data', '# Samples in Proteomic Data', 
                       '# Common Samples', '# Common Proteins']
    return dataframe.set_index('Data')

sample_data = dataframe_from_dict(transcriptomic_samples, proteomic_samples, common_samples, common_genes)
sample_data.to_csv(file_samples_info)
sample_data

Unnamed: 0_level_0,# Samples in Trancriptomic Data,# Samples in Proteomic Data,# Common Samples,# Common Proteins
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CRC (2014),244,90,77,3238
BrCa (2016),817,77,59,7283
Ovarian,579,174,174,5841
Colon,106,96,95,4970
