### Process protein properties

**Input:** Protein Abundance (GTEx), Protein Unique Peptides(GTEx), Protein half-lives and other biochemical properties

**Output:** Proteins and their properties

<div class="alert alert-block alert-info">
<b>Note:</b>      
Protein abundance is computed as the mean of the protein abundances across the 32 healthy tissues. Standard pipeline is applied before computing the mean abundances. 
</div>

In [1]:
import os
import math
import datetime
import numpy as np
import pandas as pd

import standardised_pipeline_utils

In [2]:
get_data_path = lambda folders, fname: os.path.normpath(os.environ['DATA_PATH'] + '/' +'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../local_data/' + '/'.join(folders) +'/'+ fname)

In [3]:
#Input Files
file_gtex_abundance =  get_data_path(['jiang_2020'], 'Table_S2.xlsx')
file_gtex_experiment_info = get_data_path(['jiang_2020'], 'Table_S4.xlsx')
file_protein_properties = get_data_path(['protein_properties', 'zecha_2018'], 'Table_S3.xlsx')

# Output File
file_processed_properties = get_local_data_path(['processed'], 'protein_properties.csv')

### GTEx Mean Abundance

In [4]:
gtex_abundance = pd.read_excel(io=file_gtex_abundance, sheet_name="C protein normalized abundance", 
                               skiprows=3, index_col='gene.id', engine='openpyxl')
print("Dimensions: ", gtex_abundance.shape)
gtex_abundance[:2]

Dimensions:  (12627, 533)


Unnamed: 0_level_0,gene.id.full,reference,GTEX-12WSD-0526-SM-9KNJH,GTEX-11DXX-1226-SM-9KI3K,GTEX-12WSD-1326-SM-9KMWZ,GTEX-13D11-0226-SM-9KI5R,GTEX-1211K-0226-SM-9KMWV,GTEX-YEC4-1426-SM-9KI3B,GTEX-12WSD-1426-SM-9KMX4,GTEX-12WSD-2626-SM-9KMX7,...,reference.110,GTEX-13D11-2626-SM-9KNJO.1,GTEX-11GSP-0226-SM-9KI3T.1,GTEX-1211K-1726-SM-9KI5K.1,GTEX-13FTW-0526-SM-9KNJC.1,GTEX-11GSP-2126-SM-9KI3Y.1,GTEX-13FTW-1526-SM-9KNJB.1,GTEX-11GSP-1126-SM-9KMWC.2,GTEX-13OW8-0226-SM-9KI4E.1,reference.111
gene.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,ENSG00000000003.10,15.0,,,,,,,,,...,15.0,,,,,,,,,15.0
ENSG00000000419,ENSG00000000419.8,15.0,,,,,,,,,...,15.0,,,,,,,,,15.0


In [5]:
gtex_id_mapper = pd.read_excel(io=file_gtex_abundance, sheet_name='G protein TS score', 
                               skiprows=2, engine='openpyxl', converters={'hgnc_symbol': str})

In [6]:
gtex_abundance.drop(list(gtex_abundance.filter(regex = 'reference')), axis = 1, inplace = True)
gtex_abundance.drop(columns=['gene.id.full'], inplace=True)
gtex_abundance.index = gtex_abundance.index.map(gtex_id_mapper.set_index('ensembl_id')['hgnc_symbol'])

In [7]:
gtex_abundance_processed = standardised_pipeline_utils.process(gtex_abundance)
gtex_abundance_mean = pd.DataFrame(gtex_abundance_processed.mean(axis=1), 
                                                         columns=['GTEx_Mean_Abundance'])
gtex_abundance_mean[:2]

Dimensions:  (4725, 420)


Unnamed: 0_level_0,GTEx_Mean_Abundance
gene.id,Unnamed: 1_level_1
A1BG,1357.680477
A2M,10540.179562


In [8]:
gtex_abundance_variance = pd.DataFrame(gtex_abundance_processed.var(axis=1), 
                                                         columns=['GTEx_Abundance_Variance'])
gtex_abundance_variance[:2]

Unnamed: 0_level_0,GTEx_Abundance_Variance
gene.id,Unnamed: 1_level_1
A1BG,2402719.0
A2M,127213300.0


### GTEx Unique Peptides

In [9]:
gtex_experiment_info = pd.read_excel(io=file_gtex_experiment_info, sheet_name="A enrichment comparison", 
                                     index_col='hgnc_symbol', skiprows=2)
print("Dimensions: ", gtex_experiment_info.shape)
gtex_experiment_info[:2]

Dimensions:  (12627, 62)


Unnamed: 0_level_0,ensembl_id,entrez_id,hgnc_name,prt_total_pp_cnt_from_56_runs,prt_unique_pp_cnt,prt_confident_indicator_from_full_search,prt_sample_num_out_of_201_after_imputation,prt_tissue_num_out_of_32_after_imputation,prt_sample_median,prt_sample_MAD,...,HPA:Reliability..IH.,HPA:Reliability..Mouse.Brain.,HPA:Reliability..IF.,HPA:Subcellular.location,HPA:Prognostic.p.value,HPA:RNA.cancer.category,HPA:RNA.tissue.category,HPA:RNA.TS,HPA:RNA.TS.TPM,HPA:TPM.max.in.non.specific
hgnc_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SCYL3,ENSG00000000457,57147.0,SCY1 like pseudokinase 3,8,3,high.conf,55,26,-0.06,0.65,...,Approved,,Uncertain,Nuclear bodies<br>Microtubules,Urothelial cancer:8.85e-4 (favourable),Expressed in all,Expressed in all,,,parathyroid gland: 44.0
CFH,ENSG00000000971,3075.0,complement factor H,2521,76,high.conf,201,32,-0.68,1.45,...,Supported,,Approved,Vesicles,Renal cancer:1.92e-6 (unfavourable),Tissue enriched,Tissue enhanced,,liver: 838.9,gallbladder: 206.0


In [10]:
# Select unique peptides count as per the data 
gtex_unique_peptides = gtex_experiment_info['prt_unique_pp_cnt']
print("# of duplicated values:", len(gtex_unique_peptides.index[gtex_unique_peptides.index.duplicated()]))

# of duplicated values: 6


In [11]:
gtex_unique_peptides = gtex_unique_peptides[~gtex_unique_peptides.index.duplicated(keep='first')]
gtex_unique_peptides = gtex_unique_peptides.to_frame(name='GTEx_Unique_Peptides')
gtex_unique_peptides.drop(index=[index for index in gtex_unique_peptides.index if type(index) is datetime.datetime])
gtex_unique_peptides[:2]

Unnamed: 0_level_0,GTEx_Unique_Peptides
hgnc_symbol,Unnamed: 1_level_1
SCYL3,3
CFH,76


### Biochemical Properties

In [12]:
protein_properties = pd.read_excel(io=file_protein_properties, sheet_name='Properties and functions')
print("Dimensions: ", protein_properties.shape)
protein_properties[:2]

Dimensions:  (7203, 105)


Unnamed: 0,ProteinGroup.id,UniProt identifier(s),Gene name(s),Protein name(s),Potential contaminant,Number of proteins in group,Unique peptides,Sequence coverage [%],Valid values for K,Cell culture replicate values for K,...,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104
0,3451,Q13685,AAMP,Angio-associated migratory cell protein,,1,13,41.2,8,4,...,,,,,,,,,,
1,4121,Q5JTZ9,AARS2,Alanine--tRNA ligase;mitochondrial,,1,16,22.4,8,4,...,,,,,,,,,,


In [13]:
print('Columns present: \n', [colname for colname in protein_properties.columns if 'Unnamed:' not in colname])

Columns present: 
 ['ProteinGroup.id', 'UniProt identifier(s)', 'Gene name(s)', 'Protein name(s)', 'Potential contaminant', 'Number of proteins in group', 'Unique peptides', 'Sequence coverage [%]', 'Valid values for K', 'Cell culture replicate values for K', 'Protein sequence length', 'Molecular weight [kDa]', 'K [h-1]', 'k [h-1]', 'T50% [h]', 'T1/2 [h]', 'Copies per cell', 'Copies per h&cell', 'Copies per h&cell corrected for cell doubling', 'Copies (Zeiler et al.)', 'Copies (Nagaraj et al.)', 'Coil [%]', 'Helix [%]', 'Sheet [%]', 'GRAVY Score', 'A [%]', 'C [%]', 'D [%]', 'E [%]', 'G [%]', 'F [%]', 'H [%]', 'I [%]', 'K [%]', 'L [%]', 'M [%]', 'N [%]', 'P [%]', 'Q [%]', 'R [%]', 'S [%]', 'T [%]', 'V [%]', 'W [%]', 'Y [%]', 'Melting temperature (Leuenberger et al.)', 'Thermal Stability Category (Leuenberger et al.)', 'AA after iMet', 'CORUM', 'GOBP name', 'GOMF name', 'GOCC name', 'KEGG name', 'UniProt Keywords', 'Prosite name', 'Respiratory Chain Member', 'Subcellular location (HPA)',

In [14]:
biochemical_properties = protein_properties[['Gene name(s)', 'Protein sequence length', 
                                             'Molecular weight [kDa]', 'T1/2 [h]']].copy()
biochemical_properties.replace('inf.', math.inf, inplace=True)

In [15]:
# Gene name(s) column contains a few rows with multiple gene names separated by ; => exploding the dataframe
biochemical_properties = biochemical_properties.set_index(['Protein sequence length', 'Molecular weight [kDa]', 
                                                           'T1/2 [h]', ]).apply(lambda x: x.str.split(';').explode()) \
                                                                       .reset_index().set_index('Gene name(s)')

In [16]:
biochemical_properties = biochemical_properties.apply(pd.to_numeric)
biochemical_properties[:2]

Unnamed: 0_level_0,Protein sequence length,Molecular weight [kDa],T1/2 [h]
Gene name(s),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAMP,434,46.75,10.602299
AARS2,985,107.34,42.752163


In [17]:
biochemical_properties = biochemical_properties[~biochemical_properties.index.duplicated(keep='first')]
biochemical_properties = biochemical_properties.drop(index=[index for index in biochemical_properties.index if type(index) is datetime.datetime])
biochemical_properties.drop(index=np.nan, inplace=True)

In [18]:
# Combine the properties and write into a file 
all_protein_properties = pd.concat([gtex_abundance_mean, gtex_abundance_variance, 
                                    gtex_unique_peptides, biochemical_properties], axis=1)
all_protein_properties.to_csv(file_processed_properties)