# Mapping labels to IRS normalized proteomics results

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

import sys
import os

### Import normalized data

In [2]:
ori_df = pd.read_csv('../Intermediate data/IRS_norm_data_no_labels_sorted.csv')
ori_df = ori_df.rename(columns={'Unnamed: 0':'fasta_header'})
ori_df.head(50)

Unnamed: 0,fasta_header,BL11,BL12,BL21,BL22,CF11,CF12,CF21,CF22,POOL1,...,POOL3,POOL4,S111,S112,S121,S122,S711,S712,S721,S722
0,CONT_011|gi|136425|sp|P00760|TRYP_BOVIN (+1),3834882.0,3910692.0,3873719.0,3901555.0,3731300.0,4510965.0,4588062.0,4236974.0,3904010.0,...,4161065.0,3962332.0,3785124.0,3831308.0,4691136.0,4209506.0,3870142.0,3996458.0,4224602.0,5063368.0
1,CONT_060|gi|542923|pir||S37780 (+2),57688.47,69161.08,44990.92,45472.18,51291.44,54926.03,83719.12,65864.2,58848.43,...,57980.25,59068.61,61245.9,67959.17,65552.73,72750.93,41214.19,47596.26,32997.22,31030.07
2,CONT_068|gi|1082558|pir||S41161 (+1),6199343.0,7064887.0,6312329.0,6931382.0,5935896.0,6075726.0,6754209.0,12182310.0,6828453.0,...,6827672.0,7112692.0,4897239.0,7263064.0,6779768.0,6442336.0,4001778.0,8255296.0,8388862.0,8690249.0
3,CONT_072|gi|547754|sp|P35908|K22E_HUMAN (+1),1580685.0,1583641.0,1860521.0,2163102.0,1613313.0,1448806.0,1885191.0,1723407.0,1812941.0,...,1868804.0,1834231.0,875393.1,2208658.0,1820206.0,1493089.0,1374815.0,2530316.0,2460016.0,2744847.0
4,CONT_089|gi|71536|pir||KRHU2,1919952.0,2442960.0,2154624.0,2331342.0,2301886.0,2177643.0,2417075.0,4278359.0,2414614.0,...,2422754.0,2404175.0,1594131.0,2433767.0,2269867.0,2144572.0,1557455.0,2699832.0,3269444.0,3409155.0
5,CONT_092|gi|71528|pir||KRHU0 (+2),3971203.0,4112515.0,4550645.0,5087362.0,4503944.0,4189552.0,4709996.0,5307843.0,4811199.0,...,4856297.0,4939114.0,2985942.0,5573998.0,5115626.0,4021971.0,4122821.0,6309237.0,6940652.0,6734369.0
6,EXTRA_0117 (+1),14402250.0,19763660.0,17772670.0,13234840.0,10612000.0,14418630.0,15983020.0,12908400.0,36802390.0,...,35947730.0,36897530.0,12719640.0,11921480.0,16374610.0,12480080.0,122818200.0,124455000.0,100721500.0,110005300.0
7,EXTRA_0118,435469700.0,457484800.0,454824700.0,416484200.0,348922100.0,417457500.0,397568500.0,370804200.0,835702100.0,...,852357900.0,845367100.0,1744299000.0,1499142000.0,1547300000.0,1508134000.0,1096133000.0,1057119000.0,1002331000.0,1096688000.0
8,EXTRA_0119,153978900.0,200082000.0,197986500.0,159637100.0,108041200.0,249540700.0,197419700.0,183698200.0,835979800.0,...,856586900.0,841219000.0,2061747000.0,1934594000.0,1918482000.0,1835840000.0,1233054000.0,1280986000.0,1098762000.0,1286722000.0
9,EXTRA_0121,15008440.0,17617010.0,16914160.0,14695690.0,12073330.0,14554800.0,16149000.0,12521560.0,31340590.0,...,30516430.0,31364870.0,11665780.0,11778680.0,15107030.0,12370140.0,89150360.0,99356920.0,82532120.0,88802210.0


### Extract accession numbers from UNIPROT Identifiers
##### The labels come from the FASTA headers included in the FASTA database UP000002032 for Escherichia coli (strain B / BL21-DE3) merged with the possible contamination and exogenously expressed protein sequences. These FASTA files are included in the Initial filesdirectory.

##### Accession numbers are pulled from proteins in the database and contamination/ exogenously expressed proteins are flagged

In [4]:
def pull_acc(accession):
    if 'REV' in accession:
        return 'REV'
    elif 'CONT' in accession:
        return 'cont'
    elif 'EXTRA' in accession:
        return 'extra'
    elif 'REV' in accession:
        return 'REV'
    elif'|' in accession:
        return accession.split('|')[1]
    else:
        return 'unknown'
    
ori_df['Entry'] = ori_df['fasta_header'].apply(pull_acc)

# Remove the contamination and the incorrectly aligned entries
ori_df = ori_df[ori_df.Entry != 'cont']
ori_df = ori_df[ori_df.Entry != 'REV']

ori_df

Unnamed: 0,fasta_header,BL11,BL12,BL21,BL22,CF11,CF12,CF21,CF22,POOL1,...,POOL4,S111,S112,S121,S122,S711,S712,S721,S722,Entry
6,EXTRA_0117 (+1),1.440225e+07,1.976366e+07,1.777267e+07,1.323484e+07,1.061200e+07,1.441863e+07,1.598302e+07,1.290840e+07,3.680239e+07,...,3.689753e+07,1.271964e+07,1.192148e+07,1.637461e+07,1.248008e+07,1.228182e+08,1.244550e+08,1.007215e+08,1.100053e+08,extra
7,EXTRA_0118,4.354697e+08,4.574848e+08,4.548247e+08,4.164842e+08,3.489221e+08,4.174575e+08,3.975685e+08,3.708042e+08,8.357021e+08,...,8.453671e+08,1.744299e+09,1.499142e+09,1.547300e+09,1.508134e+09,1.096133e+09,1.057119e+09,1.002331e+09,1.096688e+09,extra
8,EXTRA_0119,1.539789e+08,2.000820e+08,1.979865e+08,1.596371e+08,1.080412e+08,2.495407e+08,1.974197e+08,1.836982e+08,8.359798e+08,...,8.412190e+08,2.061747e+09,1.934594e+09,1.918482e+09,1.835840e+09,1.233054e+09,1.280986e+09,1.098762e+09,1.286722e+09,extra
9,EXTRA_0121,1.500844e+07,1.761701e+07,1.691416e+07,1.469569e+07,1.207333e+07,1.455480e+07,1.614900e+07,1.252156e+07,3.134059e+07,...,3.136487e+07,1.166578e+07,1.177868e+07,1.510703e+07,1.237014e+07,8.915036e+07,9.935692e+07,8.253212e+07,8.880221e+07,extra
10,EXTRA_0125,3.332058e+07,3.612639e+07,3.704711e+07,3.125256e+07,5.372157e+07,5.407940e+07,5.730539e+07,5.168573e+07,6.714455e+07,...,6.536687e+07,3.140472e+07,3.339494e+07,3.510481e+07,3.100372e+07,1.433261e+08,1.434790e+08,1.489743e+08,1.757447e+08,extra
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1907,tr|A0A140SSC5|A0A140SSC5_ECOBD,2.947686e+06,1.996774e+06,2.225339e+06,2.449551e+06,2.298617e+06,2.015096e+06,1.811964e+06,1.979171e+06,1.999134e+06,...,1.950416e+06,1.751082e+06,1.652027e+06,1.641154e+06,1.746556e+06,1.701948e+06,1.677219e+06,1.916319e+06,1.604518e+06,A0A140SSC5
1908,tr|C6EGE8|C6EGE8_ECOBD,3.293396e+07,2.955156e+07,2.838886e+07,2.649919e+07,3.626575e+07,3.445293e+07,3.259728e+07,3.229829e+07,3.380977e+07,...,3.344518e+07,4.331682e+07,3.605674e+07,3.886754e+07,3.496004e+07,3.630116e+07,2.949580e+07,3.714245e+07,3.704132e+07,C6EGE8
1914,tr|A0A140N3T4|A0A140N3T4_ECOBD_family,9.913333e+07,1.185221e+08,1.178427e+08,8.869173e+07,8.172502e+07,9.163873e+07,1.089065e+08,7.856521e+07,2.432978e+08,...,2.408500e+08,7.316567e+07,6.734767e+07,9.699063e+07,7.824137e+07,7.371883e+08,7.150568e+08,7.386923e+08,8.135044e+08,A0A140N3T4
1915,EXTRA_0123_family,9.621162e+07,1.097802e+08,1.083865e+08,9.617083e+07,8.786867e+07,9.855935e+07,1.051171e+08,8.943943e+07,1.581136e+08,...,1.573007e+08,7.988909e+07,8.524818e+07,8.753795e+07,8.202545e+07,3.717960e+08,3.870587e+08,3.546405e+08,3.905648e+08,extra


### Map the names of the expressed proteins to their intensities

In [11]:
def expressed_protein_accession(row):
    entry = row['fasta_header']
    for string in exp_df['Entry']:
        if string in entry:
            return exp_accession[string]
    return row['Entry']

def find_expressed(row):
    entry = row['fasta_header']
    for string in exp_df['Entry']:
        if string in entry:
            print('found')
            return 'exp'
    return 'not_exp'

exp_df = pd.read_csv('../Initial files/expressed_protein_list.csv')
exp_entry = exp_df['Entry'].tolist()

exp_names = dict(zip(exp_df['Entry'],exp_df['Protein name']))
exp_accession = dict(zip(exp_df['Entry'],exp_df['Accession']))


ori_df['Entry'] = ori_df.apply(expressed_protein_accession, axis=1)

['EXTRA_0117', 'EXTRA_0118', 'EXTRA_0119', 'EXTRA_0120', 'EXTRA_0121', 'EXTRA_0122', 'EXTRA_0123', 'EXTRA_0124', 'EXTRA_0125', 'EXTRA_0126', 'EXTRA_0127', 'EXTRA_0129', 'EXTRA_0130']


Unnamed: 0,fasta_header,BL11,BL12,BL21,BL22,CF11,CF12,CF21,CF22,POOL1,...,POOL4,S111,S112,S121,S122,S711,S712,S721,S722,Entry
1001,tr|A0A140NAI0|A0A140NAI0_ECOBD,3971636.0,4043056.0,3957685.0,3803254.0,4245724.0,4525364.0,4670624.0,4373359.0,5170285.0,...,5167611.0,3441049.0,3612092.0,3553447.0,3438511.0,8250028.0,8614739.0,10163020.0,10441740.0,A0A140NAI0


### Join the protein information with the normalized intensity data

In [12]:
ref_df = pd.read_csv('../Initial files/proteome_UP000002032_info.csv',index_col=False)

# Join the information from the proteome with the mass spectrometry data
combined_df = ori_df.set_index('Entry').join(ref_df.set_index('Entry'))


### Identify the proteins that were intentionally over-expressed

In [13]:
def expressed_protein_name(row):
    entry = row['fasta_header']
    for string in exp_df['Entry']:
        if string in entry:
            return exp_names[string]
    return row['Protein_names']

def find_expressed(name):
    if name in exp_prots:
        return 'exp'
    else:
        return 'not_exp'
    
exp_prots = exp_df['Protein name'].tolist()

combined_df['Protein_names'] = combined_df.apply(expressed_protein_name,axis=1)
combined_df['expressed'] = combined_df['Protein_names'].apply(find_expressed)



0     EXTRA_0117
1     EXTRA_0118
2     EXTRA_0119
3     EXTRA_0120
4     EXTRA_0121
5     EXTRA_0122
6     EXTRA_0123
7     EXTRA_0124
8     EXTRA_0125
9     EXTRA_0126
10    EXTRA_0127
11    EXTRA_0129
12    EXTRA_0130
Name: Entry, dtype: object
EXTRA_0117 Peptide chain release factor 3 (RF-3)
EXTRA_0118 Elongation factor Tu (EF-Tu)
EXTRA_0119 Elongation factor Ts (EF-Ts)
EXTRA_0121 Elongation factor 4 (EF-4) (EC 3.6.5.n1) (Ribosomal back-translocase LepA)
EXTRA_0125 Translation initiation factor IF-3
EXTRA_0126 Elongation factor G (EF-G)
EXTRA_0127 Translation initiation factor IF-1
EXTRA_0130 Cyan Fluorescent Protein
EXTRA_0123 Alanine--tRNA ligase (EC 6.1.1.7) (Alanyl-tRNA synthetase) (AlaRS)
Peptide chain release factor 3 (RF-3)
Elongation factor Tu (EF-Tu)
Elongation factor Ts (EF-Ts)
Elongation factor 4 (EF-4) (EC 3.6.5.n1) (Ribosomal back-translocase LepA)
Translation initiation factor IF-3
Elongation factor G (EF-G)
Translation initiation factor IF-1
Cyan Fluorescent Protein


Unnamed: 0_level_0,fasta_header,BL11,BL12,BL21,BL22,CF11,CF12,CF21,CF22,POOL1,...,Gene_ontology_cellular_component,Gene_ontology_GO,Gene_ontology_molecular_function,Gene_ontology_IDs,EC_number,Intramembrane,Subcellular_location_CC,Topological_domain,Transmembrane,expressed
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A140N3T4,tr|A0A140N3T4|A0A140N3T4_ECOBD_family,99133330.0,118522100.0,117842700.0,88691730.0,81725020.0,91638730.0,108906500.0,78565210.0,243297800.0,...,cytoplasm [GO:0005737],cytoplasm [GO:0005737]; GTP binding [GO:000552...,GTPase activity [GO:0003924]; GTP binding [GO:...,GO:0003743; GO:0003924; GO:0005525; GO:0005737,,,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000256|H...,,,exp


### Calculate comparisons between replicates and samples
##### These calculations will be used to generate the volcano plots in the R Jupyter Notebook titled "Extract comparison volcano plots"

In [14]:
def mean_calc(row, extracts=None):
    return np.mean([row[ex] for ex in extracts])

def std_calc(row,extracts=None):
    return np.std([row[ex] for ex in extracts])

def ttest_ind(row, extracts=None):
    '''
    The t-test used in this funnction is a two-tailed t-test with the 
    null hypothesis that the two sample means will be the same
    '''
    ex1_names = [extracts[0]+name for name in replicates]
    ex2_names = [extracts[1]+name for name in replicates]

    ex1 = [row[ex] for ex in ex1_names]
    ex2 = [row[ex] for ex in ex2_names]

    return stats.ttest_ind(ex1,ex2)[1]
    
extracts = ['BL','CF','S1','S7']
replicates = ['11','12','21','22']

for ex in extracts:
    ex_names = [ex+name for name in replicates]
       
    col_name = ex + '_mean'
    combined_df[col_name] = combined_df.apply(mean_calc, axis=1, extracts = ex_names)
    
    col_name = ex + '_std'
    combined_df[col_name] = combined_df.apply(std_calc, axis=1, extracts = ex_names)
    
pairs = [['CF','BL'],['S1','BL'],['S7','BL']]

for pair in pairs:
    diff_col = pair[0]+'_'+pair[1]+'_fc'
    combined_df[diff_col] = combined_df[pair[0]+'_mean']/combined_df[pair[1]+'_mean']
    ttest_col = pair[0]+'_'+pair[1]+'_p_val'
    combined_df[ttest_col] = combined_df.apply(ttest_ind, axis=1, extracts = pair)
combined_df

Unnamed: 0_level_0,fasta_header,BL11,BL12,BL21,BL22,CF11,CF12,CF21,CF22,POOL1,...,S1_mean,S1_std,S7_mean,S7_std,CF_BL_fc,CF_BL_p_val,S1_BL_fc,S1_BL_p_val,S7_BL_fc,S7_BL_p_val
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A140NC65,EXTRA_0117 (+1),1.440225e+07,1.976366e+07,1.777267e+07,1.323484e+07,1.061200e+07,1.441863e+07,1.598302e+07,1.290840e+07,3.680239e+07,...,1.337396e+07,1.756471e+06,1.145000e+08,9.725529e+06,0.827362,0.187226,0.820823,1.587303e-01,7.027404,2.749047e-06
A0A140N6W0,EXTRA_0118,4.354697e+08,4.574848e+08,4.548247e+08,4.164842e+08,3.489221e+08,4.174575e+08,3.975685e+08,3.708042e+08,8.357021e+08,...,1.574719e+09,9.956742e+07,1.063068e+09,3.856144e+07,0.869911,0.018059,3.570258,1.194812e-06,2.410225,2.300600e-07
A0A140NFD7,EXTRA_0119,1.539789e+08,2.000820e+08,1.979865e+08,1.596371e+08,1.080412e+08,2.495407e+08,1.974197e+08,1.836982e+08,8.359798e+08,...,1.937665e+09,8.084256e+07,1.224881e+09,7.573781e+07,1.037960,0.838248,10.890588,2.836386e-08,6.884404,4.364187e-07
A0A140N6C6,EXTRA_0121,1.500844e+07,1.761701e+07,1.691416e+07,1.469569e+07,1.207333e+07,1.455480e+07,1.614900e+07,1.252156e+07,3.134059e+07,...,1.273041e+07,1.397976e+06,8.996040e+07,6.030578e+06,0.860877,0.108104,0.792736,2.143383e-02,5.601930,8.056661e-07
A0A140N9R4,EXTRA_0125,3.332058e+07,3.612639e+07,3.704711e+07,3.125256e+07,5.372157e+07,5.407940e+07,5.730539e+07,5.168573e+07,6.714455e+07,...,3.272705e+07,1.644554e+06,1.528810e+08,1.339503e+07,1.573847,0.000030,0.950355,3.345745e-01,4.439484,5.327811e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A0A140SSC5,tr|A0A140SSC5|A0A140SSC5_ECOBD,2.947686e+06,1.996774e+06,2.225339e+06,2.449551e+06,2.298617e+06,2.015096e+06,1.811964e+06,1.979171e+06,1.999134e+06,...,1.697705e+06,5.128378e+04,1.725001e+06,1.161177e+05,0.842557,0.146237,0.705954,1.373626e-02,0.717304,1.913908e-02
C6EGE8,tr|C6EGE8|C6EGE8_ECOBD,3.293396e+07,2.955156e+07,2.838886e+07,2.649919e+07,3.626575e+07,3.445293e+07,3.259728e+07,3.229829e+07,3.380977e+07,...,3.830028e+07,3.227930e+06,3.499518e+07,3.191639e+06,1.155407,0.031656,1.305244,8.074961e-03,1.192609,4.826855e-02
A0A140N3T4,tr|A0A140N3T4|A0A140N3T4_ECOBD_family,9.913333e+07,1.185221e+08,1.178427e+08,8.869173e+07,8.172502e+07,9.163873e+07,1.089065e+08,7.856521e+07,2.432978e+08,...,7.893634e+07,1.111348e+07,7.511105e+08,3.721860e+07,0.850646,0.164757,0.744349,3.181924e-02,7.082776,1.257925e-07
A0A140N6W9,EXTRA_0123_family,9.621162e+07,1.097802e+08,1.083865e+08,9.617083e+07,8.786867e+07,9.855935e+07,1.051171e+08,8.943943e+07,1.581136e+08,...,8.367517e+07,2.934767e+06,3.760150e+08,1.421559e+07,0.927988,0.228000,0.815251,3.592961e-03,3.663532,8.540082e-08


In [15]:
combined_df.to_csv('../Intermediate data/labeled_IRS_proteins_exp.csv')