In [6]:
import numpy as np
import pandas as pd

import mygene
mg = mygene.MyGeneInfo()

from gprofiler import GProfiler

# Clean up proteomics gene names

It's important that we use names that can be recognized by gprofiler (official symbols). Sometimes the names provided by the original proteomics files are not the standard protein names.

In [29]:
proteomics = pd.read_csv('proteomics_norm_regressed.csv', index_col=0)
proteomics

Unnamed: 0,A0FGR8|ESYT2,A0MZ66|SHOT1,A1L020|MEX3A,A1X283|SPD2B,A2RRP1|NBAS,A2RTX5|SYTC2,A3KMH1|VWA8,A4FU69|EFCB5,A5D8V6|VP37C,A6NDG6|PGP,...,Q9Y6I3|EPN1,Q9Y6K1|DNM3A,Q9Y6M1|IF2B2,Q9Y6M7|S4A7,Q9Y6M9|NDUB9,Q9Y6R0|NUMBL,Q9Y6V0|PCLO,Q9Y6V7|DDX49,Q9Y6X4|F169A,Q9Y6Y0|NS1BP
NEUAB000NKC,22.132582,24.507411,23.586154,22.036223,19.636499,21.235689,21.162085,21.994284,18.572554,19.546428,...,22.794974,20.170942,31.865858,23.810717,25.052458,20.297559,16.757836,19.578949,18.792708,20.698066
NEUAE993EPR,45.221770,49.942097,47.992867,44.671287,38.727244,43.061282,43.126658,44.802182,37.218946,40.198396,...,46.209381,40.664922,64.755164,48.430675,51.955469,41.268515,34.178908,39.763795,38.792258,41.300492
NEUAF553MJ3,90.466199,99.505449,96.181448,89.478639,77.750792,86.307861,86.327695,89.094455,75.449926,79.909911,...,92.383687,81.819790,129.313710,96.408258,103.963852,82.451987,68.535644,80.029897,77.127294,83.259135
NEUAG241NUD,10.385513,11.541707,11.311134,10.565407,8.960294,10.202519,10.252189,9.812732,9.020608,9.151642,...,10.660789,9.570901,15.009836,10.965950,11.991511,9.742833,7.760601,9.464367,9.042568,9.742454
NEUAG603XLK,78.150920,86.235523,83.210501,77.540086,67.211740,74.531838,74.848433,77.274251,65.357872,69.589707,...,80.069674,70.865702,111.982013,83.351246,90.094299,71.236734,58.940076,69.183565,66.994363,71.879154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NEUZV656DD1,76.387701,84.133331,81.412423,75.689870,65.207632,72.501190,73.060174,75.407061,63.950561,67.139115,...,77.704208,69.156955,109.433913,81.453124,88.013561,69.568112,56.986946,67.456198,65.440478,70.341272
NEUZW701NNF,32.626740,35.845411,34.654499,32.161300,27.923917,30.928368,30.995164,32.112988,27.226467,29.216991,...,33.229356,29.621711,46.459469,34.877228,37.360453,29.600791,24.806402,28.695769,27.869300,29.956008
NEUZX521TKK,147.457095,162.963788,157.155429,146.113811,126.647587,140.525819,140.904131,145.915530,123.468999,130.762071,...,150.983318,133.677478,211.518033,158.182674,170.343933,134.648573,111.470762,130.007192,126.732783,135.556106
NEUZX847VWV,94.248238,104.138992,100.329561,93.414726,81.200284,89.159079,90.075744,92.527213,78.877753,83.445202,...,96.237650,85.418751,134.746305,100.668770,108.703929,86.218236,71.387440,83.463221,80.749254,86.265946


In [31]:
orig_prot_df = pd.DataFrame([name.split('|') for name in proteomics.columns],
                           columns = ['uniprot', 'original_name'])
orig_prot_df.head()

Unnamed: 0,uniprot,original_name
0,A0FGR8,ESYT2
1,A0MZ66,SHOT1
2,A1L020,MEX3A
3,A1X283,SPD2B
4,A2RRP1,NBAS


In [30]:
# Get gene info from MyGene
results = mg.querymany(orig_prot_df.uniprot.values, 
                       scopes=['uniprot'], fields=["symbol"], species="human", 
                       as_dataframe=True, returnall=True)
duplicated = results['dup']
missing = results['missing']
out = results['out']
# add back column
out['uniprot'] = out.index.values

INFO:biothings.client:querying 1-1000...
INFO:biothings.client:done.
INFO:biothings.client:querying 1001-2000...
INFO:biothings.client:done.
INFO:biothings.client:querying 2001-3000...
INFO:biothings.client:done.
INFO:biothings.client:querying 3001-3496...
INFO:biothings.client:done.
INFO:biothings.client:Finished.


In [32]:
# drop duplicates and keep first returned 
prot_df = out.drop_duplicates(subset=['uniprot'])
prot_df.head()

Unnamed: 0_level_0,_id,_score,symbol,notfound,uniprot
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0FGR8,57488,19.639704,ESYT2,,A0FGR8
A0MZ66,57698,20.686556,SHTN1,,A0MZ66
A1L020,92312,20.684448,MEX3A,,A1L020
A1X283,285590,19.646774,SH3PXD2B,,A1X283
A2RRP1,51594,19.641256,NBAS,,A2RRP1


In [41]:
# Merge and add final name column 
# for 5 uniprot IDs that were not identified by mygene, use the original names from the proteomics team
merged = prot_df.merge(orig_prot_df, how='outer', on='uniprot')
merged['final_name'] = merged.apply(lambda row: row['symbol'] if np.isnan(row['notfound']) else row['original_name'],
                                   axis=1)
merged

Unnamed: 0,_id,_score,symbol,notfound,uniprot,original_name,final_name
0,57488,19.639704,ESYT2,,A0FGR8,ESYT2,ESYT2
1,57698,20.686556,SHTN1,,A0MZ66,SHOT1,SHTN1
2,92312,20.684448,MEX3A,,A1L020,MEX3A,MEX3A
3,285590,19.646774,SH3PXD2B,,A1X283,SPD2B,SH3PXD2B
4,51594,19.641256,NBAS,,A2RRP1,NBAS,NBAS
...,...,...,...,...,...,...,...
3491,9253,19.685020,NUMBL,,Q9Y6R0,NUMBL,NUMBL
3492,27445,20.070452,PCLO,,Q9Y6V0,PCLO,PCLO
3493,54555,20.074020,DDX49,,Q9Y6V7,DDX49,DDX49
3494,26049,20.074020,FAM169A,,Q9Y6X4,F169A,FAM169A


In [42]:
any(pd.isnull(merged.final_name))

False

# Run gprofiler enrichment with background

In [43]:
# getting fake list of proteins aritificially enriched for translation
rb_prots = [prot for prot in merged.final_name.values if 'RP' in prot]
rb_prots

['U2SURP',
 'PRORP',
 'ARPC1B',
 'ARPC2',
 'ARPC3',
 'ARPC5',
 'RRP8',
 'PRPF3',
 'PRPSAP2',
 'NRP2',
 'MRPL33',
 'PRPF40A',
 'RP2',
 'PRPF6',
 'RPN1',
 'RPN2',
 'RPLP1',
 'RPLP2',
 'RPLP0',
 'SERPINE2',
 'SRPRA',
 'SNRPB2',
 'RPS17',
 'RPSA',
 'MRPL3',
 'SNRPA',
 'SNRPC',
 'SNRPA1',
 'PARP1',
 'RPS2',
 'RPA2',
 'RPL35A',
 'RPL7',
 'RPL17',
 'CSRP1',
 'RPS3',
 'RPS12',
 'RPL13',
 'RPL10',
 'RPA1',
 'ERP29',
 'RPL12',
 'LRPAP1',
 'RPL9',
 'SERPINB6',
 'RPA3',
 'RPL22',
 'RPL4',
 'SRP14',
 'RPS19',
 'RPL3',
 'RPL13A',
 'PRPH',
 'RPS27',
 'LRPPRC',
 'RPL35',
 'RPL27A',
 'RPL5',
 'RPL21',
 'RPL28',
 'RPS9',
 'RPS5',
 'RPS10',
 'RPL29',
 'RPL34',
 'RPIA',
 'MRPL19',
 'SERPINB9',
 'SERPINH1',
 'RPL14',
 'RPS6KA3',
 'MRPL12',
 'RRP1',
 'ARPP19',
 'ARPC4',
 'RPS20',
 'PRPS1',
 'RPS3A',
 'RPL26',
 'RPL15',
 'RPL27',
 'RPL37A',
 'RPS7',
 'RPS8',
 'RPS15A',
 'RPS16',
 'RPS14',
 'RPS23',
 'RPS18',
 'RPS29',
 'RPS13',
 'RPS11',
 'SNRPE',
 'SNRPF',
 'SNRPD1',
 'SNRPD2',
 'SNRPD3',
 'RPL7A',
 'RPS4X'

In [48]:
gp = GProfiler(return_dataframe=True)
sourcelist = ["GO:MF","GO:BP","KEGG","REAC"]
enrichment_results = gp.profile(organism='hsapiens',query=rb_prots, 
                                sources=sourcelist, no_evidences = False,
                               background = merged.final_name.values.tolist())

In [49]:
enrichment_results

Unnamed: 0,source,native,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences
0,GO:MF,GO:0003735,structural constituent of ribosome,1.318278e-152,True,"""The action of a molecule that contributes to ...",129,212,122,3536,0.575472,0.945736,query_1,[GO:0005198],"[MRPL33, RPLP1, RPLP2, RPLP0, RPS17, RPSA, MRP...","[[IEA], [IDA, IBA, NAS, IEA], [IDA, NAS, IEA],..."
1,KEGG,KEGG:03010,Ribosome,7.297676e-140,True,Ribosome,109,212,108,3536,0.509434,0.990826,query_1,[KEGG:00000],"[MRPL33, RPLP1, RPLP2, RPLP0, RPS17, RPSA, MRP...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
2,REAC,REAC:R-HSA-72766,Translation,1.042139e-124,True,Translation,229,212,138,3536,0.650943,0.602620,query_1,[REAC:R-HSA-392499],"[MRPL33, RPN1, RPN2, RPLP1, RPLP2, RPLP0, SRPR...","[[REAC], [REAC], [REAC], [REAC], [REAC], [REAC..."
3,REAC,REAC:R-HSA-2408557,Selenocysteine synthesis,3.353145e-93,True,Selenocysteine synthesis,79,212,77,3536,0.363208,0.974684,query_1,[REAC:R-HSA-2408522],"[RPLP1, RPLP2, RPLP0, RPS17, RPSA, RPS2, RPL35...","[[REAC], [REAC], [REAC], [REAC], [REAC], [REAC..."
4,REAC,REAC:R-HSA-192823,Viral mRNA Translation,8.597019e-92,True,Viral mRNA Translation,80,212,77,3536,0.363208,0.962500,query_1,[REAC:R-HSA-168273],"[RPLP1, RPLP2, RPLP0, RPS17, RPSA, RPS2, RPL35...","[[REAC], [REAC], [REAC], [REAC], [REAC], [REAC..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,GO:MF,GO:1990948,ubiquitin ligase inhibitor activity,2.171487e-03,True,"""Binds to and stops, prevents or reduces the a...",6,212,6,3536,0.028302,1.000000,query_1,[GO:0055105],"[RPL5, RPS20, RPS7, RPL23, RPS15, RPL11]","[[IDA], [IDA], [IDA], [IDA], [IDA], [IMP]]"
97,GO:BP,GO:0140694,non-membrane-bounded organelle assembly,7.028106e-03,True,"""The aggregation, arrangement and bonding toge...",123,212,24,3536,0.113208,0.195122,query_1,[GO:0070925],"[RPLP0, RPSA, CSRP1, RPS3, RPL10, RPS19, RPS27...","[[IBA], [IBA, IEA], [IBA], [IMP], [IBA], [IMP,..."
98,GO:MF,GO:0070181,small ribosomal subunit rRNA binding,1.444008e-02,True,"""Binding to small ribosomal subunit RNA (SSU r...",7,212,6,3536,0.028302,0.857143,query_1,[GO:0019843],"[RPS3, RPS14, RPS13, MRPS11, MRPS6, MRPS18A]","[[IDA], [IBA], [IBA], [IBA], [IBA], [IBA]]"
99,GO:MF,GO:0055105,ubiquitin-protein transferase inhibitor activity,1.444008e-02,True,"""Binds to and stops, prevents or reduces the a...",7,212,6,3536,0.028302,0.857143,query_1,"[GO:0004857, GO:0055106]","[RPL5, RPS20, RPS7, RPL23, RPS15, RPL11]","[[IDA], [IDA], [IDA], [IDA], [IDA], [IMP]]"


In [50]:
# notice misleading changes in significance when we use a wider (but less accurate) background
enrichment_results2 = gp.profile(organism='hsapiens',query=rb_prots, 
                                sources=sourcelist, no_evidences = False)
enrichment_results2

Unnamed: 0,source,native,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents,intersections,evidences
0,REAC,REAC:R-HSA-72766,Translation,1.788156e-182,True,Translation,292,193,138,10790,0.715026,0.472603,query_1,[REAC:R-HSA-392499],"[MRPL33, RPN1, RPN2, RPLP1, RPLP2, RPLP0, SRPR...","[[REAC], [REAC], [REAC], [REAC], [REAC], [REAC..."
1,KEGG,KEGG:03010,Ribosome,3.953646e-167,True,Ribosome,153,156,108,8161,0.692308,0.705882,query_1,[KEGG:00000],"[MRPL33, RPLP1, RPLP2, RPLP0, RPS17, RPSA, MRP...","[[KEGG], [KEGG], [KEGG], [KEGG], [KEGG], [KEGG..."
2,GO:MF,GO:0003735,structural constituent of ribosome,2.055878e-147,True,"""The action of a molecule that contributes to ...",451,209,122,20195,0.583732,0.270510,query_1,[GO:0005198],"[MRPL33, RPLP1, RPLP2, RPLP0, RPS17, RPSA, MRP...","[[IEA], [IDA, IBA, NAS, IEA], [IDA, NAS, IEA],..."
3,GO:BP,GO:0006412,translation,7.991113e-141,True,"""The cellular metabolic process in which a pro...",745,212,135,21110,0.636792,0.181208,query_1,"[GO:0009059, GO:0010467, GO:0019538, GO:0043043]","[MRPL33, RPLP1, RPLP2, RPLP0, RPS17, RPSA, MRP...","[[NAS, IEA], [IBA, NAS, IC, IEA], [NAS, IEA], ..."
4,GO:BP,GO:0043043,peptide biosynthetic process,2.082917e-138,True,"""The chemical reactions and pathways resulting...",774,212,135,21110,0.636792,0.174419,query_1,"[GO:0006518, GO:0043604, GO:1901566]","[MRPL33, RPLP1, RPLP2, RPLP0, RPS17, RPSA, MRP...","[[NAS, IEA], [IBA, NAS, IC, IEA], [NAS, IEA], ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,REAC,REAC:R-HSA-3928662,EPHB-mediated forward signaling,3.215222e-02,True,EPHB-mediated forward signaling,40,193,6,10790,0.031088,0.150000,query_1,[REAC:R-HSA-2682334],"[ARPC1B, ARPC2, ARPC3, ARPC5, ARPC4, ARPC1A]","[[REAC], [REAC], [REAC], [REAC], [REAC], [REAC]]"
132,GO:BP,GO:0006015,5-phosphoribose 1-diphosphate biosynthetic pro...,3.906495e-02,True,"""The chemical reactions and pathways resulting...",6,212,3,21110,0.014151,0.500000,query_1,"[GO:0046390, GO:0046391]","[PRPSAP2, PRPS1, PRPSAP1]","[[IBA], [IBA, IEA], [IBA]]"
133,GO:BP,GO:0046391,5-phosphoribose 1-diphosphate metabolic process,3.906495e-02,True,"""The chemical reactions and pathways involving...",6,212,3,21110,0.014151,0.500000,query_1,[GO:0019693],"[PRPSAP2, PRPS1, PRPSAP1]","[[IBA], [IBA, IEA], [IBA]]"
134,GO:BP,GO:0006417,regulation of translation,3.985431e-02,True,"""Any process that modulates the frequency, rat...",413,212,15,21110,0.070755,0.036320,query_1,"[GO:0006412, GO:0010556, GO:0010608, GO:003132...","[RPS3, RPL10, RPL13A, LRPPRC, RPL5, RPS9, RPS6...","[[IDA, IEA], [IMP, IEA], [IDA, IMP, IBA, IEA],..."
