In [1]:
from subpred.util import load_df
import networkx as nx
import pandas as pd
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset
from subpred.go_annotations import get_go_annotations_subset, EVIDENCE_CODE_TO_DESCRIPTION


columns

accession, sequence, reviewed, protein existence, organism id, protein name, go transport annotations, go membrane annotations (descencants of membrane or plasma membrane?), evidence codes, qualifiers, chebi ids, chebi_relations

one with iea, one without

evidence code map to simpler words

In [2]:
ORGANISM_IDS = {559292}
SWISSPROT_ONLY = False
MAX_SEQUENCE_EVIDENCE_CODE = 2
EXCLUDE_IEA_GO_TERMS = True

In [3]:
df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids=ORGANISM_IDS,
    swissprot_only=SWISSPROT_ONLY,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=EXCLUDE_IEA_GO_TERMS,
    max_sequence_evidence_code=MAX_SEQUENCE_EVIDENCE_CODE,
)
display(df_sequences)
display(df_uniprot_goa)
display(df_go_chebi)

Unnamed: 0_level_0,sequence,reviewed,protein_existence,organism_id,protein_names
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P00830,MVLPRLYTATSRAAFKAAKQSAPLLSTSWKRCMASAAQSTPITGKV...,True,1,559292,"ATP synthase subunit beta, mitochondrial (EC 7..."
P04817,MTNSKEDADIEEKHMYNEPVTTLFHDVEASQTHHRRGSIPLKDEKS...,True,1,559292,Arginine permease CAN1 (Canavanine resistance ...
P14906,MPTNYEYDEASETWPSFILTGLLMVVGPMTLLQIYQIFFGANAEDG...,True,1,559292,Protein translocation protein SEC63 (Protein N...
P17064,MLEEGNNVYEIQDLEKRSPVIGSSLENEKKVAASETFTATSEDDQQ...,True,1,559292,Purine-cytosine permease FCY2 (PCP FCY2) (Cyto...
P23900,MSNPQKALNDFLSSESVHTHDSSRKQSNKQSSDEGRSSSQPSHHHS...,True,1,559292,Glycerol uptake/efflux facilitator protein
...,...,...,...,...,...
P53134,MPQSTPSQEVQRVPWDNKPALKQITLRATIAGIAIGSLVLTSNFQF...,True,1,559292,Putative oligopeptide transporter YGL114W
P39542,MFQQLSASIRHNAHIIFLCISWYFISSLASQVTKQVLTVCPLPLFL...,True,1,559292,Uncharacterized transporter YJL193W
Q05497,MAGILSKTLSEVHPSLRTNGMGIGNTHRRISLGFLPPNKKNPLVRK...,True,1,559292,Uncharacterized transporter YDR338C
P38318,MEPKRKSGSLAKHDLPQFYLLIMLYLAQGIPVGLAFGTVPFLLKSL...,True,1,559292,Uncharacterized membrane protein YBR220C


Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
0,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015605,organophosphate ester transmembrane transporte...
1,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:1901702,salt transmembrane transporter activity
2,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0008514,organic anion transmembrane transporter activity
3,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0022857,transmembrane transporter activity
4,D6W196,enables,GO:0005347,ATP transmembrane transporter activity,IBA,F,GO:0015932,nucleobase-containing compound transmembrane t...
...,...,...,...,...,...,...,...,...
5823,Q99385,enables,GO:0015386,potassium:proton antiporter activity,IMP,F,GO:0015078,proton transmembrane transporter activity
5824,Q99385,enables,GO:0015386,potassium:proton antiporter activity,IMP,F,GO:0140828,metal cation:monoatomic cation antiporter acti...
5825,Q99385,enables,GO:0015386,potassium:proton antiporter activity,IMP,F,GO:0022804,active transmembrane transporter activity
5826,Q99385,enables,GO:0015386,potassium:proton antiporter activity,IMP,F,GO:0015297,antiporter activity


Unnamed: 0,go_id,go_term,chebi_id,chebi_term,chebi_go_relation
0,GO:0000064,L-ornithine transmembrane transporter activity,CHEBI:46911,L-ornithinium(1+),has_primary_input
1,GO:0000095,S-adenosyl-L-methionine transmembrane transpor...,CHEBI:59789,S-adenosyl-L-methionine zwitterion,has_primary_input
2,GO:0000099,sulfur amino acid transmembrane transporter ac...,CHEBI:26834,sulfur-containing amino acid,has_primary_input
3,GO:0000100,S-methylmethionine transmembrane transporter a...,CHEBI:58252,S-methyl-L-methionine zwitterion,has_primary_input
4,GO:0000102,L-methionine secondary active transmembrane tr...,CHEBI:57844,L-methionine zwitterion,has_primary_input
...,...,...,...,...,...
353,GO:1901682,sulfur compound transmembrane transporter acti...,CHEBI:26835,sulfur molecular entity,has_primary_input
354,GO:1902557,5'-adenylyl sulfate transmembrane transporter ...,CHEBI:58243,5'-adenylyl sulfate(2-),has_primary_input
355,GO:1903089,5-amino-1-ribofuranosylimidazole-4-carboxamide...,CHEBI:28498,acadesine,has_primary_input
356,GO:1903425,fluoride transmembrane transporter activity,CHEBI:17051,fluoride,has_primary_input


In [4]:
df_uniprot_goa_membrane = get_go_annotations_subset(
    datasets_path="../data/datasets/",
    root_go_term="membrane",
    inner_go_relations={"is_a"},
    namespaces_keep={"cellular_component"},
    proteins_subset=set(df_sequences.index),
    go_protein_qualifiers_filter_set={"located_in"}, 
    annotations_evidence_codes_remove={"IEA"} if EXCLUDE_IEA_GO_TERMS else None,
)

In [21]:
pm_proteins = df_uniprot_goa_membrane[df_uniprot_goa_membrane.go_term_ancestor == "plasma membrane"].Uniprot.unique()
df_sequences_pm = df_sequences[df_sequences.index.isin(pm_proteins)]

df_sequences_pm

Unnamed: 0_level_0,sequence,reviewed,protein_existence,organism_id,protein_names
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P04817,MTNSKEDADIEEKHMYNEPVTTLFHDVEASQTHHRRGSIPLKDEKS...,True,1,559292,Arginine permease CAN1 (Canavanine resistance ...
P17064,MLEEGNNVYEIQDLEKRSPVIGSSLENEKKVAASETFTATSEDDQQ...,True,1,559292,Purine-cytosine permease FCY2 (PCP FCY2) (Cyto...
P23900,MSNPQKALNDFLSSESVHTHDSSRKQSNKQSSDEGRSSSQPSHHHS...,True,1,559292,Glycerol uptake/efflux facilitator protein
P30605,MGIHIPYLTSKTSQSNVGDAVGNADSVEFNSEHDSPSKRGKITLES...,True,1,559292,Myo-inositol transporter 1
P32465,MNSTPDLISPQKSNSSNSYELESGRSKAMNTPEGKNESFHDNLSES...,True,1,559292,Low-affinity glucose transporter HXT1
...,...,...,...,...,...
P33335,MKSTLSLTLCVISLLLTLFLAALDIVIVVTLYDTIGIKFHDFGNIG...,True,1,559292,Protein SGE1 (10-N-nonyl acridine orange resis...
P53241,MTISNKSWRSYFPHLRKLPEDDQYLYSDDTNSSIIAEEELHHSVDK...,True,1,559292,Vitamin H transporter (H(+)/biotin symporter)
Q06686,MNMGGSSSTAAKKATCKISMLWNWYTIDTCFIARSWRNDTKGKFAG...,True,1,559292,Copper transport protein CTR3 (Copper transpor...
Q12412,MPLNEKYERPPQPPPAYDPNHRPPSSSENSAAANVNDGQTPYHFRQ...,True,1,559292,Protein PNS1 (pH nine-sensitive protein 1)


In [9]:
df_uniprot_goa_pm = df_uniprot_goa[df_uniprot_goa.Uniprot.isin(pm_proteins)]
df_uniprot_goa_pm

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
69,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0015171,amino acid transmembrane transporter activity
70,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0008514,organic anion transmembrane transporter activity
71,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0022857,transmembrane transporter activity
72,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0005342,organic acid transmembrane transporter activity
73,P04817,enables,GO:0015171,amino acid transmembrane transporter activity,IBA,F,GO:0046943,carboxylic acid transmembrane transporter acti...
...,...,...,...,...,...,...,...,...
5730,Q99271,enables,GO:0015385,sodium:proton antiporter activity,IDA,F,GO:0015078,proton transmembrane transporter activity
5731,Q99271,enables,GO:0015385,sodium:proton antiporter activity,IDA,F,GO:0140828,metal cation:monoatomic cation antiporter acti...
5732,Q99271,enables,GO:0015385,sodium:proton antiporter activity,IDA,F,GO:0022804,active transmembrane transporter activity
5733,Q99271,enables,GO:0015385,sodium:proton antiporter activity,IDA,F,GO:0015297,antiporter activity


In [19]:
go_to_primary_inputs = df_go_chebi[df_go_chebi.chebi_go_relation == "has_primary_input"].drop(["go_term", "chebi_id"], axis=1).groupby("go_id").apply(lambda x: x.chebi_term.tolist())
go_to_participants = df_go_chebi[df_go_chebi.chebi_go_relation == "has_participant"].drop(["go_term", "chebi_id"], axis=1).groupby("go_id").apply(lambda x: x.chebi_term.tolist())

go_id
GO:0005313                                    [L-glutamate(1-)]
GO:0005354                                  [D-galactopyranose]
GO:0005363                                            [maltose]
GO:0005366                               [hydron, myo-inositol]
GO:0005375                                         [copper(2+)]
GO:0005384                                      [manganese(2+)]
GO:0005385                                           [zinc(2+)]
GO:0005388    [water, hydron, calcium(2+), ATP(4-), hydrogen...
GO:0008121    [1,4-benzoquinones, hydron, hydroquinones, iro...
GO:0008137    [hydron, ubiquinones, ubiquinol, NAD(1-), NADH...
GO:0008519                                           [ammonium]
GO:0008551    [water, hydron, ATP(4-), hydrogenphosphate, AD...
GO:0008553    [water, hydron, ATP(4-), hydrogenphosphate, AD...
GO:0008554    [water, hydron, sodium(1+), ATP(4-), hydrogenp...
GO:0008556    [water, hydron, potassium(1+), ATP(4-), hydrog...
GO:0009678    [water, hydron, diph