# New data pipeline

Input: Uniprot, GO graph, UniprotGOA, root node
Output: A list of protein sequences and go annotations, including ancestors, under the root node

Next: Innner merge that dataset with GO to ChEBI annotations
Then: Create Matrices for ChEBI terms and GO terms

In [1]:
from subpred.util import load_df
import networkx as nx
import pandas as pd
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset

# ORGANISM_IDS = {559292}
ORGANISM_IDS = None
SWISSPROT_ONLY = False
MAX_SEQUENCE_EVIDENCE_CODE = 2
EXCLUDE_IEA_GO_TERMS = False
REMOVE_PROTEINS_WITHOUT_GENE_NAMES = False

df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids=ORGANISM_IDS,
    swissprot_only=SWISSPROT_ONLY,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=EXCLUDE_IEA_GO_TERMS,
    max_sequence_evidence_code=MAX_SEQUENCE_EVIDENCE_CODE,
    remove_proteins_without_gene_names=REMOVE_PROTEINS_WITHOUT_GENE_NAMES,
)

In [2]:
df_uniprot_goa

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
0,A0A014M993,enables,GO:0015288,porin activity,IEA,F,GO:0022803,passive transmembrane transporter activity
1,A0A014M993,enables,GO:0015288,porin activity,IEA,F,GO:0015288,porin activity
2,A0A014M993,enables,GO:0015288,porin activity,IEA,F,GO:0022857,transmembrane transporter activity
3,A0A014M993,enables,GO:0015288,porin activity,IEA,F,GO:0022829,wide pore channel activity
4,A0A014M993,enables,GO:0015288,porin activity,IEA,F,GO:0015267,channel activity
...,...,...,...,...,...,...,...,...
1005409,Z4YKJ7,enables,GO:0015501,glutamate:sodium symporter activity,IEA,F,GO:0015291,secondary active transmembrane transporter act...
1005410,Z4YKJ7,enables,GO:0015501,glutamate:sodium symporter activity,IEA,F,GO:0022890,inorganic cation transmembrane transporter act...
1005411,Z4YKJ7,enables,GO:0015501,glutamate:sodium symporter activity,IEA,F,GO:0005342,organic acid transmembrane transporter activity
1005412,Z4YKJ7,enables,GO:0015501,glutamate:sodium symporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity


In [3]:
go_chebi = load_df("go_chebi")

go_chebi = go_chebi[go_chebi.go_id.isin(df_uniprot_goa.go_id_ancestor)]
go_chebi = go_chebi[go_chebi.relation == "has_primary_input"]
go_chebi

Unnamed: 0,go_id,chebi_id,chebi_term,relation
50,GO:0015078,CHEBI:24636,proton,has_primary_input
54,GO:0042958,CHEBI:25140,maltodextrin,has_primary_input
175,GO:0015075,CHEBI:24867,monoatomic ion,has_primary_input
177,GO:0015085,CHEBI:29108,calcium(2+),has_primary_input
321,GO:0005249,CHEBI:29103,potassium(1+),has_primary_input
...,...,...,...,...
21537,GO:0009674,CHEBI:29103,potassium(1+),has_primary_input
21543,GO:0010542,CHEBI:17632,nitrate,has_primary_input
21624,GO:1905131,CHEBI:192797,carcininium,has_primary_input
21698,GO:0015445,CHEBI:60253,silver cation,has_primary_input


In [4]:
graph_chebi = load_df("chebi_obo")

In [5]:
list(graph_chebi.nodes())

['CHEBI:24431',
 'CHEBI:23367',
 'CHEBI:24870',
 'CHEBI:24867',
 'CHEBI:23905',
 'CHEBI:33429',
 'CHEBI:30151',
 'CHEBI:16042',
 'CHEBI:17051',
 'CHEBI:24060',
 'CHEBI:28741',
 'CHEBI:32129',
 'CHEBI:30340',
 'CHEBI:51990',
 'CHEBI:49499',
 'CHEBI:66871',
 'CHEBI:66872',
 'CHEBI:190416',
 'CHEBI:135933',
 'CHEBI:193146',
 'CHEBI:190417',
 'CHEBI:17996',
 'CHEBI:23114',
 'CHEBI:36093',
 'CHEBI:6636',
 'CHEBI:86345',
 'CHEBI:86355',
 'CHEBI:131394',
 'CHEBI:31206',
 'CHEBI:3312',
 'CHEBI:86158',
 'CHEBI:91243',
 'CHEBI:26710',
 'CHEBI:46715',
 'CHEBI:30341',
 'CHEBI:32588',
 'CHEBI:35696',
 'CHEBI:53503',
 'CHEBI:36383',
 'CHEBI:36385',
 'CHEBI:37117',
 'CHEBI:32213',
 'CHEBI:48607',
 'CHEBI:49976',
 'CHEBI:87627',
 'CHEBI:49553',
 'CHEBI:86318',
 'CHEBI:91245',
 'CHEBI:53472',
 'CHEBI:62843',
 'CHEBI:63020',
 'CHEBI:63039',
 'CHEBI:63041',
 'CHEBI:86368',
 'CHEBI:131395',
 'CHEBI:63317',
 'CHEBI:86153',
 'CHEBI:67096',
 'CHEBI:74856',
 'CHEBI:74895',
 'CHEBI:77566',
 'CHEBI:78067',
 'CH

In [6]:
list(graph_chebi.edges(keys=True))

[('CHEBI:23367', 'CHEBI:24431', 'is_a'),
 ('CHEBI:24870', 'CHEBI:23367', 'is_a'),
 ('CHEBI:24867', 'CHEBI:24870', 'is_a'),
 ('CHEBI:24867', 'CHEBI:33238', 'is_a'),
 ('CHEBI:23905', 'CHEBI:24867', 'is_a'),
 ('CHEBI:23905', 'CHEBI:22563', 'is_a'),
 ('CHEBI:33429', 'CHEBI:23905', 'is_a'),
 ('CHEBI:33429', 'CHEBI:36830', 'is_a'),
 ('CHEBI:30151', 'CHEBI:33429', 'is_a'),
 ('CHEBI:30151', 'CHEBI:33627', 'is_a'),
 ('CHEBI:16042', 'CHEBI:33429', 'is_a'),
 ('CHEBI:16042', 'CHEBI:79389', 'is_a'),
 ('CHEBI:17051', 'CHEBI:16042', 'is_a'),
 ('CHEBI:17051', 'CHEBI:36895', 'is_a'),
 ('CHEBI:17051', 'CHEBI:29228', 'is_conjugate_base_of'),
 ('CHEBI:24060', 'CHEBI:33958', 'is_a'),
 ('CHEBI:24060', 'CHEBI:24062', 'is_a'),
 ('CHEBI:24060', 'CHEBI:17051', 'has_part'),
 ('CHEBI:28741', 'CHEBI:24060', 'is_a'),
 ('CHEBI:28741', 'CHEBI:25435', 'has_role'),
 ('CHEBI:32129', 'CHEBI:24060', 'is_a'),
 ('CHEBI:32129', 'CHEBI:33049', 'has_part'),
 ('CHEBI:30340', 'CHEBI:33968', 'is_a'),
 ('CHEBI:30340', 'CHEBI:24060

In [7]:
graph_chebi = graph_chebi.edge_subgraph(
    edges=[
        (source, sink, key)
        for source, sink, key in graph_chebi.edges(keys=True)
        if key == "is_a"
    ]
)

In [8]:
go_chebi["chebi_id_ancestor"] = go_chebi.chebi_id.transform(
    lambda x: set(nx.descendants(graph_chebi, x) | {x})
)
go_chebi = go_chebi.explode("chebi_id_ancestor")
chebi_id_to_term = {k: v for k, v in graph_chebi.nodes(data="name")}
go_chebi["chebi_term_ancestor"] = go_chebi.chebi_id_ancestor.map(chebi_id_to_term)
go_chebi = go_chebi.reset_index(drop=True)

In [9]:
chebi_id_to_properties = {
    chebi_id: properties_list
    for chebi_id, properties_list in graph_chebi.nodes(data="property_value")
    if properties_list
}

In [10]:
go_chebi = go_chebi[go_chebi.chebi_id_ancestor.isin(chebi_id_to_properties.keys())].reset_index(drop=True)

go_chebi

Unnamed: 0,go_id,chebi_id,chebi_term,relation,chebi_id_ancestor,chebi_term_ancestor
0,GO:0015078,CHEBI:24636,proton,has_primary_input,CHEBI:33251,monoatomic hydrogen
1,GO:0015078,CHEBI:24636,proton,has_primary_input,CHEBI:15378,hydron
2,GO:0015078,CHEBI:24636,proton,has_primary_input,CHEBI:25414,monoatomic monocation
3,GO:0015078,CHEBI:24636,proton,has_primary_input,CHEBI:24636,proton
4,GO:0042958,CHEBI:25140,maltodextrin,has_primary_input,CHEBI:37163,glucan
...,...,...,...,...,...,...
1290,GO:0009674,CHEBI:29103,potassium(1+),has_primary_input,CHEBI:25414,monoatomic monocation
1291,GO:0010542,CHEBI:17632,nitrate,has_primary_input,CHEBI:17632,nitrate
1292,GO:1905131,CHEBI:192797,carcininium,has_primary_input,CHEBI:65296,primary ammonium ion
1293,GO:1905131,CHEBI:192797,carcininium,has_primary_input,CHEBI:192797,carcininium


In [11]:
graph_go = load_df("go_obo")
go_term_to_name = {k:v for k, v in graph_go.nodes(data="name")}

In [12]:
go_chebi.insert(column= "go_term",value=go_chebi.go_id.map(go_term_to_name), loc=1)

In [13]:
go_chebi.drop_duplicates()

Unnamed: 0,go_id,go_term,chebi_id,chebi_term,relation,chebi_id_ancestor,chebi_term_ancestor
0,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:33251,monoatomic hydrogen
1,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:15378,hydron
2,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:25414,monoatomic monocation
3,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:24636,proton
4,GO:0042958,maltodextrin transmembrane transporter activity,CHEBI:25140,maltodextrin,has_primary_input,CHEBI:37163,glucan
...,...,...,...,...,...,...,...
1290,GO:0009674,potassium:sodium symporter activity,CHEBI:29103,potassium(1+),has_primary_input,CHEBI:25414,monoatomic monocation
1291,GO:0010542,nitrate efflux transmembrane transporter activity,CHEBI:17632,nitrate,has_primary_input,CHEBI:17632,nitrate
1292,GO:1905131,carcinine transmembrane transporter activity,CHEBI:192797,carcininium,has_primary_input,CHEBI:65296,primary ammonium ion
1293,GO:1905131,carcinine transmembrane transporter activity,CHEBI:192797,carcininium,has_primary_input,CHEBI:192797,carcininium


In [14]:
# TODO filter chebi ancestors for smiles or formula or mass presence

In [15]:
df_go_chebi_primary = df_go_chebi[df_go_chebi.chebi_go_relation == "has_primary_input"].reset_index(drop=True)
df_go_chebi_primary = df_go_chebi_primary[df_go_chebi_primary.chebi_id.isin(chebi_id_to_properties.keys())]
df_go_chebi_primary

Unnamed: 0,go_id,go_term,chebi_id,chebi_term,chebi_go_relation
0,GO:0000064,L-ornithine transmembrane transporter activity,CHEBI:46911,L-ornithinium(1+),has_primary_input
1,GO:0000095,S-adenosyl-L-methionine transmembrane transpor...,CHEBI:59789,S-adenosyl-L-methionine zwitterion,has_primary_input
3,GO:0000100,S-methylmethionine transmembrane transporter a...,CHEBI:58252,S-methyl-L-methionine zwitterion,has_primary_input
4,GO:0000102,L-methionine secondary active transmembrane tr...,CHEBI:57844,L-methionine zwitterion,has_primary_input
6,GO:0000297,spermine transmembrane transporter activity,CHEBI:45725,spermine(4+),has_primary_input
...,...,...,...,...,...
679,GO:1903089,5-amino-1-ribofuranosylimidazole-4-carboxamide...,CHEBI:28498,acadesine,has_primary_input
680,GO:1903425,fluoride transmembrane transporter activity,CHEBI:17051,fluoride,has_primary_input
681,GO:1904680,peptide transmembrane transporter activity,CHEBI:16670,peptide,has_primary_input
682,GO:1905131,carcinine transmembrane transporter activity,CHEBI:192797,carcininium,has_primary_input


In [1]:
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset

df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids={9606},
    swissprot_only=False,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=False,
    max_sequence_evidence_code=1,
)
df_uniprot_goa

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity
...,...,...,...,...,...,...,...,...
67172,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity
67173,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity
67174,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity
67175,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity


In [5]:
df_go_chebi

Unnamed: 0,go_id,chebi_id,chebi_term,chebi_go_relation
0,GO:0000064,CHEBI:46911,L-ornithinium(1+),has_primary_input
1,GO:0000095,CHEBI:59789,S-adenosyl-L-methionine zwitterion,has_primary_input
2,GO:0000099,CHEBI:26834,sulfur-containing amino acid,has_primary_input
3,GO:0000295,CHEBI:61293,adenyl nucleotide,has_primary_input
4,GO:0000514,CHEBI:14321,glutamate(1-),has_primary_input
...,...,...,...,...
693,GO:1901480,CHEBI:30823,oleate,has_primary_input
694,GO:1901505,CHEBI:63299,carbohydrate derivative,has_primary_input
695,GO:1901682,CHEBI:26835,sulfur molecular entity,has_primary_input
696,GO:1902282,CHEBI:29103,potassium(1+),has_primary_input


## Merge

In [3]:
import pandas as pd

df_uniprot_go_transporter = pd.merge(
    df_uniprot_goa, df_sequences, left_on="Uniprot", right_index=True, how="inner"
).reset_index(drop=True)
df_uniprot_go_transporter

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor,sequence,reviewed,protein_existence,organism_id,protein_names
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,MQGARAPRDQGQSPGRMSALGRSSVILLTYVLAATELTCLFMQFSI...,False,1,9606,Solute carrier family 22 (Organic cation trans...
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67172,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67173,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67174,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67175,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)


In [4]:
df_uniprot_goa_chebi = df_uniprot_goa.merge(
    df_go_chebi, how="left", left_on="go_id_ancestor", right_on="go_id"
)
df_uniprot_goa_chebi

Unnamed: 0,Uniprot,qualifier,go_id_x,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor,go_id_y,chebi_id,chebi_term,chebi_go_relation
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,,,,
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity,,,,
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,,,,
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity,,,,
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
75020,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,,,,
75021,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity,,,,
75022,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity,GO:0042887,CHEBI:32988,amide,has_primary_input
75023,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity,GO:1904680,CHEBI:16670,peptide,has_primary_input
