# New data pipeline

Input: Uniprot, GO graph, UniprotGOA, root node
Output: A list of protein sequences and go annotations, including ancestors, under the root node

Next: Innner merge that dataset with GO to ChEBI annotations
Then: Create Matrices for ChEBI terms and GO terms

In [1]:
from subpred.util import load_df
import networkx as nx
import pandas as pd
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset

# ORGANISM_IDS = {559292}
ORGANISM_IDS = None
SWISSPROT_ONLY = False
MAX_SEQUENCE_EVIDENCE_CODE = 2
EXCLUDE_IEA_GO_TERMS = False
REMOVE_PROTEINS_WITHOUT_GENE_NAMES = False

df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids=ORGANISM_IDS,
    swissprot_only=SWISSPROT_ONLY,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=EXCLUDE_IEA_GO_TERMS,
    max_sequence_evidence_code=MAX_SEQUENCE_EVIDENCE_CODE,
    remove_proteins_without_gene_names=REMOVE_PROTEINS_WITHOUT_GENE_NAMES,
)

## Create GO ChEBI mapping with ancestors

In [10]:

# load go chebi data from quickgo
# TODO update identifiers in goa and here?

go_chebi = load_df("go_chebi")

go_chebi = go_chebi[go_chebi.go_id.isin(df_uniprot_goa.go_id_ancestor)]
go_chebi = go_chebi[go_chebi.relation == "has_primary_input"]

# load is_a chebi graph
graph_chebi = load_df("chebi_obo")
graph_chebi = graph_chebi.edge_subgraph(
    edges=[
        (source, sink, key)
        for source, sink, key in graph_chebi.edges(keys=True)
        if key == "is_a"
    ]
)

# add ancestor chebi ids
go_chebi["chebi_id_ancestor"] = go_chebi.chebi_id.transform(
    lambda x: set(nx.descendants(graph_chebi, x) | {x})
)
go_chebi = go_chebi.explode("chebi_id_ancestor")
chebi_id_to_term = {k: v for k, v in graph_chebi.nodes(data="name")}
go_chebi["chebi_term_ancestor"] = go_chebi.chebi_id_ancestor.map(chebi_id_to_term)
go_chebi = go_chebi.reset_index(drop=True)

# filter by chebi terms that have properties (meaning they are molecules)
chebi_id_to_properties = {
    chebi_id: properties_list
    for chebi_id, properties_list in graph_chebi.nodes(data="property_value")
    if properties_list
}
go_chebi = go_chebi[
    go_chebi.chebi_id_ancestor.isin(chebi_id_to_properties.keys())
].reset_index(drop=True)

# add names of go ids
graph_go = load_df("go_obo")
go_term_to_name = {k: v for k, v in graph_go.nodes(data="name")}
go_chebi.insert(column="go_term", value=go_chebi.go_id.map(go_term_to_name), loc=1)
go_chebi

Unnamed: 0,go_id,go_term,chebi_id,chebi_term,relation,chebi_id_ancestor,chebi_term_ancestor
0,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:25414,monoatomic monocation
1,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:15378,hydron
2,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:24636,proton
3,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:33251,monoatomic hydrogen
4,GO:0042958,maltodextrin transmembrane transporter activity,CHEBI:25140,maltodextrin,has_primary_input,CHEBI:37163,glucan
...,...,...,...,...,...,...,...
1290,GO:0009674,potassium:sodium symporter activity,CHEBI:29103,potassium(1+),has_primary_input,CHEBI:29103,potassium(1+)
1291,GO:0010542,nitrate efflux transmembrane transporter activity,CHEBI:17632,nitrate,has_primary_input,CHEBI:17632,nitrate
1292,GO:1905131,carcinine transmembrane transporter activity,CHEBI:192797,carcininium,has_primary_input,CHEBI:192797,carcininium
1293,GO:1905131,carcinine transmembrane transporter activity,CHEBI:192797,carcininium,has_primary_input,CHEBI:65296,primary ammonium ion


## Uniprot-GO-ChEBI frame with ancestor chebi terms

In [17]:
uniprot_go_chebi = df_uniprot_goa.drop(["go_id", "go_term"], axis=1).merge(
    go_chebi[["go_id", "go_term", "chebi_id_ancestor", "chebi_term_ancestor"]].drop_duplicates(), how="left", left_on="go_term_ancestor", right_on="go_term"
).drop(["go_id", "go_term"], axis=1)

uniprot_go_chebi

Unnamed: 0,Uniprot,qualifier,evidence_code,aspect,go_id_ancestor,go_term_ancestor,chebi_id_ancestor,chebi_term_ancestor
0,A0A014M993,enables,IEA,F,GO:0015267,channel activity,,
1,A0A014M993,enables,IEA,F,GO:0022803,passive transmembrane transporter activity,,
2,A0A014M993,enables,IEA,F,GO:0022829,wide pore channel activity,,
3,A0A014M993,enables,IEA,F,GO:0022857,transmembrane transporter activity,,
4,A0A014M993,enables,IEA,F,GO:0015288,porin activity,,
...,...,...,...,...,...,...,...,...
1225591,Z4YKJ7,enables,IEA,F,GO:0015293,symporter activity,,
1225592,Z4YKJ7,enables,IEA,F,GO:0015370,solute:sodium symporter activity,CHEBI:29101,sodium(1+)
1225593,Z4YKJ7,enables,IEA,F,GO:0015370,solute:sodium symporter activity,CHEBI:25414,monoatomic monocation
1225594,Z4YKJ7,enables,IEA,F,GO:0005342,organic acid transmembrane transporter activity,,


## Fill in missing ChEBI terms

In [19]:
uniprot_go_chebi[['go_id_ancestor', 'go_term_ancestor', 'chebi_id_ancestor', 'chebi_term_ancestor']]

Unnamed: 0,go_id_ancestor,go_term_ancestor,chebi_id_ancestor,chebi_term_ancestor
0,GO:0015267,channel activity,,
1,GO:0022803,passive transmembrane transporter activity,,
2,GO:0022829,wide pore channel activity,,
3,GO:0022857,transmembrane transporter activity,,
4,GO:0015288,porin activity,,
...,...,...,...,...
1225591,GO:0015293,symporter activity,,
1225592,GO:0015370,solute:sodium symporter activity,CHEBI:29101,sodium(1+)
1225593,GO:0015370,solute:sodium symporter activity,CHEBI:25414,monoatomic monocation
1225594,GO:0005342,organic acid transmembrane transporter activity,,


In [36]:
import numpy as np

searchterm_to_chebi = {
    "protein": "CHEBI:36080",
    "ion": "CHEBI:24870",
    "anion": "CHEBI:22563",
    "cation": "CHEBI:36916",
    # "monoatomic ion": "CHEBI:24867",
    "monoatomic anion": "CHEBI:23905",
    "monoatomic cation": "CHEBI:23906",
    "proton" : "CHEBI:24636"
}


for key, val in searchterm_to_chebi.items():
    go_chebi_direct = go_chebi_direct.assign(
        substrates=np.where(
            go_chebi_direct.go_term_ancestor.str.contains(key)
            & go_chebi_direct.substrates.isna(),
            val,
            go_chebi_direct.substrates,
        )
    )

In [37]:
go_chebi_direct

Unnamed: 0,go_term_ancestor,substrates
0,transmembrane transporter activity,
1,channel activity,
2,passive transmembrane transporter activity,
3,porin activity,
4,wide pore channel activity,
...,...,...
801,high-affinity copper ion transmembrane transpo...,CHEBI:24870
802,thiamine:proton symporter activity,CHEBI:24636
803,high-affinity thiamine:proton symporter activity,CHEBI:24636
804,carcinine transmembrane transporter activity,"(primary ammonium ion, carcininium)"


# OLD CODE

In [1]:
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset

df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids={9606},
    swissprot_only=False,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=False,
    max_sequence_evidence_code=1,
)
df_uniprot_goa

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity
...,...,...,...,...,...,...,...,...
67172,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity
67173,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity
67174,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity
67175,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity


In [5]:
df_go_chebi

Unnamed: 0,go_id,chebi_id,chebi_term,chebi_go_relation
0,GO:0000064,CHEBI:46911,L-ornithinium(1+),has_primary_input
1,GO:0000095,CHEBI:59789,S-adenosyl-L-methionine zwitterion,has_primary_input
2,GO:0000099,CHEBI:26834,sulfur-containing amino acid,has_primary_input
3,GO:0000295,CHEBI:61293,adenyl nucleotide,has_primary_input
4,GO:0000514,CHEBI:14321,glutamate(1-),has_primary_input
...,...,...,...,...
693,GO:1901480,CHEBI:30823,oleate,has_primary_input
694,GO:1901505,CHEBI:63299,carbohydrate derivative,has_primary_input
695,GO:1901682,CHEBI:26835,sulfur molecular entity,has_primary_input
696,GO:1902282,CHEBI:29103,potassium(1+),has_primary_input


In [None]:
df_go_chebi_primary = df_go_chebi[
    df_go_chebi.chebi_go_relation == "has_primary_input"
].reset_index(drop=True)
df_go_chebi_primary = df_go_chebi_primary[
    df_go_chebi_primary.chebi_id.isin(chebi_id_to_properties.keys())
]
df_go_chebi_primary

## Merge

In [3]:
import pandas as pd

df_uniprot_go_transporter = pd.merge(
    df_uniprot_goa, df_sequences, left_on="Uniprot", right_index=True, how="inner"
).reset_index(drop=True)
df_uniprot_go_transporter

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor,sequence,reviewed,protein_existence,organism_id,protein_names
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,MQGARAPRDQGQSPGRMSALGRSSVILLTYVLAATELTCLFMQFSI...,False,1,9606,Solute carrier family 22 (Organic cation trans...
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67172,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67173,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67174,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67175,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)


In [4]:
df_uniprot_goa_chebi = df_uniprot_goa.merge(
    df_go_chebi, how="left", left_on="go_id_ancestor", right_on="go_id"
)
df_uniprot_goa_chebi

Unnamed: 0,Uniprot,qualifier,go_id_x,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor,go_id_y,chebi_id,chebi_term,chebi_go_relation
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,,,,
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity,,,,
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,,,,
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity,,,,
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
75020,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,,,,
75021,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity,,,,
75022,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity,GO:0042887,CHEBI:32988,amide,has_primary_input
75023,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity,GO:1904680,CHEBI:16670,peptide,has_primary_input
