# New data pipeline

Input: Uniprot, GO graph, UniprotGOA, root node
Output: A list of protein sequences and go annotations, including ancestors, under the root node

Next: Innner merge that dataset with GO to ChEBI annotations
Then: Create Matrices for ChEBI terms and GO terms

In [54]:
from subpred.util import load_df
import networkx as nx
import pandas as pd


In [55]:
df_uniprot = load_df("uniprot")
df_uniprot

Unnamed: 0_level_0,gene_names,protein_names,reviewed,protein_existence,sequence,organism_id
Uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A0A0C5B5G6,MT-RNR1,Mitochondrial-derived peptide MOTS-c (Mitochon...,True,1,MRWQEMGYIFYPRKLR,9606
A0A1B0GTW7,CIROP LMLN2,Ciliated left-right organizer metallopeptidase...,True,1,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,9606
A0JNW5,BLTP3B KIAA0701 SHIP164 UHRF1BP1L,Bridge-like lipid transfer protein family memb...,True,1,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,9606
A0JP26,POTEB3,POTE ankyrin domain family member B3,True,1,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,9606
A0PK11,CLRN2,Clarin-2,True,1,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,9606
...,...,...,...,...,...,...
X5L4R4,NOD-2,Nucleotide-binding oligomerization domain-cont...,False,2,MSPGCYKGWPFNCHLSHEEDKRRNETLLQEAETSNLQITASFVSGL...,586796
X5MBL2,GT34D,"Putative galacto(Gluco)mannan alpha-1,6-galact...",False,2,KVLYDRAFNSSDDQSALVYLLLKEKDKWADRIFIEHKYYLNGYWLD...,3352
X5MFI4,GT34D,"Putative galacto(Gluco)mannan alpha-1,6-galact...",False,2,MDEDVLCKGPLHGGSARSLKGSLKRLKRIMESLNDGLIFMGGAVSA...,3352
X5MI49,GT34A,"Putative galacto(Gluco)mannan alpha-1,6-galact...",False,2,MVNDSKLETISGNMVQKRKSFDGLPFWTVSIAGGLLLCWSLWRICF...,3352


In [56]:
def get_sequence_dataset(
    organism_ids: set = None,
    swissprot_only: bool = False,
    max_sequence_evidence_code: int = 2,
):
    df_uniprot = load_df("uniprot")
    if swissprot_only:
        df_uniprot = df_uniprot[df_uniprot.reviewed]
    df_uniprot = df_uniprot[df_uniprot.protein_existence <= max_sequence_evidence_code]
    if organism_ids:
        df_uniprot = df_uniprot[df_uniprot.organism_id.isin(organism_ids)]
    df_uniprot = df_uniprot[["sequence", "reviewed", "protein_existence", "organism_id", "protein_names"]].drop_duplicates()
    return df_uniprot


In [57]:
def get_go_subgraph(
    graph_go, root_node="GO:0022857", keys={"is_a"}, namespaces={"molecular_function"}
):
    graph_go_subgraph = graph_go.subgraph(
        {
            node
            for node, namespace in graph_go.nodes(data="namespace")
            if namespace in namespaces
        }
    )
    graph_go_subgraph = graph_go_subgraph.subgraph(
        nx.ancestors(graph_go, root_node) | {root_node}
    )

    graph_go_subgraph = graph_go_subgraph.edge_subgraph(
        {
            (go1, go2, key)
            for go1, go2, key in graph_go_subgraph.edges(keys=True)
            if key in keys
        }
    )

    return graph_go_subgraph


In [58]:
def get_go_id_update_dict(graph_go):
    go_id_update_dict = dict()
    for go_term, alt_ids in graph_go.nodes(data="alt_id"):
        if not alt_ids:
            go_id_update_dict[go_term] = go_term
            continue
        for alt_id in alt_ids:
            go_id_update_dict[alt_id] = go_term
    for go_term in graph_go.nodes():
        go_id_update_dict[go_term] = go_term
    return go_id_update_dict


In [59]:
def get_go_annotations(
    go_id_update_dict,
    proteins_subset: set = None,
    go_ids_subset: set = None,
    qualifiers_keep:set=None,
    aspects_keep:set=None,
    evidence_codes_remove:set=None,
):
    df_uniprot_goa = load_df("go")

    # update go identifiers in annotation dataset to match go graph
    df_uniprot_goa["go_id"] = df_uniprot_goa.go_id.map(go_id_update_dict)
    df_uniprot_goa = df_uniprot_goa[~df_uniprot_goa.go_id.isnull()].reset_index(
        drop=True
    )

    # filtering out "not" annotations explicitly
    df_uniprot_goa = df_uniprot_goa[~df_uniprot_goa.qualifier.str.startswith("NOT")]

    # filtering for parameters
    if qualifiers_keep:
        df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.qualifier.isin(qualifiers_keep)]
    if aspects_keep:
        df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.aspect.isin(aspects_keep)]
    if evidence_codes_remove:
        df_uniprot_goa = df_uniprot_goa[
            ~df_uniprot_goa.evidence_code.isin(evidence_codes_remove)
        ]

    if proteins_subset:
        df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.Uniprot.isin(proteins_subset)]

    # filter annotations by protein subset
    if go_ids_subset:
        df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.go_id.isin(go_ids_subset)]

    # cleanup
    df_uniprot_goa = df_uniprot_goa.drop_duplicates().reset_index(drop=True)
    # df_uniprot_goa = (
    #     df_uniprot_goa[["Uniprot", "go_id"]].drop_duplicates().reset_index(drop="True")
    # )

    return df_uniprot_goa


def add_ancestors(df_uniprot_goa, graph_go):
    df_uniprot_goa["ancestor"] = df_uniprot_goa.go_id.map(
        lambda go_id: {go_id} | set(nx.descendants(graph_go, go_id))
    )

    df_uniprot_goa = df_uniprot_goa.explode("ancestor")
    df_uniprot_goa = df_uniprot_goa.reset_index(drop=True)

    return df_uniprot_goa


In [60]:
# First, get all sequences with filtering criteriou:
df_sequences = get_sequence_dataset(
    organism_ids={9606}, swissprot_only=False, max_sequence_evidence_code=1
)

# Then, get all transporter go terms with is_a
graph_go = load_df("go_obo")
go_id_to_term = {go_id: go_term for go_id, go_term in graph_go.nodes(data="name")}
go_term_to_id = {go_term: go_id for go_id, go_term in graph_go.nodes(data="name")}
go_id_update_dict = get_go_id_update_dict(graph_go=graph_go)
graph_go_transmembrane_transport = get_go_subgraph(
    graph_go=graph_go,
    root_node=go_term_to_id["transmembrane transporter activity"],
    keys={"is_a"},
    namespaces={"molecular_function"},
)


# get all go annotations, filtered by parameters, with updated ids
df_uniprot_goa = get_go_annotations(
    go_id_update_dict=go_id_update_dict,
    proteins_subset=set(df_sequences.index),
    go_ids_subset=set(graph_go_transmembrane_transport.nodes()),
    qualifiers_keep={"enables"},
    aspects_keep={"F"},
    # evidence_codes_remove={"IEA"},
)
# add ancestors
df_uniprot_goa = add_ancestors(
    df_uniprot_goa=df_uniprot_goa, graph_go=graph_go_transmembrane_transport
)
# add go terms
df_uniprot_goa["go_term"] = df_uniprot_goa.go_id.map(go_id_to_term)
df_uniprot_goa["go_term_ancestor"] = df_uniprot_goa.ancestor.map(go_id_to_term)
# sort columns
df_uniprot_goa = df_uniprot_goa[
    ["Uniprot", "qualifier", "go_id", "go_term","evidence_code","aspect", "ancestor", "go_term_ancestor"]
]

df_uniprot_goa

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,ancestor,go_term_ancestor
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity
...,...,...,...,...,...,...,...,...
67172,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity
67173,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity
67174,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity
67175,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity


In [86]:
df_uniprot_go_transporter = pd.merge(
    df_uniprot_goa, df_sequences, left_on="Uniprot", right_index=True, how="inner"
).reset_index(drop=True)
df_uniprot_go_transporter

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,ancestor,go_term_ancestor,sequence,reviewed,protein_existence,organism_id,protein_names
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,MQGARAPRDQGQSPGRMSALGRSSVILLTYVLAATELTCLFMQFSI...,False,1,9606,Solute carrier family 22 (Organic cation trans...
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67172,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67173,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67174,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67175,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)


In [75]:
# TODO annotate with primary substrate
df_go_chebi = load_df("go_chebi")
graph_chebi = load_df("chebi_obo")

chebi_id_update_dict = get_go_id_update_dict(graph_chebi)

df_go_chebi["go_id"] = df_go_chebi.go_id.map(go_id_update_dict)
df_go_chebi["chebi_id"] = df_go_chebi.chebi_id.map(chebi_id_update_dict)
# assert not df_go_chebi.go_id.isnull().any()
df_go_chebi = df_go_chebi[~df_go_chebi.go_id.isnull()]
df_go_chebi = df_go_chebi[~df_go_chebi.chebi_id.isnull()]

df_go_chebi

Unnamed: 0,go_id,chebi_id,chebi_term,relation
0,GO:0006633,CHEBI:28868,fatty acid anion,has_primary_output
1,GO:0006629,CHEBI:18059,lipid,has_primary_input_or_output
2,GO:0006665,CHEBI:26739,sphingolipid,has_primary_input_or_output
3,GO:0006754,CHEBI:30616,ATP(4-),has_primary_output
4,GO:0051028,CHEBI:33699,messenger RNA,has_primary_input
...,...,...,...,...
21355,GO:0033521,CHEBI:58404,"(E)-3,7,11,15-tetramethylhexadec-2-en-1-yl dip...",has_primary_output
21356,GO:0140786,CHEBI:28300,glutamine,has_input
21357,GO:1904474,CHEBI:57504,L-dopa zwitterion,has_input
21358,GO:0061431,CHEBI:64558,methionine zwitterion,has_input


In [85]:
relation_value_counts = df_go_chebi[df_go_chebi.go_id.isin(graph_go_transmembrane_transport.nodes())].relation.value_counts()
relation_value_counts[relation_value_counts > 0]

relation
has_primary_input    681
has_participant      467
has_input              1
Name: count, dtype: int64

In [83]:
df_go_chebi_primary = df_go_chebi[df_go_chebi.relation == "has_primary_input"].reset_index(drop=True)
df_go_chebi_primary

Unnamed: 0,go_id,chebi_id,chebi_term,relation
0,GO:0051028,CHEBI:33699,messenger RNA,has_primary_input
1,GO:0006401,CHEBI:33697,ribonucleic acid,has_primary_input
2,GO:0006406,CHEBI:33699,messenger RNA,has_primary_input
3,GO:0006851,CHEBI:29108,calcium(2+),has_primary_input
4,GO:0006811,CHEBI:24867,monoatomic ion,has_primary_input
...,...,...,...,...
1988,GO:0019458,CHEBI:64558,methionine zwitterion,has_primary_input
1989,GO:0044577,CHEBI:15936,aldehydo-D-xylose,has_primary_input
1990,GO:0140125,CHEBI:26948,vitamin B1,has_primary_input
1991,GO:0140145,CHEBI:23378,copper cation,has_primary_input


In [84]:
df_go_chebi_participant = df_go_chebi[df_go_chebi.relation == "has_participant"].reset_index(drop=True)
df_go_chebi_participant

Unnamed: 0,go_id,chebi_id,chebi_term,relation
0,GO:0004315,CHEBI:78776,O-(S-3-oxoacylpantetheine-4'-phosphoryl)-L-ser...,has_participant
1,GO:0004315,CHEBI:78449,O-(S-malonylpantetheine-4'-phosphoryl)serine(2...,has_participant
2,GO:0004315,CHEBI:16526,carbon dioxide,has_participant
3,GO:0004315,CHEBI:64479,O-(pantetheine-4'-phosphoryl)serine(1-) residue,has_participant
4,GO:0004315,CHEBI:138651,O-(S-fatty acylpantetheine-4'-phosphoryl)-L-se...,has_participant
...,...,...,...,...
16799,GO:0050328,CHEBI:16240,hydrogen peroxide,has_participant
16800,GO:0050002,CHEBI:142235,Se-L-selenocysteine-S-L-cysteine residue,has_participant
16801,GO:0050002,CHEBI:30000,L-selenocysteine residue,has_participant
16802,GO:0050002,CHEBI:57726,D-proline zwitterion,has_participant


In [None]:
# TODO ancestors?