# New data pipeline

Input: Uniprot, GO graph, UniprotGOA, root node
Output: A list of protein sequences and go annotations, including ancestors, under the root node

Next: Innner merge that dataset with GO to ChEBI annotations
Then: Create Matrices for ChEBI terms and GO terms

In [1]:
from subpred.util import load_df
import networkx as nxs

In [2]:
def get_sequence_dataset(
    organism_ids: set = None, swissprot_only: bool = False, max_sequence_evidence_code: int = 2
):
    df_uniprot = load_df("uniprot")
    if swissprot_only:
        df_uniprot = df_uniprot[df_uniprot.reviewed]
    df_uniprot = df_uniprot[df_uniprot.protein_existence <= max_sequence_evidence_code]
    if organism_ids:
        df_uniprot = df_uniprot[df_uniprot.organism_id.isin(organism_ids)]
    df_uniprot = df_uniprot.sequence.to_frame().drop_duplicates()
    return df_uniprot

In [3]:
def get_go_subgraph(
    graph_go, root_node="GO:0022857", keys={"is_a"}, namespaces={"molecular_function"}
):
    graph_go_subgraph = graph_go.subgraph(
        {
            node
            for node, namespace in graph_go.nodes(data="namespace")
            if namespace in namespaces
        }
    )
    graph_go_subgraph = graph_go_subgraph.subgraph(nx.ancestors(graph_go, root_node) | {root_node})

    graph_go_subgraph = graph_go_subgraph.edge_subgraph(
        {(go1, go2, key) for go1, go2, key in graph_go_subgraph.edges(keys=True) if key in keys}
    )

    return graph_go_subgraph    


In [4]:
def get_go_id_update_dict(graph_go):
    go_id_update_dict = dict()
    for go_term, alt_ids in graph_go.nodes(data="alt_id"):
        if not alt_ids:
            go_id_update_dict[go_term] = go_term
            continue
        for alt_id in alt_ids:
            go_id_update_dict[alt_id] = go_term
    for go_term in graph_go.nodes():
        go_id_update_dict[go_term] = go_term
    return go_id_update_dict


In [5]:
def get_go_annotations(
    graph_go,
    proteins_subset: set = None,
    go_ids_subset: set = None,
    qualifiers_keep={"enables"},
    aspects_keep={"F"},
    evidence_codes_remove={"IEA"},
):
    df_uniprot_goa = load_df("go")

    # update go identifiers in annotation dataset
    go_id_update_dict = get_go_id_update_dict(graph_go=graph_go)
    df_uniprot_goa["go_id"] = df_uniprot_goa.go_id.map(go_id_update_dict)
    df_uniprot_goa = df_uniprot_goa[~df_uniprot_goa.go_id.isnull()].reset_index(
        drop=True
    )

    # filtering out "not" annotations explicitly
    df_uniprot_goa = df_uniprot_goa[~df_uniprot_goa.qualifier.str.startswith("NOT")]

    # filtering for parameters
    df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.qualifier.isin(qualifiers_keep)]
    df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.aspect.isin(aspects_keep)]
    df_uniprot_goa = df_uniprot_goa[
        ~df_uniprot_goa.evidence_code.isin(evidence_codes_remove)
    ]

    if proteins_subset:
        df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.Uniprot.isin(proteins_subset)]

    # filter annotations by protein subset
    if go_ids_subset:
        df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.go_id.isin(go_ids_subset)]

    # cleanup
    df_uniprot_goa = df_uniprot_goa.reset_index(drop=True)
    df_uniprot_goa = (
        df_uniprot_goa[["Uniprot", "go_id"]].drop_duplicates().reset_index(drop="True")
    )

    return df_uniprot_goa


def add_ancestors(df_uniprot_goa, graph_go):
    df_uniprot_goa["ancestor"] = df_uniprot_goa.go_id.map(
        lambda go_id: {go_id} | set(nx.descendants(graph_go, go_id))
    )

    df_uniprot_goa = df_uniprot_goa.explode("ancestor")
    df_uniprot_goa = df_uniprot_goa.reset_index(drop=True)

    return df_uniprot_goa


In [6]:
# First, get all sequences with filtering criteriou:
df_sequences = get_sequence_dataset(
    organism_ids={9606}, swissprot_only=False, max_sequence_evidence_code=1
)

# Then, get all transporter go terms with is_a
graph_go = load_df("go_obo")
go_id_to_term = {go_id: go_term for go_id, go_term in graph_go.nodes(data="name")}
go_term_to_id = {go_term: go_id for go_id, go_term in graph_go.nodes(data="name")}
graph_go_transmembrane_transport = get_go_subgraph(
    graph_go=graph_go,
    root_node=go_term_to_id["transmembrane transporter activity"],
    keys={"is_a"},
    namespaces={"molecular_function"},
)

# get all go annotations, filtered by parameters, with added ancestors, updated ids
df_uniprot_goa = get_go_annotations(
    graph_go=graph_go,
    proteins_subset=set(df_sequences.index),
    go_ids_subset=set(graph_go_transmembrane_transport.nodes()),
    qualifiers_keep={"enables"},
    aspects_keep={"F"},
    evidence_codes_remove={"IEA"},
)
df_uniprot_goa = add_ancestors(
    df_uniprot_goa=df_uniprot_goa, graph_go=graph_go_transmembrane_transport
)

df_uniprot_goa["go_term"]=df_uniprot_goa.go_id.map(go_id_to_term)
df_uniprot_goa["go_term_ancestor"]=df_uniprot_goa.ancestor.map(go_id_to_term)

df_uniprot_goa = df_uniprot_goa[
    ["Uniprot", "go_id", "go_term", "ancestor", "go_term_ancestor"]
]

df_uniprot_goa

# Merge the two


Unnamed: 0,Uniprot,go_id,ancestor
0,A0AV02,GO:0015379,GO:0015377
1,A0AV02,GO:0015379,GO:0022857
2,A0AV02,GO:0015379,GO:1901702
3,A0AV02,GO:0015379,GO:0046873
4,A0AV02,GO:0015379,GO:0015294
...,...,...,...
21562,Q9Y6R1,GO:0008510,GO:0015291
21563,Q9Y6R1,GO:0015293,GO:0015291
21564,Q9Y6R1,GO:0015293,GO:0022804
21565,Q9Y6R1,GO:0015293,GO:0015293


In [10]:

df_uniprot_goa

Unnamed: 0,Uniprot,go_id,go_term,ancestor,go_term_ancestor
0,A0AV02,GO:0015379,potassium:chloride symporter activity,GO:0015377,chloride:monoatomic cation symporter activity
1,A0AV02,GO:0015379,potassium:chloride symporter activity,GO:0022857,transmembrane transporter activity
2,A0AV02,GO:0015379,potassium:chloride symporter activity,GO:1901702,salt transmembrane transporter activity
3,A0AV02,GO:0015379,potassium:chloride symporter activity,GO:0046873,metal ion transmembrane transporter activity
4,A0AV02,GO:0015379,potassium:chloride symporter activity,GO:0015294,solute:monoatomic cation symporter activity
...,...,...,...,...,...
21562,Q9Y6R1,GO:0008510,sodium:bicarbonate symporter activity,GO:0015291,secondary active transmembrane transporter act...
21563,Q9Y6R1,GO:0015293,symporter activity,GO:0015291,secondary active transmembrane transporter act...
21564,Q9Y6R1,GO:0015293,symporter activity,GO:0022804,active transmembrane transporter activity
21565,Q9Y6R1,GO:0015293,symporter activity,GO:0015293,symporter activity
