# New data pipeline

Input: Uniprot, GO graph, UniprotGOA, root node
Output: A list of protein sequences and go annotations, including ancestors, under the root node

Next: Innner merge that dataset with GO to ChEBI annotations
Then: Create Matrices for ChEBI terms and GO terms

In [126]:
from subpred.util import load_df
import networkx as nx
from copy import deepcopy

In [127]:
def get_sequence_dataset(
    organism_ids: set = None, swissprot_only: bool = False, max_sequence_evidence_code: int = 2
):
    df_uniprot = load_df("uniprot")
    if swissprot_only:
        df_uniprot = df_uniprot[df_uniprot.reviewed]
    df_uniprot = df_uniprot[df_uniprot.protein_existence <= max_sequence_evidence_code]
    if organism_ids:
        df_uniprot = df_uniprot[df_uniprot.organism_id.isin(organism_ids)]
    df_uniprot = df_uniprot.sequence.to_frame().drop_duplicates()
    return df_uniprot

In [129]:
def get_go_subgraph(
    graph_go, root_node="GO:0022857", keys={"is_a"}, namespaces={"molecular_function"}
):
    graph_go2 = graph_go.subgraph(
        {
            node
            for node, namespace in graph_go.nodes(data="namespace")
            if namespace in namespaces
        }
    )
    graph_go2 = graph_go2.subgraph(nx.ancestors(graph_go, root_node) | {root_node})

    graph_go2 = graph_go2.edge_subgraph(
        {(go1, go2, key) for go1, go2, key in graph_go2.edges(keys=True) if key in keys}
    )

    return graph_go2    


In [130]:
def get_go_id_update_dict(graph_go):
    go_id_update_dict = dict()
    for go_term, alt_ids in graph_go.nodes(data="alt_id"):
        if not alt_ids:
            go_id_update_dict[go_term] = go_term
            continue
        for alt_id in alt_ids:
            go_id_update_dict[alt_id] = go_term
    for go_term in graph_go.nodes():
        go_id_update_dict[go_term] = go_term
    return go_id_update_dict


In [None]:
def get_go_annotations(
    proteins_subset: set = None,
    qualifiers_keep={"enables"},
    aspects_keep={"F"},
    evidence_codes_remove={"IEA"},
):
    df_uniprot_goa = load_df("go")

    # update go identifiers in annotation dataset
    go_id_update_dict = get_go_id_update_dict(graph_go=graph_go)
    df_uniprot_goa["go_id"] = df_uniprot_goa.go_id.map(go_id_update_dict)
    df_uniprot_goa = df_uniprot_goa[~df_uniprot_goa.go_id.isnull()].reset_index(
        drop=True
    )

    # filtering out "not" annotations explicitly
    df_uniprot_goa = df_uniprot_goa[~df_uniprot_goa.qualifier.str.startswith("NOT")]

    # filtering for parameters
    df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.qualifier.isin(qualifiers_keep)]
    df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.aspect.isin(aspects_keep)]
    df_uniprot_goa = df_uniprot_goa[
        ~df_uniprot_goa.evidence_code.isin(evidence_codes_remove)
    ]


    # filter annotations by protein subset
    if proteins_subset:
        df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.go_id.isin(proteins_subset)]

    # cleanup
    df_uniprot_goa = df_uniprot_goa.reset_index(drop=True)
    df_uniprot_goa = (
        df_uniprot_goa[["Uniprot", "go_id"]].drop_duplicates().reset_index(drop="True")
    )

    return df_uniprot_goa


def add_ancestors(df_uniprot_goa, graph_go):
    df_uniprot_goa["ancestor"] = df_uniprot_goa.go_id.map(
        lambda go_id: {go_id} | set(nx.descendants(graph_go, go_id))
    )

    df_uniprot_goa = df_uniprot_goa.explode("ancestor")
    df_uniprot_goa = df_uniprot_goa.reset_index(drop=True)

    return df_uniprot_goa


In [155]:
# First, get all sequences with filtering criteriou:
df_sequences = get_sequence_dataset(
    organism_ids={9606}, swissprot_only=False, max_sequence_evidence_code=1
)
df_sequences

# Then, get all transporter go terms with is_a
graph_go = load_df("go_obo")
go_id_to_term = {go_id: go_term for go_id, go_term in graph_go.nodes(data="name")}
go_term_to_id = {go_term: go_id for go_id, go_term in graph_go.nodes(data="name")}
graph_go_transmembrane_transport = get_go_subgraph(
    graph_go=graph_go,
    root_node=go_term_to_id["transmembrane transporter activity"],
    keys={"is_a"},
    namespaces={"molecular_function"},
)

# get all go annotations, filtered by parameters, with added ancestors, updated ids
# ancestors in graph_to_transmembrane transport
df_uniprot_goa = get_go_annotations(
    proteins_subset=set(graph_go_transmembrane_transport.nodes()),
    qualifiers_keep={"enables"},
    aspects_keep={"F"},
    evidence_codes_remove={"IEA"},
)
df_uniprot_goa = add_ancestors(
    df_uniprot_goa=df_uniprot_goa, graph_go=graph_go_transmembrane_transport
)
df_uniprot_goa

# Merge the two


Unnamed: 0,Uniprot,go_id,ancestor
0,A0A059WQD2,GO:0005315,GO:0005315
0,A0A059WQD2,GO:0005315,GO:0022804
0,A0A059WQD2,GO:0005315,GO:0015291
0,A0A059WQD2,GO:0005315,GO:0015318
0,A0A059WQD2,GO:0005315,GO:0022857
...,...,...,...
17730,X5LXB2,GO:0042626,GO:0022857
17730,X5LXB2,GO:0042626,GO:0042626
17730,X5LXB2,GO:0042626,GO:0022804
17730,X5LXB2,GO:0042626,GO:0015399
