# New data pipeline

Input: Uniprot, GO graph, UniprotGOA, root node
Output: A list of protein sequences and go annotations, including ancestors, under the root node

Next: Innner merge that dataset with GO to ChEBI annotations
Then: Create Matrices for ChEBI terms and GO terms

In [13]:
from subpred.util import load_df
import networkx as nx
from copy import deepcopy

In [14]:
def get_sequence_dataset(
    organism_ids: set = None, swissprot_only: bool = False, max_sequence_evidence_code: int = 2
):
    df_uniprot = load_df("uniprot")
    if swissprot_only:
        df_uniprot = df_uniprot[df_uniprot.reviewed]
    df_uniprot = df_uniprot[df_uniprot.protein_existence <= max_sequence_evidence_code]
    if organism_ids:
        df_uniprot = df_uniprot[df_uniprot.organism_id.isin(organism_ids)]
    df_uniprot = df_uniprot.sequence.to_frame().drop_duplicates()
    return df_uniprot

In [15]:
def get_go_id_update_dict(graph_go):
    go_id_update_dict = dict()
    for go_term, alt_ids in graph_go.nodes(data="alt_id"):
        if not alt_ids:
            go_id_update_dict[go_term] = go_term
            continue
        for alt_id in alt_ids:
            go_id_update_dict[alt_id] = go_term
    for go_term in graph_go.nodes():
        go_id_update_dict[go_term] = go_term
    return go_id_update_dict

In [16]:
def get_goa_dataset(
    proteins:set,
    go_root_node: str,
):
    protein_annotations = None

    df_uniprot_goa = load_df("go")
    # df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.Uniprot.isin(proteins)]

    graph_go = load_df("go_obo")
    go_id_update_dict = get_go_id_update_dict(graph_go=graph_go)


    return protein_annotations


In [17]:
# First, get all sequences with filtering criteriou:
df_sequences = get_sequence_dataset(organism_ids={9606}, swissprot_only=False, max_sequence_evidence_code=1)
df_sequences

# Then, get all transporter go terms with is_a

# get all go annotations


# Merge the two

Unnamed: 0_level_0,sequence
Uniprot,Unnamed: 1_level_1
A0A0C5B5G6,MRWQEMGYIFYPRKLR
A0A1B0GTW7,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...
A0JNW5,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...
A0JP26,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...
A0PK11,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...
...,...
X6R922,MSELLRARSQSSERGNDQESSQPVGSVIVQEPTEEKRQEEEPPTDN...
X6RB39,MESPASSQPASMPQSKERLVISMVTIYGPQQVPICAVNPVLSTLQT
X6REB3,MNQPQRMAPVGTDKELSDLLDFSMMFPLPVTNGKGRPASLAGAQFG...
X6RGR3,MQADKCRTSSRSVKKELVIESPLQYKDAAQGEVEAESPGPVPAKPK...


In [18]:
# go subset with ancestors, filtered by evidence_code, aspect, qualifier, go_ids, graph edges

keys = {"is_a"}
qualifiers_keep = {"enables"}
aspects_keep = {"F"}
evidence_codes_remove = {"IEA"}
df_uniprot_goa = load_df("go")
graph_go = load_df("go_obo")



# update go identifiers in annotation dataset
go_id_update_dict = get_go_id_update_dict(graph_go=graph_go)
df_uniprot_goa["go_id"] = df_uniprot_goa.go_id.map(go_id_update_dict)
df_uniprot_goa = df_uniprot_goa[~df_uniprot_goa.go_id.isnull()].reset_index(drop=True)

# filtering out "not" annotations explicitly
df_uniprot_goa = df_uniprot_goa[~df_uniprot_goa.qualifier.str.startswith("NOT")]

# filtering for parameters
df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.qualifier.isin(qualifiers_keep)]
df_uniprot_goa = df_uniprot_goa[df_uniprot_goa.aspect.isin(aspects_keep)]
df_uniprot_goa = df_uniprot_goa[~df_uniprot_goa.evidence_code.isin(evidence_codes_remove)]

# add ancestors connected with specified edge labels
edges_with_keys = {(go1, go2, key) for go1, go2, key in graph_go.edges(keys=True) if key in keys}
graph_go_filtered = deepcopy(graph_go.edge_subgraph(edges_with_keys))
# df_uniprot_goa1 = df_uniprot_goa[]

# TODO does it make a differenct if I use is_a graph for adding ancestors?
# filter annotations by root node


df_uniprot_goa = df_uniprot_goa.reset_index(drop=True)
df_uniprot_goa.go_id.map(lambda go_id: set(nx.descendants(graph_go, go_id)))

0         {GO:0003824, GO:0003674, GO:0016798, GO:001679...
1                                  {GO:0003674, GO:0005488}
2                                  {GO:0003674, GO:0005488}
3         {GO:0004721, GO:0016791, GO:0016787, GO:004257...
4         {GO:0004721, GO:0016791, GO:0016787, GO:004257...
                                ...                        
443023    {GO:0110165, GO:0005643, GO:0005622, GO:000519...
443024    {GO:0051179, GO:0005215, GO:0051234, GO:005508...
443025     {GO:0032182, GO:0003674, GO:0005488, GO:0005515}
443026     {GO:0032182, GO:0003674, GO:0005488, GO:0005515}
443027     {GO:0032182, GO:0003674, GO:0005488, GO:0005515}
Name: go_id, Length: 443028, dtype: object

In [48]:
import requests, sys
import json
import pandas as pd

REDOWNLOAD_TRANSPORTER_GO_TERMS = False
if REDOWNLOAD_TRANSPORTER_GO_TERMS:
  requestURL = "https://www.ebi.ac.uk/QuickGO/services/ontology/go/slim?slimsToIds=GO%3A0022857&relations=is_a"

  r = requests.get(requestURL, headers={ "Accept" : "application/json"})

  if not r.ok:
    r.raise_for_status()
    sys.exit()

  responseBody = r.text

  json_transporter_goslim_isa = json.loads(responseBody)["results"]

  with open("../data/raw/gene_ontology/goslim_transmembrane_transport.json", "w") as json_file:
      json.dump(json_transporter_goslim_isa, json_file)

In [51]:
df_goslim_transporter = pd.read_json("../data/raw/gene_ontology/goslim_transmembrane_transport.json")
assert not (df_goslim_transporter.slimsToIds.apply(len) != 1).any()
df_goslim_transporter["slimsToIds"] = df_goslim_transporter.slimsToIds.apply(lambda x: x[0])
df_goslim_transporter

Unnamed: 0,slimsFromId,slimsToIds
0,GO:0015313,GO:0022857
1,GO:0015554,GO:0022857
2,GO:0015312,GO:0022857
3,GO:0042933,GO:0022857
4,GO:0015553,GO:0022857
...,...,...
1027,GO:0033266,GO:0022857
1028,GO:0015615,GO:0022857
1029,GO:0015614,GO:0022857
1030,GO:0015612,GO:0022857


In [83]:
import pandas as pd
df_goslim_transporter = pd.DataFrame(json_transporter_goslim_isa)

go_term_to_name = {term: name for term, name in graph_go.nodes(data="name")}
df_goslim_transporter["go_term"] = df_goslim_transporter.slimsFromId.map(go_term_to_name)

assert not (df_goslim_transporter.slimsToIds.apply(len) != 1).any()

df_goslim_transporter["slimsToIds"] = df_goslim_transporter.slimsToIds.apply(lambda x: x[0])

df_goslim_transporter[df_goslim_transporter.go_term.isnull()]  # TODO contains terms that are too new

Unnamed: 0,slimsFromId,slimsToIds,go_term
405,GO:0180009,GO:0022857,
406,GO:0180003,GO:0022857,
478,GO:0170003,GO:0022857,
479,GO:0170001,GO:0022857,
483,GO:0170004,GO:0022857,
525,GO:0170022,GO:0022857,
526,GO:0170023,GO:0022857,
527,GO:0170020,GO:0022857,
528,GO:0170021,GO:0022857,
535,GO:0160042,GO:0022857,


In [82]:
graph_go2 = graph_go.subgraph(nx.ancestors(graph_go,"GO:0022857") | {"GO:0022857"})

graph_go2 = graph_go2.edge_subgraph({(go1, go2, key) for go1, go2, key in graph_go2.edges(keys=True) if key in keys})

df_go2 = pd.DataFrame(graph_go2.nodes(), columns=["go_id"])
go_term_to_name = {term: name for term, name in graph_go.nodes(data="name")}

df_go2["go_term"] = df_go2.go_id.map(go_term_to_name)

df_go2[df_go2.go_term.isnull()]

Unnamed: 0,go_id,go_term


In [87]:
df_goslim_transporter = df_goslim_transporter.drop("slimsToIds", axis=1).rename(columns={"slimsFromId" : "go_id"})

In [89]:
df_go2[~df_go2.go_id.isin(pd.merge(df_go2, df_goslim_transporter, how="inner", on=["go_id", "go_term"]).go_id)]

Unnamed: 0,go_id,go_term
23,GO:1903280,negative regulation of calcium:sodium antiport...
27,GO:1904959,regulation of cytochrome-c oxidase activity
30,GO:1903406,regulation of P-type sodium:potassium-exchangi...
31,GO:1903049,negative regulation of acetylcholine-gated cat...
41,GO:1903954,positive regulation of voltage-gated potassium...
...,...,...
1120,GO:1902160,negative regulation of cyclic nucleotide-gated...
1125,GO:0010361,regulation of anion channel activity by blue l...
1131,GO:0031586,"negative regulation of inositol 1,4,5-trisphos..."
1147,GO:0106427,negative regulation of kainate selective gluta...


In [90]:
df_goslim_transporter[~df_goslim_transporter.go_id.isin(pd.merge(df_go2, df_goslim_transporter, how="inner", on=["go_id", "go_term"]).go_id)]

Unnamed: 0,go_id,go_term
405,GO:0180009,
406,GO:0180003,
478,GO:0170003,
479,GO:0170001,
483,GO:0170004,
525,GO:0170022,
526,GO:0170023,
527,GO:0170020,
528,GO:0170021,
535,GO:0160042,
