# New data pipeline

Input: Uniprot, GO graph, UniprotGOA, root node
Output: A list of protein sequences and go annotations, including ancestors, under the root node

Next: Innner merge that dataset with GO to ChEBI annotations
Then: Create Matrices for ChEBI terms and GO terms

In [1]:
from subpred.util import load_df
import networkx as nx
import pandas as pd
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset

ORGANISM_IDS = {559292}
# ORGANISM_IDS = None
SWISSPROT_ONLY = False
MAX_SEQUENCE_EVIDENCE_CODE = 2
EXCLUDE_IEA_GO_TERMS = False
REMOVE_PROTEINS_WITHOUT_GENE_NAMES = False

df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids=ORGANISM_IDS,
    swissprot_only=SWISSPROT_ONLY,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=EXCLUDE_IEA_GO_TERMS,
    max_sequence_evidence_code=MAX_SEQUENCE_EVIDENCE_CODE,
    remove_proteins_without_gene_names=REMOVE_PROTEINS_WITHOUT_GENE_NAMES,
)

## Create GO ChEBI mapping with ancestors

In [7]:
# load go chebi data from quickgo
# TODO update identifiers in goa and here?
# TODO filter by 3star?
from subpred.chebi_annotations import get_id_update_dict

go_chebi = load_df("go_chebi")
graph_chebi = load_df("chebi_obo")
graph_go = load_df("go_obo")

chebi_id_update_dict = get_id_update_dict(graph_chebi)
go_id_update_dict = get_id_update_dict(graph_go)
df_go_chebi["go_id"] = df_go_chebi.go_id.map(go_id_update_dict)
df_go_chebi["chebi_id"] = df_go_chebi.chebi_id.map(chebi_id_update_dict)

go_chebi = go_chebi[go_chebi.go_id.isin(df_uniprot_goa.go_id_ancestor)]
go_chebi = go_chebi[go_chebi.relation == "has_primary_input"]

# load is_a chebi graph
graph_chebi = graph_chebi.edge_subgraph(
    edges=[
        (source, sink, key)
        for source, sink, key in graph_chebi.edges(keys=True)
        if key == "is_a"
    ]
)

# add ancestor chebi ids
go_chebi["chebi_id_ancestor"] = go_chebi.chebi_id.transform(
    lambda x: set(nx.descendants(graph_chebi, x) | {x})
)
go_chebi = go_chebi.explode("chebi_id_ancestor")
chebi_id_to_term = {k: v for k, v in graph_chebi.nodes(data="name")}
go_chebi["chebi_term_ancestor"] = go_chebi.chebi_id_ancestor.map(chebi_id_to_term)
go_chebi = go_chebi.reset_index(drop=True)

# filter by chebi terms that have any molecular properties
chebi_id_to_properties = {
    chebi_id: properties_list
    for chebi_id, properties_list in graph_chebi.nodes(data="property_value")
    if properties_list
}
go_chebi = go_chebi[
    go_chebi.chebi_id_ancestor.isin(chebi_id_to_properties.keys())
].reset_index(drop=True)

# additional filter for chebi terms that have formula
go_chebi = go_chebi[
    go_chebi.chebi_id_ancestor.apply(
        lambda chebi_id: any(
            [
                prop.split()[0].split("/")[-1].strip() == "formula"
                for prop in chebi_id_to_properties[chebi_id]
            ]
        )
    )
]

# add names of go ids
go_term_to_name = {k: v for k, v in graph_go.nodes(data="name")}
go_chebi.insert(column="go_term", value=go_chebi.go_id.map(go_term_to_name), loc=1)
go_chebi = go_chebi.rename(columns={"relation" : "chebi_go_relation"})

go_chebi = go_chebi.reset_index(drop=True)
go_chebi

Unnamed: 0,go_id,go_term,chebi_id,chebi_term,chebi_go_relation,chebi_id_ancestor,chebi_term_ancestor
0,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:15378,hydron
1,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:33251,monoatomic hydrogen
2,GO:0015078,proton transmembrane transporter activity,CHEBI:24636,proton,has_primary_input,CHEBI:24636,proton
3,GO:0015085,calcium ion transmembrane transporter activity,CHEBI:29108,calcium(2+),has_primary_input,CHEBI:39123,calcium cation
4,GO:0015085,calcium ion transmembrane transporter activity,CHEBI:29108,calcium(2+),has_primary_input,CHEBI:39124,calcium ion
...,...,...,...,...,...,...,...
351,GO:0005476,carnitine:acyl carnitine antiporter activity,CHEBI:17387,O-acylcarnitine,has_primary_input,CHEBI:33308,carboxylic ester
352,GO:1902557,5'-adenylyl sulfate transmembrane transporter ...,CHEBI:58243,5'-adenylyl sulfate(2-),has_primary_input,CHEBI:58243,5'-adenylyl sulfate(2-)
353,GO:1902557,5'-adenylyl sulfate transmembrane transporter ...,CHEBI:58243,5'-adenylyl sulfate(2-),has_primary_input,CHEBI:58958,organosulfate oxoanion
354,GO:0005346,purine ribonucleotide transmembrane transporte...,CHEBI:26400,purine ribonucleotide,has_primary_input,CHEBI:26401,purines


356

411

5229

In [5]:
from subpred.chebi_annotations import get_properties_counts

get_properties_counts("../data/datasets/")

Counter({'ANY': 153044,
         'formula': 152971,
         'charge': 152837,
         'mass': 152022,
         'monoisotopicmass': 151970,
         'smiles': 150736,
         'inchi': 141392,
         'inchikey': 141392,
         'NONE': 11475})

In [16]:
{
    chebi_term: [
        property_str
        for property_str in prop_list
        if property_str.startswith("http://purl.obolibrary.org/obo/chebi/formula")
    ]
    for chebi_term, prop_list in chebi_id_to_properties.items()
}

{'CHEBI:33429': [],
 'CHEBI:30151': ['http://purl.obolibrary.org/obo/chebi/formula "Al" xsd:string'],
 'CHEBI:16042': ['http://purl.obolibrary.org/obo/chebi/formula "X" xsd:string'],
 'CHEBI:17051': ['http://purl.obolibrary.org/obo/chebi/formula "F" xsd:string'],
 'CHEBI:28741': ['http://purl.obolibrary.org/obo/chebi/formula "FNa" xsd:string',
  'http://purl.obolibrary.org/obo/chebi/formula "Na.F" xsd:string'],
 'CHEBI:32129': ['http://purl.obolibrary.org/obo/chebi/formula "Ag.F.2H3N" xsd:string',
  'http://purl.obolibrary.org/obo/chebi/formula "AgFH6N2" xsd:string'],
 'CHEBI:30340': ['http://purl.obolibrary.org/obo/chebi/formula "AgF" xsd:string'],
 'CHEBI:51990': ['http://purl.obolibrary.org/obo/chebi/formula "C16H36FN" xsd:string'],
 'CHEBI:49499': ['http://purl.obolibrary.org/obo/chebi/formula "BeF2" xsd:string'],
 'CHEBI:66871': ['http://purl.obolibrary.org/obo/chebi/formula "FH4N" xsd:string'],
 'CHEBI:66872': ['http://purl.obolibrary.org/obo/chebi/formula "FK" xsd:string'],
 'CH

## Uniprot-GO-ChEBI frame with ancestor chebi terms

In [18]:
uniprot_go_chebi = (
    df_uniprot_goa.drop(["go_id", "go_term"], axis=1)
    .merge(
        go_chebi[
            ["go_id", "go_term", "chebi_id_ancestor", "chebi_term_ancestor"]
        ].drop_duplicates(),
        how="left",
        left_on="go_term_ancestor",
        right_on="go_term",
    )
    .drop(["go_id", "go_term"], axis=1)
)

uniprot_go_chebi

Unnamed: 0,Uniprot,qualifier,evidence_code,aspect,go_id_ancestor,go_term_ancestor,chebi_id_ancestor,chebi_term_ancestor
0,A0A014M993,enables,IEA,F,GO:0015288,porin activity,,
1,A0A014M993,enables,IEA,F,GO:0022829,wide pore channel activity,,
2,A0A014M993,enables,IEA,F,GO:0015267,channel activity,,
3,A0A014M993,enables,IEA,F,GO:0022857,transmembrane transporter activity,,
4,A0A014M993,enables,IEA,F,GO:0022803,passive transmembrane transporter activity,,
...,...,...,...,...,...,...,...,...
6474854,Z4YKJ7,enables,IEA,F,GO:0022890,inorganic cation transmembrane transporter act...,CHEBI:36916,cation
6474855,Z4YKJ7,enables,IEA,F,GO:0022890,inorganic cation transmembrane transporter act...,CHEBI:24870,ion
6474856,Z4YKJ7,enables,IEA,F,GO:0022890,inorganic cation transmembrane transporter act...,CHEBI:23367,molecular entity
6474857,Z4YKJ7,enables,IEA,F,GO:0022890,inorganic cation transmembrane transporter act...,CHEBI:24431,chemical entity


## Fill in missing ChEBI terms

In [19]:
# Remove uniprot accessions, only relations between go terms and chebi terms
go_chebi_map = (
    uniprot_go_chebi[
        [
            "go_id_ancestor",
            "go_term_ancestor",
            "chebi_id_ancestor",
            "chebi_term_ancestor",
        ]
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [None]:
def annotate_go_term_molecule(
    go_chebi_map,
    go_search_string,
    chebi_id,
    chebi_term,
    excluded_go_terms,
):
    # get rows where the go term contains the search term
    go_chebi_map_matching_rows = go_chebi_map[
        go_chebi_map.go_term_ancestor.str.contains(go_search_string, regex=False)
        & ~go_chebi_map.go_term_ancestor.isin(excluded_go_terms)
    ].copy()

    # get the index labels of the rows where that go term does not have a chebi term, then drop those
    na_idx = go_chebi_map_matching_rows[
        go_chebi_map_matching_rows.chebi_id_ancestor.isna()
    ].index
    go_chebi_map = go_chebi_map.drop(na_idx)

    # create a new df where the go terms are mapped to the chebi id, add those rows to the original df
    go_chebi_map_new_rows = (
        go_chebi_map_matching_rows[["go_id_ancestor", "go_term_ancestor"]]
        .drop_duplicates()
        .assign(chebi_id_ancestor=chebi_id, chebi_term_ancestor=chebi_term)
    )
    go_chebi_map = pd.concat([go_chebi_map, go_chebi_map_new_rows], axis=0)

    # drop duplicated rows (if there were some existing annotations), sort again, reset the index
    go_chebi_map = (
        go_chebi_map.drop_duplicates()
        .sort_values(["go_id_ancestor", "chebi_id_ancestor"])
        .reset_index(drop=True)
    )
    return go_chebi_map




keyword_chebiterm_chebi_id = [
    ("protein", "protein", "CHEBI:36080", {}),  # TODO phosphotransferrase?
    ("monoatomic ion", "monoatomic ion", "CHEBI:24867", {}),
    ("monoatomic anion", "monoatomic anion", "CHEBI:23905", {}),
    ("monoatomic cation", "monoatomic cation", "CHEBI:23906", {}),
    ("zinc", "zinc(2+)", "CHEBI:29105", {}),
    ("NAD(P)+", "NAD(P)+", "CHEBI:13390", {}),
    ("NADH", "NADH", "CHEBI:16908", {}),
    (
        "calcium",
        "calcium(2+)",
        "CHEBI:29108",
        {
            "small conductance calcium-activated potassium channel activity",
            "Intermediate conductance calcium-activated potassium channel activity",
            "large conductance calcium-activated potassium channel activity",
        },
    ),
    ("arsenate ion", "arsenate ion", "CHEBI:22629", {}),
    ("amino acid", "amino acid", "CHEBI:33709", {}),
    ("sulfur compound", "sulfur molecular entity", "CHEBI:26835", {}),
    ("salt", "salt", "CHEBI:24866", {}),
    (
        "poly-beta-1,6-N-acetyl-D-glucosamine",
        "poly-beta-1,6-N-acetyl-D-glucosamine",
        "CHEBI:67063",
        {},
    ),
    ("carbohydrate derivative", "carbohydrate derivative", "CHEBI:63299", {}),
    ("oligopeptide", "oligopeptide", "CHEBI:25676", {}),
    ("lead ion", "lead(2+)", "CHEBI:49807", {}),
    ("oligosaccharide", "oligosaccharide", "CHEBI:50699", {}),
    (
        "sodium",
        "sodium(1+)",
        "CHEBI:29101",
        {
            "sodium-dependent organic anion transmembrane transporter activity",
            "sodium-independent organic anion transmembrane transporter activity",
            "sodium-dependent multivitamin transmembrane transporter activity",
            "intracellular sodium activated potassium channel activity",
        },
    ),
    ("methionine", "methionine", "CHEBI:16811", {}),
    (
        "proton",
        "proton",
        "CHEBI:24636",
        {
            "proton motive force dependent protein transmembrane transporter activity",
            "proton-dependent oligopeptide secondary active transmembrane transporter activity",
        },
    ),
    ("peptidoglycan", "peptidoglycan", "CHEBI:8005", {}),
    (
        "polysaccharide",
        "polysaccharide",
        "CHEBI:18154",
        {"lipopolysaccharide transmembrane transporter activity"},
    ),
    ("lipopolysaccharide", "lipopolysaccharide", "CHEBI:16412", {}),
    ("secondary active ammonium transmembrane", "ammonium", "CHEBI:28938", {}),
    ("quaternary ammonium", "quaternary ammonium ion", "CHEBI:35267", {}),
    (
        "hexose",
        "hexose",
        "CHEBI:18133",
        {
            "hexose phosphate transmembrane transporter activity",
            "hexose-phosphate:inorganic phosphate antiporter activity",
        },
    ),
    ("hexose phosphate", "hexose phosphase", "CHEBI:47878", {}),
    ("hexose-phosphate", "hexose phosphase", "CHEBI:47878", {}),
    # ("","","",{}),
    # ("","","",{}),
]

for search_term, chebi_term, chebi_id, exclude_set in keyword_chebiterm_chebi_id:
    go_chebi_map = annotate_go_term_molecule(
        go_chebi_map,
        go_search_string=search_term,
        chebi_id=chebi_id,
        chebi_term=chebi_term,
        excluded_go_terms=exclude_set,
    )
    print("=" * 180)
    print(
        go_chebi_map[
            go_chebi_map.go_term_ancestor.str.contains(search_term, regex=False)
        ][["go_term_ancestor", "chebi_term_ancestor"]].to_string()
    )

                                                                                          go_term_ancestor chebi_term_ancestor
324                                                             protein transmembrane transporter activity             protein
398                                                                      protein-exporting ATPase activity             protein
399                                                     mitochondrial protein-transporting ATPase activity             protein
405                                       protein-N(PI)-phosphohistidine-sugar phosphotransferase activity             protein
428                               proton motive force dependent protein transmembrane transporter activity             protein
889                                                                   protein-transporting ATPase activity             protein
892                                                                  ABC-type protein transporter activity     

In [None]:
go_term_to_samples = df_uniprot_goa[["Uniprot", "go_term_ancestor"]].drop_duplicates().groupby("go_term_ancestor").apply(len)
go_term_to_samples

go_term_ancestor
(+)-abscisic acid D-glucopyranosyl ester transmembrane transporter activity      2
(R)-carnitine transmembrane transporter activity                                19
(R)-carnitine:4-(trimethylammonio)butanoate antiporter activity                  2
2-keto-3-deoxygluconate:proton symporter activity                                3
3'-phosphoadenosine 5'-phosphosulfate transmembrane transporter activity        18
                                                                              ... 
zinc efflux active transmembrane transporter activity                            3
zinc efflux transmembrane transporter activity                                  24
zinc ion transmembrane transporter activity                                    587
zinc:bicarbonate symporter activity                                              4
zinc:proton antiporter activity                                                 20
Length: 806, dtype: int64

In [20]:
go_chebi_map[go_chebi_map.chebi_id_ancestor.isna()]

Unnamed: 0,go_id_ancestor,go_term_ancestor,chebi_id_ancestor,chebi_term_ancestor
0,GO:0015288,porin activity,,
1,GO:0022829,wide pore channel activity,,
2,GO:0015267,channel activity,,
3,GO:0022857,transmembrane transporter activity,,
4,GO:0022803,passive transmembrane transporter activity,,
...,...,...,...,...
13893,GO:0030233,deoxynucleotide transmembrane transporter acti...,,
13912,GO:0010290,chlorophyll catabolite transmembrane transport...,,
13983,GO:0015089,high-affinity copper ion transmembrane transpo...,,
13984,GO:0034216,high-affinity thiamine:proton symporter activity,,


In [27]:
go_chebi_map.merge(
    go_term_to_samples.rename("samples"),
    how="left",
    left_on="go_term_ancestor",
    right_index=True,
)[go_chebi_map.chebi_id_ancestor.isna()].sort_values("samples", ascending=False).iloc[250:300]

Unnamed: 0,go_id_ancestor,go_term_ancestor,chebi_id_ancestor,chebi_term_ancestor,samples
13669,GO:0140417,ATP-sensitive calcium-release channel activity,,,1
13984,GO:0034216,high-affinity thiamine:proton symporter activity,,,1
11606,GO:0042960,antimonite secondary active transmembrane tran...,,,1
11634,GO:0043893,acetate:monoatomic cation symporter activity,,,1
11133,GO:0015503,glutathione-regulated potassium exporter activity,,,1
11071,GO:0015156,melibiose transmembrane transporter activity,,,1
13985,GO:0034215,thiamine:proton symporter activity,,,1
11174,GO:0099520,monoatomic ion antiporter activity involved in...,,,1
11276,GO:0009675,high-affinity sulfate:proton symporter activity,,,1
11277,GO:0008512,sulfate:proton symporter activity,,,1


## Substrates with smiles that are not annotated:

proton

maltose

calcium

chloride 

adenosine 3',5'-bisphosphate

potassium 

sodium

carboxylic acid

L-glutamate

zinc 

methionine 

UDP-N-acetylgalactosamine

iron-nicotianamine

beta-glucoside	CHEBI:22798 - β-D-glucoside

cystine CHEBI:17376 - cystine

nitrate CHEBI:17632 - nitrate

NAD(P)+

lysine 

auxin CHEBI:142254 - auxin a

ammonium CHEBI:28938 - ammonium 

glucose CHEBI:4167 - D-glucopyranose

L-histidine

-L-ascorbate

tryptophan 

L-sorbose

copper

thiamine

acetate

sulfate





In [8]:
print(
    go_chebi_map[
        go_chebi_map.chebi_id_ancestor.isna()
        & go_chebi_map.go_term_ancestor.str.contains("hexose")
    ][["go_term_ancestor", "chebi_term_ancestor"]].to_string()
)

Empty DataFrame
Columns: [go_term_ancestor, chebi_term_ancestor]
Index: []


In [9]:
print(
    go_chebi_map[go_chebi_map.go_term_ancestor.str.contains("organic anion")][
        ["go_term_ancestor", "chebi_term_ancestor"]
    ].to_string()
)

                                                         go_term_ancestor chebi_term_ancestor
291                            solute:inorganic anion antiporter activity                 NaN
372                      organic anion transmembrane transporter activity                 NaN
486                    inorganic anion transmembrane transporter activity                 NaN
773   sodium-independent organic anion transmembrane transporter activity                 NaN
1286    ATPase-coupled inorganic anion transmembrane transporter activity                 NaN
1287    sodium-dependent organic anion transmembrane transporter activity                 NaN


In [53]:
# TODO add ancestors again!
# TODO symporters and antiporters with only one chebi substrate

In [2]:
# # for key, val in searchterm_to_chebi.items():
# #     go_chebi_direct = go_chebi_direct.assign(
# #         substrates=np.where(
# #             go_chebi_direct.go_term_ancestor.str.contains(key)
# #             & go_chebi_direct.substrates.isna(),
# #             val,
# #             go_chebi_direct.substrates,
# #         )
# #     )

# from rpy2.robjects import r, StrVector, packages, pandas2ri
# import rpy2.robjects as ro

# packages.importr("GOSemSim")
# packages.importr("org.Sc.sgd.db")
# packages.importr("Rcpp")
# godata_func = r["godata"]

# go_data_r = godata_func(
#     "org.Sc.sgd.db",
#     # keytype="ENTREZID",
#     ont="MF",
#     computeIC=True,
#     processTCSS=True,
#     # cutoff=TODO
# )

# matrix = r["matrix"](
#     0.0,
#     nrow=len(go_terms),
#     ncol=len(go_terms),

# )

R[write to console]: Error in (function ()  : 
  org.Sc.sgdPFAM is defunct. Please use select() if you need access to
  PFAM or PROSITE accessions.

  warn(str(rre))
R[write to console]: preparing gene to GO mapping data...

R[write to console]: preparing IC data...

R[write to console]: preparing TCSS data...

R[write to console]: As cutoff value is not provided, default value based on human will be used



In [1]:
go_terms = ['GO:1901505', 'GO:0015215', 'GO:0015605', 'GO:0000295',
       'GO:0015216', 'GO:0008514', 'GO:0022857', 'GO:0015932',
       'GO:0005347', 'GO:1901702', 'GO:0005346', 'GO:0015291',
       'GO:0005471', 'GO:0022804', 'GO:0015217', 'GO:0015297',
       'GO:0022890', 'GO:0015399', 'GO:0015075', 'GO:0008324',
       'GO:0015453', 'GO:0022853', 'GO:0015318', 'GO:0008121',
       'GO:0015078', 'GO:0004129', 'GO:0022803', 'GO:0005261',
       'GO:0005216', 'GO:0015252', 'GO:0046933', 'GO:0015267',
       'GO:0044769', 'GO:0019829', 'GO:0046961', 'GO:0042626',
       'GO:0042625', 'GO:0009678', 'GO:0046943', 'GO:0005342',
       'GO:0015171', 'GO:0015174', 'GO:0061459', 'GO:0015179',
       'GO:0072349', 'GO:0022832', 'GO:0008509', 'GO:0022839',
       'GO:0022836', 'GO:0008308', 'GO:0005253', 'GO:0005244',
       'GO:0022829', 'GO:0015288', 'GO:0140358', 'GO:0008553',
       'GO:0015662', 'GO:0015173', 'GO:1901474', 'GO:0005290',
       'GO:0005287', 'GO:0005291', 'GO:0008320', 'GO:0022884',
       'GO:0015450', 'GO:0005372', 'GO:0015250', 'GO:0015294',
       'GO:0005351', 'GO:0015295', 'GO:0005402', 'GO:0015144',
       'GO:0015293', 'GO:0015157', 'GO:0015154', 'GO:0005363',
       'GO:0015151', 'GO:0042947', 'GO:0008566', 'GO:0046915',
       'GO:0046873', 'GO:0005381', 'GO:0015093', 'GO:0015079',
       'GO:0140107', 'GO:0035673', 'GO:0015421', 'GO:0042887',
       'GO:0140359', 'GO:1904680', 'GO:0015440', 'GO:0015103',
       'GO:0080139', 'GO:0015562', 'GO:0015149', 'GO:0015145',
       'GO:0051119', 'GO:0005354', 'GO:0005355', 'GO:0015085',
       'GO:0005388', 'GO:0005384', 'GO:0140613', 'GO:0015081',
       'GO:0008554', 'GO:0008556', 'GO:0140679', 'GO:0015086',
       'GO:0015127', 'GO:0005310', 'GO:0015124', 'GO:0071916',
       'GO:0015205', 'GO:0015212', 'GO:0015214', 'GO:0005337',
       'GO:0015175', 'GO:0001761', 'GO:0015192', 'GO:0015193',
       'GO:0015203', 'GO:0015651', 'GO:0015101', 'GO:0015220',
       'GO:0005275', 'GO:1901618', 'GO:0034228', 'GO:0015665',
       'GO:0015226', 'GO:1901235', 'GO:0005385', 'GO:0008028',
       'GO:0089721', 'GO:0005353', 'GO:0015146', 'GO:0015578',
       'GO:0005315', 'GO:0015168', 'GO:0015166', 'GO:0015254',
       'GO:0097079', 'GO:0001406', 'GO:0015169', 'GO:0015343',
       'GO:0034634', 'GO:0008519', 'GO:0015123', 'GO:0090482',
       'GO:0015233', 'GO:0005365', 'GO:0005366', 'GO:0034257',
       'GO:0000064', 'GO:0015187', 'GO:1901682', 'GO:0015116',
       'GO:0015131', 'GO:0015556', 'GO:0034658', 'GO:1901239',
       'GO:0008137', 'GO:0033284', 'GO:0033283', 'GO:0033285',
       'GO:0008559', 'GO:0042910', 'GO:0015087', 'GO:0000006',
       'GO:0015489', 'GO:0005280', 'GO:0015185', 'GO:0005416',
       'GO:0015355', 'GO:0015495', 'GO:0042937', 'GO:0015141',
       'GO:0015138', 'GO:0005469', 'GO:0015204', 'GO:0015606',
       'GO:0015095', 'GO:0015136', 'GO:0005302', 'GO:0015186',
       'GO:0015188', 'GO:0015658', 'GO:0005247', 'GO:0015108',
       'GO:0005254', 'GO:0015196', 'GO:0022893', 'GO:0015244',
       'GO:0015218', 'GO:0005352', 'GO:0015213', 'GO:0008521',
       'GO:0071077', 'GO:0015296', 'GO:0015377', 'GO:0015379',
       'GO:0008271', 'GO:0008551', 'GO:0015370', 'GO:0005436',
       'GO:0015319', 'GO:0015228', 'GO:0043865', 'GO:0015191',
       'GO:0000099', 'GO:0015513', 'GO:0015112', 'GO:0050833',
       'GO:0005375', 'GO:0000095', 'GO:0005300', 'GO:0001409',
       'GO:0043682', 'GO:0140581', 'GO:0015431', 'GO:0015434',
       'GO:0015230', 'GO:0051724', 'GO:0005338', 'GO:0005462',
       'GO:0015165', 'GO:0015172', 'GO:0005313', 'GO:0015183',
       'GO:0015189', 'GO:0022889', 'GO:0015194', 'GO:0005458',
       'GO:0036080', 'GO:0015386', 'GO:0051139', 'GO:0022821',
       'GO:0140828', 'GO:0005249', 'GO:0005267', 'GO:0022843',
       'GO:0022841', 'GO:0022842', 'GO:0022840', 'GO:0015114',
       'GO:0005477', 'GO:0005427', 'GO:0015322', 'GO:0005262',
       'GO:0008381', 'GO:0015275', 'GO:0140135', 'GO:0000319',
       'GO:0015369', 'GO:0015368', 'GO:0015385', 'GO:0005245',
       'GO:0008331', 'GO:0005294', 'GO:0000102', 'GO:1901680',
       'GO:0005364', 'GO:0015574', 'GO:0015225', 'GO:0015234',
       'GO:0090422', 'GO:0000297', 'GO:0005452', 'GO:0005227',
       'GO:0005371', 'GO:0005274', 'GO:1903089', 'GO:0015117',
       'GO:0015140', 'GO:0015104', 'GO:0015105', 'GO:0015620',
       'GO:0032217', 'GO:1903425', 'GO:0033229', 'GO:0046964',
       'GO:1902557', 'GO:0005476', 'GO:0015227', 'GO:0005272',
       'GO:0000100', 'GO:0000007', 'GO:0005459', 'GO:0005460']

go_terms_sample = [
    "GO:1901505",
    "GO:0015215",
    "GO:0015605",
    "GO:0000295",
    "GO:0015216",
    "GO:0008514",
    "GO:0022857",
    "GO:0015932",
    "GO:0005347",
    "GO:1901702",
    "GO:0005346",
    "GO:0015291",
    "GO:0005471",
    "GO:0022804",
    "GO:0015217",
    "GO:0015297",
    "GO:0022890",
    "GO:0015399",
    "GO:0015075",
    "GO:0008324",
    "GO:0015453",
    "GO:0022853",
    "GO:0015318",
    "GO:0008121",
]


In [9]:
from rpy2.robjects import r, StrVector, packages, pandas2ri
import rpy2.robjects as ro


def get_semantic_similarity(go_terms, measure, organism, ont="MF"):
    # DOC: https://yulab-smu.top/biomedical-knowledge-mining-book/GOSemSim.html
    assert measure in {"Resnik", "Lin", "Rel", "Jiang", "TCSS", "Wang"}
    assert organism in {"yeast"}
    assert ont in {"CC", "MF", "BP"}

    packages.importr("GOSemSim")
    # add libraries for other organism here, also install via conda
    # alternative: use AnnotationForge to create custom org.db
    organism_to_orgdb = {"yeast": "org.Sc.sgd.db"}
    packages.importr(organism_to_orgdb[organism])
    packages.importr("Rcpp")
    r(
      """
      get_semantic_similarity_scores <- function(go_filter_input,measure, ont, orgDbName){
        # also add libraries for other organism here:
        go_annot <- godata(
          OrgDb = orgDbName,
          ont=ont, 
          computeIC = TRUE, 
          processTCSS = TRUE,
          # keytype = "UNIPROT"
          )
        
        # unique_go_terms <- unique(as.vector(go_annot@geneAnno[["GO"]]))
        # go_filter <- c()
        # for (go_term in go_filter_input){
        #   if (go_term %in% unique_go_terms){
        #     go_filter <- c(go_filter, go_term)
        #   }
        # }
        
        values <- mgoSim(as.character(go_filter_input),as.character(go_filter_input),semData=go_annot,measure=measure,combine=NULL)
        
        return (as.data.frame(values))
      }
    """
    )

    # TODO tcss cutoff?
    matr = r["get_semantic_similarity_scores"](
        go_terms, measure=measure, ont=ont, orgDbName=organism_to_orgdb[organism]
    )

    with (ro.default_converter + pandas2ri.converter).context():
        df_go_similarity = ro.conversion.get_conversion().rpy2py(matr)

    return df_go_similarity


get_semantic_similarity(go_terms, "Resnik", "yeast", "MF")

R[write to console]: Error in (function ()  : 
  org.Sc.sgdPFAM is defunct. Please use select() if you need access to
  PFAM or PROSITE accessions.



  warn(str(rre))
R[write to console]: preparing gene to GO mapping data...

R[write to console]: preparing IC data...

R[write to console]: preparing TCSS data...

R[write to console]: As cutoff value is not provided, default value based on human will be used



Unnamed: 0,GO:1901505,GO:0015215,GO:0015605,GO:0000295,GO:0015216,GO:0008514,GO:0022857,GO:0015932,GO:0005347,GO:1901702,...,GO:0033229,GO:0046964,GO:1902557,GO:0005476,GO:0015227,GO:0005272,GO:0000100,GO:0000007,GO:0005459,GO:0005460
GO:1901505,0.579,0.299,0.299,0.299,0.299,0.299,0.299,0.299,0.579,0.299,...,0.299,0.579,0.579,0.299,0.299,0.299,0.299,0.299,0.579,0.579
GO:0015215,0.299,0.598,0.585,0.598,0.598,0.299,0.299,0.569,0.598,0.299,...,0.299,0.598,0.598,0.299,0.299,0.299,0.299,0.299,0.569,0.569
GO:0015605,0.299,0.585,0.585,0.585,0.585,0.299,0.299,0.299,0.585,0.299,...,0.299,0.585,0.585,0.299,0.299,0.299,0.299,0.299,0.299,0.299
GO:0000295,0.299,0.598,0.585,0.614,0.611,0.299,0.299,0.569,0.614,0.299,...,0.299,0.614,0.614,0.299,0.299,0.299,0.299,0.299,0.569,0.569
GO:0015216,0.299,0.598,0.585,0.611,0.611,0.299,0.299,0.569,0.611,0.299,...,0.299,0.611,0.611,0.299,0.299,0.299,0.299,0.299,0.569,0.569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:0005272,0.299,0.299,0.299,0.299,0.299,0.299,0.299,0.299,0.475,0.475,...,0.299,0.299,0.299,0.299,0.299,1.000,0.299,0.497,0.299,0.299
GO:0000100,0.299,0.299,0.299,0.299,0.299,0.299,0.299,0.299,0.299,0.299,...,0.616,0.616,0.616,0.765,0.299,0.299,1.000,0.299,0.299,0.299
GO:0000007,0.299,0.299,0.299,0.299,0.299,0.299,0.299,0.299,0.475,0.475,...,0.299,0.299,0.299,0.299,0.299,0.497,0.299,0.932,0.299,0.299
GO:0005459,0.579,0.569,0.299,0.569,0.569,0.299,0.299,0.569,0.579,0.299,...,0.299,0.579,0.579,0.299,0.299,0.299,0.299,0.299,0.932,0.842


# Testing for additional GO similarity measures

In [1]:
from subpred.go_semantic_similarity import get_go_data_df

df_go_yeast = get_go_data_df({559292}, aspect="F")
df_go_yeast.to_csv("go_yeast.tsv", sep="\t")

# OLD CODE

In [1]:
from subpred.transmembrane_transporters import get_transmembrane_transporter_dataset

df_sequences, df_uniprot_goa, df_go_chebi = get_transmembrane_transporter_dataset(
    organism_ids={9606},
    swissprot_only=False,
    datasets_path="../data/datasets/",
    exclude_iea_go_terms=False,
    max_sequence_evidence_code=1,
)
df_uniprot_goa

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity
...,...,...,...,...,...,...,...,...
67172,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity
67173,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity
67174,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity
67175,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity


In [5]:
df_go_chebi

Unnamed: 0,go_id,chebi_id,chebi_term,chebi_go_relation
0,GO:0000064,CHEBI:46911,L-ornithinium(1+),has_primary_input
1,GO:0000095,CHEBI:59789,S-adenosyl-L-methionine zwitterion,has_primary_input
2,GO:0000099,CHEBI:26834,sulfur-containing amino acid,has_primary_input
3,GO:0000295,CHEBI:61293,adenyl nucleotide,has_primary_input
4,GO:0000514,CHEBI:14321,glutamate(1-),has_primary_input
...,...,...,...,...
693,GO:1901480,CHEBI:30823,oleate,has_primary_input
694,GO:1901505,CHEBI:63299,carbohydrate derivative,has_primary_input
695,GO:1901682,CHEBI:26835,sulfur molecular entity,has_primary_input
696,GO:1902282,CHEBI:29103,potassium(1+),has_primary_input


In [None]:
df_go_chebi_primary = df_go_chebi[
    df_go_chebi.chebi_go_relation == "has_primary_input"
].reset_index(drop=True)
df_go_chebi_primary = df_go_chebi_primary[
    df_go_chebi_primary.chebi_id.isin(chebi_id_to_properties.keys())
]
df_go_chebi_primary

## Merge

In [3]:
import pandas as pd

df_uniprot_go_transporter = pd.merge(
    df_uniprot_goa, df_sequences, left_on="Uniprot", right_index=True, how="inner"
).reset_index(drop=True)
df_uniprot_go_transporter

Unnamed: 0,Uniprot,qualifier,go_id,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor,sequence,reviewed,protein_existence,organism_id,protein_names
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,MQGARAPRDQGQSPGRMSALGRSSVILLTYVLAATELTCLFMQFSI...,False,1,9606,Solute carrier family 22 (Organic cation trans...
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,MSTAIREVGVWRQTRTLLLKNYLIKCRTKKSSVQEILFPLFFLFWL...,False,1,9606,Cholesterol transporter ABCA5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67172,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67173,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67174,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)
67175,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity,MRLPDLRPWTSLLLVDAALLWLLQGPLGTLLPQGLPGLWLEGTLRL...,False,1,9606,Antigen peptide transporter 2 (TAP2)


In [4]:
df_uniprot_goa_chebi = df_uniprot_goa.merge(
    df_go_chebi, how="left", left_on="go_id_ancestor", right_on="go_id"
)
df_uniprot_goa_chebi

Unnamed: 0,Uniprot,qualifier,go_id_x,go_term,evidence_code,aspect,go_id_ancestor,go_term_ancestor,go_id_y,chebi_id,chebi_term,chebi_go_relation
0,A0A024RCG2,enables,GO:0022857,transmembrane transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,,,,
1,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0140359,ABC-type transporter activity,,,,
2,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022857,transmembrane transporter activity,,,,
3,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0015399,primary active transmembrane transporter activity,,,,
4,A0A075B778,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
75020,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0042626,ATPase-coupled transmembrane transporter activity,,,,
75021,X5CMH5,enables,GO:0140359,ABC-type transporter activity,IEA,F,GO:0022804,active transmembrane transporter activity,,,,
75022,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:0042887,amide transmembrane transporter activity,GO:0042887,CHEBI:32988,amide,has_primary_input
75023,X5CMH5,enables,GO:1904680,peptide transmembrane transporter activity,IEA,F,GO:1904680,peptide transmembrane transporter activity,GO:1904680,CHEBI:16670,peptide,has_primary_input
