# Improving the annotation of metabolite transporters

First off, we are importing packages that might come in handy.

In [1]:
import pandas as pd
import requests
from io import StringIO
from Bio import SeqIO
import pickle
import networkx as nx

Importing most recenet TC numbers, corresponding CHEBI ID of substrates, uniprot ID and GO terms from TCDB

In [2]:
tc_chebi_url = "https://www.tcdb.org/cgi-bin/substrates/getSubstrates.py"
uniprot_tc_url = "https://www.tcdb.org/cgi-bin/projectv/public/acc2tcid.py"
go_tc_url = "https://www.tcdb.org/cgi-bin/projectv/public/go.py"
fasta_tcdb_url = "https://www.tcdb.org/public/tcdb"

def fetch_data(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def parse_data(tc_chebi_text, uniprot_tc_text, go_tc_text, fasta_tcdb_text):
    # TC-CHEBI
    tc_chebi_lines = tc_chebi_text.strip().split("\n")
    tc_chebi_data = []
    for line in tc_chebi_lines:
        tc_number, chebi_ids = line.split("\t")
        chebi_id_list = [id.split(";")[0].replace("CHEBI:", "") for id in chebi_ids.split("|")]
        tc_chebi_data.append([tc_number, chebi_id_list])

    df_chebi = pd.DataFrame(tc_chebi_data, columns=["TC Number", "CHEBI IDs"])

    # UniProt-TC
    uniprot_tc_lines = uniprot_tc_text.strip().split("\n")
    uniprot_tc_data = [line.split("\t") for line in uniprot_tc_lines]
    df_uniprot = pd.DataFrame(uniprot_tc_data, columns=["UniProt ID", "TC Number"])

    # GO-TC
    go_tc_lines = go_tc_text.strip().split("\n")
    go_tc_data = [line.split("\t")[:2] for line in go_tc_lines]
    df_go = pd.DataFrame(go_tc_data, columns=["GO Term", "TC Number"])

    # FASTA-TC
    fasta_io = StringIO(fasta_tcdb_text)
    tc_data = []
    for record in SeqIO.parse(fasta_io, "fasta"):
        header = record.description

        uniprot = header.split("|")[2]
        tc_number = header.split("|")[3].split()[0]
        sequence = str(record.seq)
        tc_data.append([tc_number, uniprot, sequence])
    df_fasta = pd.DataFrame(tc_data, columns=["TC Number", "UniProt ID", "AA Sequence"])

    return df_chebi, df_uniprot, df_go, df_fasta

In [3]:
tc_chebi_text = fetch_data(tc_chebi_url)
uniprot_tc_text = fetch_data(uniprot_tc_url)
go_tc_text = fetch_data(go_tc_url)
fasta_tcdb_text = fetch_data(fasta_tcdb_url)

In [22]:
df_chebi, df_uniprot, df_go, df_fasta = parse_data(tc_chebi_text, uniprot_tc_text, go_tc_text, fasta_tcdb_text)
df_merged = pd.merge(df_chebi, df_fasta, on="TC Number", how="left")
df_merged = df_merged.explode('CHEBI IDs')
df_merged = pd.merge(df_merged, df_go[["TC Number", "GO Term"]], on="TC Number", how="left")
df_merged

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,AA Sequence,GO Term
0,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0016021
1,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0005337
2,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015853
3,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0032869
4,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015854
...,...,...,...,...,...
154251,2.A.1.49.2,37550,Q9H2V7,MAGSDTAPFLSQADDPDDGPVPGTPGLPGSTGNPKSEEPEVPDQEG...,GO:0055085
154252,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0016021
154253,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005267
154254,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005515


Applying correct GO terms and labels because TCDB inconsisently refers to the main_id and an alternative_id of the GO Term. The file go_term_label.tsv is based on the go.owl file from geneontology.org.

In [23]:
go = pd.read_csv("go/go_term_label.tsv", sep="\t")
go_term_to_label = go.set_index("GO Term")["GO Label"].to_dict()

alt_id_to_main = go.set_index("Alternative ID")["GO Term"].to_dict()

def resolve_go_term_and_label(go_term):
    if pd.isna(go_term):
        return go_term, None
    main_term = alt_id_to_main.get(go_term, go_term)
    label = go_term_to_label.get(main_term, None)
    return main_term, label

df_merged[["Resolved GO Term", "GO Label"]] = df_merged["GO Term"].apply(
    lambda term: pd.Series(resolve_go_term_and_label(term))
)
df_merged = df_merged.drop("GO Term", axis=1)
df_merged.rename(columns={"Resolved GO Term":"GO Term"}, inplace=True)

The initial rough draft of the dataframe

In [24]:
df_merged = df_merged.drop_duplicates()
df_merged.to_csv("tcdb_data_combined.csv", index=False)
df_merged

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,AA Sequence,GO Term,GO Label
0,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0016020,membrane
1,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0005337,nucleoside transmembrane transporter activity
2,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015853,adenine transport
3,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0032869,cellular response to insulin stimulus
4,2.A.57.1.4,2472,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015854,guanine transport
...,...,...,...,...,...,...
154251,2.A.1.49.2,37550,Q9H2V7,MAGSDTAPFLSQADDPDDGPVPGTPGLPGSTGNPKSEEPEVPDQEG...,GO:0055085,transmembrane transport
154252,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0016020,membrane
154253,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005267,potassium channel activity
154254,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005515,protein binding


In [25]:
len(df_merged["CHEBI IDs"].unique())

1666

Making an easy way to convert from primary/secondary ChEBI ID to primary ChEBI ID only. The file primary_secondary_chebi_ids.tsv stems from ChEBI_complete_3star.sdf provided by ebi.ac.uk/chebi

In [26]:
prim_sec_chebi = pd.read_csv("chebi_data/primary_secondary_chebi_ids.tsv", sep="\t")

secondary_to_primary =  {}

for _, row in prim_sec_chebi.iterrows():
    primary_id = int(row["Primary_CHEBI_ID"])
    secondary_ids = eval(row["Secondary_CHEBI_IDs"])


    for s_id in secondary_ids:
        secondary_to_primary[s_id] = primary_id

def get_primary_id(chebi_id):
    chebi_id = int(chebi_id)
    return secondary_to_primary.get(str(chebi_id), chebi_id)

Convert all ChEBI IDs to primary IDs for the TCDB df

In [27]:
df_merged["CHEBI IDs"] = pd.to_numeric(df_merged["CHEBI IDs"], errors="raise")
df_merged["CHEBI IDs"] = df_merged["CHEBI IDs"].apply(get_primary_id)
df_merged

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,AA Sequence,GO Term,GO Label
0,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0016020,membrane
1,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0005337,nucleoside transmembrane transporter activity
2,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015853,adenine transport
3,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0032869,cellular response to insulin stimulus
4,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015854,guanine transport
...,...,...,...,...,...,...
154251,2.A.1.49.2,37550,Q9H2V7,MAGSDTAPFLSQADDPDDGPVPGTPGLPGSTGNPKSEEPEVPDQEG...,GO:0055085,transmembrane transport
154252,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0016020,membrane
154253,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005267,potassium channel activity
154254,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005515,protein binding


In [28]:
len(df_merged["CHEBI IDs"].unique())

1532

Creating the filter that is applied to only find accurate ChEBIs. See chebi_data/leaf_children.ipynb for more info regarding the filter choice.

The method used, is Method 2. This gives best performance (includes the most IDs where info is obtainable).
This filter aims to only leave in leaf nodes, and parents that have no children present in the df from TCDB.

First loading in the hierarchy pr 2024-08-01.

In [97]:
df_hierarchy = pd.read_csv("chebi_data/chebiHierarchy.tsv", sep="\t")
df_hierarchy['child'] = df_hierarchy['child'].str.extract(r'CHEBI_(\d+)').astype(int)
df_hierarchy['parent'] = df_hierarchy['parent'].str.extract(r'CHEBI_(\d+)').astype(int)

all_primary_chebi_tcdb = set(df_merged["CHEBI IDs"].unique())

df_hierarchy_prim = df_hierarchy.copy()
df_hierarchy_prim["child"] = df_hierarchy["child"].apply(get_primary_id)
df_hierarchy_prim["parent"] = df_hierarchy["parent"].apply(get_primary_id)

Finding all the parents from the original (TCDB) df that HAS a child listed in the df.

In [98]:
parents_w_children_in_tcdb = set(df_hierarchy_prim[df_hierarchy_prim["child"].isin(all_primary_chebi_tcdb)]["parent"])

chebi_ids_to_remove = parents_w_children_in_tcdb.intersection(all_primary_chebi_tcdb)

filtered_chebis = all_primary_chebi_tcdb - chebi_ids_to_remove
len(chebi_ids_to_remove)

201

Applying the filter where there are only leaf nodes and parents without children, and reducing df_merged correspondingly.

In [99]:
df_filtered = df_merged[df_merged["CHEBI IDs"].isin(filtered_chebis)]
df_filtered

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,AA Sequence,GO Term,GO Label
0,2.A.16.3.3,17632,T2I8Z9,MASSLSSFEAETTVPPSRIGTFFNDVLANFHPIYFVINMGLGITSL...,,
1,2.A.16.3.3,48854,T2I8Z9,MASSLSSFEAETTVPPSRIGTFFNDVLANFHPIYFVINMGLGITSL...,,
5,2.A.1.1.51,10085,Q2MEV7,MGLEDNRMVKRFVNVGEKKAGSTAMAIIVGLFAASGGVLFGYDTGT...,GO:0016020,membrane
6,2.A.1.1.51,10085,Q2MEV7,MGLEDNRMVKRFVNVGEKKAGSTAMAIIVGLFAASGGVLFGYDTGT...,GO:0022857,transmembrane transporter activity
7,2.A.1.1.51,10085,Q2MEV7,MGLEDNRMVKRFVNVGEKKAGSTAMAIIVGLFAASGGVLFGYDTGT...,GO:0055085,transmembrane transport
...,...,...,...,...,...,...
154251,2.A.50.4.1,18035,O75907,MGDRGSSRRRRTGSRPSSHGGGGPAAAEEEVRDAAAGPDVGAAGDA...,,
154252,2.A.7.12.3,67119,Q9C5H6,MKNGIAECPACHSKLVSPGSKTISRAYDDHKIRVSSKQRVLNVLLV...,GO:0000139,Golgi membrane
154253,2.A.7.12.3,67119,Q9C5H6,MKNGIAECPACHSKLVSPGSKTISRAYDDHKIRVSSKQRVLNVLLV...,GO:0016020,membrane
154254,2.A.7.12.3,67119,Q9C5H6,MKNGIAECPACHSKLVSPGSKTISRAYDDHKIRVSSKQRVLNVLLV...,GO:0005351,carbohydrate:proton symporter activity


Now I want to append all the ChEBI information to the substrates. Charge, formula, MW, and SMILES, to be precise. But first I need to make sure it is the primary ChEBI ID that is in use for both dfs. To reduce mismatches. chebiDf.tsv is from chebi.owl, provided by ebi.ac.uk/chebi.

In [100]:
chebi_df = pd.read_csv("chebi_data\chebiDf.tsv", sep="\t")
chebi_df["chebi"] = chebi_df["chebi"].str.extract(r"CHEBI_(\d+)").astype(int)
chebi_df = chebi_df.drop(columns=["inchi", "inchikey"])

chebi_df.loc[:, "chebi_primary"] = chebi_df["chebi"].apply(get_primary_id)

In [101]:
df_chebi_info = df_filtered.merge(chebi_df, left_on="CHEBI IDs", right_on="chebi_primary", how="left")
df_chebi_info.rename(columns={"label":"chebi_label"}, inplace=True)
df_chebi_info

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,AA Sequence,GO Term,GO Label,charge,chebi,formula,chebi_label,mass,smiles,chebi_primary
0,2.A.16.3.3,17632,T2I8Z9,MASSLSSFEAETTVPPSRIGTFFNDVLANFHPIYFVINMGLGITSL...,,,-1.0,17632.0,NO3,nitrate,62.00490,[O-][N+]([O-])=O,17632.0
1,2.A.16.3.3,48854,T2I8Z9,MASSLSSFEAETTVPPSRIGTFFNDVLANFHPIYFVINMGLGITSL...,,,0.0,48854.0,H2O3S,sulfurous acid,82.08008,OS(O)=O,48854.0
2,2.A.1.1.51,10085,Q2MEV7,MGLEDNRMVKRFVNVGEKKAGSTAMAIIVGLFAASGGVLFGYDTGT...,GO:0016020,membrane,,,,,,,
3,2.A.1.1.51,10085,Q2MEV7,MGLEDNRMVKRFVNVGEKKAGSTAMAIIVGLFAASGGVLFGYDTGT...,GO:0022857,transmembrane transporter activity,,,,,,,
4,2.A.1.1.51,10085,Q2MEV7,MGLEDNRMVKRFVNVGEKKAGSTAMAIIVGLFAASGGVLFGYDTGT...,GO:0055085,transmembrane transport,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64009,2.A.50.4.1,18035,O75907,MGDRGSSRRRRTGSRPSSHGGGGPAAAEEEVRDAAAGPDVGAAGDA...,,,,,,,,,
64010,2.A.7.12.3,67119,Q9C5H6,MKNGIAECPACHSKLVSPGSKTISRAYDDHKIRVSSKQRVLNVLLV...,GO:0000139,Golgi membrane,0.0,67119.0,C15H24N2O17P2,UDP-alpha-D-galactose,566.30180,OC[C@H]1O[C@H](OP(O)(=O)OP(O)(=O)OC[C@H]2O[C@H...,67119.0
64011,2.A.7.12.3,67119,Q9C5H6,MKNGIAECPACHSKLVSPGSKTISRAYDDHKIRVSSKQRVLNVLLV...,GO:0016020,membrane,0.0,67119.0,C15H24N2O17P2,UDP-alpha-D-galactose,566.30180,OC[C@H]1O[C@H](OP(O)(=O)OP(O)(=O)OC[C@H]2O[C@H...,67119.0
64012,2.A.7.12.3,67119,Q9C5H6,MKNGIAECPACHSKLVSPGSKTISRAYDDHKIRVSSKQRVLNVLLV...,GO:0005351,carbohydrate:proton symporter activity,0.0,67119.0,C15H24N2O17P2,UDP-alpha-D-galactose,566.30180,OC[C@H]1O[C@H](OP(O)(=O)OP(O)(=O)OC[C@H]2O[C@H...,67119.0


Alternatively, without using the filter at all....

In [9]:
chebi_df = pd.read_csv("chebi_data\chebiDf.tsv", sep="\t")
chebi_df["chebi"] = chebi_df["chebi"].str.extract(r"CHEBI_(\d+)").astype(int)
chebi_df = chebi_df.drop(columns=["inchi", "inchikey"])
chebi_df.loc[:, "chebi_primary"] = chebi_df["chebi"].apply(get_primary_id)

df_chebi_info = df_merged.merge(chebi_df, left_on="CHEBI IDs", right_on="chebi_primary", how="left")
df_chebi_info.rename(columns={"label":"chebi_label"}, inplace=True)
df_chebi_info

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,AA Sequence,GO Term,GO Label,charge,chebi,formula,chebi_label,mass,smiles,chebi_primary
0,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0016020,membrane,0.0,16335.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,16335.0
1,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0005337,nucleoside transmembrane transporter activity,0.0,16335.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,16335.0
2,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015853,adenine transport,0.0,16335.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,16335.0
3,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0032869,cellular response to insulin stimulus,0.0,16335.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,16335.0
4,2.A.57.1.4,16335,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015854,guanine transport,0.0,16335.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,16335.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
93903,2.A.1.49.2,37550,Q9H2V7,MAGSDTAPFLSQADDPDDGPVPGTPGLPGSTGNPKSEEPEVPDQEG...,GO:0055085,transmembrane transport,0.0,37550.0,C18H38NO5P,sphingosine 1-phosphate,379.47180,CCCCCCCCCCCCC\C=C\[C@@H](O)[C@@H](N)COP(O)(O)=O,37550.0
93904,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0016020,membrane,,,,,,,
93905,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005267,potassium channel activity,,,,,,,
93906,1.A.27.1.4,22563,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005515,protein binding,,,,,,,


Worth noting that e- with ChEBI 10545 is missing. Should be inserted manually with all its properties. Change is->was when done

In [10]:
df_missing_chebis = df_chebi_info[df_chebi_info['chebi'].isna()]

print(f"There are {len(df_missing_chebis)} ChEBI IDs that cannot be connected to any info, i.e. are too broad, like 'molecule' or 'polypeptide'.")
print(f"In total, there are {len(set(df_missing_chebis['CHEBI IDs']))} distinct ChEBI IDs.")
set(df_missing_chebis['CHEBI IDs'])

There are 33527 ChEBI IDs that cannot be connected to any info, i.e. are too broad, like 'molecule' or 'polypeptide'.
In total, there are 506 distinct ChEBI IDs.


{2426,
 2633,
 2641,
 2679,
 2782,
 2784,
 2786,
 2835,
 2970,
 3020,
 3098,
 3371,
 3473,
 3523,
 3815,
 3890,
 4062,
 4291,
 4634,
 4654,
 4705,
 5026,
 5044,
 5077,
 5172,
 5182,
 5249,
 5256,
 5306,
 5417,
 5418,
 5466,
 5476,
 5481,
 5656,
 5709,
 5745,
 5761,
 5975,
 6078,
 6104,
 6126,
 6351,
 6486,
 6494,
 6495,
 6606,
 6718,
 6923,
 6984,
 7201,
 7507,
 7731,
 7755,
 7758,
 8005,
 8132,
 8134,
 8150,
 8301,
 8322,
 8526,
 8678,
 8755,
 8850,
 9160,
 9201,
 9203,
 9251,
 9431,
 9500,
 9840,
 9929,
 9948,
 10036,
 10074,
 10081,
 10085,
 10394,
 10404,
 10427,
 10545,
 10652,
 12936,
 14311,
 14386,
 14575,
 14911,
 15332,
 15693,
 15705,
 15748,
 15791,
 15824,
 15866,
 15889,
 15904,
 16024,
 16038,
 16042,
 16110,
 16247,
 16336,
 16337,
 16374,
 16412,
 16443,
 16541,
 16670,
 16733,
 16961,
 16975,
 16988,
 17002,
 17029,
 17089,
 17234,
 17283,
 17315,
 17334,
 17522,
 17548,
 17593,
 17636,
 17757,
 17761,
 17855,
 17909,
 17984,
 18035,
 18059,
 18133,
 18154,
 18246,
 1

Next up, removing all instances from the df where there is no info to get on the ChEBI substrate. Or not, if everything is desired.

In [11]:
# valid_chebis = chebi_df["chebi"]
# df_chebi_info_filtered = df_chebi_info[df_chebi_info["CHEBI IDs"].isin(valid_chebis)]

# Some quick deletions and rearranging of the df, to make it more tidy
# df_chebi_info_filtered = df_chebi_info_filtered.drop(columns=["chebi", "chebi_primary"]) # This one is connected to the two lines above
df_chebi_info_filtered = df_chebi_info.drop(columns=["chebi", "chebi_primary"]) # This when not removing non-specific chebis

column_to_move = df_chebi_info_filtered.pop("CHEBI IDs")
df_chebi_info_filtered.insert(5, "CHEBI IDs", column_to_move)

df_chebi_info_filtered = df_chebi_info_filtered.reset_index(drop=True)
# df_chebi_info_filtered.to_csv("tcdb_data_chebi.csv", index=False)
df_chebi_info_filtered

Unnamed: 0,TC Number,UniProt ID,AA Sequence,GO Term,GO Label,CHEBI IDs,charge,formula,chebi_label,mass,smiles
0,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0016020,membrane,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O
1,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0005337,nucleoside transmembrane transporter activity,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O
2,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015853,adenine transport,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O
3,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0032869,cellular response to insulin stimulus,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O
4,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015854,guanine transport,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O
...,...,...,...,...,...,...,...,...,...,...,...
93903,2.A.1.49.2,Q9H2V7,MAGSDTAPFLSQADDPDDGPVPGTPGLPGSTGNPKSEEPEVPDQEG...,GO:0055085,transmembrane transport,37550,0.0,C18H38NO5P,sphingosine 1-phosphate,379.47180,CCCCCCCCCCCCC\C=C\[C@@H](O)[C@@H](N)COP(O)(O)=O
93904,1.A.27.1.4,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0016020,membrane,22563,,,,,
93905,1.A.27.1.4,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005267,potassium channel activity,22563,,,,,
93906,1.A.27.1.4,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005515,protein binding,22563,,,,,


Obtain all different ChEBIs in the df

In [12]:
chebis = set(df_chebi_info_filtered["CHEBI IDs"])
len(chebis)

1532

Add the amount of descendands for the ChEBI IDs

In [13]:
with open("chebi_data/hierarchy_chebi.pkl", "rb") as f:
    G = pickle.load(f)

def get_all_descendants_count(node):
    try:
        return len(list(nx.descendants(G, node)))
    except nx.NetworkXError:
        return -1 
    # return len(nx.descendants(G, node))

In [14]:
df_chebi_info_filtered["CHEBI IDs"] = pd.to_numeric(df_chebi_info_filtered["CHEBI IDs"], errors="raise")
df_chebi_info_filtered["CHEBI IDs"] = df_chebi_info_filtered["CHEBI IDs"].apply(get_primary_id)
df_chebi_info_filtered["ChEBI Descendants"] = df_chebi_info_filtered["CHEBI IDs"].apply(get_all_descendants_count)
# df_chebi_info_filtered.to_csv("tcdb_df_nofilter.csv", index=False)
df_chebi_info_filtered

Unnamed: 0,TC Number,UniProt ID,AA Sequence,GO Term,GO Label,CHEBI IDs,charge,formula,chebi_label,mass,smiles,ChEBI Descendants
0,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0016020,membrane,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
1,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0005337,nucleoside transmembrane transporter activity,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
2,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015853,adenine transport,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
3,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0032869,cellular response to insulin stimulus,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
4,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015854,guanine transport,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
...,...,...,...,...,...,...,...,...,...,...,...,...
93903,2.A.1.49.2,Q9H2V7,MAGSDTAPFLSQADDPDDGPVPGTPGLPGSTGNPKSEEPEVPDQEG...,GO:0055085,transmembrane transport,37550,0.0,C18H38NO5P,sphingosine 1-phosphate,379.47180,CCCCCCCCCCCCC\C=C\[C@@H](O)[C@@H](N)COP(O)(O)=O,0
93904,1.A.27.1.4,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0016020,membrane,22563,,,,,,9684
93905,1.A.27.1.4,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005267,potassium channel activity,22563,,,,,,9684
93906,1.A.27.1.4,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005515,protein binding,22563,,,,,,9684


Just misc testing and analysis

In [48]:
df_chebi_info_filtered.sample(n=10).to_csv("tcdb_excerpt.csv", index=False)

In [55]:
go_gone = df_chebi_info_filtered[df_chebi_info_filtered["GO Term"].isna()]
len(set(go_gone["TC Number"]))

4968

In [16]:
filtered_df = df_chebi_info_filtered[~(df_chebi_info_filtered["ChEBI Descendants"] == -1)]
filtered_df.to_csv("DF.csv", index=False)
filtered_df

Unnamed: 0,TC Number,UniProt ID,AA Sequence,GO Term,GO Label,CHEBI IDs,charge,formula,chebi_label,mass,smiles,ChEBI Descendants
0,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0016020,membrane,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
1,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0005337,nucleoside transmembrane transporter activity,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
2,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015853,adenine transport,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
3,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0032869,cellular response to insulin stimulus,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
4,2.A.57.1.4,O54699,MAHGNAPRDSYHLVGISFFILGLGTLLPWNFFITAIPYFQGRLAGT...,GO:0015854,guanine transport,16335,0.0,C10H13N5O4,adenosine,267.24152,Nc1ncnc2n(cnc12)[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,0
...,...,...,...,...,...,...,...,...,...,...,...,...
93903,2.A.1.49.2,Q9H2V7,MAGSDTAPFLSQADDPDDGPVPGTPGLPGSTGNPKSEEPEVPDQEG...,GO:0055085,transmembrane transport,37550,0.0,C18H38NO5P,sphingosine 1-phosphate,379.47180,CCCCCCCCCCCCC\C=C\[C@@H](O)[C@@H](N)COP(O)(O)=O,0
93904,1.A.27.1.4,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0016020,membrane,22563,,,,,,9684
93905,1.A.27.1.4,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005267,potassium channel activity,22563,,,,,,9684
93906,1.A.27.1.4,Q63113,MEGITCAFLLVLAGLPVLEANGPVDKGSPFYYDWESLQLGGMIFGG...,GO:0005515,protein binding,22563,,,,,,9684


In [29]:
len(filtered_df["CHEBI IDs"].unique())

1446

### Analysis of dataframe

In [None]:
df = pd.read_csv("DF.csv")
df_go_notnan = df[~df['GO Term'].isna()]
len(df_go_notnan["TC Number"].unique())
overlap_check = df.groupby('TC Number')['GO Term'].apply(lambda x: x.isna().all() or x.notna().all()).all()
overlap_check

np.True_

In [4]:
len(df["GO Term"].unique())

2717