# Improving the annotation of metabolite transporters

First off, we are importing packages that might come in handy.

In [1]:
import cobra
import numpy as np
import scipy as sp
import pandas as pd
import requests
from io import StringIO
from Bio import SeqIO

Importing most recenet TC numbers, corresponding CHEBI ID of substrates, uniprot ID and GO terms from TCDB

In [2]:
tc_chebi_url = "https://www.tcdb.org/cgi-bin/substrates/getSubstrates.py"
uniprot_tc_url = "https://www.tcdb.org/cgi-bin/projectv/public/acc2tcid.py"
go_tc_url = "https://www.tcdb.org/cgi-bin/projectv/public/go.py"
fasta_tcdb_url = "https://www.tcdb.org/public/tcdb"

def fetch_data(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def parse_data(tc_chebi_text, uniprot_tc_text, go_tc_text, fasta_tcdb_text):
    # TC-CHEBI
    tc_chebi_lines = tc_chebi_text.strip().split("\n")
    tc_chebi_data = []
    for line in tc_chebi_lines:
        tc_number, chebi_ids = line.split("\t")
        chebi_id_list = [id.split(";")[0].replace("CHEBI:", "") for id in chebi_ids.split("|")]
        tc_chebi_data.append([tc_number, chebi_id_list])

    df_chebi = pd.DataFrame(tc_chebi_data, columns=["TC Number", "CHEBI IDs"])

    # UniProt-TC
    uniprot_tc_lines = uniprot_tc_text.strip().split("\n")
    uniprot_tc_data = [line.split("\t") for line in uniprot_tc_lines]
    df_uniprot = pd.DataFrame(uniprot_tc_data, columns=["UniProt ID", "TC Number"])

    # GO-TC
    go_tc_lines = go_tc_text.strip().split("\n")
    go_tc_data = [line.split("\t")[:2] for line in go_tc_lines]
    df_go = pd.DataFrame(go_tc_data, columns=["GO Term", "TC Number"])

    # FASTA-TC
    fasta_io = StringIO(fasta_tcdb_text)
    tc_data = []
    for record in SeqIO.parse(fasta_io, "fasta"):
        header = record.description
        tc_number = header.split("|")[3].split()[0]
        sequence = str(record.seq)
        tc_data.append([tc_number, sequence])
    df_fasta = pd.DataFrame(tc_data, columns=["TC Number", "AA Sequence"])

    return df_chebi, df_uniprot, df_go, df_fasta

In [3]:
tc_chebi_text = fetch_data(tc_chebi_url)
uniprot_tc_text = fetch_data(uniprot_tc_url)
go_tc_text = fetch_data(go_tc_url)
fasta_tcdb_text = fetch_data(fasta_tcdb_url)

df_chebi, df_uniprot, df_go, df_fasta = parse_data(tc_chebi_text, uniprot_tc_text, go_tc_text, fasta_tcdb_text)
df_merged = pd.merge(df_chebi, df_uniprot, on="TC Number", how="left")
df_merged = df_merged.explode('CHEBI IDs')
df_merged = pd.merge(df_merged, df_go[["TC Number", "GO Term"]], on="TC Number", how="left")
df_merged = pd.merge(df_merged, df_fasta, on="TC Number", how="left")

df_merged.to_csv("tcdb_data_combined.csv", index=False)

In [4]:
df_merged

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence
0,1.B.14.1.31,23066,O68590,,MSRQSTDTAVSSQRLLASAIGVAITAIAAPQAAQADEAGQKKTDKD...
1,4.C.3.1.1,2455,Q6Q2Z6,GO:0005829,MVPTVSLEPTGHSCWDEPLSIAVRGLAPEQPVTLRTALRDEKGALF...
2,4.C.3.1.1,2455,Q6Q2Z6,GO:0005777,MVPTVSLEPTGHSCWDEPLSIAVRGLAPEQPVTLRTALRDEKGALF...
3,4.C.3.1.1,2455,Q6Q2Z6,GO:0004091,MVPTVSLEPTGHSCWDEPLSIAVRGLAPEQPVTLRTALRDEKGALF...
4,4.C.3.1.1,2455,Q6Q2Z6,GO:0016290,MVPTVSLEPTGHSCWDEPLSIAVRGLAPEQPVTLRTALRDEKGALF...
...,...,...,...,...,...
1757316,2.A.17.3.8,4634,Q9LFB8,GO:0005886,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...
1757317,2.A.17.3.8,4634,Q9LFB8,GO:0042936,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...
1757318,2.A.17.3.8,4634,Q9LFB8,GO:0042938,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...
1757319,2.A.17.3.8,4634,Q9LFB8,GO:0009860,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...


Making an easy way to convert from primary/secondaryy ChEBI ID to primary ChEBI ID only.

In [5]:
prim_sec_chebi = pd.read_csv("chebi_data/primary_secondary_chebi_ids.tsv", sep="\t")

secondary_to_primary =  {}

for _, row in prim_sec_chebi.iterrows():
    primary_id = int(row["Primary_CHEBI_ID"])
    secondary_ids = eval(row["Secondary_CHEBI_IDs"])


    for s_id in secondary_ids:
        secondary_to_primary[s_id] = primary_id

def get_primary_id(chebi_id):
    chebi_id = int(chebi_id)
    return secondary_to_primary.get(str(chebi_id), chebi_id)

Convert all ChEBI IDs to primary IDs for the TCDB df

In [6]:
df_merged["CHEBI IDs"] = pd.to_numeric(df_merged["CHEBI IDs"], errors="raise")
df_merged.loc[:, "ChEBI Primary IDs"] = df_merged["CHEBI IDs"].apply(get_primary_id)

Creating the filter that is applied to only find accurate ChEBIs. See chebi_data/leaf_children.ipynb for more info regarding the filter choice.

The method used, is Method 2. This gives best performance (includes the most IDs where info is obtainable).
This filter aims to only leave in leaf nodes, and parents that have no children present in the df from TCDB.

First loading in the hierarchy pr 2024-08-01.

In [7]:
df_hierarchy = pd.read_csv("chebi_data/chebiHierarchy.tsv", sep="\t")
df_hierarchy['child'] = df_hierarchy['child'].str.extract(r'CHEBI_(\d+)').astype(int)
df_hierarchy['parent'] = df_hierarchy['parent'].str.extract(r'CHEBI_(\d+)').astype(int)

all_primary_chebi_tcdb = set(df_merged["ChEBI Primary IDs"].unique())

df_hierarchy_prim = df_hierarchy.copy()
df_hierarchy_prim["child"] = df_hierarchy["child"].apply(get_primary_id)
df_hierarchy_prim["parent"] = df_hierarchy["parent"].apply(get_primary_id)

Finding all the parents from the original (TCDB) df that HAS a child listed in the df.

In [8]:
parents_w_children_in_tcdb = set(df_hierarchy_prim[df_hierarchy_prim["child"].isin(all_primary_chebi_tcdb)]["parent"])

chebi_ids_to_remove = parents_w_children_in_tcdb.intersection(all_primary_chebi_tcdb)

filtered_chebis = all_primary_chebi_tcdb - chebi_ids_to_remove

Applying the filter where there are only leaf nodes and parents without children, and reducing df_merged correspondingly.

In [9]:
df_filtered = df_merged[df_merged["ChEBI Primary IDs"].isin(filtered_chebis)]
df_filtered

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence,ChEBI Primary IDs
10,2.A.39.2.3,2470,Q708J7,GO:0016020,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708
11,2.A.39.2.3,2470,Q708J7,GO:0015205,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708
12,2.A.39.2.3,2470,Q708J7,GO:0015931,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708
13,2.A.39.2.3,2470,Q708J7,GO:0055085,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708
14,2.A.39.2.3,5563,Q708J7,GO:0016020,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16235
...,...,...,...,...,...,...
1757316,2.A.17.3.8,4634,Q9LFB8,GO:0005886,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634
1757317,2.A.17.3.8,4634,Q9LFB8,GO:0042936,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634
1757318,2.A.17.3.8,4634,Q9LFB8,GO:0042938,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634
1757319,2.A.17.3.8,4634,Q9LFB8,GO:0009860,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634


Now I want to append all the ChEBI information to the substrates. Charge, formula, MW, and SMILES, to be precise. But first I need to make sure it is the primary ChEBI ID that is in use for both dfs. To reduce mismatches.

In [10]:
chebi_df = pd.read_csv("chebi_data\chebiDf.tsv", sep="\t")
chebi_df["chebi"] = chebi_df["chebi"].str.extract(r"CHEBI_(\d+)").astype(int)
chebi_df = chebi_df.drop(columns=["inchi", "inchikey"])

chebi_df.loc[:, "chebi_primary"] = chebi_df["chebi"].apply(get_primary_id)

In [11]:
df_chebi_info = df_filtered.merge(chebi_df, left_on="ChEBI Primary IDs", right_on="chebi_primary", how="left")
df_chebi_info

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence,ChEBI Primary IDs,charge,chebi,formula,label,mass,smiles,chebi_primary
0,2.A.39.2.3,2470,Q708J7,GO:0016020,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708,0.0,16708.0,C5H5N5,adenine,135.1269,Nc1ncnc2[nH]cnc12,16708.0
1,2.A.39.2.3,2470,Q708J7,GO:0015205,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708,0.0,16708.0,C5H5N5,adenine,135.1269,Nc1ncnc2[nH]cnc12,16708.0
2,2.A.39.2.3,2470,Q708J7,GO:0015931,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708,0.0,16708.0,C5H5N5,adenine,135.1269,Nc1ncnc2[nH]cnc12,16708.0
3,2.A.39.2.3,2470,Q708J7,GO:0055085,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708,0.0,16708.0,C5H5N5,adenine,135.1269,Nc1ncnc2[nH]cnc12,16708.0
4,2.A.39.2.3,5563,Q708J7,GO:0016020,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16235,0.0,16235.0,C5H5N5O,guanine,151.1260,C12=C(N=C(NC1=O)N)NC=N2,16235.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
500240,2.A.17.3.8,4634,Q9LFB8,GO:0005886,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634,,,,,,,
500241,2.A.17.3.8,4634,Q9LFB8,GO:0042936,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634,,,,,,,
500242,2.A.17.3.8,4634,Q9LFB8,GO:0042938,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634,,,,,,,
500243,2.A.17.3.8,4634,Q9LFB8,GO:0009860,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634,,,,,,,


Worth noting that e- with ChEBI 10545 is missing. Should be inserted manually with all its properties. Change is->was when done

In [12]:
# df_chebi_info[df_chebi_info["CHEBI IDs"] == 1]
df_missing_chebis = df_chebi_info[df_chebi_info['chebi'].isna()]
df_missing_chebis

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence,ChEBI Primary IDs,charge,chebi,formula,label,mass,smiles,chebi_primary
13,3.A.1.134.10,71644,F2HM69,,MLSLKLAANNIKKGFKSFAPFLMASVTMFVMIFVTASIALSPSISK...,71644,,,,,,,
14,3.A.1.134.10,71644,F2HM69,,MLLEVKHLKKIFKTRFSKEETTALVDIDFGVEEGEYIAIMGESGSG...,71644,,,,,,,
15,3.A.1.134.10,71644,F2HM70,,MLSLKLAANNIKKGFKSFAPFLMASVTMFVMIFVTASIALSPSISK...,71644,,,,,,,
16,3.A.1.134.10,71644,F2HM70,,MLLEVKHLKKIFKTRFSKEETTALVDIDFGVEEGEYIAIMGESGSG...,71644,,,,,,,
42,2.A.1.2.77,10426,Q8NKG7,,MANNSGTTTVQLDDVLERSSTLNTLNNIDTVQHHEPRTSFANNRQQ...,35627,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
500240,2.A.17.3.8,4634,Q9LFB8,GO:0005886,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634,,,,,,,
500241,2.A.17.3.8,4634,Q9LFB8,GO:0042936,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634,,,,,,,
500242,2.A.17.3.8,4634,Q9LFB8,GO:0042938,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634,,,,,,,
500243,2.A.17.3.8,4634,Q9LFB8,GO:0009860,MEDDKDIYTKDGTLDIHKKPANKNKTGTWKACRFILGTECCERLAY...,4634,,,,,,,


Next up, removing all instances from the df where there is no info to get on the ChEBI substrate

In [13]:
valid_chebis = chebi_df["chebi"]
df_chebi_info_filtered = df_chebi_info[df_chebi_info["ChEBI Primary IDs"].isin(valid_chebis)]
df_chebi_info_filtered.to_csv("tcdb_data_chebi.csv", index=False)
df_chebi_info_filtered

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence,ChEBI Primary IDs,charge,chebi,formula,label,mass,smiles,chebi_primary
0,2.A.39.2.3,2470,Q708J7,GO:0016020,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708,0.0,16708.0,C5H5N5,adenine,135.1269,Nc1ncnc2[nH]cnc12,16708.0
1,2.A.39.2.3,2470,Q708J7,GO:0015205,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708,0.0,16708.0,C5H5N5,adenine,135.1269,Nc1ncnc2[nH]cnc12,16708.0
2,2.A.39.2.3,2470,Q708J7,GO:0015931,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708,0.0,16708.0,C5H5N5,adenine,135.1269,Nc1ncnc2[nH]cnc12,16708.0
3,2.A.39.2.3,2470,Q708J7,GO:0055085,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16708,0.0,16708.0,C5H5N5,adenine,135.1269,Nc1ncnc2[nH]cnc12,16708.0
4,2.A.39.2.3,5563,Q708J7,GO:0016020,MSSDPEKNLGMPEKTSVNSYDSMDPSSSSSGADAEIETTKLNFIDR...,16235,0.0,16235.0,C5H5N5O,guanine,151.1260,C12=C(N=C(NC1=O)N)NC=N2,16235.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
500231,1.A.8.12.8,5585,Q8VZW1,GO:0005886,MADISGNGYGNAREEVVMVNLKDEVEHQQEMEDIHNPRPLKKQDSL...,15377,0.0,15377.0,H2O,water,18.0153,[H]O[H],15377.0
500232,1.A.8.12.8,5585,Q8VZW1,GO:0015105,MADISGNGYGNAREEVVMVNLKDEVEHQQEMEDIHNPRPLKKQDSL...,15377,0.0,15377.0,H2O,water,18.0153,[H]O[H],15377.0
500233,1.A.8.12.8,5585,Q8VZW1,GO:0015250,MADISGNGYGNAREEVVMVNLKDEVEHQQEMEDIHNPRPLKKQDSL...,15377,0.0,15377.0,H2O,water,18.0153,[H]O[H],15377.0
500234,1.A.8.12.8,5585,Q8VZW1,GO:0031347,MADISGNGYGNAREEVVMVNLKDEVEHQQEMEDIHNPRPLKKQDSL...,15377,0.0,15377.0,H2O,water,18.0153,[H]O[H],15377.0
