# Improving the annotation of metabolite transporters

First off, we are importing packages that might come in handy.

In [2]:
import cobra
import numpy as np
import scipy as sp
import pandas as pd
import requests
from io import StringIO
from Bio import SeqIO

Importing most recenet TC numbers, corresponding CHEBI ID of substrates, uniprot ID and GO terms from TCDB

In [3]:
tc_chebi_url = "https://www.tcdb.org/cgi-bin/substrates/getSubstrates.py"
uniprot_tc_url = "https://www.tcdb.org/cgi-bin/projectv/public/acc2tcid.py"
go_tc_url = "https://www.tcdb.org/cgi-bin/projectv/public/go.py"
fasta_tcdb_url = "https://www.tcdb.org/public/tcdb"

def fetch_data(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def parse_data(tc_chebi_text, uniprot_tc_text, go_tc_text, fasta_tcdb_text):
    # TC-CHEBI
    tc_chebi_lines = tc_chebi_text.strip().split("\n")
    tc_chebi_data = []
    for line in tc_chebi_lines:
        tc_number, chebi_ids = line.split("\t")
        chebi_id_list = [id.split(";")[0].replace("CHEBI:", "") for id in chebi_ids.split("|")]
        tc_chebi_data.append([tc_number, chebi_id_list])

    df_chebi = pd.DataFrame(tc_chebi_data, columns=["TC Number", "CHEBI IDs"])

    # UniProt-TC
    uniprot_tc_lines = uniprot_tc_text.strip().split("\n")
    uniprot_tc_data = [line.split("\t") for line in uniprot_tc_lines]
    df_uniprot = pd.DataFrame(uniprot_tc_data, columns=["UniProt ID", "TC Number"])

    # GO-TC
    go_tc_lines = go_tc_text.strip().split("\n")
    go_tc_data = [line.split("\t")[:2] for line in go_tc_lines]
    df_go = pd.DataFrame(go_tc_data, columns=["GO Term", "TC Number"])

    # FASTA-TC
    fasta_io = StringIO(fasta_tcdb_text)
    tc_data = []
    for record in SeqIO.parse(fasta_io, "fasta"):
        header = record.description
        tc_number = header.split("|")[3].split()[0]
        sequence = str(record.seq)
        tc_data.append([tc_number, sequence])
    df_fasta = pd.DataFrame(tc_data, columns=["TC Number", "AA Sequence"])

    return df_chebi, df_uniprot, df_go, df_fasta

In [4]:
tc_chebi_text = fetch_data(tc_chebi_url)
uniprot_tc_text = fetch_data(uniprot_tc_url)
go_tc_text = fetch_data(go_tc_url)
fasta_tcdb_text = fetch_data(fasta_tcdb_url)

df_chebi, df_uniprot, df_go, df_fasta = parse_data(tc_chebi_text, uniprot_tc_text, go_tc_text, fasta_tcdb_text)
df_merged = pd.merge(df_chebi, df_uniprot, on="TC Number", how="left")
df_merged = df_merged.explode('CHEBI IDs')
df_merged = pd.merge(df_merged, df_go[["TC Number", "GO Term"]], on="TC Number", how="left")
df_merged = pd.merge(df_merged, df_fasta, on="TC Number", how="left")

df_merged.to_csv("tcdb_data_combined.csv", index=False)

In [58]:
df_merged

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence
0,1.A.4.1.4,3308,Q13507,GO:0005887,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
1,1.A.4.1.4,3308,Q13507,GO:0005515,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
2,1.A.4.1.4,3308,Q13507,GO:0015279,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
3,1.A.4.1.4,3308,Q13507,GO:0006816,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
4,1.A.4.1.4,3308,Q13507,GO:0007602,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
...,...,...,...,...,...
1756879,3.A.3.3.5,5584,Q86DE0,GO:0016021,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...
1756880,3.A.3.3.5,5584,Q86DE0,GO:0005524,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...
1756881,3.A.3.3.5,5584,Q86DE0,GO:0015662,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...
1756882,3.A.3.3.5,5584,Q86DE0,GO:0006754,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...


Grabbing the filtered ChEBI list, where all parents have been killed, and reducing df_merged correspondingly

In [59]:
with open("chebi_data/chebis_filtered.txt", "r") as f:
    content = f.read().strip()
    filtered_chebis = list(content.split(","))

df_filtered = df_merged[df_merged["CHEBI IDs"].isin(filtered_chebis)]
df_filtered

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence
0,1.A.4.1.4,3308,Q13507,GO:0005887,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
1,1.A.4.1.4,3308,Q13507,GO:0005515,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
2,1.A.4.1.4,3308,Q13507,GO:0015279,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
3,1.A.4.1.4,3308,Q13507,GO:0006816,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
4,1.A.4.1.4,3308,Q13507,GO:0007602,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...
...,...,...,...,...,...
1756879,3.A.3.3.5,5584,Q86DE0,GO:0016021,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...
1756880,3.A.3.3.5,5584,Q86DE0,GO:0005524,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...
1756881,3.A.3.3.5,5584,Q86DE0,GO:0015662,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...
1756882,3.A.3.3.5,5584,Q86DE0,GO:0006754,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...


Now I want to append all the chebi information to the substrates. Charge, formula, MW, and SMILES, to be precise. But first I need to make sure it is the primary ChEBI ID that is in use for both dfs. To reduce mismatches.

In [61]:
chebi_df = pd.read_csv("chebi_data\chebiDf.tsv", sep="\t")
chebi_df["chebi"] = chebi_df["chebi"].str.extract(r"CHEBI_(\d+)").astype(int)
chebi_df = chebi_df.drop(columns=["inchi", "inchikey"])

prim_sec_chebi = pd.read_csv("chebi_data/primary_secondary_chebi_ids.tsv", sep="\t")

secondary_to_primary =  {}

for _, row in prim_sec_chebi.iterrows():
    primary_id = int(row["Primary_CHEBI_ID"])
    secondary_ids = eval(row["Secondary_CHEBI_IDs"])


    for s_id in secondary_ids:
        secondary_to_primary[s_id] = primary_id

def get_primary_id(chebi_id):
    chebi_id = int(chebi_id)
    return secondary_to_primary.get(str(chebi_id), chebi_id)

chebi_df.loc[:, "chebi_primary"] = chebi_df["chebi"].apply(get_primary_id)
df_filtered.loc[:, "ChEBI Primary IDs"] = df_filtered["CHEBI IDs"].apply(get_primary_id).astype(int)

In [62]:
df_chebi_info = df_filtered.merge(chebi_df, left_on="ChEBI Primary IDs", right_on="chebi_primary", how="left")
df_chebi_info.to_csv("tcdb_data_chebi.csv", index=False)
df_chebi_info

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence,ChEBI Primary IDs,charge,chebi,formula,label,mass,smiles,chebi_primary
0,1.A.4.1.4,3308,Q13507,GO:0005887,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...,29108,2.0,29108.0,Ca,calcium(2+),40.07800,[Ca++],29108.0
1,1.A.4.1.4,3308,Q13507,GO:0005515,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...,29108,2.0,29108.0,Ca,calcium(2+),40.07800,[Ca++],29108.0
2,1.A.4.1.4,3308,Q13507,GO:0015279,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...,29108,2.0,29108.0,Ca,calcium(2+),40.07800,[Ca++],29108.0
3,1.A.4.1.4,3308,Q13507,GO:0006816,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...,29108,2.0,29108.0,Ca,calcium(2+),40.07800,[Ca++],29108.0
4,1.A.4.1.4,3308,Q13507,GO:0007602,MEGSPSLRRMTVMREKGRRQAVRGPAFMFNDRGTSLTAEEERFLDA...,29108,2.0,29108.0,Ca,calcium(2+),40.07800,[Ca++],29108.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1649150,3.A.3.3.5,5584,Q86DE0,GO:0016021,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...,15378,1.0,15378.0,H,hydron,1.00794,[H+],15378.0
1649151,3.A.3.3.5,5584,Q86DE0,GO:0005524,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...,15378,1.0,15378.0,H,hydron,1.00794,[H+],15378.0
1649152,3.A.3.3.5,5584,Q86DE0,GO:0015662,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...,15378,1.0,15378.0,H,hydron,1.00794,[H+],15378.0
1649153,3.A.3.3.5,5584,Q86DE0,GO:0006754,MGDTGPKGVPGTNDAGEVHKPQKPQRRQSVLSKAISEHREGDDGSV...,15378,1.0,15378.0,H,hydron,1.00794,[H+],15378.0


In [47]:
df_chebi_info[df_chebi_info["CHEBI IDs"] == 1]

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence,charge,chebi,formula,label,mass,smiles
