# Improving the annotation of metabolite transporters

First off, we are importing packages that might come in handy.

In [1]:
import cobra
import numpy as np
import scipy as sp
import pandas as pd
import requests
from io import StringIO
from Bio import SeqIO

Importing most recenet TC numbers, corresponding CHEBI ID of substrates, uniprot ID and GO terms from TCDB

In [4]:
tc_chebi_url = "https://www.tcdb.org/cgi-bin/substrates/getSubstrates.py"
uniprot_tc_url = "https://www.tcdb.org/cgi-bin/projectv/public/acc2tcid.py"
go_tc_url = "https://www.tcdb.org/cgi-bin/projectv/public/go.py"
fasta_tcdb_url = "https://www.tcdb.org/public/tcdb"

def fetch_data(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def parse_data(tc_chebi_text, uniprot_tc_text, go_tc_text, fasta_tcdb_text):
    # TC-CHEBI
    tc_chebi_lines = tc_chebi_text.strip().split("\n")
    tc_chebi_data = []
    for line in tc_chebi_lines:
        tc_number, chebi_ids = line.split("\t")
        chebi_id_list = [id.split(";")[0].replace("CHEBI:", "") for id in chebi_ids.split("|")]
        tc_chebi_data.append([tc_number, chebi_id_list])

    df_chebi = pd.DataFrame(tc_chebi_data, columns=["TC Number", "CHEBI IDs"])

    # UniProt-TC
    uniprot_tc_lines = uniprot_tc_text.strip().split("\n")
    uniprot_tc_data = [line.split("\t") for line in uniprot_tc_lines]
    df_uniprot = pd.DataFrame(uniprot_tc_data, columns=["UniProt ID", "TC Number"])

    # GO-TC
    go_tc_lines = go_tc_text.strip().split("\n")
    go_tc_data = [line.split("\t")[:2] for line in go_tc_lines]
    df_go = pd.DataFrame(go_tc_data, columns=["GO Term", "TC Number"])

    # FASTA-TC
    fasta_io = StringIO(fasta_tcdb_text)
    tc_data = []
    for record in SeqIO.parse(fasta_io, "fasta"):
        header = record.description
        tc_number = header.split("|")[3].split()[0]
        sequence = str(record.seq)
        tc_data.append([tc_number, sequence])
    df_fasta = pd.DataFrame(tc_data, columns=["TC Number", "AA Sequence"])

    return df_chebi, df_uniprot, df_go, df_fasta

In [3]:
tc_chebi_text = fetch_data(tc_chebi_url)
uniprot_tc_text = fetch_data(uniprot_tc_url)
go_tc_text = fetch_data(go_tc_url)
fasta_tcdb_text = fetch_data(fasta_tcdb_url)
df_chebi, df_uniprot, df_go, df_fasta = parse_data(tc_chebi_text, uniprot_tc_text, go_tc_text, fasta_tcdb_text)
df_merged = pd.merge(df_chebi, df_uniprot, on="TC Number", how="left")
df_merged = df_merged.explode('CHEBI IDs')
df_merged = pd.merge(df_merged, df_go[["TC Number", "GO Term"]], on="TC Number", how="left")
df_merged = pd.merge(df_merged, df_fasta, on="TC Number", how="left")
df_merged.to_csv("tcdb_data_combined.csv", index=False)

In [12]:
df_merged

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence
0,3.A.1.17.6,8675,Q9CLG8,GO:0005524,MSAVEIRDLRLSYNQQPLFAGFNFVLPKGKWTTLLGASGIGKSTLV...
1,3.A.1.17.6,8675,Q9CLG8,GO:0005524,MRGNMKIKPLIHVVLLSLMLLALWQSAVFFFDIPRYMLPPPNDVLA...
2,3.A.1.17.6,8675,Q9CLG8,GO:0005524,MQKLVKKTLTVAFAMGVSLSVSAKEKLTLLLDWFVNPDHAAIIVAQ...
3,3.A.1.17.6,8675,Q9CLG8,GO:0016887,MSAVEIRDLRLSYNQQPLFAGFNFVLPKGKWTTLLGASGIGKSTLV...
4,3.A.1.17.6,8675,Q9CLG8,GO:0016887,MRGNMKIKPLIHVVLLSLMLLALWQSAVFFFDIPRYMLPPPNDVLA...
...,...,...,...,...,...
1756776,3.A.1.1.60,28300,I7G593,,MSSPERVASNAVIYAGLLLGAVITLLPFGLGLLTSFTSAQQFVTES...
1756777,3.A.1.1.60,28300,I7G593,,MATPRVRTTALAYALVAPSLFGVVTFLLLPMLVVVWLSLHRWDLLG...
1756778,3.A.1.1.60,28300,I7FQ33,,MRRSTLLAGGLAVTMAVLLVIAMLMGRTTEPAGKTVVTVRLWDPQV...
1756779,3.A.1.1.60,28300,I7FQ33,,MSSPERVASNAVIYAGLLLGAVITLLPFGLGLLTSFTSAQQFVTES...


Grabbing the filtered ChEBI list, where all parents have been killed, and reducing df_merged correspondingly

In [16]:
with open("chebi_data/chebis_filtered.txt", "r") as f:
    content = f.read().strip()
    filtered_chebis = list(map(str, content.split(",")))

df_filtered = df_merged[df_merged["CHEBI IDs"].isin(filtered_chebis)]
df_filtered

Unnamed: 0,TC Number,CHEBI IDs,UniProt ID,GO Term,AA Sequence
0,3.A.1.17.6,8675,Q9CLG8,GO:0005524,MSAVEIRDLRLSYNQQPLFAGFNFVLPKGKWTTLLGASGIGKSTLV...
1,3.A.1.17.6,8675,Q9CLG8,GO:0005524,MRGNMKIKPLIHVVLLSLMLLALWQSAVFFFDIPRYMLPPPNDVLA...
2,3.A.1.17.6,8675,Q9CLG8,GO:0005524,MQKLVKKTLTVAFAMGVSLSVSAKEKLTLLLDWFVNPDHAAIIVAQ...
3,3.A.1.17.6,8675,Q9CLG8,GO:0016887,MSAVEIRDLRLSYNQQPLFAGFNFVLPKGKWTTLLGASGIGKSTLV...
4,3.A.1.17.6,8675,Q9CLG8,GO:0016887,MRGNMKIKPLIHVVLLSLMLLALWQSAVFFFDIPRYMLPPPNDVLA...
...,...,...,...,...,...
1756767,2.A.12.1.13,2342,Q8SUG7,GO:0006810,MSENREIDATDRRDKTFDKEKLRPHVYSSVAGGMRSTSGDTKAVLL...
1756768,2.A.12.1.13,2359,Q8SUG7,GO:0016021,MSENREIDATDRRDKTFDKEKLRPHVYSSVAGGMRSTSGDTKAVLL...
1756769,2.A.12.1.13,2359,Q8SUG7,GO:0005886,MSENREIDATDRRDKTFDKEKLRPHVYSSVAGGMRSTSGDTKAVLL...
1756770,2.A.12.1.13,2359,Q8SUG7,GO:0005524,MSENREIDATDRRDKTFDKEKLRPHVYSSVAGGMRSTSGDTKAVLL...
