# Training machine learning models on pairs of substrates in individual organisms

After creating the set of primary ChEBI substrates from a set of transmembrane transport proteins, the next task is to create a pipeline for generating these datasets automatically. Next, the 

## Generalized dataset creation method

- filter uniprot proteins generally
- filter again for transmembrane transporter activity
- get network of GO terms that are descendants of transmembrane transporter activity
    - filter network for proteins with in dataset with term
    - annotate with number of proteins
    - annotate with ChEBI substrates
- get chebi network
    - filter for GO terms in filtered GO network
- 70% sequence identity?
- feature generation
- svm pipeline

End results: 
- Annotated GO network for protein dataset
- Annotated chebi network for protein dataset
- Dataframe with accession, organism id, sequence, transport go terms, and transported substrate chebi term

In [70]:
from subpred.util import load_df
from subpred.graph import preprocess_data, get_substrate_matrix
from subpred.pssm import calculate_pssm_feature
from subpred.compositions import calculate_aac, calculate_paac
import pandas as pd

In [71]:
DATASET_FOLDER_PATH = "../data/datasets"

DATASET_NAME_TO_ORGANISM_IDS = {"human": {9606}, "athaliana": {3702}, "ecoli": {83333}, "yeast": {559292}}
DATASET_NAME_TO_ORGANISM_IDS["all"] = {list(s)[0] for s in DATASET_NAME_TO_ORGANISM_IDS.values() if len(s) == 1}
DATASET_NAME_TO_ORGANISM_IDS

{'human': {9606},
 'athaliana': {3702},
 'ecoli': {83333},
 'yeast': {559292},
 'all': {3702, 9606, 83333, 559292}}

In [72]:
organism_ids = DATASET_NAME_TO_ORGANISM_IDS["athaliana"]
df_uniprot, df_uniprot_goa, graph_go_filtered, graph_chebi_filtered = preprocess_data(
    organism_ids=organism_ids, datasets_folder_path=DATASET_FOLDER_PATH
)


43248
164519
60547


In [89]:
# TODO rename? check what the method acutally does, do we need to change anything?

df_substrate_overlaps, dict_chebi_to_uniprot = get_substrate_matrix(
    datasets_folder_path=DATASET_FOLDER_PATH,
    graph_chebi=graph_chebi_filtered,
    graph_go=graph_go_filtered,
    df_uniprot_goa=df_uniprot_goa,
    min_overlap=0, max_overlap=int(1e6)
)
assert df_substrate_overlaps.shape[0] == len(dict_chebi_to_uniprot.keys())

1995
245
244


In [74]:
chebi_name_to_term = {name : term for term, name in graph_chebi_filtered.nodes(data="name")}
chebi_term_to_name = {term : name for term, name in graph_chebi_filtered.nodes(data="name")}
molecule_counts = {
    chebi_term_to_name[term] : len(proteins) for term, proteins in dict_chebi_to_uniprot.items()
}
sorted(molecule_counts.items(), key=lambda item: item[1], reverse=True)

[('monoatomic ion', 188),
 ('monoatomic cation', 162),
 ('inorganic cation', 159),
 ('metal cation', 83),
 ('proton', 82),
 ('organic anion', 70),
 ('carboxylic acid anion', 50),
 ('organic acid', 50),
 ('carbohydrate derivative', 46),
 ('carbohydrate', 43),
 ('inorganic anion', 40),
 ('biomacromolecule', 36),
 ('potassium(1+)', 35),
 ('chemical entity', 33),
 ('amino acid', 32),
 ('calcium(2+)', 27),
 ('transition element cation', 26),
 ('monoatomic anion', 22),
 ('amide', 22),
 ('water', 21),
 ('ion', 20),
 ('organic phosphate', 18),
 ('monosaccharide', 18),
 ('nitrate', 17),
 ('nucleotide-sugar', 16),
 ('sucrose', 15),
 ('L-alpha-amino acid zwitterion', 15),
 ('purine nucleotide', 14),
 ('sulfur molecular entity', 14),
 ('nucleotide', 14),
 ('adenyl nucleotide', 14),
 ('pyrimidine nucleotide-sugar', 13),
 ('purine ribonucleotide', 12),
 ('manganese cation', 11),
 ('ATP(4-)', 11),
 ('nucleobase', 10),
 ('nucleoside', 10),
 ('ADP(3-)', 10),
 ('hexose', 9),
 ('iron cation', 9),
 ('mono

In [75]:
# TODO sequence clustering?
# TODO turn into function labels -> df_labels

labels = "potassium(1+)", 'calcium(2+)'

protein_to_label = list()
for label in labels:
    label_proteins = dict_chebi_to_uniprot[chebi_name_to_term[label]]
    for protein in label_proteins:
        protein_to_label.append([protein, label])

df_labels = pd.DataFrame.from_records(protein_to_label,columns=["Uniprot", "label"], index="Uniprot")

df_labels

Unnamed: 0_level_0,label
Uniprot,Unnamed: 1_level_1
Q84WG1,potassium(1+)
Q93VD3,potassium(1+)
Q9M7K4,potassium(1+)
Q9LD18,potassium(1+)
Q8LBL1,potassium(1+)
...,...
Q945S5,calcium(2+)
Q39253,calcium(2+)
Q9LEQ3,calcium(2+)
Q39254,calcium(2+)


Create sequence-based features

In [76]:
df_sequences = df_uniprot.loc[df_labels.index].sequence

In [77]:
df_aac = calculate_aac(df_sequences)
df_paac = calculate_paac(df_sequences)

In [78]:
df_pssm_50_1 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref50_1it",
    blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
    iterations=1,
    psiblast_threads=16,
    verbose=False
)
df_pssm_50_3 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref50_3it",
    blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
    iterations=3,
    psiblast_threads=16,
    verbose=False
)
df_pssm_90_1 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
    blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
    iterations=1,
    psiblast_threads=16,
    verbose=False
)
df_pssm_90_3 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
    blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
    iterations=3,
    psiblast_threads=16,
    verbose=False
)