# Training machine learning models on pairs of substrates in individual organisms

After creating the set of primary ChEBI substrates from a set of transmembrane transport proteins, the next task is to create a pipeline for generating these datasets automatically. Next, the 

## Generalized dataset creation method

- filter uniprot proteins generally
- filter again for transmembrane transporter activity
- get network of GO terms that are descendants of transmembrane transporter activity
    - filter network for proteins with in dataset with term
    - annotate with number of proteins
    - annotate with ChEBI substrates
- get chebi network
    - filter for GO terms in filtered GO network
- 70% sequence identity?
- feature generation
- svm pipeline

End results: 
- Annotated GO network for protein dataset
- Annotated chebi network for protein dataset
- Dataframe with accession, organism id, sequence, transport go terms, and transported substrate chebi term

In [1]:
from subpred.util import load_df
from subpred.graph import preprocess_data, get_substrate_matrix
from subpred.pssm import calculate_pssm_feature
from subpred.compositions import calculate_aac, calculate_paac
import pandas as pd

In [2]:
DATASET_FOLDER_PATH = "../data/datasets"

DATASET_NAME_TO_ORGANISM_IDS = {"human": {9606}, "athaliana": {3702}, "ecoli": {83333}, "yeast": {559292}}
DATASET_NAME_TO_ORGANISM_IDS["all"] = {list(s)[0] for s in DATASET_NAME_TO_ORGANISM_IDS.values() if len(s) == 1}
DATASET_NAME_TO_ORGANISM_IDS

{'human': {9606},
 'athaliana': {3702},
 'ecoli': {83333},
 'yeast': {559292},
 'all': {3702, 9606, 83333, 559292}}

In [3]:
organism_ids = DATASET_NAME_TO_ORGANISM_IDS["athaliana"]
df_uniprot, df_uniprot_goa, graph_go_filtered, graph_chebi_filtered = preprocess_data(
    organism_ids=organism_ids, datasets_folder_path=DATASET_FOLDER_PATH
)


43248
164519
60547


In [4]:
# TODO rename? check what the method acutally does, do we need to change anything?

df_substrate_overlaps, dict_chebi_to_uniprot = get_substrate_matrix(
    datasets_folder_path=DATASET_FOLDER_PATH,
    graph_chebi=graph_chebi_filtered,
    graph_go=graph_go_filtered,
    df_uniprot_goa=df_uniprot_goa,
    min_overlap=0, max_overlap=int(1e6)
)
assert df_substrate_overlaps.shape[0] == len(dict_chebi_to_uniprot.keys())

1995
247
246


In [5]:
chebi_name_to_term = {name : term for term, name in graph_chebi_filtered.nodes(data="name")}
chebi_term_to_name = {term : name for term, name in graph_chebi_filtered.nodes(data="name")}
molecule_counts = {
    chebi_term_to_name[term] : len(proteins) for term, proteins in dict_chebi_to_uniprot.items()
}
sorted(molecule_counts.items(), key=lambda item: item[1], reverse=True)

[('monoatomic ion', 336),
 ('monoatomic cation', 276),
 ('inorganic cation', 249),
 ('metal cation', 138),
 ('proton', 125),
 ('organic anion', 82),
 ('carbohydrate', 71),
 ('inorganic anion', 70),
 ('carboxylic acid anion', 58),
 ('organic acid', 58),
 ('potassium(1+)', 56),
 ('carbohydrate derivative', 53),
 ('monoatomic anion', 46),
 ('biomacromolecule', 45),
 ('calcium(2+)', 44),
 ('chemical entity', 36),
 ('ion', 34),
 ('transition element cation', 33),
 ('amino acid', 32),
 ('amide', 30),
 ('organic phosphate', 27),
 ('water', 22),
 ('sulfur molecular entity', 22),
 ('monosaccharide', 22),
 ('nitrate', 18),
 ('purine nucleotide', 18),
 ('nucleotide', 18),
 ('adenyl nucleotide', 18),
 ('nucleotide-sugar', 17),
 ('chloride', 16),
 ('purine ribonucleotide', 16),
 ('sucrose', 15),
 ('ATP(4-)', 15),
 ('L-alpha-amino acid zwitterion', 15),
 ('peptide', 14),
 ('ADP(3-)', 14),
 ('pyrimidine nucleotide-sugar', 13),
 ('iron cation', 12),
 ('manganese cation', 12),
 ('oligopeptide', 12),
 (

In [6]:
# TODO sequence clustering?
# TODO turn into function labels -> df_labels

labels = "potassium(1+)", 'calcium(2+)'

protein_to_label = list()
for label in labels:
    label_proteins = dict_chebi_to_uniprot[chebi_name_to_term[label]]
    for protein in label_proteins:
        protein_to_label.append([protein, label])

df_labels = pd.DataFrame.from_records(protein_to_label,columns=["Uniprot", "label"], index="Uniprot")

df_labels

Unnamed: 0_level_0,label
Uniprot,Unnamed: 1_level_1
A0A1P8BFX8,potassium(1+)
A0A1P8B0E9,potassium(1+)
A0A1P8B652,potassium(1+)
Q39128,potassium(1+)
F4JHE9,potassium(1+)
...,...
Q9M2L4,calcium(2+)
O81108,calcium(2+)
Q9FLS8,calcium(2+)
Q9FG04,calcium(2+)


## Feature creation

In [7]:
df_sequences = df_uniprot.loc[df_labels.index].sequence
print(df_sequences.shape[0])

100


In [8]:
df_aac = calculate_aac(df_sequences)
df_paac = calculate_paac(df_sequences)

In [9]:
df_pssm_50_1 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref50_1it",
    blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
    iterations=1,
    psiblast_threads=-1,
    verbose=True
)
df_pssm_50_3 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref50_3it",
    blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
    iterations=3,
    psiblast_threads=-1,
    verbose=True
)
df_pssm_90_1 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
    blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
    iterations=1,
    psiblast_threads=-1,
    verbose=True
)
df_pssm_90_3 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
    blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
    iterations=3,
    psiblast_threads=-1,
    verbose=True
)

PSSM for accession A0A1P8BFX8 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession A0A1P8B0E9 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession A0A1P8B652 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q39128 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession F4JHE9 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q84TI7 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q9LKW9 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession F4JV33 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q84WG1 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q38898 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession A0A1P8B951 was foun

KeyboardInterrupt: 

In [None]:
# TODO combine features, separated by <type>__<feature> to use with custom transformer
# TODO test custom transformer method

from subpred.custom_transformers import FeatureCombinator

## ML pipeline

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.metrics import f1_score, classification_report
from subpred.custom_transformers import FeatureCombinator

# splitting data
X = None
y = None
# train test eval split
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=1, stratify=True)

# model
model = make_pipeline(StandardScaler(), SVC(random_state=1))
param_grid = {
    "svc__C": [0.1, 1, 10],
    "svc__gamma": ["scale", "auto"],
    "svc__class_weight": ["balanced", None],
}

# hyperparam optim & crossval
gridsearch = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    return_train_score=True,
)
gridsearch.fit(X_train, y_train)
print("Best train score:",gridsearch.best_score_)
print("Best train params",gridsearch.best_params_)
model_optim = gridsearch.best_estimator_

# eval
y_pred = model_optim.predict(y_eval)
classification_report(y_true=y_eval, y_pred=y_pred)

## Comparisons

Compare training results with: 

- Average sequence similarity
- GO term similarity
  - How many protein in common?
  - Semantic similarity?
