# Training machine learning models on pairs of substrates in individual organisms

After creating the set of primary ChEBI substrates from a set of transmembrane transport proteins, the next task is to create a pipeline for generating these datasets automatically. Next, the 

## Generalized dataset creation method

- filter uniprot proteins generally
- filter again for transmembrane transporter activity
- get network of GO terms that are descendants of transmembrane transporter activity
    - filter network for proteins with in dataset with term
    - annotate with number of proteins
    - annotate with ChEBI substrates
- get chebi network
    - filter for GO terms in filtered GO network
- 70% sequence identity?
- feature generation
- svm pipeline

End results: 
- Annotated GO network for protein dataset
- Annotated chebi network for protein dataset
- Dataframe with accession, organism id, sequence, transport go terms, and transported substrate chebi term

In [10]:
from subpred.util import load_df
from subpred.graph import preprocess_data, get_substrate_matrix
from subpred.pssm import calculate_pssm_feature
from subpred.compositions import calculate_aac, calculate_paac
import pandas as pd

In [11]:
DATASET_FOLDER_PATH = "../data/datasets"

DATASET_NAME_TO_ORGANISM_IDS = {"human": {9606}, "athaliana": {3702}, "ecoli": {83333}, "yeast": {559292}}
DATASET_NAME_TO_ORGANISM_IDS["all"] = {list(s)[0] for s in DATASET_NAME_TO_ORGANISM_IDS.values() if len(s) == 1}
DATASET_NAME_TO_ORGANISM_IDS

{'human': {9606},
 'athaliana': {3702},
 'ecoli': {83333},
 'yeast': {559292},
 'all': {3702, 9606, 83333, 559292}}

In [12]:
organism_ids = DATASET_NAME_TO_ORGANISM_IDS["athaliana"]
df_uniprot, df_uniprot_goa, graph_go_filtered, graph_chebi_filtered = preprocess_data(
    organism_ids=organism_ids, datasets_folder_path=DATASET_FOLDER_PATH
)


43248
164519
60547


In [13]:
# TODO go through method code 

df_substrate_overlaps, dict_chebi_to_uniprot = get_substrate_matrix(
    datasets_folder_path=DATASET_FOLDER_PATH,
    graph_chebi=graph_chebi_filtered,
    graph_go=graph_go_filtered,
    df_uniprot_goa=df_uniprot_goa,
    min_overlap=0, max_overlap=int(1e6)
)
assert df_substrate_overlaps.shape[0] == len(dict_chebi_to_uniprot.keys())

1995
247
246


In [14]:
chebi_name_to_term = {name : term for term, name in graph_chebi_filtered.nodes(data="name")}
chebi_term_to_name = {term : name for term, name in graph_chebi_filtered.nodes(data="name")}
molecule_counts = {
    chebi_term_to_name[term] : len(proteins) for term, proteins in dict_chebi_to_uniprot.items()
}
sorted(molecule_counts.items(), key=lambda item: item[1], reverse=True)

[('monoatomic ion', 336),
 ('monoatomic cation', 276),
 ('inorganic cation', 249),
 ('metal cation', 138),
 ('proton', 125),
 ('organic anion', 82),
 ('carbohydrate', 71),
 ('inorganic anion', 70),
 ('carboxylic acid anion', 58),
 ('organic acid', 58),
 ('potassium(1+)', 56),
 ('carbohydrate derivative', 53),
 ('monoatomic anion', 46),
 ('biomacromolecule', 45),
 ('calcium(2+)', 44),
 ('chemical entity', 36),
 ('ion', 34),
 ('transition element cation', 33),
 ('amino acid', 32),
 ('amide', 30),
 ('organic phosphate', 27),
 ('water', 22),
 ('sulfur molecular entity', 22),
 ('monosaccharide', 22),
 ('nitrate', 18),
 ('purine nucleotide', 18),
 ('nucleotide', 18),
 ('adenyl nucleotide', 18),
 ('nucleotide-sugar', 17),
 ('chloride', 16),
 ('purine ribonucleotide', 16),
 ('sucrose', 15),
 ('ATP(4-)', 15),
 ('L-alpha-amino acid zwitterion', 15),
 ('peptide', 14),
 ('ADP(3-)', 14),
 ('pyrimidine nucleotide-sugar', 13),
 ('iron cation', 12),
 ('manganese cation', 12),
 ('oligopeptide', 12),
 (

In [15]:
# TODO sequence clustering?
# TODO turn into function labels -> df_labels

labels = "potassium(1+)", 'calcium(2+)'

protein_to_label = list()
for label in labels:
    label_proteins = dict_chebi_to_uniprot[chebi_name_to_term[label]]
    for protein in label_proteins:
        protein_to_label.append([protein, label])

df_labels = pd.DataFrame.from_records(protein_to_label,columns=["Uniprot", "label"], index="Uniprot")

df_labels = df_labels[~df_labels.index.duplicated()]
df_labels.label.value_counts()

label
potassium(1+)    56
calcium(2+)      42
Name: count, dtype: int64

## Feature creation

In [16]:
df_sequences = df_uniprot.loc[df_labels.index].sequence
print(df_sequences.shape[0])

98


In [17]:
df_aac = calculate_aac(df_sequences)
df_paac = calculate_paac(df_sequences)

In [18]:
# TODO cpu count too high?
df_pssm_50_1 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref50_1it",
    blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
    iterations=1,
    psiblast_threads=-1,
    verbose=True,
    feature_name="PSSM_50_1"
)
df_pssm_50_3 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref50_3it",
    blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
    iterations=3,
    psiblast_threads=-1,
    verbose=True,
    feature_name="PSSM_50_3"
)
df_pssm_90_1 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
    blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
    iterations=1,
    psiblast_threads=-1,
    verbose=True,
    feature_name="PSSM_90_1"
)
df_pssm_90_3 = calculate_pssm_feature(
    df_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
    blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
    iterations=3,
    psiblast_threads=-1,
    verbose=True,
    feature_name="PSSM_90_3"
)

PSSM for accession A0A1P8BFX1 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q8RWU6 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q84WG1 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q9S6Z8 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q9LQL2 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession A0A1I9LLU1 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession P92960 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession F4JQC2 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession F4JU14 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession O22920 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q8LPL8 was found in tmp

In [19]:
df_paac.index.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False])

In [22]:
# TODO combine features, separated by <type>__<feature> to use with custom transformer
# TODO test custom transformer method


df_features = pd.concat(
    [
        df_aac,
        df_paac,
        df_pssm_50_1,
        df_pssm_50_3,
        df_pssm_90_1,
        df_pssm_90_3,
    ], axis=1
)
df_features

Unnamed: 0,AAC__A,AAC__C,AAC__D,AAC__E,AAC__F,AAC__G,AAC__H,AAC__I,AAC__K,AAC__L,...,PSSM_90_3__VL,PSSM_90_3__VK,PSSM_90_3__VM,PSSM_90_3__VF,PSSM_90_3__VP,PSSM_90_3__VS,PSSM_90_3__VT,PSSM_90_3__VW,PSSM_90_3__VY,PSSM_90_3__VV
A0A1P8BFX1,0.052030,0.015228,0.058376,0.065990,0.059645,0.058376,0.024112,0.077411,0.062183,0.120558,...,0.552437,0.322009,0.559823,0.429838,0.323486,0.379616,0.462334,0.302806,0.350074,0.782866
Q8RWU6,0.065421,0.007477,0.035514,0.044860,0.093458,0.084112,0.031776,0.056075,0.018692,0.121495,...,0.484642,0.237201,0.498294,0.401024,0.226962,0.298635,0.407850,0.257679,0.310580,0.697952
Q84WG1,0.073559,0.009940,0.033797,0.035785,0.079523,0.059642,0.027833,0.083499,0.043738,0.135189,...,0.508897,0.274021,0.503559,0.418149,0.266904,0.362989,0.425267,0.291815,0.341637,0.779359
Q9S6Z8,0.051471,0.012255,0.063725,0.044118,0.051471,0.046569,0.039216,0.083333,0.051471,0.115196,...,0.658228,0.224684,0.594937,0.582278,0.224684,0.382911,0.449367,0.237342,0.335443,0.879747
Q9LQL2,0.068404,0.021173,0.052117,0.053746,0.060261,0.076547,0.011401,0.065147,0.050489,0.094463,...,0.552013,0.283557,0.536913,0.458054,0.239933,0.340604,0.458054,0.226510,0.338926,0.859060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q8GXJ4,0.063608,0.010428,0.044838,0.050052,0.056309,0.063608,0.015641,0.062565,0.041710,0.085506,...,0.504890,0.128362,0.479218,0.338631,0.079462,0.215159,0.363081,0.050122,0.207824,1.000000
A0A1P8AXT8,0.071121,0.015086,0.047414,0.065733,0.039871,0.078664,0.015086,0.079741,0.056034,0.098060,...,0.449084,0.093686,0.405295,0.318737,0.064155,0.174134,0.316701,0.059063,0.200611,1.000000
Q9M2L4,0.078049,0.015610,0.044878,0.067317,0.039024,0.076098,0.014634,0.081951,0.059512,0.093659,...,0.447393,0.088152,0.403791,0.327014,0.061611,0.163033,0.317536,0.054976,0.211374,1.000000
Q9LY28,0.081420,0.014614,0.058455,0.073069,0.035491,0.077244,0.016701,0.062630,0.064718,0.112735,...,0.531390,0.352018,0.515695,0.405830,0.311659,0.392377,0.439462,0.266816,0.352018,0.780269


In [26]:
from subpred.custom_transformers import FeatureCombinator

# TODO figure out if I should implement get_support instead.
fc = FeatureCombinator(feature_names=df_features.columns, feature_types="AAC")

fc.fit_transform(df_features, df_labels) 



array([[0.05203046, 0.01522843, 0.05837563, ..., 0.06472081, 0.01142132,
        0.04441624],
       [0.06542056, 0.00747664, 0.03551402, ..., 0.07476636, 0.00560748,
        0.03364486],
       [0.07355865, 0.00994036, 0.03379722, ..., 0.07355865, 0.01192843,
        0.02186879],
       ...,
       [0.07804878, 0.01560976, 0.04487805, ..., 0.0995122 , 0.01073171,
        0.01365854],
       [0.08141962, 0.01461378, 0.05845511, ..., 0.05845511, 0.0125261 ,
        0.03757829],
       [0.07142857, 0.02040816, 0.04591837, ..., 0.06122449, 0.00510204,
        0.02040816]])

## ML pipeline

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.metrics import f1_score, classification_report
from subpred.custom_transformers import FeatureCombinator

# splitting data
X = None
y = None
# train test eval split
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# model
model = make_pipeline(StandardScaler(), SVC(random_state=1))
param_grid = {
    "svc__C": [0.1, 1, 10],
    "svc__gamma": ["scale", "auto"],
    "svc__class_weight": ["balanced", None],
}

# hyperparam optim & crossval
gridsearch = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    return_train_score=True,
)
gridsearch.fit(X_train, y_train)
print("Best train score:",gridsearch.best_score_)
print("Best train params",gridsearch.best_params_)
model_optim = gridsearch.best_estimator_

# eval
y_pred = model_optim.predict(y_eval)
classification_report(y_true=y_eval, y_pred=y_pred)

InvalidParameterError: The 'stratify' parameter of train_test_split must be an array-like or None. Got True instead.

## Comparisons

Compare training results with: 

- Average sequence similarity
- GO term similarity
  - How many protein in common?
  - Semantic similarity?
