# Training machine learning models on pairs of substrates in individual organisms

After creating the set of primary ChEBI substrates from a set of transmembrane transport proteins, the next task is to create a pipeline for generating these datasets automatically. Next, the 

## Generalized dataset creation method

- filter uniprot proteins generally
- filter again for transmembrane transporter activity
- get network of GO terms that are descendants of transmembrane transporter activity
    - filter network for proteins with in dataset with term
    - annotate with number of proteins
    - annotate with ChEBI substrates
- get chebi network
    - filter for GO terms in filtered GO network
- 70% sequence identity
- feature generation
- svm pipeline

End results: 
- Stats for organisms, substrates, features, train/test

In [1]:
from subpred.util import load_df
from subpred.graph import preprocess_data, get_substrate_matrix
from subpred.pssm import calculate_pssm_feature
from subpred.compositions import calculate_aac, calculate_paac
import pandas as pd
from subpred.cdhit import cd_hit
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from subpred.custom_transformers import FeatureCombinator, get_feature_type_combinations

## Functions

### Dataset

In [2]:
def get_classification_task(
    organism_ids: set,
    labels: set,
    clustering_threshold: int = None,
    dataset_folder_path: str = "../data/datasets",
) -> pd.DataFrame:
    # TODO handling for multi-substrate
    # TODO ability to use go terms or chebi terms (compare sample count, performance)

    (
        df_uniprot,
        df_uniprot_goa,
        graph_go_filtered,
        graph_chebi_filtered,
    ) = preprocess_data(
        organism_ids=organism_ids, datasets_folder_path=dataset_folder_path
    )
    # TODO go through method code
    df_substrate_overlaps, dict_chebi_to_uniprot = get_substrate_matrix(
        datasets_folder_path=dataset_folder_path,
        graph_chebi=graph_chebi_filtered,
        graph_go=graph_go_filtered,
        df_uniprot_goa=df_uniprot_goa,
        min_overlap=0,
        max_overlap=int(1e6),
    )
    assert df_substrate_overlaps.shape[0] == len(dict_chebi_to_uniprot.keys())
    chebi_name_to_term = {
        name: term for term, name in graph_chebi_filtered.nodes(data="name")
    }
    chebi_term_to_name = {
        term: name for term, name in graph_chebi_filtered.nodes(data="name")
    }
    molecule_counts = {
        chebi_term_to_name[term]: len(proteins)
        for term, proteins in dict_chebi_to_uniprot.items()
    }
    print(sorted(molecule_counts.items(), key=lambda item: item[1], reverse=True))

    protein_to_label = list()
    for label in labels:
        label_proteins = dict_chebi_to_uniprot[chebi_name_to_term[label]]
        for protein in label_proteins:
            protein_to_label.append([protein, label])

    df_labels = pd.DataFrame.from_records(
        protein_to_label, columns=["Uniprot", "label"], index="Uniprot"
    )

    df_labels = df_labels[~df_labels.index.duplicated()]  # TODO series?
    print(df_labels.label.value_counts())
    df_sequences = df_uniprot.loc[df_labels.index].sequence.to_frame()
    print("number of sequences", df_sequences.shape[0])
    if clustering_threshold:
        cluster_representatives = cd_hit(
            df_sequences.sequence, identity_threshold=clustering_threshold
        )
        print(cluster_representatives)
        df_sequences = df_sequences.loc[cluster_representatives]
        df_labels = df_labels.loc[cluster_representatives]
    return pd.concat([df_sequences, df_labels], axis=1)

### Features

In [3]:
def get_features(series_sequences:pd.Series):
    df_aac = calculate_aac(series_sequences)
    df_paac = calculate_paac(series_sequences)
    df_pssm_50_1 = calculate_pssm_feature(
        series_sequences,
        tmp_folder="../data/intermediate/blast/pssm_uniref50_1it",
        blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
        iterations=1,
        psiblast_threads=-1,
        verbose=False,
        feature_name="PSSM_50_1"
    )
    df_pssm_50_3 = calculate_pssm_feature(
        series_sequences,
        tmp_folder="../data/intermediate/blast/pssm_uniref50_3it",
        blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
        iterations=3,
        psiblast_threads=-1,
        verbose=False,
        feature_name="PSSM_50_3"
    )
    df_pssm_90_1 = calculate_pssm_feature(
        series_sequences,
        tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
        blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
        iterations=1,
        psiblast_threads=-1,
        verbose=False,
        feature_name="PSSM_90_1"
    )
    df_pssm_90_3 = calculate_pssm_feature(
        series_sequences,
        tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
        blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
        iterations=3,
        psiblast_threads=-1,
        verbose=False,
        feature_name="PSSM_90_3"
    )
    df_features = pd.concat(
        [
            df_aac,
            df_paac,
            df_pssm_50_1,
            df_pssm_50_3,
            df_pssm_90_1,
            df_pssm_90_3,
        ], axis=1
    )
    return df_features

### Eval

In [4]:
# TODO try removing worst sample according to percentages
# TODO feature selection, regularization
# TODO cd-hit
# TODO determinism
# TODO also comparative analysis of features?
# TODO compare to protein embeddings and BLAST
# TODO parameter for using featurecombinator
# TODO separate functions

def evaluate(df_dataset, df_features):

    # converting data to numpy
    label_encoder = LabelEncoder()
    label_encoder.fit(sorted(df_dataset.label.unique()))
    sample_names = df_features.index.values
    feature_names = df_features.columns.values
    X = df_features.values
    y = label_encoder.transform(df_dataset.label)
    # train test eval split
    (
        X_train,
        X_eval,
        y_train,
        y_eval,
        sample_names_train,
        sample_names_eval,
    ) = train_test_split(X, y, sample_names, test_size=0.2, random_state=1, stratify=y)

    feature_type_combinations = get_feature_type_combinations(feature_names=feature_names)
    feature_combinator = FeatureCombinator(feature_names=df_features.columns)
    model = make_pipeline(
        StandardScaler(), feature_combinator, SVC(random_state=1, probability=True)
    )
    param_grid = {
        "svc__C": [0.1, 1, 10],
        # "svc__gamma": ["scale", "auto"],
        # "svc__class_weight": ["balanced", None],
        "featurecombinator__feature_types": feature_type_combinations,
        # "selectpercentile__percentile": list(range(1, 101, 5)),
    }

    # hyperparam optim & crossval
    gridsearch = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring="f1",
        cv=5,
        n_jobs=-1,
        return_train_score=True,
        # verbose=20
    )
    gridsearch.fit(X_train, y_train)
    print("Best train score:", gridsearch.best_score_)
    print("Best train params", gridsearch.best_params_)
    model_optim = gridsearch.best_estimator_

    # eval
    y_pred = model_optim.predict(X_eval)
    print(classification_report(y_true=y_eval, y_pred=y_pred))
    print(model_optim.predict_proba(X_eval))  # TODO compare with actual labels


## Main

In [5]:
def main(organism_ids:set, labels: set):
    # TODO get rid of unnecessary prints
    df_dataset = get_classification_task(
        organism_ids=organism_ids,
        labels=labels,
        clustering_threshold=70,
    )

    df_features = get_features(df_dataset.sequence)

    df_features = df_features.loc[df_features.index.sort_values()]
    df_dataset = df_dataset.loc[df_features.index]
    df_features = df_features.loc[df_dataset.index]

    print(df_dataset.shape, df_features.shape)

    evaluate(df_dataset=df_dataset, df_features=df_features)


In [6]:
labels = {"potassium(1+)", 'calcium(2+)'}

dataset_name_to_organism_ids = {"human": {9606}, "athaliana": {3702}, "ecoli": {83333}, "yeast": {559292}}
dataset_name_to_organism_ids["all"] = {list(s)[0] for s in dataset_name_to_organism_ids.values() if len(s) == 1}

for dataset_name, organism_ids in dataset_name_to_organism_ids.items():
    print(dataset_name)
    main(organism_ids=organism_ids, labels=labels)

human
43248
164519
60547
1995
477
474
[('monoatomic ion', 1455), ('monoatomic cation', 1138), ('inorganic cation', 1061), ('metal cation', 852), ('calcium(2+)', 356), ('potassium(1+)', 289), ('organic anion', 266), ('sodium(1+)', 239), ('monoatomic anion', 238), ('inorganic anion', 234), ('proton', 197), ('chloride', 187), ('organic acid', 186), ('carboxylic acid anion', 183), ('chemical entity', 144), ('amino acid', 111), ('L-alpha-amino acid zwitterion', 86), ('carbohydrate derivative', 78), ('sulfur molecular entity', 66), ('ion', 58), ('monocarboxylic acid', 58), ('amide', 55), ('transition element cation', 49), ('carbohydrate', 40), ('organic cation', 40), ('amino acid derivative', 38), ('organic phosphate', 35), ('nucleotide-sugar', 32), ('biomacromolecule', 32), ('nucleotide', 31), ('pyrimidine nucleotide-sugar', 30), ('dicarboxylic acid', 28), ('purine nucleotide', 27), ('monosaccharide', 27), ('glucose', 25), ('hexose', 25), ('hydrogencarbonate', 24), ('adenyl nucleotide', 24)

## Comparisons

Compare training results with: 

- Average sequence similarity
- GO term similarity
  - How many protein in common?
  - Semantic similarity?
