# Training machine learning models on pairs of substrates in individual organisms

After creating the set of primary ChEBI substrates from a set of transmembrane transport proteins, the next task is to create a pipeline for generating these datasets automatically. Next, the 

## Generalized dataset creation method

- filter uniprot proteins generally
- filter again for transmembrane transporter activity
- get network of GO terms that are descendants of transmembrane transporter activity
    - filter network for proteins with in dataset with term
    - annotate with number of proteins
    - annotate with ChEBI substrates
- get chebi network
    - filter for GO terms in filtered GO network
- 70% sequence identity?
- feature generation
- svm pipeline

End results: 
- Annotated GO network for protein dataset
- Annotated chebi network for protein dataset
- Dataframe with accession, organism id, sequence, transport go terms, and transported substrate chebi term

In [1]:
from subpred.util import load_df
from subpred.graph import preprocess_data, get_substrate_matrix
from subpred.pssm import calculate_pssm_feature
from subpred.compositions import calculate_aac, calculate_paac
import pandas as pd

In [2]:
DATASET_FOLDER_PATH = "../data/datasets"

DATASET_NAME_TO_ORGANISM_IDS = {"human": {9606}, "athaliana": {3702}, "ecoli": {83333}, "yeast": {559292}}
DATASET_NAME_TO_ORGANISM_IDS["all"] = {list(s)[0] for s in DATASET_NAME_TO_ORGANISM_IDS.values() if len(s) == 1}
DATASET_NAME_TO_ORGANISM_IDS

{'human': {9606},
 'athaliana': {3702},
 'ecoli': {83333},
 'yeast': {559292},
 'all': {3702, 9606, 83333, 559292}}

In [3]:
organism_ids = DATASET_NAME_TO_ORGANISM_IDS["athaliana"]
df_uniprot, df_uniprot_goa, graph_go_filtered, graph_chebi_filtered = preprocess_data(
    organism_ids=organism_ids, datasets_folder_path=DATASET_FOLDER_PATH
)


43248
164519
60547


In [4]:
# TODO go through method code 

df_substrate_overlaps, dict_chebi_to_uniprot = get_substrate_matrix(
    datasets_folder_path=DATASET_FOLDER_PATH,
    graph_chebi=graph_chebi_filtered,
    graph_go=graph_go_filtered,
    df_uniprot_goa=df_uniprot_goa,
    min_overlap=0, max_overlap=int(1e6)
)
assert df_substrate_overlaps.shape[0] == len(dict_chebi_to_uniprot.keys())

1995
247
246


In [5]:
chebi_name_to_term = {name : term for term, name in graph_chebi_filtered.nodes(data="name")}
chebi_term_to_name = {term : name for term, name in graph_chebi_filtered.nodes(data="name")}
molecule_counts = {
    chebi_term_to_name[term] : len(proteins) for term, proteins in dict_chebi_to_uniprot.items()
}
sorted(molecule_counts.items(), key=lambda item: item[1], reverse=True)

[('monoatomic ion', 336),
 ('monoatomic cation', 276),
 ('inorganic cation', 249),
 ('metal cation', 138),
 ('proton', 125),
 ('organic anion', 82),
 ('carbohydrate', 71),
 ('inorganic anion', 70),
 ('carboxylic acid anion', 58),
 ('organic acid', 58),
 ('potassium(1+)', 56),
 ('carbohydrate derivative', 53),
 ('monoatomic anion', 46),
 ('biomacromolecule', 45),
 ('calcium(2+)', 44),
 ('chemical entity', 36),
 ('ion', 34),
 ('transition element cation', 33),
 ('amino acid', 32),
 ('amide', 30),
 ('organic phosphate', 27),
 ('water', 22),
 ('sulfur molecular entity', 22),
 ('monosaccharide', 22),
 ('nitrate', 18),
 ('purine nucleotide', 18),
 ('nucleotide', 18),
 ('adenyl nucleotide', 18),
 ('nucleotide-sugar', 17),
 ('chloride', 16),
 ('purine ribonucleotide', 16),
 ('sucrose', 15),
 ('ATP(4-)', 15),
 ('L-alpha-amino acid zwitterion', 15),
 ('peptide', 14),
 ('ADP(3-)', 14),
 ('pyrimidine nucleotide-sugar', 13),
 ('iron cation', 12),
 ('manganese cation', 12),
 ('oligopeptide', 12),
 (

In [6]:
# TODO sequence clustering?
# TODO turn into function labels -> df_labels

labels = "potassium(1+)", 'calcium(2+)'

protein_to_label = list()
for label in labels:
    label_proteins = dict_chebi_to_uniprot[chebi_name_to_term[label]]
    for protein in label_proteins:
        protein_to_label.append([protein, label])

df_labels = pd.DataFrame.from_records(protein_to_label,columns=["Uniprot", "label"], index="Uniprot")

df_labels = df_labels[~df_labels.index.duplicated()]
df_labels.label.value_counts()

label
potassium(1+)    56
calcium(2+)      42
Name: count, dtype: int64

## Feature creation

In [7]:
series_sequences = df_uniprot.loc[df_labels.index].sequence
print(series_sequences.shape[0])

98


In [8]:
from subpred.cdhit import cd_hit

cluster_representatives = cd_hit(series_sequences, identity_threshold=70)
print(cluster_representatives)
series_sequences = series_sequences.loc[cluster_representatives]
df_labels = df_labels.loc[cluster_representatives]

cd-hit: clustered 98 sequences into 63 clusters at threshold 70
['Q8L7Z0', 'Q8GXE6', 'O22920', 'Q2UVJ5', 'Q9S6Z8', 'A0A1P8B1L6', 'A0A1P8B951', 'Q9LKW9', 'A0A1P8BFX8', 'P92960', 'F4KDC7', 'Q39016', 'Q9LD18', 'Q9M7K4', 'O22397', 'Q9FY75', 'Q9LQL2', 'F4IIZ3', 'Q8RWN2', 'Q38898', 'Q9M8S6', 'Q8RWU6', 'Q68KI4', 'Q93VD3', 'Q9XFR0', 'Q9FE38', 'O65718', 'O80739', 'Q84TI7', 'Q9ZTZ7', 'A0A1P8B8U1', 'O22881', 'Q84WG1', 'O82226', 'Q38998', 'Q8LBL1', 'Q8GYU6', 'A0A178UK01', 'Q945S5', 'A0A1P8AYB6', 'A0A1I9LSS9', 'Q9SX28', 'A8MR98', 'Q9LEQ3', 'Q94KI8', 'O22218', 'Q8GXJ4', 'Q9FI43', 'P92939', 'F4HZU9', 'F4IGU5', 'Q9SY55', 'Q9FLS8', 'Q37145', 'Q39254', 'A0A1P8AP52', 'Q9LF79', 'Q93YT1', 'Q9T0H9', 'Q8L7E9', 'Q94AX5', 'Q9C8E7', 'O23087']


In [9]:
df_aac = calculate_aac(series_sequences)
df_paac = calculate_paac(series_sequences)

In [10]:
df_pssm_50_1 = calculate_pssm_feature(
    series_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref50_1it",
    blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
    iterations=1,
    psiblast_threads=-1,
    verbose=True,
    feature_name="PSSM_50_1"
)
df_pssm_50_3 = calculate_pssm_feature(
    series_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref50_3it",
    blast_db="../data/raw/uniref/uniref50/uniref50.fasta",
    iterations=3,
    psiblast_threads=-1,
    verbose=True,
    feature_name="PSSM_50_3"
)
df_pssm_90_1 = calculate_pssm_feature(
    series_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
    blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
    iterations=1,
    psiblast_threads=-1,
    verbose=True,
    feature_name="PSSM_90_1"
)
df_pssm_90_3 = calculate_pssm_feature(
    series_sequences,
    tmp_folder="../data/intermediate/blast/pssm_uniref90_3it",
    blast_db="../data/raw/uniref/uniref90/uniref90.fasta",
    iterations=3,
    psiblast_threads=-1,
    verbose=True,
    feature_name="PSSM_90_3"
)

PSSM for accession Q8L7Z0 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q8GXE6 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession O22920 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q2UVJ5 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q9S6Z8 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession A0A1P8B1L6 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession A0A1P8B951 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession Q9LKW9 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession A0A1P8BFX8 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession P92960 was found in tmp folder ../data/intermediate/blast/pssm_uniref50_1it
PSSM for accession F4KDC7 was found in

In [11]:
# TODO combine features, separated by <type>__<feature> to use with custom transformer
# TODO test custom transformer method


df_features = pd.concat(
    [
        df_aac,
        df_paac,
        df_pssm_50_1,
        df_pssm_50_3,
        df_pssm_90_1,
        df_pssm_90_3,
    ], axis=1
)
df_features = df_features.loc[df_features.index.sort_values()]

df_labels = df_labels.loc[df_features.index]
df_features = df_features.loc[df_labels.index]
print(
    df_labels.shape, df_features.shape
)

(63, 1) (63, 2020)


In [12]:
from subpred.custom_transformers import FeatureCombinator

# TODO figure out if I should implement get_support instead.
fc = FeatureCombinator(feature_names=df_features.columns, feature_types="AAC")

# fc.fit_transform(df_features, df_labels) 

## ML pipeline

In [13]:
# TODO try removing worst sample according to percentages
# TODO feature selection, regularization
# TODO cd-hit

from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.metrics import f1_score, classification_report
from subpred.custom_transformers import FeatureCombinator

# converting data to numpy
label_encoder = LabelEncoder()
label_encoder.fit(sorted(df_labels.label.unique()))
sample_names = df_features.index.values
feature_names = df_features.columns.values
X = df_features.values
y = label_encoder.transform(df_labels.label)
# train test eval split
(
    X_train,
    X_eval,
    y_train,
    y_eval,
    sample_names_train,
    sample_names_eval,
) = train_test_split(X, y, sample_names, test_size=0.2, random_state=1, stratify=y)

# model
model = make_pipeline(
    StandardScaler(), SelectPercentile(), SVC(random_state=1, probability=True)
)
param_grid = {
    "svc__C": [0.1, 1, 10],
    "svc__gamma": ["scale", "auto"],
    "svc__class_weight": ["balanced", None],
    "selectpercentile__percentile": list(range(1, 101)),
}

# hyperparam optim & crossval
gridsearch = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    return_train_score=True,
    # verbose=20
)
gridsearch.fit(X_train, y_train)
print("Best train score:", gridsearch.best_score_)
print("Best train params", gridsearch.best_params_)
model_optim = gridsearch.best_estimator_

# eval
y_pred = model_optim.predict(X_eval)
print(classification_report(y_true=y_eval, y_pred=y_pred))
model_optim.predict_proba(X_eval)  # TODO compare with actual labels


Best train score: 0.8197668997668999
Best train params {'selectpercentile__percentile': 95, 'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale'}
              precision    recall  f1-score   support

           0       0.83      0.83      0.83         6
           1       0.86      0.86      0.86         7

    accuracy                           0.85        13
   macro avg       0.85      0.85      0.85        13
weighted avg       0.85      0.85      0.85        13



array([[0.42302103, 0.57697897],
       [0.56410701, 0.43589299],
       [0.72057401, 0.27942599],
       [0.31074607, 0.68925393],
       [0.1328623 , 0.8671377 ],
       [0.11531936, 0.88468064],
       [0.7979374 , 0.2020626 ],
       [0.30185713, 0.69814287],
       [0.71324271, 0.28675729],
       [0.14169745, 0.85830255],
       [0.82812382, 0.17187618],
       [0.38600574, 0.61399426],
       [0.5253791 , 0.4746209 ]])

In [None]:
feature_types = set([feature_name.split("__")[0] for feature_name in feature_names])

from itertools import combinations
from copy import deepcopy

feature_type_combinations = set()
for tuple_length in range(2,len(feature_types)+1):
    feature_type_combinations.update(set(combinations(feature_types, tuple_length)))

feature_type_combinations.update(feature_types)

{('AAC', 'PSSM_50_1'),
 ('AAC', 'PSSM_50_1', 'PSSM_90_3'),
 ('AAC', 'PSSM_90_3'),
 ('PAAC', 'AAC'),
 ('PAAC', 'AAC', 'PSSM_50_1'),
 ('PAAC', 'AAC', 'PSSM_50_1', 'PSSM_90_3'),
 ('PAAC', 'AAC', 'PSSM_90_3'),
 ('PAAC', 'PSSM_50_1'),
 ('PAAC', 'PSSM_50_1', 'PSSM_90_3'),
 ('PAAC', 'PSSM_90_3'),
 ('PSSM_50_1', 'PSSM_90_3'),
 ('PSSM_50_3', 'AAC'),
 ('PSSM_50_3', 'AAC', 'PSSM_50_1'),
 ('PSSM_50_3', 'AAC', 'PSSM_50_1', 'PSSM_90_3'),
 ('PSSM_50_3', 'AAC', 'PSSM_90_3'),
 ('PSSM_50_3', 'PAAC'),
 ('PSSM_50_3', 'PAAC', 'AAC'),
 ('PSSM_50_3', 'PAAC', 'AAC', 'PSSM_50_1'),
 ('PSSM_50_3', 'PAAC', 'AAC', 'PSSM_50_1', 'PSSM_90_3'),
 ('PSSM_50_3', 'PAAC', 'AAC', 'PSSM_90_3'),
 ('PSSM_50_3', 'PAAC', 'PSSM_50_1'),
 ('PSSM_50_3', 'PAAC', 'PSSM_50_1', 'PSSM_90_3'),
 ('PSSM_50_3', 'PAAC', 'PSSM_90_3'),
 ('PSSM_50_3', 'PSSM_50_1'),
 ('PSSM_50_3', 'PSSM_50_1', 'PSSM_90_3'),
 ('PSSM_50_3', 'PSSM_90_1'),
 ('PSSM_50_3', 'PSSM_90_1', 'AAC'),
 ('PSSM_50_3', 'PSSM_90_1', 'AAC', 'PSSM_50_1'),
 ('PSSM_50_3', 'PSSM_90_1',

In [67]:
# TODO second pipeline with featurecombinator

## Comparisons

Compare training results with: 

- Average sequence similarity
- GO term similarity
  - How many protein in common?
  - Semantic similarity?
