# This is Step 3 in the Pipeline - Training ML Prediction Model
With this notebook we can train various ML classifiers to tackle multi-lable prediction problem. We are predicting Spec2Vec embeddings from molecular fingerprints.

### Imports

In [27]:
from sklearn.metrics import accuracy_score, f1_score, log_loss, precision_score, recall_score, jaccard_score, roc_auc_score, hamming_loss, label_ranking_loss, coverage_error
from sklearn.model_selection import KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import  ClassifierChain
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from mass_spectra.similarity_voting import SimilarityVoting
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import pickle
from random import shuffle, seed
from math import ceil
import os

### Parameters

In [28]:
RANDOM_STATE = 27082023
seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# path to merged fingerprint and embedding data (fingerprint columns should be prefixed with 'fingerprint_' and embedding columns should be prefixed with 'embedding_').
MERGED_PATH = './source/embedding/tms_maccs/merged.csv'
MODEL_OUTPUT_FOLDER = "./source/model/tms_maccs/"

In [29]:
assert os.path.isfile(MERGED_PATH)
assert os.path.isdir(MODEL_OUTPUT_FOLDER)
assert MERGED_PATH.endswith('.csv')

In [30]:
ESTIMATOR = LogisticRegression(random_state=RANDOM_STATE)

In [31]:
MODEL = OneVsRestClassifier(ESTIMATOR, n_jobs=-1)

In [32]:
MODEL_OUTPUT_FOLDER = f'{MODEL_OUTPUT_FOLDER}{MODEL.__class__.__name__}_{ESTIMATOR.__class__.__name__}'
os.makedirs(f'{MODEL_OUTPUT_FOLDER}/models', exist_ok=False)
os.makedirs(f'{MODEL_OUTPUT_FOLDER}/unseen_inchi_keys_models', exist_ok=False)

### Metrics Definition
Creates metrics which can be called with (y_true, y_prob, y_pred) for easier use. It also creates multiple combinations of metrics for different averaging methods.

In [33]:
Y_PRED_SCORES = [accuracy_score, log_loss, hamming_loss] # input y predictions and y true
Y_PRED_SCORES_WITH_AVERAGING = [f1_score, precision_score, recall_score, jaccard_score] # input y predictions and y true and use one of the following: "micro", "macro", "weighted", "samples"
Y_PROB_SCORES = [roc_auc_score, label_ranking_loss, coverage_error] # input y probabilities and y true

In [34]:
METRICS = []
METRIC_NAMES = []
for metric in Y_PRED_SCORES:
    METRICS.append(lambda y_true, y_prob, y_pred, metric=metric: metric(y_true, y_pred))
    METRIC_NAMES.append(metric.__name__)
for metric in Y_PRED_SCORES_WITH_AVERAGING:
    for average in ["micro", "macro", "weighted", "samples"]:
        zero_division = 0 if metric.__name__ == "jaccard_score" else np.nan
        METRICS.append(lambda y_true, y_prob, y_pred, metric=metric, average=average: metric(y_true, y_pred, average=average, zero_division=zero_division))
        METRIC_NAMES.append(metric.__name__ + "__" + average)
for metric in Y_PROB_SCORES:
    METRICS.append(lambda y_true, y_prob, y_pred, metric=metric: metric(y_true, y_prob))
    METRIC_NAMES.append(metric.__name__)

In [35]:
class Metrics:
    def __init__(self, metrics, metric_names, repeats=2, folds=5):
        self.metrics = metrics
        self.metric_names = metric_names
        
        self.repeats = repeats
        self.folds = folds
        self.i = 0

        self.results = pd.DataFrame(columns=['repeat', 'fold', 'model_training_data_path'] + self.metric_names)
    
    def evaluate(self, y_true, y_prob, y_pred, model_training_data_path=None):
        entry = {
            'repeat': self.i // self.folds,
            'fold': self.i % self.folds,
            'model_training_data_path': model_training_data_path
        }
        for metric, metric_name in zip(self.metrics, self.metric_names):
            try:
                entry[metric_name] = metric(y_true, y_prob, y_pred)
            except ValueError as e:
                print("Warning: ", e)
                entry[metric_name] = np.nan
        
        self.results = pd.concat([self.results, pd.DataFrame(entry, index=[0])], ignore_index=True)
        self.i += 1
    
    def store(self, filename):
        self.results.to_csv(filename, index=False)

### Load Data

In [36]:
merged_df = pd.read_csv(MERGED_PATH)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3025 entries, 0 to 3024
Columns: 467 entries, inchi_key to embedding_299
dtypes: float64(466), object(1)
memory usage: 10.8+ MB


In [37]:
f'Number of NaNs: {merged_df.isna().sum().sum()}' # should be 0

'Number of NaNs: 0'

In [38]:
X = merged_df.filter(regex='^embedding_')
y = merged_df.filter(regex='^fingerprint_')
X.shape, y.shape

((3025, 300), (3025, 166))

In [39]:
X = X.to_numpy()
y = y.to_numpy()

### Train- K-fold Cross Validation

In [40]:
REPEATS = 2
K = 5
metrics = Metrics(METRICS, METRIC_NAMES, REPEATS, K)

for i in tqdm(range(REPEATS), desc="Repeats"):
    kf = KFold(n_splits=K, shuffle=True, random_state=RANDOM_STATE + i)

    for fold, (train_index, test_index) in tqdm(enumerate(kf.split(X, y)), desc="Fold", total=K):
        # train
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        MODEL.fit(X_train, y_train)

        # predict
        y_pred = MODEL.predict(X_test)
        y_prob = MODEL.predict_proba(X_test)

        # store train data
        model_training_data_path = f'{MODEL_OUTPUT_FOLDER}/models/{i}_{fold}.pkl'
        with open(model_training_data_path, "wb") as f:
            pickle.dump({
                "model": MODEL,
                "X_train": X_train,
                "y_train": y_train,
                "X_test": X_test,
                "y_test": y_test,
            }, f)

        # evaluate
        metrics.evaluate(y_test, y_prob, y_pred, model_training_data_path=model_training_data_path)
        
metrics.store(f'{MODEL_OUTPUT_FOLDER}/metrics.csv')

Repeats:   0%|          | 0/2 [00:00<?, ?it/s]

Fold:   0%|          | 0/5 [00:00<?, ?it/s]





















Fold:   0%|          | 0/5 [00:00<?, ?it/s]





















In [41]:
metrics.results.describe()

Unnamed: 0,accuracy_score,log_loss,hamming_loss,f1_score__micro,f1_score__macro,f1_score__weighted,f1_score__samples,precision_score__micro,precision_score__macro,precision_score__weighted,...,recall_score__macro,recall_score__weighted,recall_score__samples,jaccard_score__micro,jaccard_score__macro,jaccard_score__weighted,jaccard_score__samples,roc_auc_score,label_ranking_loss,coverage_error
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,10.0,10.0
mean,0.060496,273.823179,0.047163,0.886091,0.569981,0.8856,0.887575,0.893221,0.581024,0.894957,...,0.566313,0.879092,0.888414,0.795508,0.478783,0.820357,0.809726,,0.020618,59.741983
std,0.008649,10.952253,0.002151,0.004672,0.007729,0.005202,0.004048,0.003414,0.011741,0.003476,...,0.012845,0.006948,0.005103,0.007515,0.008929,0.006996,0.006062,,0.001522,1.614533
min,0.044628,258.728597,0.043861,0.876722,0.557985,0.875114,0.880635,0.888146,0.560778,0.88812,...,0.538535,0.865589,0.879676,0.780503,0.463335,0.807065,0.7996,,0.018639,56.704132
25%,0.056612,266.456173,0.045835,0.883493,0.563243,0.882572,0.885157,0.891211,0.572672,0.893247,...,0.561979,0.875198,0.885079,0.791301,0.472386,0.816186,0.806053,,0.019547,59.19876
50%,0.060331,274.842925,0.046754,0.886685,0.573175,0.885612,0.887811,0.893128,0.583809,0.894849,...,0.566068,0.878853,0.888429,0.796436,0.48258,0.820623,0.809918,,0.020482,59.727273
75%,0.066942,282.187079,0.048755,0.8892,0.574847,0.88964,0.889599,0.894935,0.587085,0.897842,...,0.571242,0.885018,0.891925,0.800504,0.483973,0.825789,0.812804,,0.021571,60.830579
max,0.072727,291.408865,0.050961,0.892106,0.580663,0.891919,0.89326,0.898244,0.599626,0.899495,...,0.584225,0.888018,0.896502,0.805226,0.491301,0.829325,0.818455,,0.023544,62.34876


### Train With Unseen InChI Keys

In [42]:
def split_dataset(X, y, test_inchi_keys=[]):
    # get index from merged_df
    test_index = merged_df[merged_df['inchi_key'].isin(test_inchi_keys)].index
    train_index = merged_df[~merged_df['inchi_key'].isin(test_inchi_keys)].index

    # split X and y
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    return X_train, X_test, y_train, y_test

In [43]:
all_inchi_keys = list(merged_df['inchi_key'].unique())
shuffle(all_inchi_keys)

In [44]:
hidden_inchi_keys = 10

REPEATS = 1
K = ceil(len(all_inchi_keys) / hidden_inchi_keys)
metrics = Metrics(METRICS, METRIC_NAMES, REPEATS, K)

for i in tqdm(range(REPEATS), desc="Repeats"):
    # Reshuffle
    shuffle(all_inchi_keys)

    for end_i in tqdm(range(hidden_inchi_keys, len(all_inchi_keys), hidden_inchi_keys), desc="Fold", total=K):
        start_i = end_i - hidden_inchi_keys
        if end_i + hidden_inchi_keys > len(all_inchi_keys):
            end_i = len(all_inchi_keys)

        # train
        test_inchi_keys = all_inchi_keys[start_i:end_i]
        X_train, X_test, y_train, y_test = split_dataset(X, y, test_inchi_keys)

        MODEL.fit(X_train, y_train)

        # predict
        y_pred = MODEL.predict(X_test)
        y_prob = MODEL.predict_proba(X_test)

        # store train data
        model_training_data_path = f'{MODEL_OUTPUT_FOLDER}/unseen_inchi_keys_models/{start_i}_{end_i}.pkl'
        with open(model_training_data_path, "wb") as f:
            pickle.dump({
                "model": MODEL,
                "X_train": X_train,
                "y_train": y_train,
                "X_test": X_test,
                "y_test": y_test,
            }, f)

        # evaluate
        metrics.evaluate(y_test, y_prob, y_pred, model_training_data_path=model_training_data_path)

metrics.store(f'{MODEL_OUTPUT_FOLDER}/unseen_inchi_keys_metrics.csv')

Repeats:   0%|          | 0/1 [00:00<?, ?it/s]

Fold:   0%|          | 0/11 [00:00<?, ?it/s]







































In [45]:
metrics.results.describe()

Unnamed: 0,accuracy_score,log_loss,hamming_loss,f1_score__micro,f1_score__macro,f1_score__weighted,f1_score__samples,precision_score__micro,precision_score__macro,precision_score__weighted,...,recall_score__macro,recall_score__weighted,recall_score__samples,jaccard_score__micro,jaccard_score__macro,jaccard_score__weighted,jaccard_score__samples,roc_auc_score,label_ranking_loss,coverage_error
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,10.0,10.0
mean,0.01113,424.566557,0.092088,0.775859,0.25889,0.773875,0.782796,0.793424,0.288535,0.816057,...,0.257364,0.761034,0.78272,0.634953,0.215581,0.70803,0.661918,,0.060978,86.985411
std,0.016413,98.875142,0.017652,0.03453,0.035569,0.042875,0.031595,0.042661,0.050794,0.0409,...,0.033516,0.048066,0.044381,0.045504,0.026899,0.051132,0.040484,,0.017093,12.870083
min,0.0,319.35694,0.069487,0.716793,0.192055,0.701357,0.729632,0.713917,0.206776,0.737883,...,0.185742,0.695205,0.718066,0.558595,0.171804,0.629265,0.596405,,0.037725,68.509554
25%,0.000814,352.628408,0.082038,0.759725,0.240218,0.740072,0.765759,0.770019,0.251891,0.804088,...,0.244163,0.713567,0.744621,0.612625,0.199911,0.664556,0.637618,,0.050587,76.458898
50%,0.006109,374.26365,0.087952,0.784315,0.26113,0.784945,0.789165,0.796566,0.291311,0.813472,...,0.259274,0.775593,0.796822,0.645179,0.213979,0.714587,0.670641,,0.059758,86.720199
75%,0.0122,506.283899,0.102982,0.791712,0.272094,0.804507,0.793287,0.815267,0.307515,0.844446,...,0.265805,0.803753,0.818344,0.655235,0.222074,0.750225,0.672976,,0.066355,96.86568
max,0.05414,598.977735,0.118474,0.819505,0.318622,0.829669,0.82683,0.864236,0.379419,0.867883,...,0.312009,0.815877,0.839398,0.694205,0.267533,0.777955,0.719308,,0.08978,108.083333
