In [1]:
%load_ext autoreload
%autoreload 2

In [18]:
# --------------------
# Import general stuff
# --------------------
import matplotlib.pyplot as plt

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.pipeline import make_pipeline

# ----------------------------------
# Import Retention Order SVM classes
# ----------------------------------
from rosvm.ranksvm.rank_svm_cls import KernelRankSVC
from rosvm.ranksvm.analysis_utils import RankSVMAnalyzer
from rosvm.ranksvm.tutorial.utils import read_dataset
from rosvm.feature_extraction.featurizer_cls import CircularFPFeaturizer

## Load Example Data

We work with the five datasets used in the ECCB 2018 publication. Those include retention times measured on different chromatographic systems.

In [4]:
_, y, mol = read_dataset("./ECCB2018_data.csv")  # we do not use the provided features, but calculated late them here
print("Number of Measurements:", len(y))

datasets = y.get_unique_dss()
print("Datasets:", datasets)

Number of Measurements: 1081
Datasets: ['Eawag_XBridgeC18', 'FEM_long', 'LIFE_old', 'RIKEN', 'UFZ_Phenomenex']


## Build Sklearn Pipeline

We use [sklearn pipelines](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline) to stream-line the calculation of the molecule features (here FCFP fingerprints) and ROSVM model training with hyper parameter estimation. 

#### First: Get the components of our pipeline

In [52]:
# Fingerprinter to calculate the FCFP fingerprints from the given molecule representations (SMILES)
# 
# "only_freq_subs": Final fingerprint vector only contains substructures, that appear often in the training set.
fprinter = CircularFPFeaturizer(fp_type="FCFP", only_freq_subs=True, output_dense_matrix=True, radius=2, 
                                min_subs_freq=0.01)

# RankSVM model with default parameters
ranksvm = KernelRankSVC(kernel="minmax", random_state=102)  

#### Second: We build the sklearn pipeline

In [53]:
pipeline = make_pipeline(fprinter, ranksvm)

#### Third: Set up the sklearn GridSearch 

In [54]:
gridsearchcv = GridSearchCV(estimator=pipeline, 
                            param_grid={"kernelranksvc__C": [1, 2, 4]},
                            cv=GroupKFold(n_splits=3),
                            n_jobs=1)

## Optimize the Model (pipeline)

In [55]:
test_score = []
for train, test in GroupKFold(n_splits=5).split(y, groups=mol):
    ranksvm = gridsearchcv.fit(mol[train], y[train], groups=mol[train])
    print(ranksvm.score(mol[test], y[test]))

0.8743825034247346
0.8372415687727628
0.8457403726608881
0.851810942645446
0.8487800861354519
