In [6]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from joblib import dump

# Dataset

In [7]:
df = pd.read_csv('data/augmented_dataset.csv', index_col='id')

features = [i for i in df.columns if i != 'class']
target = 'class'

X, y = df.loc[:, features], df.loc[:, target]

In [8]:
df.head(5)

Unnamed: 0_level_0,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,BIOV880101,...,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6cjp,4.404118,1.005294,1.486471,1.531176,0.7,0.777059,0.663529,0.409471,84.794118,45.235294,...,11.005714,5.52646,5.52646,3.745058,-3.78,23.380599,10.436197,5.962061,7.176604,1
3p5l,4.313,0.828,1.137,1.127,0.633,0.851,0.603,0.4004,87.09,53.0,...,5.62519,3.427906,3.427906,2.301945,-1.05,12.408054,5.071992,2.45606,3.701974,1
1utj,4.362727,0.625455,0.891818,0.800909,0.390909,0.679091,0.838182,0.457545,66.718182,24.909091,...,2.764237,1.1315,1.1315,0.644677,-0.82,5.319276,2.478229,1.073982,1.647798,1
2zif,4.3925,1.061875,1.035,1.114375,0.524375,0.740625,0.70375,0.436688,77.7375,20.5,...,8.482719,4.698579,4.698579,3.064262,-1.99,19.762817,7.94124,3.987346,5.812639,1
1qxl,4.428,0.931333,1.318,1.330667,0.643333,0.789333,0.738,0.412667,84.586667,44.866667,...,11.900591,6.209351,6.209351,4.22592,-3.91,24.222061,11.482918,6.361879,7.946855,1


# Training

## Configurations

All the configurations have the form:

$$ \text{scaler} \mapsto \text{reducer} \mapsto \text{classifier}$$

In [9]:
steps = [
    ('scaler', StandardScaler()),
    ('reducer', PCA(random_state=1)),
    ('classifier', 'passthrough'),  # Will be populated by param_grid.
]

param_grid = [
    {
        'reducer__n_components': [20, 50, 100, 300],
        'classifier': [RandomForestClassifier(random_state=1)],
        'classifier__n_estimators': [100, 200, 500],
        'classifier__min_samples_split': [2, 5, 10],
    },
    {
        'reducer__n_components': [20, 50, 100, 300],
        'classifier': [XGBClassifier(random_state=1)],
        'classifier__n_estimators': [50, 100, 500, 1000, 2000],
        'classifier__learning_rate': [0.1, 0.5, 1.],
        'classifier__max_depth': [2, 4, 6, 8],
    },
]

## CV configuration

In [10]:
# Create the pipeline: scaler => reducer => classifier.
pipeline = Pipeline(steps=steps)

# Spli the data into train and test with stratification.
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, train_size=0.9, random_state=1)

# 5-fold cross validation for tuning hyperparameters.
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Use AUC as validation metric.
grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kf, scoring='roc_auc', n_jobs=-1, refit=True)

## Tuning hyperparameters and refitting on the whole training set

In [None]:
grid.fit(X_train, y_train)

# Save the best model and the grid

In [None]:
best_model = grid.best_estimator_
best_model

In [None]:
dump(best_model, 'saved_models/best_model.joblib')
dump(grid, 'saved_models/gridsearch_cv.joblib')