In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
# For reproducibility.
rng = np.random.RandomState(1)

# Dataset

In [3]:
df = pd.read_csv('data/augmented_dataset.csv', index_col='id')

features = [i for i in df.columns if i != 'class']
target = 'class'

X, y = df.loc[:, features], df.loc[:, target]

In [4]:
df.head(3)

Unnamed: 0_level_0,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,BIOV880101,...,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10gs,-0.117565,0.164055,-0.386466,-0.524996,-0.655534,-0.31002,0.380819,0.619936,0.361497,-0.406576,...,-0.00139,-0.155634,-0.155634,-0.29124,-0.271137,0.118107,0.286221,0.33583,0.278359,1
11gs,-0.102087,0.072364,-0.433112,-0.522733,-1.105537,-0.766021,0.94046,1.220417,0.111012,-0.628022,...,0.183152,-0.016953,-0.016953,-0.23294,-0.122073,0.596794,0.680634,0.736305,0.819202,1
13gs,-1.386729,2.196977,0.058786,0.142759,-0.968929,1.071801,-0.915684,0.522422,0.520896,1.053327,...,-0.365395,-0.512734,-0.512734,-0.596358,-0.479828,-0.291246,-0.312462,-0.317647,-0.368448,1


# Training

## Configurations

All the configurations have the form:

$$ \text{scaler} \mapsto \text{reducer} \mapsto \text{classifier}$$

In [5]:
steps = [
    ('scaler', StandardScaler()),
    ('reducer', PCA(random_state=rng)),
    ('classifier', DummyClassifier()),
]

param_grid = [
    {
        'reducer__n_components': [20, 50, 100, 200, 300],
        'classifier': [RandomForestClassifier(random_state=rng)],
        'classifier__n_estimators': [100, 200, 300, 500],
        'classifier__min_samples_split': [2, 5, 10],
    },
    {
        'reducer__n_components': [20, 50, 100, 200, 300],
        'classifier': [SVC()],
        'classifier__kernel': ['rbf'],
        'classifier__C': [0.1, 1, 5, 10, 20],
        'classifier__gamma': [0.1, 1, 5, 10],
    },
    {
        'reducer__n_components': [20, 50, 100, 200, 300],
        'classifier': [XGBClassifier(random_state=rng)],
        'classifier__n_estimators': [50, 100, 200, 500, 1000, 2000],
        'classifier__learning_rate': [0.1, 0.5, 1.],
        'classifier__max_depth': [2, 4, 6, 8],
    },
]

## CV configuration

In [6]:
# Create the pipeline: scaler => reducer => classifier.
pipeline = Pipeline(steps=steps)

# Spli the data into train and test with stratification.
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, train_size=0.9, random_state=1)

# 5-fold cross validation for tuning hyperparameters.
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Use AUC as validation metric.
grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kf, scoring='roc_auc', n_jobs=-1, refit=True)

## Tuning hyperparameters and refitting on the whole training set

In [None]:
grid.fit(X_train, y_train)

In [None]:
pd.DataFrame(grid.cv_results_)