# Supervised Learning Models

In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import (
    RandomUnderSampler,
    EditedNearestNeighbours
)
from imblearn.combine import SMOTEENN
from train import grid_search
from utils import get_data
import config

In [3]:
X_train, X_test, y_train, y_test = get_data(
    os.path.join(config.DATASET_ROOT,
                 'magic04.data'),
    columns=config.FEATURE_NAMES
)

# TODO: Trivial and baseline classifiers

In [4]:
estimators = [
    RandomForestClassifier,
    MLPClassifier,
    KNeighborsClassifier,
    DecisionTreeClassifier,
    SVC
]

## Fit estimators without any oversampling/undersampling

In [5]:
for estimator in estimators:
    grid_search(estimator, X_train, y_train, save=True)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END estimator__criterion=gini, estimator__min_samples_split=10, estimator__n_estimators=10; total time=   0.4s
[CV] END estimator__criterion=gini, estimator__min_samples_split=10, estimator__n_estimators=10; total time=   0.4s
[CV] END estimator__criterion=gini, estimator__min_samples_split=10, estimator__n_estimators=10; total time=   0.4s
[CV] END estimator__criterion=gini, estimator__min_samples_split=10, estimator__n_estimators=10; total time=   0.4s
[CV] END estimator__criterion=gini, estimator__min_samples_split=10, estimator__n_estimators=10; total time=   0.4s
[CV] END estimator__criterion=gini, estimator__min_samples_split=10, estimator__n_estimators=20; total time=   0.6s
[CV] END estimator__criterion=gini, estimator__min_samples_split=10, estimator__n_estimators=20; total time=   0.6s
[CV] END estimator__criterion=gini, estimator__min_samples_split=10, estimator__n_estimators=20; total time=   0.6s
[CV] END e

KeyboardInterrupt: 

# Fit estimators with oversampling techniques
* SMOTE
* ADASYN

In [None]:
for estimator in estimators:
    grid_search(estimator,
                X_train, y_train,
                save=True, sampler=SMOTE)

In [None]:
for estimator in estimators:
    grid_search(estimator,
                X_train, y_train,
                save=True, sampler=ADASYN)

## Fit estimators with undersampling techniques
* RandomUnderSampling
* EditedNearestNeighbors

In [None]:
for estimator in estimators:
    grid_search(estimator,
                X_train, y_train,
                save=True, sampler=RandomUnderSampler)

In [None]:
for estimator in estimators:
    grid_search(estimator,
                X_train, y_train,
                save=True, sampler=EditedNearestNeighbours)

## Fit estimators with a combination of oversampling and undersampling
* SMOTEENN

In [None]:
for estimator in estimators:
    grid_search(estimator,
                X_train, y_train,
                save=True, sampler=SMOTEENN)