# Description

In delgado et al., the best performing models were Linear kernel SVM and MLP for both binary and quaternary classification

Performed grid hyperparameter search
and 5-fold cross validation

Classical ML Experiments I want to do:


Bayesian Optimize all

Binary classification:
- diestrus vs rest
- pro-est vs met-die

Quaternary classification

LinearSVC

MLP, architectural search

W/out GAN augmented, with GAN Augmented

## Caution:
We should only use GAN data to augment train, not on validaiton set.
What if we train on GAN data but then don't have GAN data in our validation. How will it perform?

# Imports

In [1]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import sklearn
import pandas as pd
import os
import errno
import glob
from tqdm import tqdm_notebook as tqdm
from time import gmtime, strftime

from skopt.space import Real, Integer
from skopt.utils import use_named_args



# Setup output directories

In [2]:
root = "../experiments/delgado/"

In [3]:
EXPERIMENT_NAME = strftime("%Y-%m-%d_%H-%M_%S", gmtime())
OUT_DIR = root + EXPERIMENT_NAME

x = input("Are you sure you want to make a new experiment at " + OUT_DIR + "? (y/N): ")
if x.lower().strip() != "y":
    assert False, "User chose not to make a new experiment."

try:
    os.makedirs(OUT_DIR)
except OSError as exc:
    if exc.errno == errno.EEXIST:
        c = input(f"The path {OUT_DIR} already exists. Are you sure you want to continue? (y/N): ")
        if c.lower().strip() != "y":
            assert False, "User chose not to continue"
    else:
        raise

Are you sure you want to make a new experiment at ../experiments/delgado/2019-07-19_05-48_28? (y/N): y


# Load data

In [4]:
DATA_DIR = "../data/gan_augmented"
train = os.path.join(DATA_DIR, "train")
train

('../data/gan_augmented/train', '../data/gan_augmented/test')

In [5]:
df = pd.read_csv(DATA_DIR + "/train.csv")
len(df)

32640

In [6]:
no_augmentation = df[df["file"].str.startswith("gan_")==False]
len(no_augmentation)

14376

In [7]:
DATASETS = {
    "BASE": no_augmentation,
    "GAN_AUGMENTED": df,
}

# Define search space

In [8]:
# Define desired models and search spaces. Add more if desired

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

MAX_N_HIDDEN = 5

DESIRED_MODELS = {
    "LinearSVC": {
        "model": SVC,
        "space": [
            Real(10, 10**6, prior="log-uniform", name="C")
        ]
    },
    "MLP": {
        "model": MLPClassifier,
        "space": [
            Real(0.001, 1.0, prior="log-uniform", name="learning_rate_init"),
        ] + [Integer(0, 150, name=f"n_layer_{i}") for i in range(MAX_N_HIDDEN)] # add on hidden
    }
    
}

# Choose class groupings

In [9]:
def regroup_classes(y, *classes):
    """
    Regroup class labels
    
    Args
        y: a series
        *classes: tuple grouping classes together. E.g. ("proestrus", "estrus")
    """
    for group in classes:
        if type(group) is tuple or type(group) is list:
            combined_name = "_".join(group)
            for g in group:
                y = y.replace(g, combined_name)
    return y

In [10]:
DESIRED_CLASS_GROUPS = {
    "4-way": ("proestrus", "estrus", "metestrus", "diestrus"),     # 4-way
    "binary_pe-v-md": (("proestrus", "estrus"), ("metestrus", "diestrus")), # binary v1
    "binary_pem-v-d": (("proestrus", "estrus", "metestrus"), ("diestrus")), # binary v2
}

# Define Optimization Loop

In [11]:
def replace_mlp_params(params):
    # create hidden layer if greater than zero
    hidden_layer_sizes = [
        params[f"n_layer_{i}"] for i in range(MAX_N_HIDDEN) 
        if params[f"n_layer_{i}"] > 0
    ]
    params["hidden_layer_sizes"] = hidden_layer_sizes
    # get rid of the dummy params
    [params.pop(f"n_layer_{i}") for i in range(MAX_N_HIDDEN)]
    return params
    

In [12]:
from skopt import gp_minimize
from sklearn.model_selection import cross_val_score

NUM_POINTS = 25

def run_optimization(
    desired_models=DESIRED_MODELS, 
    desired_datasets=DATASETS, 
    desired_class_groups=DESIRED_CLASS_GROUPS
):
    for mname, model_params in tqdm(desired_models.items()):
        for dname, data in tqdm(desired_datasets.items(), leave=False):
            for gname, class_group in tqdm(desired_class_groups.items(), leave=False):
                print(mname, dname, gname)

                time_start = gmtime()

                X = data.drop(columns=["file", "label", "patch_num"])
                y = data["label"]
                y = regroup_classes(y, *class_group)

                space = model_params["space"]

                @use_named_args(space)
                def objective(**params):
                    if mname is "MLP":
                       params = replace_mlp_params(params)

                    model = model_params["model"](**params)
                    score = -np.mean(cross_val_score(model, X, y, cv=5, n_jobs=-1))
                    return score

                results_gp = gp_minimize(objective, space, n_calls=NUM_POINTS, random_state=0, verbose=True)

                time_end = gmtime()

                save_results(mname, dname, gname, space, results_gp, time_start, time_end)

In [13]:
import json
from skopt.plots import plot_convergence

format_time = lambda time: strftime("%Y-%m-%d_%H-%M_%S", time)

def save_results(model_name, data_name, gname, space, results_gp, time_start, time_end):
    fname = "_".join((gname, data_name, model_name))
    
    # pickle the result itself
    skopt.dump(results_gp, fname + ".pkl")
    
    # save run info
    info = {
        "time_start": format_time(time_start),
        "time_end": format_time(time_end),
        "duration": format_time(time_end - time_start),
        "model": model_name,
        "dataset": data_name,
        "class_group": gname,
        "space": space,
        "best_validation": -results_gp.fun # add negative because this gp minimizes, flip back to positive
    }
    best_parameters = {space[i].name: value for i, value in enumerate(results_gp.x)}
    info["best_parameters"] = best_parameters
    with open(os.path.join(OUT_DIR, fname), "w") as f:
        json.dump(info, f + ".json")
    
    # save convergence plot
    plot_path = os.path.join(OUT_DIR, fname + "_plot.png")
    plot_convergence(results_gp).savefig(plot_path)

# Run

In [14]:
run_optimization()

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

LinearSVC BASE 4-way
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 91.4654
Function value obtained: -0.5676
Current minimum: -0.5676
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 96.0004
Function value obtained: -0.5676
Current minimum: -0.5676
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 98.8383
Function value obtained: -0.5676
Current minimum: -0.5676
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 93.9874
Function value obtained: -0.5676
Current minimum: -0.5676
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 100.9579
Function value obtained: -0.5676
Current minimum: -0.567

KeyboardInterrupt: 