In [4]:
"""
The intent of this notebook is model selection and 
evaluation for the MVP of our brainNN classifier.
"""
import sys
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tornado import gen
from tornado.ioloop import IOLoop
import aimetrics as aim
import aimetrics.metrics as aim_metrics
import seaborn as sns
%matplotlib inline

In [5]:
X_trn_val = pd.read_csv('output/bnn-mvp/X_trn_val.csv', index_col=0)
y_trn_val = pd.read_csv('output/bnn-mvp/y_trn_val.csv', index_col=0)
X_test = pd.read_csv('output/bnn-mvp/X_test.csv', index_col=0)
y_test = pd.read_csv('output/bnn-mvp/y_test.csv', index_col=0)
labels = ['small_drone', 'person']
# create data storage variable
metrics = {}

In [6]:
from sklearn.cross_validation import  StratifiedKFold


# put together k-fold output
skf = StratifiedKFold(y_trn_val['small_drone'],  5)



@gen.coroutine
def get_default_metrics():
    indx = list(skf) # for debugging
    metrics['default'] = yield [
        aim_metrics.remote_classifier_metrics(
            'http://localhost:3002/', 
            'bnn',
            X_trn_val.iloc[trn_ind].values, 
            y_trn_val.iloc[trn_ind].values,
            X_trn_val.iloc[val_ind].values,
            y_trn_val.iloc[val_ind].values,
            labels
        ) 
        for trn_ind, val_ind in indx
    ]
IOLoop.instance().add_callback(get_default_metrics)

In [None]:
row_range = range(0, 6)
col_range = range(2,7)
rate_range = np.arange(0.1, 0.8, 0.1)
n_folds = 5
print("TESTING %d MODELS" % (len(row_range)*len(col_range)*len(rate_range)*n_folds)

@gen.coroutine
def get_param_grid_metrics():
    yield [get_param_metrics(nrows, ncols, learning_rate=rate) for nrows in row_range for ncols in col_range for rate in rate_range]
    print("PARAMETER SEARCH COMPLETE")
    
                
                
@gen.coroutine
def get_param_metrics(n_hidden_rows, ncols, learning_rate=None):
    """Get the metrics for a particular parameter set.  Assumes a grid topo"""
    skf = StratifiedKFold(y_trn_val['small_drone'],  n_folds)
    params = {'hiddenLayers': [ncols] * n_hidden_rows}
    if learning_rate:
        params['learningRate'] = learning_rate
    key = 'x'.join((str(n_hidden_rows), str(ncols), str(learning_rate)))
    metrics[key] = yield [
        aim_metrics.remote_classifier_metrics(
            'http://localhost:3002/', 
            'bnn',
            X_trn_val.iloc[trn_ind].values, 
            y_trn_val.iloc[trn_ind].values,
            X_trn_val.iloc[val_ind].values,
            y_trn_val.iloc[val_ind].values,
            labels,
            model_params = params,
        ) 
        for trn_ind, val_ind in skf
    ]
    print("%s Complete" % key)

IOLoop.instance().add_callback(get_param_grid_metrics)

In [8]:
def get_score_report(key, score, agg):
    stats = pd.Series([r[score] for r in metrics[key]]).describe()
    return stats[agg]

In [13]:
scores = ['roc_auc', 'acc', 'f1_score']
aggs = ['mean', 'std']
report = pd.DataFrame({
        score + "_" + agg: pd.Series({
                key: get_score_report(key, score, agg) for key in metrics.keys()
        }) for score in scores for agg in aggs
})
report.sort("roc_auc_mean", ascending=False)




Unnamed: 0,acc_mean,acc_std,f1_score_mean,f1_score_std,roc_auc_mean,roc_auc_std
1x4x0.7,0.720635,5.678903e-02,0.693801,5.954350e-02,0.775120,0.026029
1x6x0.6,0.717460,6.085806e-02,0.691295,6.203974e-02,0.774402,0.024508
1x5x0.7,0.717460,6.085806e-02,0.689795,6.313095e-02,0.773923,0.024962
1x5x0.6,0.717460,6.085806e-02,0.691295,6.203974e-02,0.772727,0.027629
1x4x0.6,0.717460,6.085806e-02,0.691295,6.203974e-02,0.772488,0.029170
1x2x0.6,0.720635,6.106471e-02,0.697888,5.774796e-02,0.772488,0.027699
1x4x0.5,0.717460,6.085806e-02,0.691295,6.203974e-02,0.772488,0.026199
1x6x0.7,0.714286,5.611959e-02,0.687389,5.499315e-02,0.772249,0.028653
2x6x0.7,0.720635,4.840619e-02,0.693770,5.010105e-02,0.771770,0.027621
1x5x0.5,0.717460,6.085806e-02,0.691295,6.203974e-02,0.771770,0.028198


In [14]:
import json
with open("../output/bnn-mvp/param_metrics.json", 'w') as f:
    json.dump(metrics, f)
report.to_csv("../output/bnn-mvp/param_metrics_summary.csv")

In [11]:
"""
Conclusion:

The 1-layer NN's almost categorically outperformed the others
in roc_auc. Of those, 1x4x0.7 was the best performer by a very
small margin.  Therefore, it will be the selected model for our 
MVP.  However, I want to confirm this with an increased search
space this evening, in order to rule out somewhat larger 
dimensions.

Selected Params: 
{
    hiddenLayers: [4],
    learningRate: 0.7
}
"""
pass