In [1]:
import pandas as pd
import numpy as np
import pickle

## Multiclass

In [9]:
train_data = pd.read_csv('data/training_data.csv').set_index('Unnamed: 0')
test_data = pd.read_csv('data/test_data.csv').set_index('Unnamed: 0')

print train_data.columns
print train_data.shape
print train_data.head(5)

print test_data.shape
print test_data.head(5)


X_train, y_train = train_data[[el for el in train_data.columns if el != 'privacy']].as_matrix(), train_data['privacy'].as_matrix()
X_test, y_test = test_data[[el for el in test_data.columns if el != 'privacy']].as_matrix(), test_data['privacy'].as_matrix()

Index([u'room_area', u'total_floor_area', u'room_to_total_area',
       u'centered_isovist', u'centered_isovist_to_room_floor',
       u'centered_isovist_to_total_floor', u'number_of_visual_neighbors',
       u'privacy', u'degree_centrality', u'betweenness_centrality'],
      dtype='object')
(67, 10)
            room_area  total_floor_area  room_to_total_area  centered_isovist  \
Unnamed: 0                                                                      
1               4.900             144.3               0.034            17.600   
2              10.961             125.4               0.087            21.244   
3              12.000             162.0               0.074            15.600   
4              11.900             222.1               0.054            32.500   
5              31.800             108.3               0.294            53.100   

            centered_isovist_to_room_floor  centered_isovist_to_total_floor  \
Unnamed: 0                                         

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC


model_fitting_dict = {
    'Nearest Neighbours' : {
        'model': KNeighborsClassifier,
        'random_grid': {'n_neighbors': range(1,20)},
        'default': {'n_neighbors': 1},
        'optimize': True,
        'scale': True,
        'n_iter': 19
    },
    'Random Forest' : {
        'model': RandomForestClassifier,
        'random_grid': {'bootstrap': [True, False],
                         'max_depth': [10, 30, 50, 100, None],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200, 350, 500, 1000]},
        'default': None,
        'optimize': True,
        'scale': True,
        'n_iter': 100
    },
    'Logistic Regression' : {
        'model': LogisticRegression,
        'random_grid': None,
        'default': {'multi_class':'multinomial', 'solver':'saga', 'max_iter':1000},
        'optimize': False,
        'scale': True
    },
    'Logistic Regression - L1 Penalty' : {
        'model': LogisticRegression,
        'random_grid': None,
        'default': {'multi_class':'multinomial', 'solver':'saga', 'max_iter':1000, 'penalty':'l1'},
        'optimize': False,
        'scale': True
    },
    'Gradient Boosted Trees' : {
        'model': XGBClassifier,
        'random_grid': {
                        'min_child_weight': [1, 5, 10, 20],
                        'gamma': [0.5, 1, 1.5, 2, 5],
                        'subsample': [0.6, 0.8, 1.0],
                        'colsample_bytree': [0.6, 0.8, 1.0],
                        'max_depth': [10, 50, 100, 200, 400]
                        },
        'default': None,
        'optimize': True,
        'scale': True,
        'n_iter': 100
    },
    'SVM' : {
        'model': SVC,
        'random_grid': {'C': np.logspace(start=-0.5, stop=2, num=25, base=10)},
        'default': {'kernel':'linear', 'probability':True},
        'optimize': True,
        'scale': True,
        'n_iter': 24
    },
    'SVM (RBF Kernel)' : {
        'model': SVC,
        'random_grid': {
                        'C': np.logspace(start=-0.5, stop=2, num=25, base=10)
                        },
        'default': {'kernel':'rbf', 'probability':True},
        'optimize': True,
        'scale': True,
        'n_iter': 24
    }
}

In [22]:
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss


def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

def fitting_model_from_dict(X_train, y_train, X_test, y_test, outcome_label = [0,1,2], 
                            opt_metric='accuracy', model_dict=model_fitting_dict):
    
    scaler = StandardScaler()
    X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.fit_transform(X_test)
    out_list = []
    
    for model_name, model_value in model_dict.iteritems():
        out_model = []
        print 'Working on %s'%(model_name)
        
        # Fitting the model as is first
        if model_value['default']:
            model = model_value['model'](**model_value['default'])
        else:
            model = model_value['model']()
            
        model.fit(X_train, y_train)
        y_predict = model.predict(X_test)
        y_predict_dict = Counter(zip(y_test, y_predict))
        
        out_model.append(model_name)
        for class_label in outcome_label:
            out_model.append(y_predict_dict[(class_label, class_label)]/float(sum(y_test==class_label)))
        out_list.append(out_model)
        
        
        # Fitting the model scaled as necessary
        if model_value['scale']:
            out_model = []
            
            if model_value['default']:
                model = model_value['model'](**model_value['default'])
            else:
                model = model_value['model']()
            model.fit(X_train_scaled, y_train)
            y_predict = model.predict(X_test_scaled)
            y_predict_dict = Counter(zip(y_test, y_predict))

            out_model.append(model_name +' (Scaled)')
            for class_label in outcome_label:
                out_model.append(y_predict_dict[(class_label, class_label)]/float(sum(y_test==class_label)))
            out_list.append(out_model)
            
        # Optimize the model if necessary    
        if model_value['optimize']:
            out_model = []
            print 'Optimizing %s'%(model_name)
            
            if model_value['default']:
                model = model_value['model'](**model_value['default'])
            else:
                model = model_value['model']()
                
            print model_value
            
            random_grid = model_value['random_grid']
            random_opt = RandomizedSearchCV(estimator = model, 
                               param_distributions = random_grid,
                               n_iter = model_value['n_iter'],
                               cv = 3, 
                               verbose=1, 
                               random_state=42, 
                               n_jobs = -1,
                               scoring = opt_metric)
            
            random_opt.fit(X_train, y_train)
            
            if model_value['default']:
                final_model_params_dict = merge_two_dicts(model_value['default'], random_opt.best_params_)
            else:
                final_model_params_dict = random_opt.best_params_
                
            model = model_value['model'](**final_model_params_dict)
            model.fit(X_train, y_train)
            y_predict = model.predict(X_test)
            y_predict_dict = Counter(zip(y_test, y_predict))

            out_model.append(model_name+' (Optimized)')
            for class_label in outcome_label:
                out_model.append(y_predict_dict[(class_label, class_label)]/float(sum(y_test==class_label)))
            out_list.append(out_model)
            
    return out_list

In [20]:
import pickle
from datetime import datetime

output_list_models = fitting_model_from_dict(X_train, y_train, X_test, y_test,
                                            model_dict=model_fitting_dict)

filename_out = 'output_list_results_multiclass_%s.pkl'%(datetime.strftime(datetime.today(), format='%Y-%m-%d'))
pickle.dump(output_list_models, open(filename_out, 'wb'))

Working on Nearest Neighbours
Optimizing Nearest Neighbours
{'n_iter': 19, 'scale': True, 'default': {'n_neighbors': 1}, 'random_grid': {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}, 'model': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'optimize': True}
Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=-1)]: Done  57 out of  57 | elapsed:    0.1s finished


Working on SVM
Optimizing SVM
{'n_iter': 24, 'scale': True, 'default': {'kernel': 'linear', 'probability': True}, 'random_grid': {'C': array([  0.31622777,   0.40194503,   0.51089698,   0.64938163,
         0.82540419,   1.04913973,   1.33352143,   1.69498815,
         2.15443469,   2.73841963,   3.48070059,   4.42418555,
         5.62341325,   7.14770577,   9.08517576,  11.54781985,
        14.67799268,  18.65663579,  23.71373706,  30.1416253 ,
        38.3118685 ,  48.69675252,  61.89658189,  78.67438077,
       100.        ])}, 'model': <class 'sklearn.svm.classes.SVC'>, 'optimize': True}
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.1min finished


Working on SVM (RBF Kernel)
Optimizing SVM (RBF Kernel)
{'n_iter': 24, 'scale': True, 'default': {'kernel': 'rbf', 'probability': True}, 'random_grid': {'C': array([  0.31622777,   0.40194503,   0.51089698,   0.64938163,
         0.82540419,   1.04913973,   1.33352143,   1.69498815,
         2.15443469,   2.73841963,   3.48070059,   4.42418555,
         5.62341325,   7.14770577,   9.08517576,  11.54781985,
        14.67799268,  18.65663579,  23.71373706,  30.1416253 ,
        38.3118685 ,  48.69675252,  61.89658189,  78.67438077,
       100.        ])}, 'model': <class 'sklearn.svm.classes.SVC'>, 'optimize': True}
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.2s finished
  if diff:
  if diff:


Working on Logistic Regression
Working on Gradient Boosted Trees
Optimizing Gradient Boosted Trees
{'n_iter': 100, 'scale': True, 'default': None, 'random_grid': {'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'gamma': [0.5, 1, 1.5, 2, 5], 'min_child_weight': [1, 5, 10, 20], 'max_depth': [10, 50, 100, 200, 400]}, 'model': <class 'xgboost.sklearn.XGBClassifier'>, 'optimize': True}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=-1)]: Done 212 tasks      | elapsed:    4.3s
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    6.1s finished
  if diff:


Working on Logistic Regression - L1 Penalty
Working on Random Forest
Optimizing Random Forest
{'n_iter': 100, 'scale': True, 'default': None, 'random_grid': {'bootstrap': [True, False], 'min_samples_leaf': [1, 2, 4], 'n_estimators': [50, 100, 200, 350, 500, 1000], 'min_samples_split': [2, 5, 10], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 30, 50, 100, None]}, 'model': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'optimize': True}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.9min finished


In [23]:
print output_list_models

[['Nearest Neighbours', 0.4166666666666667, 0.0, 0.6153846153846154], ['Nearest Neighbours (Scaled)', 0.3333333333333333, 0.0, 0.5384615384615384], ['Nearest Neighbours (Optimized)', 0.5, 0.0, 0.5384615384615384], ['SVM', 0.6666666666666666, 0.0, 0.6153846153846154], ['SVM (Scaled)', 0.6666666666666666, 0.0, 0.5384615384615384], ['SVM (Optimized)', 0.5833333333333334, 0.0, 0.5384615384615384], ['SVM (RBF Kernel)', 0.8333333333333334, 0.0, 0.3076923076923077], ['SVM (RBF Kernel) (Scaled)', 0.6666666666666666, 0.0, 0.6923076923076923], ['SVM (RBF Kernel) (Optimized)', 0.8333333333333334, 0.0, 0.3076923076923077], ['Logistic Regression', 0.75, 0.0, 0.6923076923076923], ['Logistic Regression (Scaled)', 0.6666666666666666, 0.0, 0.6153846153846154], ['Gradient Boosted Trees', 0.6666666666666666, 0.0, 0.23076923076923078], ['Gradient Boosted Trees (Scaled)', 0.6666666666666666, 0.0, 0.5384615384615384], ['Gradient Boosted Trees (Optimized)', 0.6666666666666666, 0.0, 0.3076923076923077], ['Log

## Binary case

In [24]:
train_data_binary = pd.read_csv('data/training_data_binary.csv').set_index('Unnamed: 0')
test_data_binary = pd.read_csv('data/test_data_binary.csv').set_index('Unnamed: 0')

print train_data_binary.columns
print train_data_binary.shape
print train_data_binary.head(5)

print test_data_binary.shape
print test_data_binary.head(5)


X_train_binary, y_train_binary = train_data_binary[[el for el in train_data_binary.columns if el != 'privacy_binary']].as_matrix(), train_data_binary['privacy_binary'].as_matrix()
X_test_binary, y_test_binary = test_data_binary[[el for el in test_data_binary.columns if el != 'privacy_binary']].as_matrix(), test_data_binary['privacy_binary'].as_matrix()

Index([u'room_area', u'total_floor_area', u'room_to_total_area',
       u'centered_isovist', u'centered_isovist_to_room_floor',
       u'centered_isovist_to_total_floor', u'number_of_visual_neighbors',
       u'privacy_binary', u'degree_centrality', u'betweenness_centrality'],
      dtype='object')
(67, 10)
            room_area  total_floor_area  room_to_total_area  centered_isovist  \
Unnamed: 0                                                                      
1               1.800             144.3               0.012             5.200   
2              10.961             125.4               0.087            21.244   
3              21.200             222.1               0.095            24.200   
4              11.900             222.1               0.054            32.500   
5               2.000             162.0               0.012            13.600   

            centered_isovist_to_room_floor  centered_isovist_to_total_floor  \
Unnamed: 0                                  

In [25]:
model_fitting_dict_binary = {
    'Nearest Neighbours' : {
        'model': KNeighborsClassifier,
        'random_grid': {'n_neighbors': range(1,20)},
        'default': {'n_neighbors': 1},
        'optimize': True,
        'scale': True,
        'n_iter': 19
    },
    'Random Forest' : {
        'model': RandomForestClassifier,
        'random_grid': {'bootstrap': [True, False],
                         'max_depth': [10, 30, 50, 100, None],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200, 350, 500, 1000]},
        'default': None,
        'optimize': True,
        'scale': True,
        'n_iter': 100
    },
    'Logistic Regression' : {
        'model': LogisticRegression,
        'random_grid': None,
        'default': {'multi_class':'ovr', 'solver':'saga', 'max_iter':1000},
        'optimize': False,
        'scale': True
    },
    'Logistic Regression - L1 Penalty' : {
        'model': LogisticRegression,
        'random_grid': None,
        'default': {'multi_class':'ovr', 'solver':'saga', 'max_iter':1000, 'penalty':'l1'},
        'optimize': False,
        'scale': True
    },
    'Gradient Boosted Trees' : {
        'model': XGBClassifier,
        'random_grid': {
                        'min_child_weight': [1, 5, 10, 20],
                        'gamma': [0.5, 1, 1.5, 2, 5],
                        'subsample': [0.6, 0.8, 1.0],
                        'colsample_bytree': [0.6, 0.8, 1.0],
                        'max_depth': [10, 50, 100, 200, 400]
                        },
        'default': None,
        'optimize': True,
        'scale': True,
        'n_iter': 100
    },
    'SVM' : {
        'model': SVC,
        'random_grid': {'C': np.logspace(start=-0.5, stop=2, num=25, base=10)},
        'default': {'kernel':'linear', 'probability':True},
        'optimize': True,
        'scale': True,
        'n_iter': 24
    },
    'SVM (RBF Kernel)' : {
        'model': SVC,
        'random_grid': {
                        'C': np.logspace(start=-0.5, stop=2, num=25, base=10)
                        },
        'default': {'kernel':'rbf', 'probability':True},
        'optimize': True,
        'scale': True,
        'n_iter': 24
    }
}

In [26]:
output_list_models_binary = fitting_model_from_dict(X_train_binary, y_train_binary, X_test_binary, y_test_binary,
                                             outcome_label = [0,1],
                                             model_dict=model_fitting_dict_binary)

filename_out = 'output_list_results_binary_%s.pkl'%(datetime.strftime(datetime.today(), format='%Y-%m-%d'))
pickle.dump(output_list_models_binary, open(filename_out, 'wb'))

Working on Nearest Neighbours
Optimizing Nearest Neighbours
{'n_iter': 19, 'scale': True, 'default': {'n_neighbors': 1}, 'random_grid': {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}, 'model': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'optimize': True}
Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=-1)]: Done  57 out of  57 | elapsed:    0.1s finished


Working on SVM
Optimizing SVM
{'n_iter': 24, 'scale': True, 'default': {'kernel': 'linear', 'probability': True}, 'random_grid': {'C': array([  0.31622777,   0.40194503,   0.51089698,   0.64938163,
         0.82540419,   1.04913973,   1.33352143,   1.69498815,
         2.15443469,   2.73841963,   3.48070059,   4.42418555,
         5.62341325,   7.14770577,   9.08517576,  11.54781985,
        14.67799268,  18.65663579,  23.71373706,  30.1416253 ,
        38.3118685 ,  48.69675252,  61.89658189,  78.67438077,
       100.        ])}, 'model': <class 'sklearn.svm.classes.SVC'>, 'optimize': True}
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   39.7s finished


Working on SVM (RBF Kernel)
Optimizing SVM (RBF Kernel)
{'n_iter': 24, 'scale': True, 'default': {'kernel': 'rbf', 'probability': True}, 'random_grid': {'C': array([  0.31622777,   0.40194503,   0.51089698,   0.64938163,
         0.82540419,   1.04913973,   1.33352143,   1.69498815,
         2.15443469,   2.73841963,   3.48070059,   4.42418555,
         5.62341325,   7.14770577,   9.08517576,  11.54781985,
        14.67799268,  18.65663579,  23.71373706,  30.1416253 ,
        38.3118685 ,  48.69675252,  61.89658189,  78.67438077,
       100.        ])}, 'model': <class 'sklearn.svm.classes.SVC'>, 'optimize': True}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Working on Logistic Regression
Working on Gradient Boosted Trees
Optimizing Gradient Boosted Trees
{'n_iter': 100, 'scale': True, 'default': None, 'random_grid': {'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'gamma': [0.5, 1, 1.5, 2, 5], 'min_child_weight': [1, 5, 10, 20], 'max_depth': [10, 50,

[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.2s finished
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    2.6s finished
  if diff:


Working on Logistic Regression - L1 Penalty
Working on Random Forest
Optimizing Random Forest
{'n_iter': 100, 'scale': True, 'default': None, 'random_grid': {'bootstrap': [True, False], 'min_samples_leaf': [1, 2, 4], 'n_estimators': [50, 100, 200, 350, 500, 1000], 'min_samples_split': [2, 5, 10], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 30, 50, 100, None]}, 'model': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'optimize': True}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.3min finished


In [27]:
print output_list_models_binary

[['Nearest Neighbours', 0.7857142857142857, 0.5333333333333333], ['Nearest Neighbours (Scaled)', 0.7857142857142857, 0.4666666666666667], ['Nearest Neighbours (Optimized)', 0.7857142857142857, 0.6], ['SVM', 0.7857142857142857, 0.5333333333333333], ['SVM (Scaled)', 0.7142857142857143, 0.6], ['SVM (Optimized)', 0.7857142857142857, 0.5333333333333333], ['SVM (RBF Kernel)', 0.9285714285714286, 0.2], ['SVM (RBF Kernel) (Scaled)', 0.7857142857142857, 0.6], ['SVM (RBF Kernel) (Optimized)', 0.9285714285714286, 0.2], ['Logistic Regression', 0.8571428571428571, 0.5333333333333333], ['Logistic Regression (Scaled)', 0.7857142857142857, 0.4666666666666667], ['Gradient Boosted Trees', 0.9285714285714286, 0.4666666666666667], ['Gradient Boosted Trees (Scaled)', 0.8571428571428571, 0.6], ['Gradient Boosted Trees (Optimized)', 0.9285714285714286, 0.4666666666666667], ['Logistic Regression - L1 Penalty', 0.8571428571428571, 0.5333333333333333], ['Logistic Regression - L1 Penalty (Scaled)', 0.78571428571

## Getting Latex tables

In [28]:
from datetime import datetime

def create_latex_table_from_list_of_list(output_list, y_test, class_labels):
    
    final_line = ' ?? ?hline'.replace('?', '\\')
    
    col_names = ['Model'] + ['%s Class Accuracy'%(class_label) for class_label in class_labels]
    output = [' & '.join(col_names) + final_line]
    
    for line in output_list:
        value_line = '%s & '%(line[0])
        
        for j, class_label in enumerate(class_labels):
            last_line = False
            if j == len(class_labels)-1:
                last_line = True
            n_class_label = sum(y_test==class_label)
            value_line += '%s \\%% (%s / %s) %s '%(round(line[j+1]*100, 2), int(line[j+1]*n_class_label), 
                                                   n_class_label, '&' if not last_line else final_line)
            
        
        output += [value_line]
        
    for line in output:
        print line
        
        
with open('output_list_results_multiclass_%s.pkl'%(datetime.strftime(datetime.today(), format='%Y-%m-%d')), 'rb') as f:
    output_list_models = pickle.load(f)
    
    
with open('output_list_results_binary_%s.pkl'%(datetime.strftime(datetime.today(), format='%Y-%m-%d')), 'rb') as f:
    output_list_models_binary = pickle.load(f)    

            
            
create_latex_table_from_list_of_list(output_list_models, y_test, [0,1,2])

print '\n'

create_latex_table_from_list_of_list(output_list_models_binary, y_test_binary, [0,1])
    

Model & 0 Class Accuracy & 1 Class Accuracy & 2 Class Accuracy \\ \hline
Nearest Neighbours & 41.67 \% (5 / 12) & 0.0 \% (0 / 4) & 61.54 \% (8 / 13)  \\ \hline 
Nearest Neighbours (Scaled) & 33.33 \% (4 / 12) & 0.0 \% (0 / 4) & 53.85 \% (7 / 13)  \\ \hline 
Nearest Neighbours (Optimized) & 50.0 \% (6 / 12) & 0.0 \% (0 / 4) & 53.85 \% (7 / 13)  \\ \hline 
SVM & 66.67 \% (8 / 12) & 0.0 \% (0 / 4) & 61.54 \% (8 / 13)  \\ \hline 
SVM (Scaled) & 66.67 \% (8 / 12) & 0.0 \% (0 / 4) & 53.85 \% (7 / 13)  \\ \hline 
SVM (Optimized) & 58.33 \% (7 / 12) & 0.0 \% (0 / 4) & 53.85 \% (7 / 13)  \\ \hline 
SVM (RBF Kernel) & 83.33 \% (10 / 12) & 0.0 \% (0 / 4) & 30.77 \% (4 / 13)  \\ \hline 
SVM (RBF Kernel) (Scaled) & 66.67 \% (8 / 12) & 0.0 \% (0 / 4) & 69.23 \% (9 / 13)  \\ \hline 
SVM (RBF Kernel) (Optimized) & 83.33 \% (10 / 12) & 0.0 \% (0 / 4) & 30.77 \% (4 / 13)  \\ \hline 
Logistic Regression & 75.0 \% (9 / 12) & 0.0 \% (0 / 4) & 69.23 \% (9 / 13)  \\ \hline 
Logistic Regression (Scaled) & 66.