# Classifying using ellipse features

**Methods:**
>1. Load intermediate data
>2. Concat new features
>3. Use gridsearchcv to find best hyperparameters for RF
>4. Use cross-val-score on RF

In [20]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.svm as skl_svm
import sklearn.cross_validation as skl_cv
import seaborn as sns
import os
import sys

base_path = '/home/lundi/Python/MNIST/'
sys.path.append(base_path + '/libraries/')

import time

import sklearn.linear_model as skl_lm
import sklearn.ensemble as skl_ensemble
import sklearn.grid_search as skl_gs

import MNIST_data_processor as mdp
import MNIST_model_functions as mmf

MNIST_data_processor = mdp.MNIST_data_processor()
MNIST_model_functions = mmf.MNIST_model_functions()

import glob

## 1. Load intermediate data

In [59]:
data = pd.read_csv('./../../data/intermediate/train_with_ellipse_count.csv')

X = data.drop(['label'], axis=1)['ellipse_count'].reset_index().rename(columns={'index': 'datum_index'})
y = data['label']

## 2. Concat new features

In [60]:
for filename in glob.glob('./../../data/generated_features/*'):
    new_features = pd.read_csv(filename)

    X = pd.merge(X, new_features, on = ['datum_index'])
X = X.drop(['label', 'datum_index'], axis=1)

## 3. Use gridsearchcv to find best hyperparameters for RF

In [62]:
start_time = time.time()

param_grid = [
    {'n_estimators': [100,1000], 'criterion': ['entropy'], 'max_depth': [2,6,9,15]}
]

rf_gs_clf = skl_gs.GridSearchCV(estimator=skl_ensemble.RandomForestClassifier(verbose=False), 
                                param_grid = param_grid, cv=5, n_jobs=6)
rf_gs_clf.fit(X, y)

end_time = time.time()

print 'Elapsed Time: ', (end_time - start_time) / 60.0, ' mins'

Elapsed Time:  0.708839865526  mins


In [63]:
for params, mean_score, scores in rf_gs_clf.grid_scores_:
    print mean_score, params

0.55048990202 {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 2}
0.553689262148 {'n_estimators': 1000, 'criterion': 'entropy', 'max_depth': 2}
0.621875624875 {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 6}
0.620675864827 {'n_estimators': 1000, 'criterion': 'entropy', 'max_depth': 6}
0.634873025395 {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 9}
0.635672865427 {'n_estimators': 1000, 'criterion': 'entropy', 'max_depth': 9}
0.615676864627 {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 15}
0.617676464707 {'n_estimators': 1000, 'criterion': 'entropy', 'max_depth': 15}


## 4. Use cross-val-score on RF

In [64]:
y_pred = skl_cv.cross_val_predict(skl_ensemble.RandomForestClassifier(n_estimators=5000, max_depth=30, criterion='entropy'),
                                      X = X, y = y, cv = 5)

prediction_data = pd.concat([pd.Series(y_pred), y], axis=1).rename(columns = {0: 'pred'})
prediction_data.groupby(['label'])['pred'].value_counts() / prediction_data.groupby(['label'])['pred'].count()

label  pred
0      0       0.829960
       2       0.058704
       3       0.046559
       8       0.026316
       4       0.020243
       5       0.012146
       9       0.004049
       6       0.002024
1      1       0.899642
       4       0.037634
       5       0.025090
       3       0.021505
       7       0.007168
       9       0.005376
       2       0.001792
       6       0.001792
2      2       0.451376
       6       0.233028
       4       0.097248
       3       0.075229
       5       0.075229
       0       0.029358
       8       0.025688
       1       0.009174
       7       0.001835
       9       0.001835
3      3       0.397917
       5       0.231250
       4       0.120833
       7       0.081250
                 ...   
6      5       0.007752
       4       0.005814
       0       0.003876
       1       0.003876
       3       0.001938
7      7       0.689723
       4       0.084980
       3       0.075099
       5       0.069170
       9       0.059289
    

In [66]:
(prediction_data.groupby(['label'])['pred'].value_counts() \
                 / prediction_data.groupby(['label'])['pred'].count()) \
                        .unstack().fillna(0).round(3).transpose()

label,0,1,2,3,4,5,6,7,8,9
pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.83,0.0,0.029,0.025,0.027,0.011,0.004,0.0,0.006,0.01
1,0.0,0.9,0.009,0.021,0.073,0.03,0.004,0.018,0.0,0.004
2,0.059,0.002,0.451,0.079,0.126,0.102,0.205,0.004,0.029,0.002
3,0.047,0.022,0.075,0.398,0.149,0.292,0.002,0.075,0.019,0.015
4,0.02,0.038,0.097,0.121,0.212,0.164,0.006,0.085,0.023,0.031
5,0.012,0.025,0.075,0.231,0.151,0.203,0.008,0.069,0.027,0.019
6,0.002,0.002,0.233,0.002,0.017,0.009,0.758,0.0,0.004,0.0
7,0.0,0.007,0.002,0.081,0.191,0.119,0.0,0.69,0.002,0.14
8,0.026,0.0,0.026,0.025,0.021,0.034,0.014,0.0,0.858,0.008
9,0.004,0.005,0.002,0.017,0.034,0.036,0.0,0.059,0.031,0.77


In [73]:
import sklearn.metrics as skl_metrics

print skl_metrics.classification_report(y_true = prediction_data['label'], y_pred = prediction_data['pred'])

             precision    recall  f1-score   support

          0       0.88      0.83      0.85       494
          1       0.87      0.90      0.88       558
          2       0.45      0.45      0.45       545
          3       0.36      0.40      0.38       480
          4       0.26      0.21      0.23       477
          5       0.24      0.20      0.22       469
          6       0.73      0.76      0.74       516
          7       0.57      0.69      0.63       506
          8       0.84      0.86      0.85       478
          9       0.80      0.77      0.78       478

avg / total       0.60      0.61      0.61      5001



So 0s, 1s, 8s, and 9s are pretty good. 3,4,5,6, and 7 are still problematic

## Generating prediction probabilities

In [67]:
rf_clf = skl_ensemble.RandomForestClassifier(n_estimators=5000, max_depth=30, criterion='entropy')
start_time = time.time()

sf_results = MNIST_model_functions.cross_val_predict_proba(
                        estimator = rf_clf, 
                        X = X, y = y, 
                        cv=5, 
                        model_name = 'RF_ellipses'
                    )
sf_results.to_csv(base_path + '/data/prediction_results/2016.11.7-rf_ellipse_results_v2.csv')
end_time = time.time()

print 'Elapsed Time: ', (end_time - start_time) / 60.0, ' mins'

Elapsed Time:  4.14005558491  mins
