# Classifying using ellipse features

>1. Load ellipse data
>2. Use gridsearchcv to find best hyperparameters for RF
>3. Use cross-val-score on RF
>4. Use predict_proba using optimized model
>5. Export predictions

In [5]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.svm as skl_svm
import sklearn.cross_validation as skl_cv
import seaborn as sns
import os
import sys

base_path = '/home/lundi/Python/MNIST/'
sys.path.append(base_path + '/libraries/')

import time

import sklearn.linear_model as skl_lm
import sklearn.ensemble as skl_ensemble
import sklearn.grid_search as skl_gs

import MNIST_data_processor as mdp
import MNIST_model_functions as mmf

MNIST_data_processor = mdp.MNIST_data_processor()
MNIST_model_functions = mmf.MNIST_model_functions()

## 1. Load ellipse data

In [3]:
data = pd.read_csv('./../../data/processed/data_ellipses.csv')

X = data.drop(['label'], axis=1)
y = data['label']

## 2. Use gridsearchcv to find best hyperparameters for RF

In [10]:
start_time = time.time()

param_grid = [
    {'n_estimators': [100,1000,5000], 'criterion': ['entropy'], 'max_depth': [2,3,6,9,15]}
]

rf_gs_clf = skl_gs.GridSearchCV(estimator=skl_ensemble.RandomForestClassifier(verbose=False), 
                                param_grid = param_grid, cv=5, n_jobs=6)
rf_gs_clf.fit(X, y)

end_time = time.time()

print 'Elapsed Time: ', (end_time - start_time) / 60.0, ' mins'

Elapsed Time:  1.30886906783  mins


In [11]:
for params, mean_score, scores in rf_gs_clf.grid_scores_:
    print mean_score, params

0.470105978804 {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 2}
0.470105978804 {'n_estimators': 1000, 'criterion': 'entropy', 'max_depth': 2}
0.469306138772 {'n_estimators': 5000, 'criterion': 'entropy', 'max_depth': 2}
0.469306138772 {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 3}
0.470105978804 {'n_estimators': 1000, 'criterion': 'entropy', 'max_depth': 3}
0.470105978804 {'n_estimators': 5000, 'criterion': 'entropy', 'max_depth': 3}
0.470305938812 {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 6}
0.471105778844 {'n_estimators': 1000, 'criterion': 'entropy', 'max_depth': 6}
0.471105778844 {'n_estimators': 5000, 'criterion': 'entropy', 'max_depth': 6}
0.471105778844 {'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 9}
0.471105778844 {'n_estimators': 1000, 'criterion': 'entropy', 'max_depth': 9}
0.471105778844 {'n_estimators': 5000, 'criterion': 'entropy', 'max_depth': 9}
0.470705858828 {'n_estimators': 100, 'criterion': 'entropy', 'max_de

## 3. Use cross-val-score on RF

In [16]:
y_pred = skl_cv.cross_val_predict(skl_ensemble.RandomForestClassifier(n_estimators=100, max_depth=15, criterion='entropy'),
                      X = X, y = y, cv = 5)

In [22]:
prediction_data = pd.concat([pd.Series(y_pred), y], axis=1).rename(columns = {0: 'pred'})

In [24]:
prediction_data.groupby(['label'])['pred'].value_counts() / prediction_data.groupby(['label'])['pred'].count()

label  pred
0      0       0.777328
       3       0.119433
       2       0.028340
       5       0.026316
       8       0.024291
       9       0.016194
       6       0.008097
1      1       0.878136
       3       0.112903
       5       0.005376
       2       0.001792
       8       0.001792
2      3       0.572477
       6       0.315596
       5       0.056881
       2       0.027523
       8       0.023853
       1       0.003670
3      3       0.916667
       5       0.033333
       9       0.014583
       2       0.012500
       8       0.012500
       6       0.010417
4      3       0.823899
       1       0.073375
       9       0.044025
       5       0.029350
       2       0.020964
       8       0.006289
                 ...   
5      2       0.057569
       5       0.042644
       8       0.023454
       6       0.012793
       9       0.006397
       1       0.004264
6      6       0.707364
       3       0.168605
       8       0.050388
       5       0.046512
    

## 4. Use predict_proba using optimized model

In [25]:
rf_clf = skl_ensemble.RandomForestClassifier(n_estimators=100, max_depth=15, criterion='entropy')
start_time = time.time()

sf_results = MNIST_model_functions.cross_val_predict_proba(
                        estimator = rf_clf, 
                        X = X, y = y, 
                        cv=5, 
                        model_name = 'RF_ellipses'
                    )
sf_results.to_csv(base_path + '/data/prediction_results/2016.11.7-rf_ellipse_results.csv')
end_time = time.time()

print 'Elapsed Time: ', (end_time - start_time) / 60.0, ' mins'

Elapsed Time:  0.0177988171577  mins


In [29]:
exp_1 = sf_results[['Actual','Predicted','1']]
exp_1.loc[exp_1['1'] > 0.8]

Unnamed: 0,Actual,Predicted,1
0,1,1,0.84727
2,1,1,0.84727
12,1,1,0.84727
15,1,1,0.84727
18,7,1,0.84727
35,1,1,0.84727
37,1,1,0.84727
38,1,1,0.84727
41,1,1,0.84727
52,1,1,0.84727


This works okay. The issue is that for numbers that aren't included in the dummy variable list, it gets the predictions very wrong

## 5. Export predictions