In [15]:
import pandas as pd
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [6]:
rf = RandomForestClassifier()

param_grid = {"n_estimators": [100, 300],\
              "min_samples_split": [4, 10]}

cv = GridSearchCV(rf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2)

In [7]:
cv.fit(train.loc[:, 'pixel0':], train.loc[:, 'label'])

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] min_samples_split=4, n_estimators=100 ...........................
[CV] .................. min_samples_split=4, n_estimators=100 -  24.5s
[CV] min_samples_split=4, n_estimators=100 ...........................
[CV] .................. min_samples_split=4, n_estimators=100 -  24.4s
[CV] min_samples_split=4, n_estimators=100 ...........................
[CV] .................. min_samples_split=4, n_estimators=100 -  25.4s
[CV] min_samples_split=4, n_estimators=300 ...........................
[CV] .................. min_samples_split=4, n_estimators=300 - 1.3min
[CV] min_samples_split=4, n_estimators=300 ...........................
[CV] .................. min_samples_split=4, n_estimators=300 - 1.0min
[CV] min_samples_split=4, n_estimators=300 ...........................
[CV] .................. min_samples_split=4, n_estimators=300 -  53.0s
[CV] min_samples_split=10, n_estimators=100 ..........................
[CV] ............

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  7.8min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [4, 10], 'n_estimators': [100, 300]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=2)

In [8]:
cv.best_params_

{'min_samples_split': 4, 'n_estimators': 300}

In [10]:
print(classification_report(train.loc[:, 'label'],\
                            cv.predict(train.loc[:, 'pixel0':])))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      4132
          1       1.00      1.00      1.00      4684
          2       1.00      1.00      1.00      4177
          3       1.00      1.00      1.00      4351
          4       1.00      1.00      1.00      4072
          5       1.00      1.00      1.00      3795
          6       1.00      1.00      1.00      4137
          7       1.00      1.00      1.00      4401
          8       1.00      1.00      1.00      4063
          9       1.00      1.00      1.00      4188

avg / total       1.00      1.00      1.00     42000



In [12]:
prediction = cv.predict(test)

In [19]:
prediction.shape

(28000,)

In [24]:
np.arange(1, 2, 1)

array([1])

In [25]:
submission = pd.DataFrame({'ImageId': np.arange(1, test.shape[0] + 1, 1),\
                           'Label': prediction})

In [26]:
submission.to_csv('../submission/Modeling_v1.csv', index=False)