# random forest

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# model validation
from sklearn.metrics import accuracy_score

In [3]:
train_labels = pd.read_csv("./data/train_labels.csv", header=None)
train_data = pd.read_csv("./data/train_data.csv", header=None)

In [4]:
# scale
scaled_features = preprocessing.scale(train_data)
pd.DataFrame(scaled_features).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,254,255,256,257,258,259,260,261,262,263
0,-1.571333,-1.423949,-0.935285,-1.169432,-0.772396,-1.258998,-0.811812,-0.985136,-0.581884,-0.756522,...,1.279245,1.266232,-0.158097,0.297566,0.164625,0.499226,0.921775,-0.185416,0.092879,0.006096
1,-0.602171,0.267213,0.055395,-0.711935,-1.073589,-0.876857,-0.373028,0.155161,0.39948,0.963212,...,-0.897103,-0.852736,0.074136,1.103637,0.111916,-0.726099,0.269456,-1.177036,-0.694615,-0.2238
2,-0.584983,0.148239,0.606352,0.806748,0.07589,-0.140405,-0.116555,0.220999,-0.266751,-0.562176,...,-0.406852,1.244176,0.375294,1.291144,0.824687,2.77078,-0.179373,-0.786025,0.560227,-0.548004
3,0.193432,1.292285,0.85447,0.486911,0.341579,0.529897,0.640292,0.450361,0.375602,0.654629,...,-0.975856,0.528008,-0.66158,0.90356,-1.144311,0.899273,-1.229087,0.110236,2.960029,-0.956088
4,-0.825078,-0.712986,-0.865418,-1.029277,-1.073589,-1.243794,-0.87486,-1.021802,-0.95912,-0.545164,...,-0.189255,1.433153,-0.525024,-1.507232,-0.966014,-0.535219,-0.432634,-1.336981,-1.421282,0.632431


In [5]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, train_labels[0], test_size=0.30)

There are three features worth exploring with the sklearn RandomForestClassifier, in order of importance:

- `n_estimators`
- `max_features`
- `criterion`

n_estimators is not really worth optimizing. The more estimators you give it, the better it will do. 500 or 1000 is usually sufficient.

In [6]:
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
predictions = rfc.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

0.6103896103896104

### grid search

In [11]:
from sklearn.model_selection import GridSearchCV

In [14]:
# # use a full grid over all parameters
# param_grid = {"max_depth": [3, None],
#               "max_features": [1, 3, 10],
#               "min_samples_split": [2, 3, 10],
#               "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"]}

# # run grid search
# rfc = RandomForestClassifier(n_estimators=1000)

# grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=5)
# grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, None], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

`estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),`

In [16]:
grid_search.get_params()

{'cv': 5,
 'error_score': 'raise',
 'estimator__bootstrap': True,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 1000,
 'estimator__n_jobs': 1,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
             oob_score=False, 

In [18]:
new_pred = grid_search.predict(X_test)
accuracy_score(y_true=y_test, y_pred=new_pred)

0.6157372039724981

In [19]:
from sklearn.metrics import log_loss

new_proba = grid_search.predict_proba(X_test)
log_loss(y_true=y_test, y_pred=new_proba)

1.2302562626206417

In [20]:
new_proba

array([[0.181, 0.166, 0.138, ..., 0.04 , 0.047, 0.056],
       [0.346, 0.159, 0.066, ..., 0.033, 0.022, 0.021],
       [0.695, 0.177, 0.021, ..., 0.015, 0.005, 0.011],
       ...,
       [0.724, 0.069, 0.016, ..., 0.043, 0.001, 0.013],
       [0.585, 0.167, 0.007, ..., 0.036, 0.002, 0.011],
       [0.673, 0.104, 0.015, ..., 0.043, 0.002, 0.009]])

### on submission data

In [27]:
submission_data = pd.read_csv("./data/test_data.csv", header=None)
submission_proba = grid_search.predict_proba(submission_data)

In [36]:
dummy = pd.read_csv("./data/dummy_solution_accuracy.csv")
dummy.drop("Sample_label", inplace=True, axis=1)
print(dummy.shape)
dummy.head()

(6544, 1)


Unnamed: 0,Sample_id
0,1
1,2
2,3
3,4
4,5


In [37]:
submission_proba[:, 0].shape

(6544,)

In [38]:
submission2 = dummy.copy()

In [39]:
for i in range(10):
    submission2[f'Class_{i+1}'] = submission_proba[:, i]

In [40]:
submission2.head()

Unnamed: 0,Sample_id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9,Class_10
0,1,0.234,0.175,0.215,0.007,0.154,0.059,0.012,0.131,0.009,0.004
1,2,0.232,0.178,0.213,0.007,0.156,0.058,0.012,0.13,0.01,0.004
2,3,0.238,0.169,0.21,0.006,0.159,0.056,0.012,0.138,0.008,0.004
3,4,0.246,0.169,0.204,0.005,0.164,0.048,0.013,0.141,0.006,0.004
4,5,0.246,0.167,0.202,0.006,0.165,0.049,0.014,0.141,0.006,0.004


In [41]:
submission2.to_csv("./submission/logloss_random_forest.csv", index=False)