## 3 - Random Forest

In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV
from operator import itemgetter

In [2]:
train_data = pd.read_csv('./input/train_prep.csv')
test_data = pd.read_csv('./input/test_prep.csv')

In [3]:
param = train_data.ix[:, :-1] # take all rows and all but last column from training data
res = train_data.ix[:, -1] # take all rows and only last column from training data
test_data = test_data.drop('ID', 1) # drop ID column from test data

In [4]:
# create output file
def create_output_file(data, file_name):
    output = []
    for pred in data:
        oi = [0] * 5
        oi[pred - 1] = 1
        output.append(oi)
    output = pd.DataFrame(output, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
    output.index.names = ['ID']
    output.index += 1
    output.to_csv('./output/Sub-' + file_name, index_label='ID')

In [5]:
rfc = RandomForestClassifier(n_estimators = 200, max_features='auto') # auto = sqrt(max_features)
# rfc = RandomForestClassifier(n_estimators = 100, max_features=None) # None = max_features
cross_validation.cross_val_score(rfc, param, res)

array([ 0.59447935,  0.59775533,  0.60357023])

In [6]:
rfc.fit(param, res) # fit the data
predictions = rfc.predict(test_data) # make predictions
create_output_file(predictions, 'RF.csv')
print predictions

[5 1 5 ..., 5 1 3]


### With Pipeline and GridSerchCV

In [7]:
pipeline = Pipeline([
            ('featureSelection', RFECV(estimator=RandomForestClassifier(n_estimators=20), scoring='neg_log_loss')),
            ('clf', RandomForestClassifier(n_estimators=100))
        ])
params = {
    "clf__max_depth": [2, 3, None],
    "clf__max_features": [0.75, 1.0],
    "clf__min_samples_split": [2, 3, 10],
    "clf__min_samples_leaf": [2, 3, 10],
    "clf__bootstrap": [True, False],
    "clf__criterion": ["gini", "entropy"]
}

In [22]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [23]:
grid_search = GridSearchCV(pipeline, params, n_jobs=-1)
grid_search.fit(param, res)
report(grid_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.608 (std: 0.003)
Parameters: {'clf__bootstrap': True, 'clf__criterion': 'entropy', 'clf__max_depth': 2, 'clf__min_samples_leaf': 2, 'clf__max_features': 0.5, 'clf__min_samples_split': 3}

Model with rank: 2
Mean validation score: 0.608 (std: 0.003)
Parameters: {'clf__bootstrap': True, 'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__min_samples_leaf': 2, 'clf__max_features': 1.0, 'clf__min_samples_split': 3}

Model with rank: 3
Mean validation score: 0.558 (std: 0.070)
Parameters: {'clf__bootstrap': True, 'clf__criterion': 'gini', 'clf__max_depth': 2, 'clf__min_samples_leaf': 3, 'clf__max_features': 0.25, 'clf__min_samples_split': 3}



In [24]:
predictions = grid_search.predict_proba(test_data)
grid_search.predict_proba(test_data)[:5]

array([[ 0.22198464,  0.        ,  0.15504756,  0.44868443,  0.17428337],
       [ 0.37186383,  0.0128928 ,  0.06081248,  0.04027921,  0.51415168],
       [ 0.29865831,  0.00257245,  0.13388114,  0.31360071,  0.25128739],
       [ 0.4221105 ,  0.00484381,  0.0541711 ,  0.24561823,  0.27325635],
       [ 0.4221105 ,  0.00484381,  0.0541711 ,  0.24561823,  0.27325635]])

In [18]:
predictions = grid_search.predict(test_data)
create_output_file(predictions, 'RF-gridSearch.csv')