## 4 - Decision Tree

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import cross_validation
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from time import time
from operator import itemgetter
import numpy as np
import pandas as pd

In [5]:
train_data = pd.read_csv('./input/train_prep.csv')
test_data = pd.read_csv('./input/test_prep.csv')

param = train_data.ix[:, :-1] # take all rows and all but last column from training data
res = train_data.ix[:, -1] # take all rows and only last column from training data
test_data = test_data.drop('ID', 1) # drop ID column from test data

In [6]:
# create output file
def create_output_file(data, file_name):
    output = []
    for pred in data:
        oi = [0] * 5
        oi[pred - 1] = 1
        output.append(oi)
    output = pd.DataFrame(output, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
    output.index.names = ['ID']
    output.index += 1
    output.to_csv('./output/Sub-' + file_name, index_label='ID')

In [13]:
dt = DecisionTreeClassifier()
cross_validation.cross_val_score(dt, param, res)

array([ 0.59290844,  0.59753086,  0.59840575])

In [18]:
dt.fit(param, res)
pred = dt.predict(test_data)
create_output_file(pred, "DT.csv")
print pred

[4 1 5 ..., 5 1 3]


### Pipeline and GridSearchCV

In [25]:
params = {
    "featureSelection__k" : [2, 3, 4],
    "clf__criterion": ["gini", "entropy"],
    "clf__max_features": [0.75, 1.0],
    "clf__min_samples_split": [2, 3, 10, 30],
    "clf__min_samples_leaf": [2, 3, 10, 30],
    "clf__min_weight_fraction_leaf": [0.0, 0.1, 0.3],
    "clf__max_depth": [3, 10, 30, None]
}
pipeline = Pipeline([
        ('featureSelection', SelectKBest(f_classif)),
        ('clf', DecisionTreeClassifier())
    ])
grid_search = GridSearchCV(pipeline, params, n_jobs=-1)

In [28]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [29]:
grid_search.fit(param, res)
report(grid_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.634 (std: 0.004)
Parameters: {'featureSelection__k': 3, 'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__min_weight_fraction_leaf': 0.0, 'clf__min_samples_leaf': 10, 'clf__max_features': 0.75, 'clf__min_samples_split': 10}

Model with rank: 2
Mean validation score: 0.634 (std: 0.003)
Parameters: {'featureSelection__k': 3, 'clf__criterion': 'entropy', 'clf__max_depth': 30, 'clf__min_weight_fraction_leaf': 0.0, 'clf__min_samples_leaf': 30, 'clf__max_features': 0.75, 'clf__min_samples_split': 2}

Model with rank: 3
Mean validation score: 0.634 (std: 0.004)
Parameters: {'featureSelection__k': 3, 'clf__criterion': 'gini', 'clf__max_depth': None, 'clf__min_weight_fraction_leaf': 0.0, 'clf__min_samples_leaf': 10, 'clf__max_features': 0.75, 'clf__min_samples_split': 2}



In [36]:
pred = grid_search.predict(test_data)
create_output_file(pred, "DT-grid-search.csv")
grid_search.predict_proba(test_data)[:5]

array([[ 0.16666667,  0.        ,  0.16666667,  0.16666667,  0.5       ],
       [ 0.53795066,  0.00189753,  0.0256167 ,  0.26091082,  0.17362429],
       [ 0.392     ,  0.        ,  0.02      ,  0.1       ,  0.488     ],
       [ 0.01612903,  0.01612903,  0.08064516,  0.08064516,  0.80645161],
       [ 0.4424981 ,  0.00152323,  0.03884235,  0.31302361,  0.20411272]])