# Shelter Animal Outcomes 6

## Decision Trees

In case you're wondering which decision tree I'm using, I'm using the
one scikit learn is providing, which accordng to it's [user guide](http://scikit-learn.org/0.15/modules/tree.html#tree-algorithms-id3-c4-5-c5-0-and-cart) 
> scikit-learn uses an optimised version of the CART algorithm.

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import cross_validation
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from time import time
from operator import itemgetter
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv('../Shelter_train.csv')
df_test = pd.read_csv('../Shelter_test.csv')

In [3]:
X = df_train.ix[:, :-1]
y = df_train.ix[:, -1]
df_test = df_test.drop('ID', 1)

In [4]:
clf = DecisionTreeClassifier()
cross_validation.cross_val_score(clf, X, y, scoring="log_loss")

array([-2.93792338, -2.91144915, -2.94305274])

In [5]:
%timeit clf.fit(X, y)

10 loops, best of 3: 22.5 ms per loop


In [6]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [7]:
params = {
    "featureSelection__k" : [2, 3, 4, 5, 6, 7, 8],
    "clf__criterion": ["gini", "entropy"],
    "clf__max_features": [0.33, 0.66, 1.0],
    "clf__min_samples_split": [1, 3, 10, 30],
    "clf__min_samples_leaf": [1, 3, 10, 30],
    "clf__min_weight_fraction_leaf": [0.0, 0.1, 0.3],
    "clf__max_depth": [3, 10, 30, None]
    }

In [8]:
pipeline = Pipeline([
        ('featureSelection', SelectKBest(f_classif)),
        ('clf', DecisionTreeClassifier())
    ])
grid_search = GridSearchCV(pipeline, params, n_jobs=-1, scoring='log_loss')
start = time()
grid_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)
predictions = grid_search.predict_proba(df_test)
output = pd.DataFrame(predictions, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
output.index.names = ['ID']
output.index += 1
output.head()

GridSearchCV took 518.45 seconds for 8064 candidate parameter settings.
Model with rank: 1
Mean validation score: -0.954 (std: 0.005)
Parameters: {'featureSelection__k': 3, 'clf__criterion': 'gini', 'clf__max_depth': 3, 'clf__min_weight_fraction_leaf': 0.0, 'clf__min_samples_leaf': 1, 'clf__max_features': 1.0, 'clf__min_samples_split': 1}

Model with rank: 2
Mean validation score: -0.954 (std: 0.005)
Parameters: {'featureSelection__k': 4, 'clf__criterion': 'gini', 'clf__max_depth': 3, 'clf__min_weight_fraction_leaf': 0.0, 'clf__min_samples_leaf': 1, 'clf__max_features': 1.0, 'clf__min_samples_split': 1}

Model with rank: 3
Mean validation score: -0.954 (std: 0.005)
Parameters: {'featureSelection__k': 5, 'clf__criterion': 'gini', 'clf__max_depth': 3, 'clf__min_weight_fraction_leaf': 0.0, 'clf__min_samples_leaf': 1, 'clf__max_features': 1.0, 'clf__min_samples_split': 1}



Unnamed: 0,Adoption,Died,Euthanasia,Return_to_owner,Transfer
1,0.062141,0.003655,0.11436,0.363969,0.455875
2,0.485691,0.001363,0.033161,0.316072,0.163714
3,0.679824,0.002204,0.017231,0.086756,0.213985
4,0.062141,0.003655,0.11436,0.363969,0.455875
5,0.485691,0.001363,0.033161,0.316072,0.163714


In [9]:
output.to_csv('../submission-DecisionTree.3.0.csv', index_label = 'ID')