# Shelter Animal Outcomes 10

## Gradient Tree Boosting

In [1]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.feature_selection import RFECV
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.pipeline import Pipeline
from time import time
from operator import itemgetter
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv('../Shelter_train.csv')
df_test = pd.read_csv('../Shelter_test.csv')

In [3]:
X = df_train.ix[:, :-1]
y = df_train.ix[:, -1]
df_test = df_test.drop('ID', 1)

In [4]:
clf = GradientBoostingClassifier()
cross_validation.cross_val_score(clf, X, y, scoring="log_loss")

array([-0.95119628, -0.94774087, -0.93670263])

In [14]:
params = {
          "clf__max_features": [0.1, 0.3, 1.0],
          "clf__min_samples_split": [1, 3, 10],
          "clf__max_depth": [3, 5, 6],
          "clf__n_estimators" : sp_randint(10, 400),
          "clf__max_leaf_nodes": [3, 5, 7, None]
}          

In [6]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [15]:
pipeline = Pipeline([
        ('featureSelection', RFECV(estimator=GradientBoostingClassifier(n_estimators=10), scoring='log_loss')),
        ('clf', GradientBoostingClassifier())
        ])
rand_search = RandomizedSearchCV(pipeline, params, n_iter=50, n_jobs=-1, scoring='log_loss')
start = time()
rand_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(rand_search.grid_scores_)))
report(rand_search.grid_scores_)
predictions = rand_search.predict_proba(df_test)
output = pd.DataFrame(predictions, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
output.index.names = ['ID']
output.index += 1
output.head()

GridSearchCV took 1252.21 seconds for 50 candidate parameter settings.
Model with rank: 1
Mean validation score: -0.945 (std: 0.006)
Parameters: {'clf__max_features': 1.0, 'clf__max_depth': 3, 'clf__n_estimators': 74, 'clf__min_samples_split': 10, 'clf__max_leaf_nodes': 7}

Model with rank: 2
Mean validation score: -0.945 (std: 0.006)
Parameters: {'clf__max_features': 1.0, 'clf__max_depth': 3, 'clf__n_estimators': 66, 'clf__min_samples_split': 1, 'clf__max_leaf_nodes': 7}

Model with rank: 3
Mean validation score: -0.946 (std: 0.007)
Parameters: {'clf__max_features': 1.0, 'clf__max_depth': 3, 'clf__n_estimators': 140, 'clf__min_samples_split': 1, 'clf__max_leaf_nodes': 5}



Unnamed: 0,Adoption,Died,Euthanasia,Return_to_owner,Transfer
1,0.074442,0.003995,0.076139,0.295358,0.550065
2,0.596681,0.001717,0.030891,0.235492,0.135218
3,0.663599,0.002926,0.034327,0.090181,0.208966
4,0.09334,0.004693,0.079784,0.370129,0.452055
5,0.516111,0.002105,0.021441,0.285548,0.174794


In [10]:
output.to_csv('../submission-GradientBoostingClassifier.3.0.csv', index_label = 'ID')