In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
import cPickle as pickle
from scipy import sparse

In [3]:
trainData = pd.read_csv('../processed/train_processed.csv', index_col ='listing_id')
trainData.shape

(49352, 15)

In [4]:
# training data
X = trainData.drop('interest_level', axis=1)
y = trainData.interest_level

Follow guide to tuning GradientBoostingClassifier() outlined [here](https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/)

In [50]:
# First find a learning rate where classifier is optimized for low number of trees
clf = GradientBoostingClassifier(min_samples_split=500, 
                                 min_samples_leaf=50, 
                                 max_depth=6,
                                 max_features='sqrt',
                                 subsample=0.8,
                                 learning_rate=0.45)

param_grid = {'n_estimators': range(20,91,10)}
grid = GridSearchCV(clf, param_grid,cv=5, scoring='log_loss')
%time grid.fit(X,y)
grid.grid_scores_

Wall time: 2min 53s


[mean: -0.61170, std: 0.00523, params: {'n_estimators': 20},
 mean: -0.60224, std: 0.00551, params: {'n_estimators': 30},
 mean: -0.59911, std: 0.00561, params: {'n_estimators': 40},
 mean: -0.59797, std: 0.00529, params: {'n_estimators': 50},
 mean: -0.59604, std: 0.00491, params: {'n_estimators': 60},
 mean: -0.59636, std: 0.00476, params: {'n_estimators': 70},
 mean: -0.59556, std: 0.00600, params: {'n_estimators': 80},
 mean: -0.59568, std: 0.00598, params: {'n_estimators': 90}]

In [53]:
grid.best_params_, grid.best_score_

({'n_estimators': 80}, -0.59556040843719538)

In [56]:
# tune max_depth and min_samples_split
clf = GradientBoostingClassifier(n_estimators=80, 
                                 min_samples_leaf=50, 
                                 max_features='sqrt',
                                 subsample=0.8,
                                 learning_rate=0.45)
param_grid2 = {'max_depth':range(2,6), 'min_samples_split':range(200,1001,200)}
grid = GridSearchCV(clf, param_grid2, cv=5, scoring='log_loss')
%time grid.fit(X,y)
grid.grid_scores_

Wall time: 6min 52s


[mean: -0.61799, std: 0.00516, params: {'min_samples_split': 200, 'max_depth': 2},
 mean: -0.61748, std: 0.00409, params: {'min_samples_split': 400, 'max_depth': 2},
 mean: -0.61741, std: 0.00492, params: {'min_samples_split': 600, 'max_depth': 2},
 mean: -0.61793, std: 0.00519, params: {'min_samples_split': 800, 'max_depth': 2},
 mean: -0.61868, std: 0.00496, params: {'min_samples_split': 1000, 'max_depth': 2},
 mean: -0.60266, std: 0.00596, params: {'min_samples_split': 200, 'max_depth': 3},
 mean: -0.60239, std: 0.00568, params: {'min_samples_split': 400, 'max_depth': 3},
 mean: -0.60226, std: 0.00593, params: {'min_samples_split': 600, 'max_depth': 3},
 mean: -0.60223, std: 0.00719, params: {'min_samples_split': 800, 'max_depth': 3},
 mean: -0.60359, std: 0.00453, params: {'min_samples_split': 1000, 'max_depth': 3},
 mean: -0.59668, std: 0.00467, params: {'min_samples_split': 200, 'max_depth': 4},
 mean: -0.59575, std: 0.00575, params: {'min_samples_split': 400, 'max_depth': 4},
 m

In [57]:
grid.best_params_, grid.best_score_

({'max_depth': 5, 'min_samples_split': 1000}, -0.5942223526416639)

In [58]:
# tune min_samples_split further
clf = GradientBoostingClassifier(n_estimators=80, 
                                 min_samples_leaf=50, 
                                 max_features='sqrt',
                                 subsample=0.8,
                                 learning_rate=0.45,
                                 max_depth=5)
param_grid2 = {'min_samples_split':range(800,2001,200)}
grid = GridSearchCV(clf, param_grid2, cv=5, scoring='log_loss')
%time grid.fit(X,y)
grid.grid_scores_

Wall time: 2min 55s


[mean: -0.59358, std: 0.00566, params: {'min_samples_split': 800},
 mean: -0.59390, std: 0.00596, params: {'min_samples_split': 1000},
 mean: -0.59369, std: 0.00509, params: {'min_samples_split': 1200},
 mean: -0.59413, std: 0.00469, params: {'min_samples_split': 1400},
 mean: -0.59285, std: 0.00632, params: {'min_samples_split': 1600},
 mean: -0.59227, std: 0.00602, params: {'min_samples_split': 1800},
 mean: -0.59248, std: 0.00667, params: {'min_samples_split': 2000}]

In [59]:
grid.best_params_, grid.best_score_

({'min_samples_split': 1800}, -0.59226525041550504)

In [60]:
# tune min_samples_leaf
clf = GradientBoostingClassifier(n_estimators=80, 
                                 max_features='sqrt',
                                 subsample=0.8,
                                 learning_rate=0.45,
                                 max_depth=5,
                                 min_samples_split=1800)
param_grid2 = {'min_samples_leaf':range(20,101,10)}
grid = GridSearchCV(clf, param_grid2, cv=5, scoring='log_loss')
%time grid.fit(X,y)
grid.grid_scores_

Wall time: 3min 39s


[mean: -0.59445, std: 0.00480, params: {'min_samples_leaf': 20},
 mean: -0.59521, std: 0.00461, params: {'min_samples_leaf': 30},
 mean: -0.59464, std: 0.00523, params: {'min_samples_leaf': 40},
 mean: -0.59374, std: 0.00549, params: {'min_samples_leaf': 50},
 mean: -0.59472, std: 0.00594, params: {'min_samples_leaf': 60},
 mean: -0.59354, std: 0.00648, params: {'min_samples_leaf': 70},
 mean: -0.59443, std: 0.00502, params: {'min_samples_leaf': 80},
 mean: -0.59403, std: 0.00654, params: {'min_samples_leaf': 90},
 mean: -0.59441, std: 0.00648, params: {'min_samples_leaf': 100}]

In [62]:
grid.best_params_, grid.best_score_

({'min_samples_leaf': 70}, -0.59354023097023789)

In [66]:
# tune max_features
clf = GradientBoostingClassifier(n_estimators=80, 
                                 min_samples_leaf=70,
                                 subsample=0.8,
                                 learning_rate=0.45,
                                 max_depth=5,
                                 min_samples_split=1800)
param_grid2 = {'max_features':range(5,15,2)}
grid = GridSearchCV(clf, param_grid2, cv=5, scoring='log_loss')
%time grid.fit(X,y)
grid.grid_scores_

Wall time: 3min 21s


[mean: -0.59058, std: 0.00605, params: {'max_features': 5},
 mean: -0.58788, std: 0.00565, params: {'max_features': 7},
 mean: -0.58773, std: 0.00491, params: {'max_features': 9},
 mean: -0.58864, std: 0.00600, params: {'max_features': 11},
 mean: -0.58846, std: 0.00498, params: {'max_features': 13}]

In [67]:
grid.best_params_, grid.best_score_

({'max_features': 9}, -0.58772797704169022)

In [72]:
# tune subsample
clf = GradientBoostingClassifier(n_estimators=80, 
                                 min_samples_leaf=70, 
                                 max_features=9,
                                 learning_rate=0.45,
                                 max_depth=5,
                                 min_samples_split=1800)
param_grid = {'subsample':[0.85,0.9,0.95,1.0]}
grid = GridSearchCV(clf, param_grid, cv=5, scoring='log_loss')
%time grid.fit(X,y)
grid.grid_scores_

Wall time: 2min 43s


[mean: -0.58722, std: 0.00592, params: {'subsample': 0.85},
 mean: -0.58762, std: 0.00623, params: {'subsample': 0.9},
 mean: -0.58483, std: 0.00473, params: {'subsample': 0.95},
 mean: -0.58487, std: 0.00610, params: {'subsample': 1.0}]

In [73]:
grid.best_params_, grid.best_score_

({'subsample': 0.95}, -0.58482872338536507)

Now have all parameters. Next, lower learning rate and increase number of estimators proportionally.
clf = GradientBoostingClassifier(n_estimators=80, 
                                 min_samples_leaf=70, 
                                 max_features=9,
                                 learning_rate=0.45,
                                 max_depth=5,
                                 min_samples_split=1800,
                                 subsample=0.95)

In [74]:
clf = GradientBoostingClassifier(n_estimators=240, 
                                 min_samples_leaf=70, 
                                 max_features=9,
                                 learning_rate=0.15,
                                 max_depth=5,
                                 min_samples_split=1800,
                                 subsample=0.95)
score = cross_val_score(clf, X, y, cv=5, scoring='log_loss')
score.mean()

array([-0.58759273, -0.57170269, -0.58058868, -0.57808455, -0.58550717])

In [76]:
clf = GradientBoostingClassifier(n_estimators=720, 
                                 min_samples_leaf=70, 
                                 max_features=9,
                                 learning_rate=0.05,
                                 max_depth=5,
                                 min_samples_split=1800,
                                 subsample=0.95)
score = cross_val_score(clf, X, y, cv=5, scoring='log_loss')
score.mean()

-0.57954212242857783

In [77]:
clf = GradientBoostingClassifier(n_estimators=3600, 
                                 min_samples_leaf=70, 
                                 max_features=9,
                                 learning_rate=0.01,
                                 max_depth=5,
                                 min_samples_split=1800,
                                 subsample=0.95)
score = cross_val_score(clf, X, y, cv=5, scoring='log_loss')
score.mean()
# yields score of 0.60057 on kaggle

-0.57898731859836439

In [78]:
clf = GradientBoostingClassifier(n_estimators=7200, 
                                 min_samples_leaf=70, 
                                 max_features=9,
                                 learning_rate=0.005,
                                 max_depth=5,
                                 min_samples_split=1800,
                                 subsample=0.95)
score = cross_val_score(clf, X, y, cv=5, scoring='log_loss')
score.mean()
# yields score of 0.60017 on kaggle

-0.57887873550077473

Predict on testing set using this model

In [5]:
X_test = pd.read_csv('../processed/test_processed.csv', index_col ='listing_id')
X_test.head()

Unnamed: 0_level_0,bathrooms,bedrooms,latitude,longitude,price,num_photos,num_features,num_description,building_id,created_year,created_month,created_day,manager_id,created_day_of_week
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
7142618,1.0,1,40.7185,-73.9865,2950,8,6,78,4412,2016,6,11,2694,5
7210040,1.0,2,40.7278,-74.0,2850,3,3,35,0,2016,6,24,3145,4
7103890,1.0,1,40.7306,-73.989,3758,6,3,333,2257,2016,6,3,2346,4
7143442,1.0,2,40.7109,-73.9571,3300,6,10,204,4368,2016,6,11,179,5
6860601,2.0,2,40.765,-73.9845,4900,7,14,174,3530,2016,4,12,2764,1


In [6]:
clf = GradientBoostingClassifier(n_estimators=3600, 
                                 min_samples_leaf=70, 
                                 max_features=9,
                                 learning_rate=0.01,
                                 max_depth=5,
                                 min_samples_split=1800,
                                 subsample=0.95)
clf.fit(X,y)

GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=5, max_features=9, max_leaf_nodes=None,
              min_samples_leaf=70, min_samples_split=1800,
              min_weight_fraction_leaf=0.0, n_estimators=3600,
              presort='auto', random_state=None, subsample=0.95, verbose=0,
              warm_start=False)

In [7]:
y_pred = clf.predict_proba(X_test) # predict

In [8]:
# prepare to write to file
y_pred_df = pd.DataFrame(y_pred, columns=['low','medium','high'],index=X_test.index)
y_pred_df = y_pred_df[['high','medium','low']]
y_pred_df.head()

Unnamed: 0_level_0,high,medium,low
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7142618,0.099107,0.469895,0.430998
7210040,0.084911,0.135442,0.779646
7103890,0.026705,0.185972,0.787323
7143442,0.037543,0.215968,0.746489
6860601,0.013975,0.109131,0.876894


In [9]:
y_pred_df.to_csv('../processed/submission1.csv') # write to file