# Classifying with GBM using deeper trees

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.svm as skl_svm
import sklearn.cross_validation as skl_cv
import sklearn.ensemble as skl_ensemble
import seaborn as sns
import os

import time
from sklearn.grid_search import GridSearchCV

In [6]:
digit_data = pd.read_csv('Data/train.csv')
X = digit_data.ix[:,1:digit_data.shape[1]]
y = digit_data['label']
X_subset = X.ix[0:5000,:]
y_subset = y.ix[0:5000]

X_train, X_test, y_train, y_test = skl_cv.train_test_split(X_subset, y_subset, test_size=0.2)

In [7]:
X_train.shape

(4000, 784)

In [10]:
gbm_clf = skl_ensemble.GradientBoostingClassifier(verbose=True, n_estimators=600, learning_rate=0.1, max_leaf_nodes=15)

In [11]:
start_time = time.time()

gbm_clf.fit(X_train, y_train);

end_time = time.time()

print 'Elapsed Time: ', end_time - start_time, ' seconds'

      Iter       Train Loss   Remaining Time 
         1        7146.9828           11.04m
         2        6015.0066           11.09m
         3        5219.7225           11.15m
         4        4629.6947           11.11m
         5        4142.4426           11.04m
         6        3720.7394           11.07m
         7        3359.2531           11.12m
         8        3049.2468           11.17m
         9        2762.7042           11.24m
        10        2523.1591           11.29m
        20        1136.6834           11.59m
        30         630.5898           11.46m
        40         390.1343           11.20m
        50         250.6399           10.95m
        60         169.7409           10.68m
        70         117.1191           10.44m
        80          82.6114           10.22m
        90          59.6453            9.96m
       100          42.5923            9.73m
       200           2.8789            6.99m
       300           1.4560            3.83m
       40

In [12]:
(gbm_clf.predict(X_test) == y_test.values).mean()

0.93306693306693311

So the GBM gets 93.3% accuracy using max_leaf_nodes=15, n_estimators=600, and the subset of data (n=4,000). Let's try going deeper and add more tress:

In [13]:
gbm_clf = skl_ensemble.GradientBoostingClassifier(verbose=True, n_estimators=1200, learning_rate=0.1, max_leaf_nodes=25)
start_time = time.time()

gbm_clf.fit(X_train, y_train);

end_time = time.time()

print 'Elapsed Time: ', end_time - start_time, ' seconds'

      Iter       Train Loss   Remaining Time 
         1        7143.2101           42.62m
         2        6010.3450           44.99m
         3        5218.9431           45.02m
         4        4624.0047           44.85m
         5        4128.7856           45.21m
         6        3716.1754           45.27m
         7        3361.3323           44.92m
         8        3036.5369           44.64m
         9        2759.6336           44.53m
        10        2512.1649           43.64m
        20        1136.1035           34.25m
        30         629.0533           31.18m
        40         381.9679           29.18m
        50         245.4960           27.62m
        60         164.3440           26.78m
        70         114.0362           25.77m
        80          82.3250           25.15m
        90          59.6250           25.05m
       100          43.6793           25.01m
       200           2.8258           19.36m
       300           1.4489           12.60m
       40

In [14]:
(gbm_clf.predict(X_test) == y_test.values).mean()

0.93906093906093902

Slight improvement to 93.9% up from 93.3%. This isn't quite what I was hoping for. I will now reduce the learning rate:

In [15]:
gbm_clf = skl_ensemble.GradientBoostingClassifier(verbose=True, n_estimators=1200, learning_rate=0.01, max_leaf_nodes=25)
start_time = time.time()

gbm_clf.fit(X_train, y_train);

end_time = time.time()

print 'Elapsed Time: ', end_time - start_time, ' seconds'

(gbm_clf.predict(X_test) == y_test.values).mean()

      Iter       Train Loss   Remaining Time 
         1        8988.4191           22.54m
         2        8785.9181           22.34m
         3        8597.2597           22.14m
         4        8414.8648           22.22m
         5        8246.2158           22.30m
         6        8083.5764           22.47m
         7        7930.9643           22.50m
         8        7785.5677           22.42m
         9        7647.4831           22.34m
        10        7511.9254           22.28m
        20        6410.4709           22.02m
        30        5579.0527           21.91m
        40        4928.7215           22.05m
        50        4389.0527           22.32m
        60        3943.2290           22.52m
        70        3561.2868           22.23m
        80        3214.8356           22.17m
        90        2920.3107           22.05m
       100        2663.8260           21.89m
       200        1199.1041           20.38m
       300         655.6966           18.37m
       40

0.93106893106893107

In [17]:
gbm_clf = skl_ensemble.GradientBoostingClassifier(verbose=True, n_estimators=2000, learning_rate=0.01, max_leaf_nodes=75)
start_time = time.time()

gbm_clf.fit(X_train, y_train);

end_time = time.time()

print 'Elapsed Time: ', end_time - start_time, ' seconds'

(gbm_clf.predict(X_test) == y_test.values).mean()

      Iter       Train Loss   Remaining Time 
         1        8988.4189           37.28m
         2        8785.9180           37.11m
         3        8597.2597           36.99m
         4        8414.8647           37.75m
         5        8246.2157           37.56m
         6        8083.5764           37.46m
         7        7930.9643           37.43m
         8        7785.5676           37.52m
         9        7647.4830           37.42m
        10        7511.9254           37.35m
        20        6410.4709           37.22m
        30        5579.0527           37.21m
        40        4928.7215           37.08m
        50        4389.0527           36.88m
        60        3943.2289           36.67m
        70        3561.2868           36.55m
        80        3214.8356           36.62m
        90        2920.3107           36.68m
       100        2663.8260           36.73m
       200        1199.1041           36.45m
       300         655.6966           34.57m
       40

0.93706293706293708

It looks like gradient boosting hits a wall near ~94%. I will investigate which numbers it gets wrong: