In [49]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import dump_decision_tree
from soln.utils import eval_regressor
from soln.utils import print_feature_importances

Populating the interactive namespace from numpy and matplotlib


In [47]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
%time X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))

CPU times: user 6.94 s, sys: 12 ms, total: 6.96 s
Wall time: 7.05 s
CPU times: user 48 ms, sys: 0 ns, total: 48 ms
Wall time: 48.3 ms


In [53]:
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)

%time X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
%time X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values

CPU times: user 448 ms, sys: 4 ms, total: 452 ms
Wall time: 469 ms
CPU times: user 684 ms, sys: 152 ms, total: 836 ms
Wall time: 846 ms
CPU times: user 120 ms, sys: 0 ns, total: 120 ms
Wall time: 128 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26311 entries, 0 to 26310
Data columns (total 439 columns):
annual_usage                                          int64
min_order_quantity                                    int64
bracket_pricing                                       bool
quantity                                              int64
diameter                                              float64
wall_thickness                                        float64
length                                                float64
num_bends                                             int64
bend_radius                                           float64
end_a_1x                                              bool
end_a_2x                                              bool
end_x_1x              

In [54]:
regressors = [
    DummyRegressor(strategy='constant', constant=0.0),
    DummyRegressor(strategy='mean'),
    RandomForestRegressor(n_estimators=20),
    # RandomForestRegressor(n_estimators=100),
]

for reg in regressors:
    %time train_rmsle, test_rmsle = eval_regressor(reg, X_train_np, y_train_np, X_test_np, y_test_np)
    print "{}:".format(reg)
    print "    train RMSLE {}".format(train_rmsle)
    print "    test RMSLE {}".format(test_rmsle)
    print

CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 40.1 ms
DummyRegressor(constant=array(0.0), quantile=None, strategy='constant'):
    train RMSLE 2.35046007432
    test RMSLE 2.34246970634

CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 39.1 ms
DummyRegressor(constant=None, quantile=None, strategy='mean'):
    train RMSLE 0.822919215283
    test RMSLE 0.825379493313

CPU times: user 41.1 s, sys: 108 ms, total: 41.2 s
Wall time: 41.6 s
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False):
    train RMSLE 0.0916904766464
    test RMSLE 0.301066553903



In [55]:
print_feature_importances(X_train_feats, reg);

adj_quantity 0.445235752164
diameter 0.123237915271
annual_usage 0.06907998983
min_order_quantity 0.0527020677599
quantity 0.0402767943547
length 0.0380894863912
components other 0.0378619089086
quote_age 0.0225214442638
end_a EF-003 0.0148951127114
bend_radius 0.0109940859102
wall_thickness 0.00974902346859
supplier S-0041 0.00718882546679
bracketing_pattern (1, 3, 5, 7, 9) 0.00684502056722
end_x EF-003 0.00648649061299
supplier S-0054 0.00643135981038
num_bends 0.0064171957895
supplier S-0026 0.00560650610537
supplier S-0072 0.00414361601223
supplier S-0066 0.00403968477172
bracketing_pattern (5, 19, 20) 0.00331378363415
bracketing_pattern (1, 6, 20) 0.00281509478651
components C-1727 0.00225658375011
bracketing_pattern other 0.0020292992603
num_boss 0.00190118557789
end_x_2x 0.00188276319971
supplier S-0064 0.00178852178181
end_a EF-023 0.00151213984157
specs SP-0080 0.00146276243595
components C-1631 0.00140847231859
components C-1630 0.00140297641023
end_a_2x 0.00138686714417
comp

In [41]:
print_feature_importances(X_train_feats, reg.estimators_[0])

adj_quantity 0.383636015766
diameter 0.11265538548
quantity 0.102695954927
annual_usage 0.0695016293123
min_order_quantity 0.0562030720638
length 0.0381054124711
components other 0.0315695194441
quote_date_days_since_1900 0.0254373647876
end_a EF-003 0.0230288972112
supplier S-0054 0.0135606124729
bend_radius 0.0120611480668
num_bends 0.00732055109556
wall_thickness 0.00693447963585
bracketing (1, 3, 5, 7, 9) 0.00655966182841
supplier S-0026 0.00653500089742
supplier S-0041 0.00649436623324
end_x EF-003 0.00597924355396
supplier S-0066 0.00425901069624
bracketing other 0.00362096411591
supplier S-0072 0.00351878035727
end_a_2x 0.00322958809483
bracketing (1, 6, 20) 0.00272968076017
supplier S-0064 0.00271985531768
components C-1727 0.00266456998671
specs SP-0061 0.00201049974401
end_x EF-023 0.0019811121209
end_x_2x 0.0018908052284
bracketing (1, 2, 3, 4, 5) 0.00179992430602
components C-1629 0.00175922658999
bracketing (5, 19, 20) 0.00175655502861
components C-1445 0.00169516258566
ma

[('adj_quantity', 0.38363601576561601),
 ('diameter', 0.11265538548010036),
 ('quantity', 0.10269595492660942),
 ('annual_usage', 0.06950162931234817),
 ('min_order_quantity', 0.056203072063815751),
 ('length', 0.03810541247114544),
 ('components other', 0.031569519444086327),
 ('quote_date_days_since_1900', 0.02543736478764887),
 ('end_a EF-003', 0.0230288972111805),
 ('supplier S-0054', 0.013560612472883808),
 ('bend_radius', 0.01206114806682235),
 ('num_bends', 0.0073205510955589494),
 ('wall_thickness', 0.0069344796358468477),
 ('bracketing (1, 3, 5, 7, 9)', 0.0065596618284128761),
 ('supplier S-0026', 0.0065350008974187574),
 ('supplier S-0041', 0.0064943662332401595),
 ('end_x EF-003', 0.0059792435539641866),
 ('supplier S-0066', 0.004259010696238426),
 ('bracketing other', 0.0036209641159054905),
 ('supplier S-0072', 0.0035187803572677184),
 ('end_a_2x', 0.0032295880948316454),
 ('bracketing (1, 6, 20)', 0.0027296807601701685),
 ('supplier S-0064', 0.0027198553176849168),
 ('com

In [42]:
dump_decision_tree("tree0.pdf", X_train_feats, reg.estimators_[0], max_depth=5)