In [37]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

from soln.dataset import load_raw_data
from soln.dataset import generate_xv_splits_np
from soln.featurizer import CustomFeaturizer
from soln.utils import dump_decision_tree
from soln.utils import eval_regressor
from soln.utils import print_feature_importances

Populating the interactive namespace from numpy and matplotlib


In [34]:
raw = load_raw_data()
featurizer = CustomFeaturizer()
xv_iter = generate_xv_splits_np(raw, featurizer)
X_train_feats, X_train_np, y_train_np, X_test_np, y_test_np = next(xv_iter)

print "Have {} features:".format(len(X_train_feats.columns))
for feature_name in X_train_feats.columns:
    print feature_name

Populating the interactive namespace from numpy and matplotlib
Have 375 features:
supplier other
supplier S-0042
supplier S-0005
supplier S-0026
supplier S-0027
supplier S-0072
supplier S-0062
supplier S-0064
supplier S-0043
supplier S-0066
supplier S-0041
supplier S-0105
supplier S-0080
supplier S-0081
supplier S-0104
supplier S-0013
supplier S-0014
supplier S-0070
supplier S-0031
supplier S-0030
supplier S-0058
supplier S-0054
supplier S-0092
material_id other
material_id nan
material_id SP-0046
material_id SP-0041
material_id SP-0033
material_id SP-0048
material_id SP-0034
material_id SP-0035
material_id SP-0036
material_id SP-0037
material_id SP-0030
material_id SP-0019
material_id SP-0008
material_id SP-0038
material_id SP-0039
material_id SP-0029
material_id SP-0028
end_a other
end_a EF-005
end_a NONE
end_a EF-002
end_a EF-003
end_a EF-008
end_a EF-009
end_a EF-023
end_a EF-021
end_a EF-012
end_a EF-017
end_a EF-016
end_a EF-015
end_a EF-019
end_a EF-018
end_x other
end_x EF-005


In [38]:
regressors = [
    DummyRegressor(strategy='constant', constant=0.0),
    DummyRegressor(strategy='mean'),
    RandomForestRegressor(n_estimators=20),
    # RandomForestRegressor(n_estimators=100),
]

for reg in regressors:
    %time train_rmsle, test_rmsle = eval_regressor(reg, X_train_np, y_train_np, X_test_np, y_test_np)
    print "{}:".format(reg)
    print "    train RMSLE {}".format(train_rmsle)
    print "    test RMSLE {}".format(test_rmsle)
    print

CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 35.8 ms
DummyRegressor(constant=array(0.0), quantile=None, strategy='constant'):
    train RMSLE 2.35046007432
    test RMSLE 2.34246970634

CPU times: user 28 ms, sys: 4 ms, total: 32 ms
Wall time: 35.2 ms
DummyRegressor(constant=None, quantile=None, strategy='mean'):
    train RMSLE 0.822919215283
    test RMSLE 0.825379493313

CPU times: user 36.5 s, sys: 144 ms, total: 36.6 s
Wall time: 37.3 s
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False):
    train RMSLE 0.0903810777825
    test RMSLE 0.300728731422



In [39]:
print_feature_importances(X_train_feats, reg);

adj_quantity 0.444030230844
diameter 0.120529653932
annual_usage 0.0680515542304
min_order_quantity 0.054354945467
quantity 0.0395873474987
components other 0.0392623258328
length 0.0366953608724
quote_date_days_since_1900 0.0239282722878
end_a EF-003 0.0139932220966
bend_radius 0.011008115476
wall_thickness 0.0102929364964
supplier S-0054 0.008412278169
supplier S-0041 0.00744124466613
supplier S-0026 0.00687500563005
num_bends 0.0065907304928
end_x EF-003 0.00648410990929
bracketing (1, 3, 5, 7, 9) 0.00610923597856
supplier S-0066 0.00457019910945
supplier S-0072 0.00393774747837
bracketing (1, 6, 20) 0.00288902635824
bracketing other 0.00287625814832
bracketing (5, 19, 20) 0.00266951156344
components C-1727 0.00227011185598
num_boss 0.00194199132624
supplier S-0064 0.00186583451689
end_a EF-012 0.00180744358061
end_a_2x 0.00177282584323
end_x_2x 0.00176174542546
components C-1630 0.00155186800539
end_x NONE 0.00153874881138
specs SP-0070 0.00148629141618
end_x EF-023 0.0014834946398

In [40]:
X_train_feats['components C-1312'].value_counts()

0    23201
2     3024
1       86
dtype: int64

In [41]:
print_feature_importances(X_train_feats, reg.estimators_[0])

adj_quantity 0.383636015766
diameter 0.11265538548
quantity 0.102695954927
annual_usage 0.0695016293123
min_order_quantity 0.0562030720638
length 0.0381054124711
components other 0.0315695194441
quote_date_days_since_1900 0.0254373647876
end_a EF-003 0.0230288972112
supplier S-0054 0.0135606124729
bend_radius 0.0120611480668
num_bends 0.00732055109556
wall_thickness 0.00693447963585
bracketing (1, 3, 5, 7, 9) 0.00655966182841
supplier S-0026 0.00653500089742
supplier S-0041 0.00649436623324
end_x EF-003 0.00597924355396
supplier S-0066 0.00425901069624
bracketing other 0.00362096411591
supplier S-0072 0.00351878035727
end_a_2x 0.00322958809483
bracketing (1, 6, 20) 0.00272968076017
supplier S-0064 0.00271985531768
components C-1727 0.00266456998671
specs SP-0061 0.00201049974401
end_x EF-023 0.0019811121209
end_x_2x 0.0018908052284
bracketing (1, 2, 3, 4, 5) 0.00179992430602
components C-1629 0.00175922658999
bracketing (5, 19, 20) 0.00175655502861
components C-1445 0.00169516258566
ma

[('adj_quantity', 0.38363601576561601),
 ('diameter', 0.11265538548010036),
 ('quantity', 0.10269595492660942),
 ('annual_usage', 0.06950162931234817),
 ('min_order_quantity', 0.056203072063815751),
 ('length', 0.03810541247114544),
 ('components other', 0.031569519444086327),
 ('quote_date_days_since_1900', 0.02543736478764887),
 ('end_a EF-003', 0.0230288972111805),
 ('supplier S-0054', 0.013560612472883808),
 ('bend_radius', 0.01206114806682235),
 ('num_bends', 0.0073205510955589494),
 ('wall_thickness', 0.0069344796358468477),
 ('bracketing (1, 3, 5, 7, 9)', 0.0065596618284128761),
 ('supplier S-0026', 0.0065350008974187574),
 ('supplier S-0041', 0.0064943662332401595),
 ('end_x EF-003', 0.0059792435539641866),
 ('supplier S-0066', 0.004259010696238426),
 ('bracketing other', 0.0036209641159054905),
 ('supplier S-0072', 0.0035187803572677184),
 ('end_a_2x', 0.0032295880948316454),
 ('bracketing (1, 6, 20)', 0.0027296807601701685),
 ('supplier S-0064', 0.0027198553176849168),
 ('com

In [42]:
dump_decision_tree("tree0.pdf", X_train_feats, reg.estimators_[0], max_depth=5)