In [22]:
%pylab inline

import pandas as pd

from soln.dataset import get_dev_split
from soln.dataset import get_extended_X
from soln.dataset import load_raw_data
from soln.dataset import log_transform_y
from soln.featurizer import CustomFeaturizer
from soln.utils import dump_decision_tree
from soln.utils import print_feature_importances

raw = load_raw_data()
X_train, y_train, X_test, y_test = get_dev_split(raw)
X_train = get_extended_X(X_train, raw)
X_test = get_extended_X(X_test, raw)
y_train = log_transform_y(y_train)
y_test = log_transform_y(y_test)

featurizer = CustomFeaturizer()
featurizer.fit(X_train)
X_train_feats = featurizer.transform(X_train)
X_test_feats = featurizer.transform(X_test)
print "Have {} features:".format(len(X_train_feats.columns))
for feature_name in X_train_feats.columns:
    print feature_name

X_train_np = X_train_feats.astype(np.float).values
X_test_np = X_test_feats.astype(np.float).values
y_train_np = y_train.values
y_test_np = y_test.values

Populating the interactive namespace from numpy and matplotlib
Have 343 features:
supplier other
supplier S-0042
supplier S-0005
supplier S-0026
supplier S-0027
supplier S-0072
supplier S-0062
supplier S-0064
supplier S-0043
supplier S-0066
supplier S-0041
supplier S-0105
supplier S-0080
supplier S-0081
supplier S-0104
supplier S-0013
supplier S-0014
supplier S-0070
supplier S-0031
supplier S-0030
supplier S-0058
supplier S-0054
supplier S-0092
material_id other
material_id nan
material_id SP-0046
material_id SP-0041
material_id SP-0033
material_id SP-0048
material_id SP-0034
material_id SP-0035
material_id SP-0036
material_id SP-0037
material_id SP-0030
material_id SP-0019
material_id SP-0008
material_id SP-0038
material_id SP-0039
material_id SP-0029
material_id SP-0028
end_a other
end_a EF-005
end_a NONE
end_a EF-002
end_a EF-003
end_a EF-008
end_a EF-009
end_a EF-023
end_a EF-021
end_a EF-012
end_a EF-017
end_a EF-016
end_a EF-015
end_a EF-019
end_a EF-018
end_x other
end_x EF-005


In [23]:
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

regressors = [
    DummyRegressor(strategy='constant', constant=0.0),
    DummyRegressor(strategy='mean'),
    RandomForestRegressor(n_estimators=20),
    # RandomForestRegressor(n_estimators=100),
]

for reg in regressors:
    %time reg.fit(X_train_np, y_train_np)
    y_train_pred = reg.predict(X_train_np)
    train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
    y_test_pred = reg.predict(X_test_np)
    test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
    print "{}:".format(reg)
    print "    train RMSLE {}".format(train_rmsle)
    print "    test RMSLE {}".format(test_rmsle)
    print

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 412 µs
DummyRegressor(constant=array(0.0), quantile=None, strategy='constant'):
    train RMSLE 2.35046007432
    test RMSLE 2.34246970634

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 408 µs
DummyRegressor(constant=None, quantile=None, strategy='mean'):
    train RMSLE 0.822919215283
    test RMSLE 0.825379493313

CPU times: user 32.2 s, sys: 40 ms, total: 32.3 s
Wall time: 32.4 s
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False):
    train RMSLE 0.091241971812
    test RMSLE 0.296693938282



In [24]:
print_feature_importances(X_train_feats, reg);

adj_quantity 0.444719426938
diameter 0.120579102418
annual_usage 0.0709923502694
min_order_quantity 0.0553211598815
quantity 0.0385604708165
components other 0.0383921152957
length 0.0372597884342
quote_date_days_since_1900 0.030524871225
end_a EF-003 0.0141104313117
bend_radius 0.0108907245945
wall_thickness 0.0104257317084
supplier S-0054 0.0100041754148
supplier S-0041 0.00789361729129
supplier S-0026 0.00711349416617
num_bends 0.00680852904217
end_x EF-003 0.0065214897831
supplier S-0072 0.00524943745687
supplier S-0066 0.00515054602316
components C-1727 0.00236124269879
num_boss 0.00207226437354
end_a EF-012 0.00199074003296
end_x_2x 0.00176665045233
supplier S-0064 0.0017477122995
supplier S-0058 0.00164208209753
components C-1630 0.00162453979885
end_a_2x 0.00161137334629
end_x NONE 0.00160327985822
material_id SP-0028 0.00156733924232
specs SP-0070 0.00149862864491
end_x EF-023 0.0014720562278
material_id SP-0008 0.00144375627895
components C-1631 0.00137150650922
components C-

In [5]:
X_train_feats['components C-1312'].value_counts()

0    23201
2     3024
1       86
dtype: int64

In [62]:
print_feature_importances(X_train_feats, reg.estimators_[0])

components C-1866 0.473946886108
components C-2043 0.132258895405
components C-1711 0.103026552394
components C-1718 0.0667651257513
components C-1781 0.0503431931246
components C-1963 0.0217178299554
components C-1715 0.0138885396242
components C-0434 0.0133121347389
components C-1405 0.0114387077952
components C-1229 0.0113480251671
components C-0318 0.0103564362084
components C-1869 0.00751755131568
components C-0539 0.00735320516576
components C-1625 0.00705279309686
components C-0211 0.00642005487505
components C-0215 0.00595768551119
components C-0165 0.00537435925421
components C-1867 0.00434385184868
components C-1313 0.00430578048576
components C-1848 0.00390244888641
components C-0679 0.00371728966952
components C-1244 0.00233939739964
components C-1243 0.00198830772496
components C-0095 0.00190053360815
components C-0007 0.00177721941835
components C-0001 0.00177557996401
components C-0250 0.00177514852976
components C-1622 0.00173145757323
components C-1889 0.0017155156506


[('components C-1866', 0.4739468861084461),
 ('components C-2043', 0.13225889540518981),
 ('components C-1711', 0.10302655239352888),
 ('components C-1718', 0.066765125751279658),
 ('components C-1781', 0.050343193124589097),
 ('components C-1963', 0.02171782995541284),
 ('components C-1715', 0.013888539624212441),
 ('components C-0434', 0.013312134738944562),
 ('components C-1405', 0.011438707795235811),
 ('components C-1229', 0.01134802516712693),
 ('components C-0318', 0.010356436208370914),
 ('components C-1869', 0.007517551315681755),
 ('components C-0539', 0.0073532051657581136),
 ('components C-1625', 0.0070527930968613357),
 ('components C-0211', 0.0064200548750458723),
 ('components C-0215', 0.0059576855111870396),
 ('components C-0165', 0.005374359254210313),
 ('components C-1867', 0.0043438518486750262),
 ('components C-1313', 0.0043057804857639858),
 ('components C-1848', 0.0039024488864073453),
 ('components C-0679', 0.0037172896695233351),
 ('components C-1244', 0.0023393

In [6]:
dump_decision_tree("tree0.pdf", X_train_feats, reg.estimators_[0], max_depth=5)