In [114]:
%pylab inline

import pandas as pd

from soln.dataset import get_dev_split
from soln.dataset import get_extended_X
from soln.dataset import load_raw_data
from soln.dataset import log_transform_y
from soln.featurizer import CustomFeaturizer
from soln.utils import dump_decision_tree
from soln.utils import print_feature_importances

raw = load_raw_data()
X_train, y_train, X_test, y_test = get_dev_split(raw)
X_train = get_extended_X(X_train, raw)
X_test = get_extended_X(X_test, raw)
y_train = log_transform_y(y_train)
y_test = log_transform_y(y_test)

featurizer = CustomFeaturizer()
featurizer.fit(X_train)
X_train_feats = featurizer.transform(X_train)
X_test_feats = featurizer.transform(X_test)
print "Have {} features:".format(len(X_train_feats.columns))
for feature_name in X_train_feats.columns:
    print feature_name

X_train_np = X_train_feats.astype(np.float).values
X_test_np = X_test_feats.astype(np.float).values
y_train_np = y_train.values
y_test_np = y_test.values

Populating the interactive namespace from numpy and matplotlib
Have 319 features:
supplier other
supplier S-0042
supplier S-0005
supplier S-0026
supplier S-0027
supplier S-0072
supplier S-0062
supplier S-0064
supplier S-0043
supplier S-0066
supplier S-0041
supplier S-0105
supplier S-0080
supplier S-0081
supplier S-0104
supplier S-0013
supplier S-0014
supplier S-0070
supplier S-0018
supplier S-0031
supplier S-0030
supplier S-0058
supplier S-0054
supplier S-0092
material_id other
material_id nan
material_id SP-0046
material_id SP-0041
material_id SP-0033
material_id SP-0048
material_id SP-0034
material_id SP-0035
material_id SP-0036
material_id SP-0037
material_id SP-0030
material_id SP-0019
material_id SP-0008
material_id SP-0038
material_id SP-0039
material_id SP-0029
material_id SP-0028
specs other
specs SP-0065
specs SP-0002
specs SP-0050
specs SP-0051
specs SP-0057
specs SP-0058
specs SP-0079
specs SP-0024
specs SP-0070
specs SP-0017
specs SP-0072
specs SP-0016
specs SP-0012
specs S

In [117]:
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

regressors = [
    DummyRegressor(strategy='constant', constant=0.0),
    DummyRegressor(strategy='mean'),
    RandomForestRegressor(n_estimators=20),
    # RandomForestRegressor(n_estimators=100),
]

for reg in regressors:
    %time reg.fit(X_train_np, y_train_np)
    y_train_pred = reg.predict(X_train_np)
    train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
    y_test_pred = reg.predict(X_test_np)
    test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
    print "{}:".format(reg)
    print "    train RMSLE {}".format(train_rmsle)
    print "    test RMSLE {}".format(test_rmsle)
    print

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 488 µs
DummyRegressor(constant=array(0.0), quantile=None, strategy='constant'):
    train RMSLE 2.36481494682
    test RMSLE 2.20644848894

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 424 µs
DummyRegressor(constant=None, quantile=None, strategy='mean'):
    train RMSLE 0.823952519474
    test RMSLE 0.81837011564

CPU times: user 31.2 s, sys: 68 ms, total: 31.2 s
Wall time: 31.4 s
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False):
    train RMSLE 0.0984281175707
    test RMSLE 0.268775456581



In [109]:
print_feature_importances(X_train_feats, reg);

quantity 0.466492578635
diameter 0.126308829005
annual_usage 0.0905259793468
min_order_quantity 0.0691032434657
length 0.0393822121507
components other 0.0354056411308
wall_thickness 0.0156418114726
bend_radius 0.012337387556
supplier S-0041 0.00948070089575
num_bends 0.0093841490115
material_id SP-0029 0.00713877905487
supplier S-0054 0.00703947433933
supplier S-0026 0.00629206568391
supplier S-0066 0.00601687077107
supplier S-0064 0.00529551893215
supplier S-0072 0.00406896260012
specs SP-0061 0.0035180953258
components C-0063 0.00262230818874
components C-1637 0.00256364198925
components C-1621 0.00226425430418
num_boss 0.00220942422283
material_id SP-0035 0.00210894482869
end_a_2x 0.00193805046441
components C-1312 0.00193031676606
material_id SP-0028 0.00192456685897
end_x_2x 0.00172402090401
specs SP-0070 0.00171794993353
specs SP-0080 0.00170977327003
supplier S-0058 0.00166934346652
components C-1629 0.00159241813081
specs SP-0063 0.00152687402877
specs SP-0058 0.00143605183122

In [106]:
X_train_feats['components C-1312'].value_counts()

0    24175
2     2907
1      104
dtype: int64

In [62]:
print_feature_importances(X_train_feats, reg.estimators_[0])

components C-1866 0.473946886108
components C-2043 0.132258895405
components C-1711 0.103026552394
components C-1718 0.0667651257513
components C-1781 0.0503431931246
components C-1963 0.0217178299554
components C-1715 0.0138885396242
components C-0434 0.0133121347389
components C-1405 0.0114387077952
components C-1229 0.0113480251671
components C-0318 0.0103564362084
components C-1869 0.00751755131568
components C-0539 0.00735320516576
components C-1625 0.00705279309686
components C-0211 0.00642005487505
components C-0215 0.00595768551119
components C-0165 0.00537435925421
components C-1867 0.00434385184868
components C-1313 0.00430578048576
components C-1848 0.00390244888641
components C-0679 0.00371728966952
components C-1244 0.00233939739964
components C-1243 0.00198830772496
components C-0095 0.00190053360815
components C-0007 0.00177721941835
components C-0001 0.00177557996401
components C-0250 0.00177514852976
components C-1622 0.00173145757323
components C-1889 0.0017155156506


[('components C-1866', 0.4739468861084461),
 ('components C-2043', 0.13225889540518981),
 ('components C-1711', 0.10302655239352888),
 ('components C-1718', 0.066765125751279658),
 ('components C-1781', 0.050343193124589097),
 ('components C-1963', 0.02171782995541284),
 ('components C-1715', 0.013888539624212441),
 ('components C-0434', 0.013312134738944562),
 ('components C-1405', 0.011438707795235811),
 ('components C-1229', 0.01134802516712693),
 ('components C-0318', 0.010356436208370914),
 ('components C-1869', 0.007517551315681755),
 ('components C-0539', 0.0073532051657581136),
 ('components C-1625', 0.0070527930968613357),
 ('components C-0211', 0.0064200548750458723),
 ('components C-0215', 0.0059576855111870396),
 ('components C-0165', 0.005374359254210313),
 ('components C-1867', 0.0043438518486750262),
 ('components C-1313', 0.0043057804857639858),
 ('components C-1848', 0.0039024488864073453),
 ('components C-0679', 0.0037172896695233351),
 ('components C-1244', 0.0023393

In [110]:
dump_decision_tree("tree0.pdf", X_train_feats, reg.estimators_[0], max_depth=5)