In [11]:
%pylab inline

import pandas as pd

from soln.dataset import get_dev_split
from soln.dataset import get_extended_X
from soln.dataset import load_raw_data
from soln.dataset import log_transform_y

raw = load_raw_data()
X_train, y_train, X_test, y_test = get_dev_split(raw)
X_train = get_extended_X(X_train, raw)
X_test = get_extended_X(X_test, raw)
y_train = log_transform_y(y_train)
y_test = log_transform_y(y_test)

Populating the interactive namespace from numpy and matplotlib


In [48]:
from soln.featurizer import CustomFeaturizer
featurizer = CustomFeaturizer()
featurizer.fit(X_train)
X_train_feats = featurizer.transform(X_train)
X_test_feats = featurizer.transform(X_test)
list(X_train_feats.columns)

['supplier other',
 'supplier S-0042',
 'supplier S-0005',
 'supplier S-0026',
 'supplier S-0027',
 'supplier S-0072',
 'supplier S-0062',
 'supplier S-0064',
 'supplier S-0043',
 'supplier S-0066',
 'supplier S-0041',
 'supplier S-0105',
 'supplier S-0080',
 'supplier S-0081',
 'supplier S-0104',
 'supplier S-0013',
 'supplier S-0014',
 'supplier S-0070',
 'supplier S-0018',
 'supplier S-0031',
 'supplier S-0030',
 'supplier S-0058',
 'supplier S-0054',
 'supplier S-0092',
 'material_id other',
 'material_id nan',
 'material_id SP-0046',
 'material_id SP-0041',
 'material_id SP-0033',
 'material_id SP-0048',
 'material_id SP-0034',
 'material_id SP-0035',
 'material_id SP-0036',
 'material_id SP-0037',
 'material_id SP-0030',
 'material_id SP-0019',
 'material_id SP-0008',
 'material_id SP-0038',
 'material_id SP-0039',
 'material_id SP-0029',
 'material_id SP-0028',
 'specs other',
 'specs SP-0065',
 'specs SP-0002',
 'specs SP-0050',
 'specs SP-0051',
 'specs SP-0057',
 'specs SP-00

In [49]:
X_train_np = X_train_feats.astype(np.float).values
X_test_np = X_test_feats.astype(np.float).values
y_train_np = y_train.values
y_test_np = y_test.values

In [50]:
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

regressors = [
    DummyRegressor(strategy='constant', constant=0.0),
    DummyRegressor(strategy='mean'),
    RandomForestRegressor(n_estimators=20),
    # RandomForestRegressor(n_estimators=100),
]

for reg in regressors:
    %time reg.fit(X_train_np, y_train_np)
    y_train_pred = reg.predict(X_train_np)
    train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
    y_test_pred = reg.predict(X_test_np)
    test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
    print "{}:".format(reg)
    print "    train RMSLE {}".format(train_rmsle)
    print "    test RMSLE {}".format(test_rmsle)
    print

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 358 µs
DummyRegressor(constant=array(0.0), quantile=None, strategy='constant'):
    train RMSLE 2.36481494682
    test RMSLE 2.20644848894

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 407 µs
DummyRegressor(constant=None, quantile=None, strategy='mean'):
    train RMSLE 0.823952519474
    test RMSLE 0.81837011564

CPU times: user 11.7 s, sys: 16 ms, total: 11.7 s
Wall time: 11.8 s
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False):
    train RMSLE 0.109795964546
    test RMSLE 0.350591729874



In [42]:
X_train_feats['specs SP-0080'].value_counts()

False    22355
True      4831
dtype: int64

In [51]:
feat_imps = list(zip(X_train_feats.columns, reg.feature_importances_))
feat_imps.sort(key=lambda (feat, imp): imp, reverse=True)
for feat, imp in feat_imps:
    print feat, imp

quantity 0.469463539106
diameter 0.137506020656
annual_usage 0.0984722305463
min_order_quantity 0.0681419620087
length 0.0528168244548
wall_thickness 0.0219557048529
bend_radius 0.0149763605285
num_bends 0.0138102001164
supplier S-0041 0.0110584699468
material_id SP-0029 0.00914507019142
supplier S-0064 0.00815279461689
supplier S-0026 0.00805146870822
supplier S-0054 0.0079115047761
supplier S-0066 0.00779712546724
num_boss 0.00582718653476
material_id SP-0028 0.00565149023247
supplier S-0072 0.00468928163961
material_id SP-0035 0.00437321884996
specs SP-0061 0.0040876394703
num_other 0.00360641113785
end_a_2x 0.0029645681614
end_x_2x 0.0024377393125
supplier S-0058 0.00218060485395
supplier S-0013 0.00201474787753
specs SP-0007 0.00189362695369
bracket_pricing 0.00184348304616
specs SP-0080 0.00181614661454
specs SP-0063 0.00162670619245
specs SP-0058 0.00137305134097
material_id SP-0038 0.00136699598792
end_a_1x 0.00134224682791
specs SP-0004 0.00132877747478
specs SP-0012 0.0013208