In [1]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import dump_decision_tree
from soln.utils import eval_regressor
from soln.utils import print_feature_importances

pd.set_option('display.max_columns', None)

Populating the interactive namespace from numpy and matplotlib


In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()

CPU times: user 13.6 s, sys: 208 ms, total: 13.8 s
Wall time: 14.3 s


In [31]:
%time X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))

CPU times: user 76 ms, sys: 8 ms, total: 84 ms
Wall time: 97.7 ms


In [6]:
# Approach 1: Keep only the examples with the () bracket.

print X_train.shape, y_train.shape
train_is = (X_train.bracketing_pattern == ())
X_train = X_train[train_is].reset_index(drop=True)
y_train = y_train[train_is].reset_index(drop=True)
print X_train.shape, y_train.shape

print X_test.shape, y_test.shape
test_is = (X_test.bracketing_pattern == ())
X_test = X_test[test_is].reset_index(drop=True)
y_test = y_test[test_is].reset_index(drop=True)
print X_test.shape, y_test.shape

(27270, 53) (27270,)
(4249, 53) (4249,)
(2943, 53) (2943,)
(493, 53) (493,)


In [17]:
# Approach 2: In train, keep all examples with bracket=() or adj_qty=1.
# In test, keep all examples with bracket=().

print X_train.shape, y_train.shape
train_is = ((X_train.bracketing_pattern == ()) | (X_train.adj_quantity == 1))
X_train = X_train[train_is].reset_index(drop=True)
y_train = y_train[train_is].reset_index(drop=True)
print X_train.shape, y_train.shape

print X_test.shape, y_test.shape
test_is = (X_test.bracketing_pattern == ())
X_test = X_test[test_is].reset_index(drop=True)
y_test = y_test[test_is].reset_index(drop=True)
print X_test.shape, y_test.shape

# Toss bracketing info.
X_train.bracketing_pattern = 666
X_test.bracketing_pattern = 666

(27270, 53) (27270,)
(7473, 53) (7473,)
(2943, 53) (2943,)
(493, 53) (493,)


In [32]:
# Approach 3: Instead of just the empty bracket, train and test on all uncommon brackets.

common_brackets = [
    (1, 2, 5, 10, 25, 50, 100, 250),
    (1, 6, 20),
    (1, 2, 3, 5, 10, 20),
    (1, 2, 5, 10, 25, 50, 100),
    (5, 19, 20),
]

print X_train.shape, y_train.shape
train_is = ~X_train.bracketing_pattern.isin(common_brackets)
X_train = X_train[train_is].reset_index(drop=True)
y_train = y_train[train_is].reset_index(drop=True)
print X_train.shape, y_train.shape

print X_test.shape, y_test.shape
test_is = ~X_test.bracketing_pattern.isin(common_brackets)
X_test = X_test[test_is].reset_index(drop=True)
y_test = y_test[test_is].reset_index(drop=True)
print X_test.shape, y_test.shape

(27270, 53) (27270,)
(8221, 53) (8221,)
(2943, 53) (2943,)
(987, 53) (987,)


In [33]:
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)

CPU times: user 456 ms, sys: 0 ns, total: 456 ms
Wall time: 468 ms
CPU times: user 376 ms, sys: 64 ms, total: 440 ms
Wall time: 443 ms
CPU times: user 76 ms, sys: 0 ns, total: 76 ms
Wall time: 76 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8221 entries, 0 to 8220
Data columns (total 499 columns):
annual_usage                                           int64
min_order_quantity                                     int64
bracket_pricing                                        bool
quantity                                               int64
diameter                                               float64
wall_thickness                                         float64
length                                                 float64
num_bends                                              int64
bend_radius                                            float64
end_a_1x                                               bool
end_a_2x                                               bool
end_x_1x         

In [34]:
X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
print X_train_np.shape, X_test_np.shape, y_train_np.shape, y_test_np.shape

(8221, 499) (987, 499) (8221,) (987,)


In [35]:
import xgboost as xgb

params = {
    'objective': 'reg:linear',
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'silent': 1,
    'max_depth': 8,
}

xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)

In [36]:
num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 55.3 s, sys: 116 ms, total: 55.4 s
Wall time: 33.5 s
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.54 ms
CPU times: user 240 ms, sys: 0 ns, total: 240 ms
Wall time: 141 ms
0.163275589812 0.345135857409


In [30]:
aug_train_set.bracketing_pattern.value_counts()[:10]

(1, 2, 5, 10, 25, 50, 100, 250)    17640
()                                  4742
(1, 6, 20)                          2022
(1, 2, 3, 5, 10, 20)                 516
(1, 2, 5, 10, 25, 50, 100)           497
(5, 19, 20)                          330
(1, 2, 5, 10, 25, 50)                186
(1, 3, 5, 7, 9)                      175
(1, 2, 3, 4, 5)                      165
(2, 4, 6, 8)                         140
dtype: int64

In [37]:
# Check RMSLE on () bracket.

indices = (X_test.bracketing_pattern == ())
print indices.mean()
bra_y_test = y_test[indices]
bra_y_test_pred = pd.Series(y_test_pred)[indices]
print y_test.shape, y_test_pred.shape
print bra_y_test.shape, bra_y_test_pred.shape
bra_test_rmsle = np.sqrt(mean_squared_error(bra_y_test, bra_y_test_pred))
print bra_test_rmsle

0.499493414387
(987,) (987,)
(493,) (493,)
0.378887498903
