In [219]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import dump_decision_tree
from soln.utils import eval_regressor
from soln.utils import print_feature_importances

Populating the interactive namespace from numpy and matplotlib


In [220]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
%time X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))

CPU times: user 13.9 s, sys: 72 ms, total: 13.9 s
Wall time: 14.4 s
CPU times: user 180 ms, sys: 8 ms, total: 188 ms
Wall time: 198 ms


In [221]:
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)

CPU times: user 1.46 s, sys: 4 ms, total: 1.47 s
Wall time: 1.48 s
CPU times: user 1.38 s, sys: 260 ms, total: 1.64 s
Wall time: 1.67 s
CPU times: user 176 ms, sys: 0 ns, total: 176 ms
Wall time: 178 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27270 entries, 0 to 27269
Data columns (total 596 columns):
annual_usage                                           int64
min_order_quantity                                     int64
bracket_pricing                                        bool
quantity                                               int64
diameter                                               float64
wall_thickness                                         float64
length                                                 float64
num_bends                                              int64
bend_radius                                            float64
end_a_1x                                               bool
end_a_2x                                               bool
end_x_1x   

In [None]:
# Experiment to determine whether component_clusters help:
# Drop all features except components and component_clusters.

assert False

chosen_cols = []
for col in X_train_feats.columns:
    if col.startswith('components ') or col.startswith('component_clusters '):
        chosen_cols.append(col)

X_train_feats = X_train_feats[chosen_cols]
X_test_feats = X_test_feats[chosen_cols]
X_train_feats.info(verbose=True)

In [222]:
X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
print X_train_np.shape, X_test_np.shape, y_train_np.shape, y_test_np.shape

(27270, 596) (2943, 596) (27270,) (2943,)


In [223]:
# Inspired by https://www.kaggle.com/kumareshd/caterpillar-tube-pricing/xgbooost-222/code

import xgboost as xgb

params = {
    'objective': 'reg:linear',
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'scale_pos_weight': 0.8,
    'silent': 1,
    'max_depth': 8,
    'max_delta_step': 2,
}

xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)

In [None]:
# Experiment: Try linear booster instead of tree booster.

assert False

params = {
    'booster': 'gblinear',
    'objective': 'reg:linear',
    'silent': 1,
}

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_np)
xgtrain = xgb.DMatrix(scaler.transform(X_train_np), label=y_train_np)
xgtest = xgb.DMatrix(scaler.transform(X_test_np))

In [224]:
num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 2min 39s, sys: 476 ms, total: 2min 40s
Wall time: 1min 48s
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 4.77 ms
CPU times: user 820 ms, sys: 0 ns, total: 820 ms
Wall time: 514 ms
0.127432962457 0.224804181574


In [None]:
## Stuff below has not been re-evaluated after adding features!

In [34]:
# Check RMSLE on well-behaved bracket.
from soln.bracket import brapa
indices = (X_test.bracketing_pattern == brapa)
print indices.mean()
bra_y_test = y_test[indices]
bra_y_test_pred = pd.Series(y_test_pred)[indices]
print y_test.shape, y_test_pred.shape
print bra_y_test.shape, bra_y_test_pred.shape
bra_test_rmsle = np.sqrt(mean_squared_error(bra_y_test, bra_y_test_pred))
print bra_test_rmsle

0.559972816854
(2943,) (2943,)
(1648,) (1648,)
0.0884511080473


In [228]:
# Check if RMSLE on rows with unknown components is worse.

from soln.dataset import get_component_info_df
from soln.dataset import load_raw_components
comp_types, group_dfs, cluster_dfs = load_raw_components()
cinfo_df = get_component_info_df(comp_types, group_dfs, cluster_dfs)

from soln.utils import count_components
train_counts = count_components(X_train, cinfo_df)
train_counts.rename(columns={'count': 'train_count'}, inplace=True)
test_counts = count_components(X_test, cinfo_df)
test_counts.rename(columns={'count': 'test_count'}, inplace=True)
all_counts = cinfo_df[['component_id', 'component_type_id', 'component_group_id']]
all_counts = all_counts.merge(train_counts, on='component_id')
all_counts = all_counts.merge(test_counts, on='component_id')

known_cids = set(all_counts.component_id[all_counts.train_count > 0].values)
print len(all_counts), len(known_cids)

has_unk = []
for cids in X_test.components:
    has_unk.append(any([cid not in known_cids for cid in cids]))
print len(X_test), len(has_unk)

X_test['has_unk'] = has_unk
print X_test.has_unk.value_counts()
print X_test.has_unk.value_counts(normalize=True)
tmp_df = X_test[['tube_assembly_id', 'has_unk']].drop_duplicates()
print len(X_test), len(tmp_df)
print tmp_df.has_unk.value_counts()
print tmp_df.has_unk.value_counts(normalize=True)

2047 1141
2943 2943
False    2791
True      152
dtype: int64
False    0.948352
True     0.051648
dtype: float64
2943 895
False    828
True      67
dtype: int64
False    0.92514
True     0.07486
dtype: float64


In [235]:
X_test['true_log_cost'] = y_test
X_test['pred_log_cost'] = y_test_pred
print len(X_test)
print "overall RMSLE:", sqrt(mean_squared_error(X_test.true_log_cost.values, X_test.pred_log_cost.values))

tmp_df = X_test[X_test.has_unk == False]
print len(tmp_df)
print "has_unk=False RMSLE:", sqrt(mean_squared_error(tmp_df.true_log_cost.values, tmp_df.pred_log_cost.values))

tmp_df = X_test[X_test.has_unk == True]
print len(tmp_df)
print "has_unk=True RMSLE:", sqrt(mean_squared_error(tmp_df.true_log_cost.values, tmp_df.pred_log_cost.values))

2943
overall RMSLE: 0.224804181574
2791
has_unk=False RMSLE: 0.20667325039
152
has_unk=True RMSLE: 0.440662251413


In [11]:
num_rounds = 2000
%time model = xgb.train(plst, xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 4min 9s, sys: 580 ms, total: 4min 10s
Wall time: 2min 24s
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 6.43 ms
CPU times: user 2.54 s, sys: 4 ms, total: 2.55 s
Wall time: 1.45 s
0.102381677438 0.217054198541


In [12]:
num_rounds = 3000
%time model = xgb.train(plst, xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 6min 15s, sys: 956 ms, total: 6min 16s
Wall time: 3min 40s
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 4.18 ms
CPU times: user 4.55 s, sys: 4 ms, total: 4.56 s
Wall time: 2.72 s
0.0842968028879 0.214548119854


In [13]:
num_rounds = 4000
%time model = xgb.train(plst, xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 8min 17s, sys: 1.16 s, total: 8min 18s
Wall time: 4min 48s
CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 4.06 ms
CPU times: user 6.24 s, sys: 12 ms, total: 6.25 s
Wall time: 3.6 s
0.072606964701 0.213577193377


In [14]:
num_rounds = 6000
%time model = xgb.train(plst, xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 12min 22s, sys: 2.23 s, total: 12min 24s
Wall time: 7min 8s
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 5.71 ms
CPU times: user 9.56 s, sys: 16 ms, total: 9.58 s
Wall time: 5.54 s
0.0576814524533 0.212653378762


In [15]:
num_rounds = 10000
%time model = xgb.train(plst, xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 20min 30s, sys: 3.06 s, total: 20min 33s
Wall time: 11min 48s
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 4.39 ms
CPU times: user 15.8 s, sys: 52 ms, total: 15.9 s
Wall time: 9.15 s
0.0430123921517 0.211972759348
