In [1]:
%pylab inline

from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_model
from soln.utils import train_model

pd.set_option('display.max_columns', None)

Populating the interactive namespace from numpy and matplotlib


In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()

CPU times: user 13.5 s, sys: 196 ms, total: 13.7 s
Wall time: 13.9 s


In [3]:
from itertools import islice
fold_number = 0
%time X_train, y_train, X_test, y_test = next(islice(generate_xv_splits(aug_train_set), fold_number, None))
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

CPU times: user 88 ms, sys: 40 ms, total: 128 ms
Wall time: 125 ms
(27270, 53) (27270,) (2943, 53) (2943,)


In [4]:
# Layer 2: Uncommon brackets.

layer2_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

common_brackets = [
    (1, 2, 5, 10, 25, 50, 100, 250),
    (1, 6, 20),
    (1, 2, 3, 5, 10, 20),
    (1, 2, 5, 10, 25, 50, 100),
    (5, 19, 20),
]

def layer2_get_indices(X):
    return ~X.bracketing_pattern.isin(common_brackets)

In [5]:
layer2_featurizer = AllCategoricalsFeaturizer()
%time layer2 = train_model(layer2_params, layer2_get_indices, layer2_featurizer, X_train, y_train)

CPU times: user 1min, sys: 428 ms, total: 1min
Wall time: 39.9 s


In [6]:
layer2_train_results = eval_model(layer2['model'], layer2_get_indices, layer2_featurizer, X_train, y_train)
layer2_test_results = eval_model(layer2['model'], layer2_get_indices, layer2_featurizer, X_test, y_test)
print "Train on uncommon brackets, test on uncommon brackets:"
print layer2['X_train'].shape
print layer2_train_results['X_eval'].shape
print layer2_test_results['X_eval'].shape
print "train RMSLE", layer2_train_results['rmsle']
print "test RMSLE", layer2_test_results['rmsle']

Train on uncommon brackets, test on uncommon brackets:
(8221, 53)
(8221, 53)
(987, 53)
train RMSLE 0.163275589812
test RMSLE 0.345135857409


In [73]:
# Try training layer2 on all instances, but assigning higher weights to the uncommon brackets.

low_weight = 0.1
high_weight = 16.0
high_weight_is = layer2_get_indices(X_train)
weights = pd.Series(np.ones(len(X_train)) * low_weight)
weights[high_weight_is] = high_weight
print weights.value_counts()

featurizer = AllCategoricalsFeaturizer()
featurizer.fit(X_train)
X_train_feats = featurizer.transform(X_train)
X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
xgtrain = xgb.DMatrix(X_train_np, label=y_train_np, weight=weights.values)

0.1     19049
16.0     8221
dtype: int64


In [74]:
%time model = xgb.train(layer2_params.items(), xgtrain, layer2_params['num_rounds'])

CPU times: user 2min 37s, sys: 412 ms, total: 2min 38s
Wall time: 1min 32s


In [75]:
test_results = eval_model(model, layer2_get_indices, featurizer, X_test, y_test)
print "Train on weigh, test on uncommon brackets:"
print test_results['X_eval'].shape
print "test RMSLE", test_results['rmsle']

Train on weigh, test on uncommon brackets:
(987, 53)
test RMSLE 0.349621353972
