In [9]:
%pylab inline

from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_model
from soln.utils import train_model

pd.set_option('display.max_columns', None)

Populating the interactive namespace from numpy and matplotlib


In [5]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()
%time X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

CPU times: user 13.5 s, sys: 100 ms, total: 13.6 s
Wall time: 14.2 s
CPU times: user 128 ms, sys: 20 ms, total: 148 ms
Wall time: 152 ms
(27270, 53) (27270,) (2943, 53) (2943,)


In [4]:
# Layer 1: Everything.

layer1_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

def layer1_get_indices(X):
    return np.ones(len(X), dtype=bool)

In [7]:
layer1_featurizer = AllCategoricalsFeaturizer()
%time layer1 = train_model(layer1_params, layer1_get_indices, layer1_featurizer, X_train, y_train)

CPU times: user 2min 43s, sys: 992 ms, total: 2min 44s
Wall time: 1min 44s


In [8]:
layer1_train_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_train, y_train)
layer1_test_results = eval_model(layer1['model'], layer1_get_indices, layer1_featurizer, X_test, y_test)
print "Train on everything, test on everything:"
print layer1['X_train'].shape
print layer1_train_results['X_eval'].shape
print layer1_test_results['X_eval'].shape
print "train RMSLE", layer1_train_results['rmsle']
print "test RMSLE", layer1_test_results['rmsle']

Train on everything, test on everything:
(27270, 53)
(27270, 53)
(2943, 53)
train RMSLE 0.124960740984
test RMSLE 0.227403087285


In [10]:
# Layer 2: Uncommon brackets.

layer2_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

common_brackets = [
    (1, 2, 5, 10, 25, 50, 100, 250),
    (1, 6, 20),
    (1, 2, 3, 5, 10, 20),
    (1, 2, 5, 10, 25, 50, 100),
    (5, 19, 20),
]

def layer2_get_indices(X):
    return ~X.bracketing_pattern.isin(common_brackets)

In [11]:
layer2_featurizer = AllCategoricalsFeaturizer()
%time layer2 = train_model(layer2_params, layer2_get_indices, layer2_featurizer, X_train, y_train)

CPU times: user 59.6 s, sys: 324 ms, total: 59.9 s
Wall time: 40.3 s


In [12]:
print "Train on everything, test on uncommon brackets:"
tmp = eval_model(layer1['model'], layer2_get_indices, layer1_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']

Train on everything, test on uncommon bracket:
(987, 53)
test RMSLE 0.369369099906


In [13]:
layer2_train_results = eval_model(layer2['model'], layer2_get_indices, layer2_featurizer, X_train, y_train)
layer2_test_results = eval_model(layer2['model'], layer2_get_indices, layer2_featurizer, X_test, y_test)
print "Train on uncommon brackets, test on uncommon brackets:"
print layer2['X_train'].shape
print layer2_train_results['X_eval'].shape
print layer2_test_results['X_eval'].shape
print "train RMSLE", layer2_train_results['rmsle']
print "test RMSLE", layer2_test_results['rmsle']

Train on uncommon bracket, test on uncommon bracket:
(8221, 53)
(8221, 53)
(987, 53)
train RMSLE 0.163275589812
test RMSLE 0.345135857409


In [17]:
y_test_pred = pd.Series(layer1_test_results['y_eval_pred'], copy=True)
y_test_pred[layer2_test_results['eval_is']] = layer2_test_results['y_eval_pred']
rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred.values))
print "Layer 1 and layer 2 together:"
print y_test_pred.shape
print "test RMSLE", rmsle

Layer 1 and layer 2 together:
(2943,)
test RMSLE 0.214255159159


In [18]:
# Layer 3: Empty bracket.

layer3_params = {
    'objective': 'reg:linear',
    'silent': 1,
    'num_rounds': 1000,
    'gamma': 0.0,
    'eta': 0.02,
    'max_depth': 8,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
}

def layer3_get_indices(X):
    return (X.bracketing_pattern == ())

In [19]:
layer3_featurizer = AllCategoricalsFeaturizer()
%time layer3 = train_model(layer3_params, layer3_get_indices, layer3_featurizer, X_train, y_train)

CPU times: user 35 s, sys: 208 ms, total: 35.2 s
Wall time: 25.6 s


In [21]:
print "Train on everything, test on empty bracket:"
tmp = eval_model(layer1['model'], layer3_get_indices, layer1_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']

Train on everything, test on empty bracket:
(493, 53)
test RMSLE 0.402430706857


In [22]:
print "Train on uncommon brackets, test on empty bracket:"
tmp = eval_model(layer2['model'], layer3_get_indices, layer2_featurizer, X_test, y_test)
print tmp['X_eval'].shape
print "test RMSLE", tmp['rmsle']

Train on uncommon brackets, test on empty bracket:
(493, 53)
test RMSLE 0.378887498903


In [23]:
layer3_train_results = eval_model(layer3['model'], layer3_get_indices, layer3_featurizer, X_train, y_train)
layer3_test_results = eval_model(layer3['model'], layer3_get_indices, layer3_featurizer, X_test, y_test)
print "Train on empty bracket, test on empty bracket:"
print layer3['X_train'].shape
print layer3_train_results['X_eval'].shape
print layer3_test_results['X_eval'].shape
print "train RMSLE", layer3_train_results['rmsle']
print "test RMSLE", layer3_test_results['rmsle']

Train on empty bracket, test on empty bracket:
(4249, 53)
(4249, 53)
(493, 53)
train RMSLE 0.146880176789
test RMSLE 0.377893012301


In [24]:
y_test_pred = pd.Series(layer1_test_results['y_eval_pred'], copy=True)
y_test_pred[layer2_test_results['eval_is']] = layer2_test_results['y_eval_pred']
y_test_pred[layer3_test_results['eval_is']] = layer3_test_results['y_eval_pred']
rmsle = np.sqrt(mean_squared_error(y_test.values, y_test_pred.values))
print "Layer 1 and layer 2 and layer 3 together:"
print y_test_pred.shape
print "test RMSLE", rmsle

Layer 1 and layer 2 and layer 3 together:
(2943,)
test RMSLE 0.213960742254
