In [17]:
%pylab inline

from collections import Counter
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
import pandas as pd
import xgboost as xgb

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.dataset import inverse_log_transform_y
from soln.dataset import log_transform_y
from soln.bracket import brapa
from soln.bracket import fc_vals
from soln.bracket import generate_bracket_csv
from soln.utils import eval_regressor
from soln.utils import print_brackets
from soln.utils import print_feature_importances

Populating the interactive namespace from numpy and matplotlib


In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()

CPU times: user 13 s, sys: 168 ms, total: 13.2 s
Wall time: 13.4 s


In [3]:
bracket = pd.read_csv('bracket.csv')
print bracket.shape
bracket[:5]

(2205, 4)


Unnamed: 0,tube_assembly_id,fixed_cost_class,fixed_cost,var_cost
0,TA-18908,1,19.043385,2.76479
1,TA-18906,1,19.043385,2.868769
2,TA-18907,1,19.043385,3.430745
3,TA-18902,1,19.043385,1.759423
4,TA-18903,1,19.043385,1.825497


In [4]:
# Get train and test set only for the well-behaved bracket.

X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))
X_train['log_cost'] = y_train
X_test['log_cost'] = y_test
print X_train.shape, X_test.shape

X_train = X_train[X_train.bracketing_pattern == brapa]
X_test = X_test[X_test.bracketing_pattern == brapa]
print X_train.shape, X_test.shape

y_train = X_train.pop('log_cost')
y_test = X_test.pop('log_cost')

print X_train.bracketing_pattern.value_counts()
print X_test.bracketing_pattern.value_counts()
print X_train.supplier.value_counts()
print X_test.supplier.value_counts()

(27270, 50) (2943, 50)
(15992, 50) (1648, 50)
(1, 2, 5, 10, 25, 50, 100, 250)    15992
dtype: int64
(1, 2, 5, 10, 25, 50, 100, 250)    1648
dtype: int64
S-0066    15992
dtype: int64
S-0066    1648
dtype: int64


In [8]:
# Evaluate original xgb on the well-behaved bracket.

featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)

%time X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
%time X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values

params = {
    'objective': 'reg:linear',
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'scale_pos_weight': 0.8,  # undocumented?!
    'silent': 1,
    'max_depth': 8,
    'max_delta_step': 2,
}

xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)

num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 784 ms, sys: 0 ns, total: 784 ms
Wall time: 791 ms
CPU times: user 656 ms, sys: 0 ns, total: 656 ms
Wall time: 651 ms
CPU times: user 92 ms, sys: 0 ns, total: 92 ms
Wall time: 93.3 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15992 entries, 0 to 27223
Data columns (total 248 columns):
annual_usage                                          int64
min_order_quantity                                    int64
bracket_pricing                                       bool
quantity                                              int64
diameter                                              float64
wall_thickness                                        float64
length                                                float64
num_bends                                             int64
bend_radius                                           float64
end_a_1x                                              bool
end_a_2x                                              bool
end_x_1x                 

In [9]:
# Try to predict fixed_cost_class and log(var_cost) independently, then combine the two.

X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))
X_train['log_cost'] = y_train
X_test['log_cost'] = y_test
print X_train.shape, X_test.shape

X_train = X_train[(X_train.bracketing_pattern == brapa) & (X_train.adj_quantity == 1)]
X_test = X_test[(X_test.bracketing_pattern == brapa) & (X_test.adj_quantity == 1)]
log_cost_train = X_train.pop('log_cost')
log_cost_test = X_test.pop('log_cost')
print X_train.shape, X_test.shape

X_train = X_train.merge(bracket, on='tube_assembly_id')
X_test = X_test.merge(bracket, on='tube_assembly_id')
X_train.pop('fixed_cost')
X_test.pop('fixed_cost')
log_var_cost_train = log_transform_y(X_train.pop('var_cost'))
log_var_cost_test = log_transform_y(X_test.pop('var_cost'))
fcc_train = X_train.pop('fixed_cost_class')
fcc_test = X_test.pop('fixed_cost_class')
print X_train.shape, X_test.shape

featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)

%time X_train_np = X_train_feats.astype(np.float).values
%time X_test_np = X_test_feats.astype(np.float).values

(27270, 50) (2943, 50)
(1999, 49) (206, 49)
(1999, 49) (206, 49)
CPU times: user 100 ms, sys: 0 ns, total: 100 ms
Wall time: 102 ms
CPU times: user 108 ms, sys: 0 ns, total: 108 ms
Wall time: 109 ms
CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 37.3 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1999 entries, 0 to 1998
Data columns (total 160 columns):
annual_usage                                          int64
min_order_quantity                                    int64
bracket_pricing                                       bool
quantity                                              int64
diameter                                              float64
wall_thickness                                        float64
length                                                float64
num_bends                                             int64
bend_radius                                           float64
end_a_1x                                              bool
end_a_2x             

In [13]:
# The classification part: predict fixed_cost_class.

y_train = fcc_train
y_test = fcc_test
y_train_np = y_train.values
y_test_np = y_test.values

xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)

params = {
    'objective': 'multi:softmax',
    'num_class': 4,
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'scale_pos_weight': 0.8,  # undocumented?!
    'silent': 1,
    'max_depth': 8,
    'max_delta_step': 2,
}

num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)

print "on train:"
print accuracy_score(y_train_np, y_train_pred)
print confusion_matrix(y_train_np, y_train_pred)

%time y_test_pred = model.predict(xgtest)
pred_fixed_cost_class_test = y_test_pred

print
print "on test:"
print accuracy_score(y_test_np, y_test_pred)
print confusion_matrix(y_test_np, y_test_pred)

CPU times: user 26.7 s, sys: 88 ms, total: 26.8 s
Wall time: 15.2 s
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.62 ms
on train:
0.973986993497
[[  30    0    0    0]
 [   0 1174    1    4]
 [   0    0  216   12]
 [   0   34    1  527]]
CPU times: user 140 ms, sys: 0 ns, total: 140 ms
Wall time: 77.9 ms

on test:
0.936893203883
[[  1   2   0   0]
 [  0 138   0   2]
 [  0   3  24   2]
 [  0   4   0  30]]


In [15]:
# The regression part: predict var_cost.

y_train = log_var_cost_train
y_test = log_var_cost_test
y_train_np = y_train.values
y_test_np = y_test.values

params = {
    'objective': 'reg:linear',
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'scale_pos_weight': 0.8,  # undocumented?!
    'silent': 1,
    'max_depth': 8,
    'max_delta_step': 2,
}

xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)

num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
pred_log_var_cost_test = y_test_pred
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 10.2 s, sys: 48 ms, total: 10.3 s
Wall time: 5.71 s
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 484 µs
CPU times: user 44 ms, sys: 0 ns, total: 44 ms
Wall time: 23.8 ms
0.0560411692289 0.129193580971


In [18]:
# Combine predictions.

print accuracy_score(fcc_test, pred_fixed_cost_class_test)
print np.sqrt(mean_squared_error(log_var_cost_test, pred_log_var_cost_test))
df = pd.DataFrame()
df['tube_assembly_id'] = X_test.tube_assembly_id
df['pred_fixed_cost_class'] = pred_fixed_cost_class_test
df['pred_fixed_cost'] = np.array(fc_vals)[df.pred_fixed_cost_class]
df['pred_var_cost'] = inverse_log_transform_y(pred_log_var_cost_test)
print df.shape
df[:5]

0.936893203883
0.129193580971
(206, 4)


Unnamed: 0,tube_assembly_id,pred_fixed_cost_class,pred_fixed_cost,pred_var_cost
0,TA-00093,1,19.043385,2.509289
1,TA-00125,3,23.633726,4.808931
2,TA-00173,2,20.295284,4.184652
3,TA-00264,1,19.043385,2.43019
4,TA-00334,1,19.043385,2.554564


In [19]:
# Evaluate on all quantities.

_, _, X_test_full, y_test_full = next(generate_xv_splits(aug_train_set))
X_test_full['log_cost'] = y_test_full
print X_test_full.shape

X_test_full = X_test_full[X_test_full.bracketing_pattern == brapa]
print X_test_full.shape

X_test_full = X_test_full.merge(bracket, on='tube_assembly_id')
X_test_full = X_test_full.merge(df, on='tube_assembly_id')
print X_test_full.shape

X_test_full['pred_cost'] = X_test_full.pred_fixed_cost / X_test_full.adj_quantity + X_test_full.pred_var_cost
X_test_full['pred_log_cost'] = log_transform_y(X_test_full.pred_cost)
X_test_full['err2'] = (X_test_full.log_cost.values - X_test_full.pred_log_cost.values) ** 2
X_test_full.sort('err2', ascending=False, inplace=True)
print X_test_full.shape

print X_test_full.err2.describe()
print np.sqrt(mean_squared_error(X_test_full.log_cost.values, X_test_full.pred_log_cost.values))

X_test_full[:10]

(2943, 50)
(1648, 50)
(1648, 56)
(1648, 59)
count    1.648000e+03
mean     1.145908e-02
std      6.121045e-02
min      3.300686e-10
25%      4.241907e-05
50%      3.428373e-04
75%      2.197979e-03
max      1.031941e+00
Name: err2, dtype: float64
0.107047074172


Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,wall_thickness,...,log_cost,fixed_cost_class,fixed_cost,var_cost,pred_fixed_cost_class,pred_fixed_cost,pred_var_cost,pred_cost,pred_log_cost,err2
1567,TA-20766,S-0066,2013-11-02,1,0,True,250,SP-0029,12.7,0.89,...,2.979539,3,23.633726,18.49766,3,23.633726,6.031068,6.125603,1.963694,1.031941
1566,TA-20766,S-0066,2013-11-02,1,0,True,100,SP-0029,12.7,0.89,...,2.985178,3,23.633726,18.49766,3,23.633726,6.031068,6.267405,1.983399,1.003561
1565,TA-20766,S-0066,2013-11-02,1,0,True,50,SP-0029,12.7,0.89,...,2.994553,3,23.633726,18.49766,3,23.633726,6.031068,6.503742,2.015402,0.958737
1564,TA-20766,S-0066,2013-11-02,1,0,True,25,SP-0029,12.7,0.89,...,3.014818,3,23.633726,18.49766,3,23.633726,6.031068,6.976417,2.076489,0.880461
1563,TA-20766,S-0066,2013-11-02,1,0,True,10,SP-0029,12.7,0.89,...,3.082232,3,23.633726,18.49766,3,23.633726,6.031068,8.39444,2.240118,0.709156
1562,TA-20766,S-0066,2013-11-02,1,0,True,5,SP-0029,12.7,0.89,...,3.185531,3,23.633726,18.49766,3,23.633726,6.031068,10.757813,2.464518,0.519859
791,TA-06066,S-0066,2013-12-02,1,0,True,250,SP-0029,25.4,3.05,...,2.576576,2,20.295284,11.986175,2,20.295284,6.218779,6.29996,1.987869,0.346577
790,TA-06066,S-0066,2013-12-02,1,0,True,100,SP-0029,25.4,3.05,...,2.583743,2,20.295284,11.986175,2,20.295284,6.218779,6.421731,2.004412,0.335624
789,TA-06066,S-0066,2013-12-02,1,0,True,50,SP-0029,25.4,3.05,...,2.595228,2,20.295284,11.986175,2,20.295284,6.218779,6.624684,2.031391,0.317912
375,TA-03006,S-0066,2013-09-01,0,0,True,250,SP-0029,19.05,1.65,...,3.085035,2,20.295284,20.70167,2,20.295284,11.410233,11.491414,2.525042,0.313592
