In [44]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import dump_decision_tree
from soln.utils import eval_regressor
from soln.utils import print_feature_importances

pd.set_option('display.max_columns', None)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [26]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()

CPU times: user 13.7 s, sys: 76 ms, total: 13.8 s
Wall time: 14 s


In [27]:
%time X_train, y_train, X_test, y_test = next(generate_xv_splits(aug_train_set))

CPU times: user 108 ms, sys: 8 ms, total: 116 ms
Wall time: 123 ms


In [28]:
# Keep only the test examples that have unknown components.

from soln.dataset import get_component_info_df
from soln.dataset import load_raw_components
comp_types, group_dfs, cluster_dfs = load_raw_components()
cinfo_df = get_component_info_df(comp_types, group_dfs, cluster_dfs)

from soln.utils import count_components
train_counts = count_components(X_train, cinfo_df)
train_counts.rename(columns={'count': 'train_count'}, inplace=True)
test_counts = count_components(X_test, cinfo_df)
test_counts.rename(columns={'count': 'test_count'}, inplace=True)
all_counts = cinfo_df[['component_id', 'component_type_id', 'component_group_id']]
all_counts = all_counts.merge(train_counts, on='component_id')
all_counts = all_counts.merge(test_counts, on='component_id')

known_cids = set(all_counts.component_id[all_counts.train_count > 0].values)
print len(all_counts), len(known_cids)

has_unk = []
for cids in X_test.components:
    has_unk.append(any([cid not in known_cids for cid in cids]))
print len(X_test), len(has_unk)

X_test['has_unk'] = has_unk
print X_test.has_unk.value_counts()
print X_test.has_unk.value_counts(normalize=True)
tmp_df = X_test[['tube_assembly_id', 'has_unk']].drop_duplicates()
print len(X_test), len(tmp_df)
print tmp_df.has_unk.value_counts()
print tmp_df.has_unk.value_counts(normalize=True)

X_test_orig = X_test
y_test_orig = y_test
print X_train.shape, y_train.shape
print X_test_orig.shape, y_test_orig.shape
X_test = X_test_orig[X_test_orig.has_unk == True].reset_index(drop=True)
X_test.pop('has_unk')
y_test = y_test_orig[X_test_orig.has_unk == True].reset_index(drop=True)
print X_test.shape, y_test.shape

2047 1141
2943 2943
False    2791
True      152
dtype: int64
False    0.948352
True     0.051648
dtype: float64
2943 895
False    828
True      67
dtype: int64
False    0.92514
True     0.07486
dtype: float64
(27270, 50) (27270,)
(2943, 51) (2943,)
(152, 50) (152,)


In [29]:
featurizer = AllCategoricalsFeaturizer()
%time featurizer.fit(X_train)
%time X_train_feats = featurizer.transform(X_train)
%time X_test_feats = featurizer.transform(X_test)
X_train_feats.info(verbose=True)

CPU times: user 1.46 s, sys: 0 ns, total: 1.46 s
Wall time: 1.51 s
CPU times: user 1.35 s, sys: 280 ms, total: 1.63 s
Wall time: 1.64 s
CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 40 ms
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27270 entries, 0 to 27269
Data columns (total 596 columns):
annual_usage                                           int64
min_order_quantity                                     int64
bracket_pricing                                        bool
quantity                                               int64
diameter                                               float64
wall_thickness                                         float64
length                                                 float64
num_bends                                              int64
bend_radius                                            float64
end_a_1x                                               bool
end_a_2x                                               bool
end_x_1x      

In [30]:
X_train_np = X_train_feats.astype(np.float).values
y_train_np = y_train.values
X_test_np = X_test_feats.astype(np.float).values
y_test_np = y_test.values
print X_train_np.shape, X_test_np.shape, y_train_np.shape, y_test_np.shape

(27270, 596) (152, 596) (27270,) (152,)


In [31]:
import xgboost as xgb

params = {
    'objective': 'reg:linear',
    'eta': 0.02,
    'min_child_weight': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'silent': 1,
    'max_depth': 8,
}

xgtrain = xgb.DMatrix(X_train_np, label=y_train_np)
xgtest = xgb.DMatrix(X_test_np)

In [32]:
num_rounds = 1000
%time model = xgb.train(params.items(), xgtrain, num_rounds)
%time y_train_pred = model.predict(xgtrain)
train_rmsle = np.sqrt(mean_squared_error(y_train_np, y_train_pred))
%time y_test_pred = model.predict(xgtest)
test_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
print train_rmsle, test_rmsle

CPU times: user 2min 27s, sys: 336 ms, total: 2min 27s
Wall time: 1min 27s
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 3.96 ms
CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 20.8 ms
0.127241842406 0.431898659154


In [62]:
in_test_not_train = all_counts[(all_counts.train_count == 0) & (all_counts.test_count > 0)]
print len(in_test_not_train)
print in_test_not_train.component_group_id.value_counts()

78
other       50
straight    10
boss         8
threaded     6
elbow        2
adaptor      2
dtype: int64


In [64]:
df = X_test.copy()
df['true_log_cost'] = y_test
df['pred_log_cost'] = y_test_pred
df['err2'] = (df.true_log_cost - df.pred_log_cost) ** 2
df.sort('err2', ascending=False, inplace=True)
df[:10]

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,wall_thickness,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,num_other,specs,components,quote_age,adj_quantity,adj_bracketing,bracketing_pattern,ends,end_a_forming,end_1x_count,end_x_forming,end_2x_count,end_forming_count,component_groups,component_types,unique_feature_count,orientation_count,groove_count,total_component_weight,component_end_forms,component_connection_types,component_max_length,component_max_overall_length,component_max_bolt_pattern_wide,component_max_bolt_pattern_long,component_max_thickness,component_min_thread_pitch,component_min_thread_size,component_part_names,true_log_cost,pred_log_cost,err2
60,TA-05245,S-0090,2005-02-23,0,1,True,1,SP-0035,19.05,1.24,25,1,44.45,False,False,False,False,EF-003,EF-017,0,0,0,[],"[C-0443, C-1486]",38404,1,False,(),"(EF-003, EF-017)",False,0,True,0,1,"[adaptor, threaded]","[CP-028, CP-014]",0,0,0,0.0,"[A-007, 9999, A-001, A-004]","[B-007, 9999, B-006]",0,33.0,0.0,0.0,0,14,0.875,(),0.909402,2.25546,1.811871
105,TA-15518,S-0066,2014-04-03,600,20,False,20,SP-0029,25.4,1.65,31,2,63.5,False,False,False,False,EF-003,EF-003,0,0,0,[],"[C-1625, C-1632, C-0302, C-1008]",41730,20,False,(),"(EF-003, EF-003)",False,0,False,0,0,"[nut, sleeve, other, other]","[CP-025, CP-024, other, other]",1,0,0,0.506,[],[B-002],9999,0.0,0.0,0.0,0,12,9999.0,"(PLUG AS-LD STOR, ELBOW-CAST)",3.618106,2.286615,1.772868
119,TA-17657,S-0042,2013-10-29,1,1,False,1,SP-0035,9.52,1.65,61,4,19.05,False,False,False,False,EF-017,EF-017,0,0,0,[],"[C-1468, C-1473]",41574,1,False,(),"(EF-017, EF-017)",True,0,True,0,2,"[threaded, other]","[CP-014, other]",0,0,0,0.128,"[A-001, A-004]",[B-006],0,18.54,0.0,0.0,0,18,0.625,"(FITTING-A/C,)",1.246692,2.569293,1.749274
120,TA-18071,S-0066,2013-04-11,273,0,True,5,SP-0029,88.9,1.65,27,1,88.9,True,True,False,True,EF-023,EF-009,0,0,0,[],[C-1289],41373,5,True,"(5, 20)","(EF-023, EF-009)",False,1,True,2,1,[other],[other],0,0,0,0.699,[],[],0,0.0,0.0,0.0,0,9999,9999.0,"(FLANGE,)",4.53996,3.364467,1.381784
42,TA-03160,S-0066,2009-01-02,64300,1,False,1,SP-0029,25.4,1.65,19,0,0.0,True,True,True,True,EF-023,EF-023,0,0,0,[],"[C-0330, C-0330]",39813,1,False,(),"(EF-023, EF-023)",False,2,False,2,0,"[other, other]","[other, other]",0,0,0,2.0,[],[],0,0.0,0.0,0.0,0,9999,9999.0,"(ELBOW, ELBOW)",1.468163,2.534086,1.136191
112,TA-16772,S-0066,2013-08-08,304,25,False,25,SP-0029,152.4,1.65,12,0,0.0,True,True,True,True,EF-017,EF-017,3,0,0,[],"[C-1159, C-1161, C-1308]",41492,25,False,(),"(EF-017, EF-017)",True,2,True,2,2,"[other, other, boss]","[other, other, CP-020]",0,1,0,1.062,[],[9999],0,0.0,0.0,86.0,0,9999,9999.0,"(BOSS, BOSS)",3.935294,2.983759,0.905419
72,TA-07246,S-0066,2011-06-29,1,0,True,1,SP-0028,76.2,1.65,53,0,0.0,True,True,True,True,EF-017,EF-017,0,0,0,[],[C-0696],40721,1,True,"(1, 2, 3, 5)","(EF-017, EF-017)",True,2,True,2,2,[other],[other],0,0,0,0.176,[],[],0,0.0,0.0,0.0,0,9999,9999.0,"(COUPLING (1 BSP),)",5.449324,4.620416,0.687089
63,TA-05729,S-0081,2014-05-01,1,1,False,1,SP-0029,38.1,1.65,15,1,76.2,True,True,False,True,EF-023,EF-017,0,0,0,"[SP-0004, SP-0013, SP-0024, SP-0026]","[C-0626, C-0629]",41758,1,False,(),"(EF-023, EF-017)",False,1,True,2,1,"[other, other]","[other, other]",0,0,0,0.36,[],[],0,0.0,0.0,0.0,0,9999,9999.0,"(FLANGE, TUBE)",3.810203,3.04877,0.579781
115,TA-17269,S-0066,2013-05-15,5,0,True,1,SP-0048,31.75,3.96,124,2,63.5,False,False,False,False,EF-003,EF-003,0,0,0,[],"[C-0334, C-0679]",41407,1,True,"(1, 2, 5)","(EF-003, EF-003)",False,0,False,0,0,"[straight, straight]","[CP-006, CP-003]",0,1,1,0.578,[],[],0,0.0,26.19,52.37,32,9999,9999.0,(),5.855604,5.120413,0.540506
91,TA-11915,S-0111,2014-07-25,1091,8,False,1,SP-0029,38.1,4.78,161,2,76.2,False,False,False,False,NONE,NONE,0,0,0,[],"[C-1093, C-1093]",41843,8,False,(),"(NONE, NONE)",False,0,False,0,0,"[other, other]","[other, other]",0,0,0,1.226,[],[],0,0.0,0.0,0.0,0,9999,9999.0,"(BRACKET, BRACKET)",2.433555,3.149969,0.513249


In [55]:
# Approach 1: Replace unknown 'straight' components with their nearest known neighbor.

straight = pd.read_csv('straight_vecs.csv')
straight.set_index('component_id', drop=True, inplace=True)
straight_np = straight.astype(np.float).values
print straight.shape
print straight_np.shape

from scipy.cluster.vq import whiten
straight_np_wh = whiten(straight_np)
cid_to_row = {}
for i, cid in enumerate(straight.index):
    cid_to_row[cid] = straight_np_wh[i, :]

unknown_cids = set(in_test_not_train.component_id[in_test_not_train.component_group_id == 'straight'].values)
print unknown_cids

from scipy.spatial.distance import euclidean

cid_to_subst = {}
for cid in unknown_cids:
    cid_row = cid_to_row[cid]
    best_target_cid = None
    best_dist = np.inf
    for target_cid, target_cid_row in cid_to_row.iteritems():
        if target_cid in unknown_cids:
            continue
        dist = euclidean(cid_row, target_cid_row)
        if dist < best_dist:
            best_target_cid = target_cid
            best_dist = dist
    cid_to_subst[cid] = best_target_cid
    print "unknown cid {} mapped to known cid {} with dist {}".format(cid, best_target_cid, best_dist)

cid_to_subst

(361, 26)
(361, 26)
set(['C-0334', 'C-1494', 'C-1999', 'C-0141', 'C-1549', 'C-0621', 'C-1897', 'C-0466', 'C-1785', 'C-0362'])
unknown cid C-0334 mapped to known cid C-0741 with dist 0.424655790434
unknown cid C-1494 mapped to known cid C-1495 with dist 0.0148741682958
unknown cid C-1999 mapped to known cid C-0457 with dist 1.30878282013
unknown cid C-0141 mapped to known cid C-1996 with dist 0.459905470358
unknown cid C-1549 mapped to known cid C-1740 with dist 0.0863014648659
unknown cid C-0621 mapped to known cid C-1900 with dist 1.48704650335
unknown cid C-1897 mapped to known cid C-1344 with dist 0.887588960897
unknown cid C-0466 mapped to known cid C-1433 with dist 0.151038244132
unknown cid C-1785 mapped to known cid C-1329 with dist 0.23612852287
unknown cid C-0362 mapped to known cid C-0038 with dist 0.41829806389


{'C-0141': 'C-1996',
 'C-0334': 'C-0741',
 'C-0362': 'C-0038',
 'C-0466': 'C-1433',
 'C-0621': 'C-1900',
 'C-1494': 'C-1495',
 'C-1549': 'C-1740',
 'C-1785': 'C-1329',
 'C-1897': 'C-1344',
 'C-1999': 'C-0457'}

In [46]:
cids = ('C-0334', 'C-0741')
straight[straight.index.isin(cids)]

Unnamed: 0_level_0,bolt_pattern_long,bolt_pattern_wide,head_diameter,overall_length,thickness,groove,unique_feature,orientation,weight,MJ-001,MJ-002,MJ-003,MJ-007,MJ-other,CP-001,CP-002,CP-003,CP-004,CP-005,CP-006,CP-007,bolt_pattern_long_missing,bolt_pattern_wide_missing,head_diameter_missing,overall_length_missing,weight_missing
component_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
C-0334,71.77567,40.841225,47.63,27.906098,9.53,True,False,False,0.104,0,0,0,0,1,0,0,0,0,0,1,0,True,True,False,True,False
C-0741,71.77567,40.841225,50.8,27.906098,8.24,True,False,False,0.14,0,0,0,0,1,0,0,0,0,0,1,0,True,True,False,True,False


In [60]:
X_test_mangled = X_test.copy()
orig_components = X_test_mangled.pop('components')
subst_components = []
for cids in orig_components.values:
    subst_cids = []
    for cid in cids:
        if cid in unknown_cids:
            target_cid = cid_to_subst[cid]
        else:
            target_cid = cid
        subst_cids.append(target_cid)
    subst_components.append(subst_cids)
X_test_mangled['components'] = subst_components

X_test_mangled['orig_components'] = orig_components
print X_test_mangled[['tube_assembly_id', 'components', 'orig_components']][:10]
X_test_mangled.pop('orig_components')
None

# Note that we only make substitutions for 'straight' at the moment,
# so some component lists will remain unchanged...

  tube_assembly_id        components   orig_components
0         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
1         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
2         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
3         TA-00181  [C-1468, C-1480]  [C-1468, C-1480]
4         TA-00566  [C-1329, C-1329]  [C-1785, C-1785]
5         TA-00968          [C-1764]          [C-1764]
6         TA-01243          [C-1996]          [C-0141]
7         TA-01243          [C-1996]          [C-0141]
8         TA-01243          [C-1996]          [C-0141]
9         TA-01243          [C-1996]          [C-0141]


In [61]:
print X_test.shape, X_test_mangled.shape
X_test_mangled_feats = featurizer.transform(X_test_mangled)
X_test_mangled_np = X_test_mangled_feats.astype(np.float).values
xgtest_mangled = xgb.DMatrix(X_test_mangled_np)
y_test_mangled_pred = model.predict(xgtest_mangled)
test_mangled_rmsle = np.sqrt(mean_squared_error(y_test_np, y_test_mangled_pred))
print test_mangled_rmsle

(152, 50) (152, 50)
0.432077643867
