In [1]:
cd ..

/home/agrigorev/notebooks/home-depot/homedepot


In [2]:
import numpy as np
import pandas as pd

In [35]:
import xgboost as xgb

In [4]:
root_path = '/home/agrigorev/notebooks/home-depot/input'

df_train = pd.read_csv(root_path + '/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(root_path + '/test.csv', encoding="ISO-8859-1")

## Feature Preparation

In [None]:
full = 'w2v_features_cleaned_full.csv'
exl = 'w2v_features_cleaned_query_excluded.csv'

In [6]:
df_w2v_sim_full = pd.read_csv(full)
df_w2v_sim_exl = pd.read_csv(exl)

In [7]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all.drop(['product_title', 'product_uid', 'search_term'], axis=1, inplace=1)

df_all = df_all.merge(df_w2v_sim_full, on='id')
df_all = df_all.merge(df_w2v_sim_exl, on='id')

In [25]:
w2v_all = list(df_w2v_sim_full.columns)
w2v_all.remove('id')

feature_groups = {
    'w2v_all': w2v_all,
    'w2v_bullets': [c for c in w2v_all if 'bullet' in c],
    'w2v_exl': [c + '_exl' for c in w2v_all],
    'w2v_exl_bullets': [c + '_exl' for c in w2v_bullets],
    'w2v_diff': [c + '_diff' for c in w2v_all],
    'w2v_diff_bullets': [c + '_diff' for c in bullets]
}

In [10]:
for c in w2v_all:
    df_all[c + '_diff'] = df_all[c] - df_all[c + '_exl']

In [26]:
for name, columns in feature_groups.items():
    df_all[name + '_min'] = df_all[columns].min(axis=1)
    df_all[name + '_min'] = df_all[columns].max(axis=1)
    df_all[name + '_mean'] = df_all[columns].mean(axis=1)
    df_all[name + '_std'] = df_all[columns].std(axis=1)
    df_all[name + '_mean+std'] = df_all[name + '_mean'] + df_all[name + '_std']
    df_all[name + '_mean-std'] = df_all[name + '_mean'] - df_all[name + '_std']

## Model

In [37]:
train_len = int(len(df_train) * 0.885)
counts = df_train[:train_len].groupby(['product_uid']).count()[['id']]

counts = counts[counts['id'] > 1]
counts = counts.add_suffix('_Count').reset_index()
valid_product_uids = set(counts['product_uid'].values)

inds = []

allowed_uids = df_train.loc[df_train['product_uid'].isin(valid_product_uids)]
# For now, always grab first row of valid product uid.
lastUid = 0

for idx, mrow in allowed_uids.iterrows():
    if lastUid == mrow['product_uid']:
        continue

    lastUid = mrow['product_uid']
    inds.append(idx)

validation_idx = np.array(inds + list(df_train[train_len:].index.values))
train_idx = np.array(df_train.loc[~df_train.index.isin(validation_idx)].index.values)

In [51]:
features = list(df_all.columns)
features.remove('id')
features.remove('relevance')

In [52]:
X_all = df_all[features].values

X = X_all[:len(df_train)]
y = df_train.relevance.values

X_test = X_all[len(df_train):]

In [53]:
dtrain = xgb.DMatrix(X[train_idx], label=y[train_idx], feature_names=features)
dvalid = xgb.DMatrix(X[validation_idx], label=y[validation_idx], feature_names=features)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

In [69]:
results = []

In [101]:
n_estimators = 10000
early_stopping_rounds = 50

xgb_pars = {
    'reg_alpha': 10, 
    'colsample_bytree': 0.8, 
    'silent': 1, 
    'eval_metric': 'rmse', 
    'learning_rate': 0.03, 
    'max_delta_step': 5, 
    'nthread': 8, 
    'min_child_weight': 1, 
    'subsample': 0.9, 
    'reg_lambda': 0.05, 
    'seed': 42, 
    'objective': 'reg:linear', 
    'max_depth': 10, 
    'gamma': 0.5
}

In [102]:
model = xgb.train(xgb_pars, dtrain, 10000, evals=watchlist, early_stopping_rounds=20, verbose_eval=10)
results.append((model.best_score, dict(xgb_pars)))
model.best_score, model.best_iteration

(0.496242, 132)

In [103]:
for score, par in reversed(results):
    print score, par

0.496242 {'reg_alpha': 10, 'colsample_bytree': 0.8, 'silent': 1, 'eval_metric': 'rmse', 'learning_rate': 0.03, 'max_delta_step': 5, 'nthread': 8, 'min_child_weight': 1, 'subsample': 0.9, 'reg_lambda': 0.05, 'seed': 42, 'objective': 'reg:linear', 'max_depth': 10, 'gamma': 0.5}
0.496545 {'reg_alpha': 10, 'colsample_bytree': 0.8, 'silent': 1, 'eval_metric': 'rmse', 'learning_rate': 0.03, 'max_delta_step': 5, 'nthread': 8, 'min_child_weight': 100, 'subsample': 0.9, 'reg_lambda': 0.05, 'seed': 42, 'objective': 'reg:linear', 'max_depth': 10, 'gamma': 0.5}
0.499169 {'reg_alpha': 10, 'colsample_bytree': 0.8, 'silent': 1, 'eval_metric': 'rmse', 'learning_rate': 0.3, 'max_delta_step': 1, 'nthread': 8, 'min_child_weight': 100, 'subsample': 0.75, 'reg_lambda': 0.05, 'seed': 42, 'objective': 'reg:linear', 'max_depth': 10, 'gamma': 0.1}
0.499736 {'reg_alpha': 10, 'colsample_bytree': 0.8, 'silent': 1, 'eval_metric': 'rmse', 'learning_rate': 0.3, 'max_delta_step': 1, 'nthread': 8, 'min_child_weight': 

In [62]:
sorted(model.get_fscore().items(), key=lambda x: -x[1])

[('w2v_query_product_title_diff', 549),
 ('w2v_query_product_title_exl', 511),
 ('w2v_query_product_title', 504),
 ('w2v_query_all_text_diff', 474),
 ('w2v_query_product_description_diff', 464),
 ('w2v_query_brand', 461),
 ('w2v_query_product_description', 389),
 ('w2v_query_brand_exl', 383),
 ('w2v_exl_mean-std', 372),
 ('w2v_query_product_description_exl', 365),
 ('w2v_query_all_text_exl', 357),
 ('w2v_diff_mean-std', 357),
 ('w2v_query_all_text', 352),
 ('w2v_query_bullet02_exl', 349),
 ('w2v_query_material', 347),
 ('w2v_all_min', 345),
 ('w2v_query_bullet03_exl', 340),
 ('w2v_query_bullet01', 338),
 ('w2v_query_bullet01_exl', 333),
 ('w2v_query_bullet04_exl', 319),
 ('w2v_all_mean-std', 318),
 ('w2v_exl_min', 315),
 ('w2v_query_bullet02', 310),
 ('w2v_query_bullet03', 306),
 ('w2v_query_bullet04', 291),
 ('w2v_query_color', 291),
 ('w2v_query_bullet01_diff', 287),
 ('w2v_exl_bullets_mean-std', 266),
 ('w2v_query_bullet04_diff', 262),
 ('w2v_query_bullet05', 259),
 ('w2v_query_bull