In [1]:
import xgboost as xgb
import json
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../data/parsed_train.tsv', sep='\t')

In [4]:
train[:5]

Unnamed: 0,record_number,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,category_id,item_price,quantity,declared_handling_days,bt,package_size,weight,tz_dis,dis,cross_city,cross_state,acc_hour,pay_hour,acc_date,target
0,1,0,0.0,3,5,13,27.95,1,3.0,0,2,5.0,2.0,3.001315,1.0,1.0,15,3,20190326,3
1,3,0,4.5,3,5,1,19.9,1,1.0,0,0,9.0,-2.0,1.114014,1.0,1.0,16,0,20190107,3
2,6,0,0.0,3,5,3,10.39,1,1.0,0,0,1.0,-2.0,1.608366,1.0,1.0,19,14,20190419,3
3,7,0,0.0,3,5,11,5.7,1,1.0,0,0,0.0,1.0,2.784471,1.0,1.0,17,9,20190208,3
4,9,3,0.0,2,8,18,5.55,1,5.0,0,1,0.0,-1.0,1.568966,1.0,1.0,9,4,20191012,3


In [3]:
c1 = pd.get_dummies(train.shipment_method_id, prefix='sm')
c2 = pd.get_dummies(train.category_id, prefix='ci')
c3 = pd.get_dummies(train.package_size, prefix='ps')
c4 = pd.get_dummies(train.cross_city, prefix='cc')
c5 = pd.get_dummies(train.cross_state, prefix='cs')

In [7]:
x = pd.concat([train.drop(['record_number', 'target', 'shipment_method_id', 'category_id', 'package_size', 'cross_city', 'cross_state'], axis=1), 
               c1, c2, c3, c4, c5], axis=1)
y = train.target

In [8]:
one_unit = int(x.shape[0] / 10)

654382

In [12]:
x_train, y_train = x[:9 * one_unit], y[:9 * one_unit]

In [13]:
x_test, y_test = x[9 * one_unit:], y[9 * one_unit:]

In [14]:
params = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',
    # larger -> under fit
    'gamma': 0,
    'max_depth': 10,
    # L2 Reg
    'lambda': 1,
    'subsample': 0.7,
    'colsample_bytree': 1,
    # child node number
    'min_child_weight': 3,
    # shrinkage
    'eta': 0.3,
    'seed': 1000,
    'nthread': 8,
}

In [15]:
dtrain = xgb.DMatrix(x_train, y_train, enable_categorical=True)
dtest = xgb.DMatrix(x_test, y_test, enable_categorical=True)

In [16]:
num_rounds = 100

In [17]:
def custom_asymmetric_eval(preds, train_data):
    labels = train_data.get_label()
    preds = np.round(preds)
    residual = (labels - preds).astype("int")
    loss = np.where(residual < 0, residual * -0.6, residual * 0.4) 
    return "ebay_loss", np.mean(loss)

In [18]:
results = {}
bst = xgb.train(params, dtrain, 
                num_boost_round=num_rounds, 
                evals=[(dtest, 'test1')], 
                feval=custom_asymmetric_eval, 
                evals_result=results,
                early_stopping_rounds=1)

[0]	test1-rmse:2.41163	test1-ebay_loss:0.79785
[1]	test1-rmse:2.01859	test1-ebay_loss:0.46148
[2]	test1-rmse:1.79371	test1-ebay_loss:0.42696
[3]	test1-rmse:1.67232	test1-ebay_loss:0.39051
[4]	test1-rmse:1.60878	test1-ebay_loss:0.39160


In [19]:
results['test1']['ebay_loss'][-1]

0.391605

In [20]:
print(bst.feature_types)

['float', 'int', 'int', 'float', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int']


In [21]:
bst.save_model('../para/xgb_category.json')

In [None]:
quiz_set = pd.read_csv('../data/parsed_quiz.tsv', sep='\t')

In [47]:
c1q = pd.get_dummies(quiz_set.shipment_method_id, prefix='sm')
c2q = pd.get_dummies(quiz_set.category_id, prefix='ci')
c3q = pd.get_dummies(quiz_set.package_size, prefix='ps')
c4q = pd.get_dummies(quiz_set.cross_city, prefix='cc')
c5q = pd.get_dummies(quiz_set.cross_state, prefix='cs')

In [None]:
x2 = pd.concat([quiz_set.drop(['record_number', 'shipment_method_id', 'category_id', 'package_size', 'cross_city', 'cross_state'], axis=1), 
               c1q, c2q, c3q, c4q, c5q], axis=1)

In [49]:
dtest2 = xgb.DMatrix(x2)

In [50]:
ypred = bst.predict(dtest2, iteration_range=(0, bst.best_iteration))

In [51]:
ypred.shape

(2500000,)

In [None]:
res = quiz_set.filter(items=['record_number'], axis=1)
res.shape

In [56]:
quiz_set['target'] = pd.Series(np.round(ypred))

In [57]:
quiz_set

Unnamed: 0,record_number,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,category_id,item_price,quantity,bt,package_size,weight,tz_dis,dis,cross_city,cross_state,acc_hour,target
0,15000001,0,0.00,3,5,2,28.00,1,0,0,4,2.0,3.0,1.0,1.0,5,2.0
1,15000002,0,0.00,3,5,9,9.95,1,0,0,13,-1.0,1.0,1.0,1.0,2,2.0
2,15000003,0,0.00,3,5,2,16.25,1,0,0,6,0.0,1.0,1.0,1.0,0,2.0
3,15000004,0,0.00,3,5,16,13.65,1,0,0,4,-1.0,1.0,1.0,1.0,19,2.0
4,15000005,0,0.00,3,5,1,13.99,1,0,3,2,1.0,2.0,1.0,1.0,14,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499995,17499996,0,0.00,3,5,10,7.71,1,0,0,8,-4.0,3.0,1.0,1.0,14,2.0
2499996,17499997,0,4.39,3,5,4,15.99,1,0,0,8,-1.0,0.0,1.0,1.0,19,2.0
2499997,17499998,5,0.00,2,5,9,34.20,2,0,0,42,-3.0,2.0,1.0,1.0,20,2.0
2499998,17499999,0,0.00,3,5,22,20.00,1,1,0,5,-2.0,1.0,1.0,1.0,11,2.0
