In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('data/train_merged.csv', parse_dates=['date'])
test = pd.read_csv('data/test_merged.csv', parse_dates=['date'])

In [3]:
cat_features = [
    'family', 'store_nbr', 'city', 'state', 'type', 'cluster',
    'is_holiday', 'payday',
]
for name in cat_features:
    le = LabelEncoder()
    train[name] = le.fit_transform(train[name])
    test[name] = le.fit_transform(test[name])

In [4]:
dates = sorted(train.date.unique())
border_date = dates[-30]
train_data = train[train.date <= border_date]
valid_data = train[train.date > border_date]

print(train_data.shape)
print(valid_data.shape)

(2949210, 18)
(51678, 18)


In [5]:
train_dates = train_data.date.unique()
valid_dates = valid_data.date.unique()
print(f'Train from {train_dates[0]} to {train_dates[-1]}')
print(f'Valid from {valid_dates[0]} to {valid_dates[-1]}')

Train from 2013-01-01T00:00:00.000000000 to 2017-07-17T00:00:00.000000000
Valid from 2017-07-18T00:00:00.000000000 to 2017-08-15T00:00:00.000000000


In [11]:
y_train = np.log(train_data[['sales']].values + 1)
y_valid = np.log(valid_data[['sales']].values + 1)

X_train = train_data.drop(['id', 'sales', 'year', 'date', 'transactions'], axis=1)
X_valid = valid_data.drop(['id', 'sales', 'year', 'date', 'transactions'], axis=1)

X_test = test.drop(['id', 'date', 'year', 'transactions'], axis=1)

In [7]:
X_test.head()

Unnamed: 0,store_nbr,family,onpromotion,city,state,type,cluster,transactions,dcoilwtico,is_holiday,month,day,day_of_week,payday
0,0,0,0,18,12,3,12,0.0,46.8,0,8,16,2,0
1,0,1,0,18,12,3,12,0.0,46.8,0,8,16,2,0
2,0,2,2,18,12,3,12,0.0,46.8,0,8,16,2,0
3,0,3,20,18,12,3,12,0.0,46.8,0,8,16,2,0
4,0,4,0,18,12,3,12,0.0,46.8,0,8,16,2,0


In [8]:
xgb_params = {
    'verbosity': 2,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'random_state': 1,
    'learning_rate': 0.01,
    'subsample': 0.99,
    'colsample_bytree': 0.80,
    'reg_alpha': 10.0,
    'reg_lambda': 0.18,
    'min_child_weight': 47,
}

In [9]:
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_valid = xgb.DMatrix(X_valid, label=y_valid)
eval_list = [(xgb_train, 'train'), (xgb_valid, 'eval')]
eval_result = dict()

model = xgb.train(
    params=xgb_params,
    dtrain=xgb_train,
    evals=eval_list,
    evals_result=eval_result,
    num_boost_round=1000,
    early_stopping_rounds=100,
)

[22:03:07] INFO: /tmp/build/80754af9/xgboost-split_1619724447847/work/src/tree/updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[0]	train-rmse:3.59079	eval-rmse:3.97489
[22:03:08] INFO: /tmp/build/80754af9/xgboost-split_1619724447847/work/src/tree/updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[1]	train-rmse:3.56120	eval-rmse:3.94288
[22:03:09] INFO: /tmp/build/80754af9/xgboost-split_1619724447847/work/src/tree/updater_prune.cc:101: tree pruning end, 118 extra nodes, 0 pruned nodes, max_depth=6
[2]	train-rmse:3.53835	eval-rmse:3.90877
[22:03:10] INFO: /tmp/build/80754af9/xgboost-split_1619724447847/work/src/tree/updater_prune.cc:101: tree pruning end, 118 extra nodes, 0 pruned nodes, max_depth=6
[3]	train-rmse:3.51538	eval-rmse:3.87487
[22:03:11] INFO: /tmp/build/80754af9/xgboost-split_1619724447847/work/src/tree/updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[4]	train-rmse:3

In [12]:
xgb_test = xgb.DMatrix(X_test)
xgb_pred = np.exp(pd.Series(model.predict(xgb_test)).map(lambda x: max(x, 0)))

In [13]:
xgb_valid_pred = model.predict(xgb_valid)
mean_squared_error(y_valid, xgb_valid_pred, squared=False)

0.7188205010618849

In [14]:
sorted(model.get_fscore().items(), key=lambda x: x[1], reverse=True)

[('family', 18755),
 ('store_nbr', 11132),
 ('dcoilwtico', 7299),
 ('onpromotion', 4171),
 ('type', 3800),
 ('cluster', 3772),
 ('month', 3351),
 ('city', 2826),
 ('day_of_week', 2655),
 ('state', 2528),
 ('day', 1099),
 ('payday', 6)]

In [15]:
sub = test[['id']].copy()
sub['sales'] = xgb_pred
sub.head()

Unnamed: 0,id,sales
0,3000888,4.623218
1,3000889,1.16261
2,3000890,5.656198
3,3000891,2559.597321
4,3000892,1.210362


In [None]:
sub.to_csv('data/submission.csv', index=False)