In [1]:
import pandas as pd
from xgboost import XGBRegressor

In [2]:
data = pd.read_pickle('data.pkl')
data = data[data.date_block_num > 11]

In [3]:
data = data[[
    'date_block_num',
    'shop_id',
    'item_id',
    'item_cnt_month',
    'zip_code',
    'item_category_id',
    'category_type',
    'category_device',
    'item_cnt_month_lag_1',
    'item_cnt_month_lag_2',
    'item_cnt_month_lag_3',
    'item_cnt_month_lag_6',
    'item_cnt_month_lag_12',
    'date_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_2',
    'date_item_avg_item_cnt_lag_3',
    'date_item_avg_item_cnt_lag_6',
    'date_item_avg_item_cnt_lag_12',
    'date_shop_avg_item_cnt_lag_1',
    'date_shop_avg_item_cnt_lag_2',
    'date_shop_avg_item_cnt_lag_3',
    'date_shop_avg_item_cnt_lag_6',
    'date_shop_avg_item_cnt_lag_12',
    'date_cat_avg_item_cnt_lag_1',
    'date_shop_cat_avg_item_cnt_lag_1',
    'date_city_avg_item_cnt_lag_1',
    'date_item_city_avg_item_cnt_lag_1',
    'delta_price_trend_with_lag_1',
    'month',
    'days_in_month',
    'item_last_sold_in_given_shop',
    'item_last_sold',
    'item_first_sold_in_given_shop',
    'item_first_sold',
]]

In [4]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [5]:
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300,
    colsample_bytree=0.8,
    subsample=0.8,
    eta=0.3,
    seed=42)

model.fit(
    X_train,
    Y_train,
    eval_metric="rmse",
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
    verbose=True,
    early_stopping_rounds = 10)

[0]	validation_0-rmse:1.05975	validation_1-rmse:1.03916
[1]	validation_0-rmse:0.96160	validation_1-rmse:0.95909
[2]	validation_0-rmse:0.90291	validation_1-rmse:0.90921
[3]	validation_0-rmse:0.86797	validation_1-rmse:0.90514
[4]	validation_0-rmse:0.84860	validation_1-rmse:0.89572
[5]	validation_0-rmse:0.83588	validation_1-rmse:0.88417
[6]	validation_0-rmse:0.82393	validation_1-rmse:0.89419
[7]	validation_0-rmse:0.81340	validation_1-rmse:0.88053
[8]	validation_0-rmse:0.80711	validation_1-rmse:0.88699
[9]	validation_0-rmse:0.80222	validation_1-rmse:0.88297
[10]	validation_0-rmse:0.79597	validation_1-rmse:0.89285
[11]	validation_0-rmse:0.79053	validation_1-rmse:0.89726
[12]	validation_0-rmse:0.78842	validation_1-rmse:0.89816
[13]	validation_0-rmse:0.78578	validation_1-rmse:0.89264
[14]	validation_0-rmse:0.78447	validation_1-rmse:0.89290
[15]	validation_0-rmse:0.78174	validation_1-rmse:0.88821
[16]	validation_0-rmse:0.78025	validation_1-rmse:0.88916
[17]	validation_0-rmse:0.77951	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=8,
             min_child_weight=300, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=24, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=0.8, tree_method='approx', validate_parameters=1,
             verbosity=None)

In [6]:
test = pd.read_csv('data/technical/test.csv')

Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index,
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission.csv', index=False)