In [1]:
import pandas as pd

In [2]:
df = pd.read_pickle('data.pkl')
df_og = pd.read_pickle('data_og.pkl')

In [3]:
df = df[[
    'date_block_num',
    'shop_id',
    'item_id',
    'item_cnt_month',
    'zip_code',
    'item_category_id',
    'category_type',
    'category_device',
    'item_cnt_month_lag_1',
    'item_cnt_month_lag_2',
    'item_cnt_month_lag_3',
    'item_cnt_month_lag_6',
    'item_cnt_month_lag_12',
    'date_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_2',
    'date_item_avg_item_cnt_lag_3',
    'date_item_avg_item_cnt_lag_6',
    'date_item_avg_item_cnt_lag_12',
    'date_shop_avg_item_cnt_lag_1',
    'date_shop_avg_item_cnt_lag_2',
    'date_shop_avg_item_cnt_lag_3',
    'date_shop_avg_item_cnt_lag_6',
    'date_shop_avg_item_cnt_lag_12',
    'date_cat_avg_item_cnt_lag_1',
    'date_shop_cat_avg_item_cnt_lag_1',
    'date_city_avg_item_cnt_lag_1',
    'date_item_city_avg_item_cnt_lag_1',
    'delta_price_trend_with_lag_1',
    'month',
    'days_in_month',
    'item_last_sold_in_given_shop',
    'item_last_sold',
    'item_first_sold_in_given_shop',
    'item_first_sold'
]]

In [4]:
# methodological differences
df['delta_price_trend_with_lag_1'] = df_og['delta_price_lag']
df['item_last_sold_in_given_shop'] = df_og['item_shop_last_sale']
df['item_last_sold'] = df_og['item_last_sale']
df['item_first_sold_in_given_shop'] = df_og['item_shop_first_sale']
df['item_first_sold'] = df_og['item_first_sale']

# different encodings
df['zip_code'] = df_og['city_code']
df['category_type'] = df_og['type_code']
df['category_device'] = df_og['subtype_code']
df['month'] = df_og['month']

# max 0.002% diff, sometimes even 0.0001%
df['date_avg_item_cnt_lag_1'] = df_og['date_avg_item_cnt_lag_1']
df['date_shop_cat_avg_item_cnt_lag_1'] = df_og['date_shop_cat_avg_item_cnt_lag_1']
df['date_city_avg_item_cnt_lag_1'] = df_og['date_city_avg_item_cnt_lag_1']
df['date_cat_avg_item_cnt_lag_1'] = df_og['date_cat_avg_item_cnt_lag_1']
df['date_shop_avg_item_cnt_lag_1'] = df_og['date_shop_avg_item_cnt_lag_1']
df['date_shop_avg_item_cnt_lag_2'] = df_og['date_shop_avg_item_cnt_lag_2']
df['date_shop_avg_item_cnt_lag_3'] = df_og['date_shop_avg_item_cnt_lag_3']
df['date_shop_avg_item_cnt_lag_6'] = df_og['date_shop_avg_item_cnt_lag_6']
df['date_shop_avg_item_cnt_lag_12'] = df_og['date_shop_avg_item_cnt_lag_12']

# has actual discrepencies
df['date_item_city_avg_item_cnt_lag_1'] = df_og['date_item_city_avg_item_cnt_lag_1']

In [5]:
X_train = df[df.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = df[df.date_block_num < 33]['item_cnt_month']
X_valid = df[df.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = df[df.date_block_num == 33]['item_cnt_month']
X_test = df[df.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [6]:
from xgboost import XGBRegressor

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300,
    colsample_bytree=0.8,
    subsample=0.8,
    eta=0.3,
    seed=42)

model.fit(
    X_train,
    Y_train,
    eval_metric="rmse",
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
    verbose=True,
    early_stopping_rounds = 10)

test = pd.read_csv('data/technical/test.csv')

Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index,
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission.csv', index=False)

[0]	validation_0-rmse:1.08755	validation_1-rmse:1.07128
[1]	validation_0-rmse:0.99487	validation_1-rmse:1.00890
[2]	validation_0-rmse:0.94664	validation_1-rmse:0.97641
[3]	validation_0-rmse:0.92128	validation_1-rmse:0.95682
[4]	validation_0-rmse:0.90107	validation_1-rmse:0.94439
[5]	validation_0-rmse:0.89068	validation_1-rmse:0.93898
[6]	validation_0-rmse:0.88412	validation_1-rmse:0.93673
[7]	validation_0-rmse:0.87773	validation_1-rmse:0.93371
[8]	validation_0-rmse:0.87294	validation_1-rmse:0.93200
[9]	validation_0-rmse:0.86775	validation_1-rmse:0.92944
[10]	validation_0-rmse:0.86428	validation_1-rmse:0.92943
[11]	validation_0-rmse:0.86203	validation_1-rmse:0.92930
[12]	validation_0-rmse:0.86061	validation_1-rmse:0.92967
[13]	validation_0-rmse:0.85936	validation_1-rmse:0.92966
[14]	validation_0-rmse:0.85675	validation_1-rmse:0.93274
[15]	validation_0-rmse:0.85577	validation_1-rmse:0.93290
[16]	validation_0-rmse:0.85502	validation_1-rmse:0.93290
[17]	validation_0-rmse:0.85404	validation