Based on this notebook: https://www.kaggle.com/dlarionov/feature-engineering-xgboost

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product

import time

In [2]:
items = pd.read_csv('data/info/items.csv')
shops = pd.read_csv('data/info/shops.csv')
cats = pd.read_csv('data/info/item_categories.csv')
train = pd.read_csv('data/technical/sales_train.csv')
# set index to ID to avoid droping it later
test  = pd.read_csv('data/technical/test.csv').set_index('ID')

## Monthly sales
Test set is a product of some shops and some items within 34 month.
There are 5100 items * 42 shops = 214 200 pairs.
363 items are new compared to the train set.
Hence, for most of the items in the test set, the target value should be zero.

In the other hand train set contains only pairs which were sold
or returned to the past. The main idea is to calculate monthly sales and
<b>extend it with zero sales</b> for each unique pair within the month.
This way train data will be similar to test data.

In [3]:
ts = time.time()
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    # product makes a cartesian product of all the columns. the labels
    # will be added again later on
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))

matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
time.time() - ts

12.504931211471558

In [4]:
train['revenue'] = train['item_price'] *  train['item_cnt_day']

In [5]:
ts = time.time()
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))
time.time() - ts


3.0037128925323486

In [6]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

In [7]:
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) # 34 month

In [8]:
X_train = matrix[matrix.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = matrix[matrix.date_block_num < 33]['item_cnt_month']
X_test = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [9]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()
LR.fit(X_train, Y_train)
prediction = LR.predict(X_test)

In [10]:
submission = pd.DataFrame({
    "ID": test.index,
    "item_cnt_month": prediction
})
submission.to_csv('lr_submission.csv', index=False)


In [11]:
X_train = matrix[matrix.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = matrix[matrix.date_block_num < 33]['item_cnt_month']
X_valid = matrix[matrix.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = matrix[matrix.date_block_num == 33]['item_cnt_month']
X_test = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [13]:
from xgboost import XGBRegressor

ts = time.time()

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300,
    colsample_bytree=0.8,
    subsample=0.8,
    eta=0.3,
    seed=42)

model.fit(
    X_train,
    Y_train,
    eval_metric="rmse",
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
    verbose=True,
    early_stopping_rounds = 10)

time.time() - ts

[0]	validation_0-rmse:1.22610	validation_1-rmse:1.14870
[1]	validation_0-rmse:1.22089	validation_1-rmse:1.14107
[2]	validation_0-rmse:1.21086	validation_1-rmse:1.12997
[3]	validation_0-rmse:1.20266	validation_1-rmse:1.12365
[4]	validation_0-rmse:1.19784	validation_1-rmse:1.12078
[5]	validation_0-rmse:1.19569	validation_1-rmse:1.11900
[6]	validation_0-rmse:1.19464	validation_1-rmse:1.11881
[7]	validation_0-rmse:1.19368	validation_1-rmse:1.11819
[8]	validation_0-rmse:1.19287	validation_1-rmse:1.11764
[9]	validation_0-rmse:1.19229	validation_1-rmse:1.11755
[10]	validation_0-rmse:1.19185	validation_1-rmse:1.11741
[11]	validation_0-rmse:1.19169	validation_1-rmse:1.11727
[12]	validation_0-rmse:1.19140	validation_1-rmse:1.11740
[13]	validation_0-rmse:1.19117	validation_1-rmse:1.11745
[14]	validation_0-rmse:1.19090	validation_1-rmse:1.11675
[15]	validation_0-rmse:1.19055	validation_1-rmse:1.11690
[16]	validation_0-rmse:1.19025	validation_1-rmse:1.11685
[17]	validation_0-rmse:1.19017	validation

58.1401309967041

In [14]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index,
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission.csv', index=False)