Based on this notebook: https://www.kaggle.com/dlarionov/feature-engineering-xgboost

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product

import time

In [2]:
items = pd.read_csv('data/info/items.csv')
shops = pd.read_csv('data/info/shops.csv')
cats = pd.read_csv('data/info/item_categories.csv')
train = pd.read_csv('data/technical/sales_train.csv')
# set index to ID to avoid droping it later
test  = pd.read_csv('data/technical/test.csv').set_index('ID')

## Monthly sales
Test set is a product of some shops and some items within 34 month.
There are 5100 items * 42 shops = 214 200 pairs.
363 items are new compared to the train set.
Hence, for most of the items in the test set, the target value should be zero.

In the other hand train set contains only pairs which were sold
or returned to the past. The main idea is to calculate monthly sales and
<b>extend it with zero sales</b> for each unique pair within the month.
This way train data will be similar to test data.

In [3]:
ts = time.time()
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    # product makes a cartesian product of all the columns. the labels
    # will be added again later on
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))

matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
time.time() - ts

12.966606140136719

In [4]:
train['revenue'] = train['item_price'] *  train['item_cnt_day']

In [5]:
ts = time.time()
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))
time.time() - ts

3.85969877243042

In [6]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

In [7]:
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) # 34 month

In [8]:
X_train = matrix[matrix.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = matrix[matrix.date_block_num < 33]['item_cnt_month']
X_test = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

true_false_matrix = [True, False]

# does not look like there are any more valid parameters to try from...
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression
parameters_lr = {'fit_intercept':true_false_matrix,
                 'normalize':true_false_matrix,
                 'copy_X':true_false_matrix}

ts = time.time()

model_lr = LinearRegression()
model_lr_cv = GridSearchCV(estimator=model_lr, param_grid=parameters_lr, n_jobs=-1)
model_lr_cv.fit(X_train, Y_train)

model_lr_cved = model_lr_cv.best_estimator_


time.time() - ts

16.571980237960815

In [10]:
prediction = model_lr_cved.predict(X_test)

In [11]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge

from sklearn.linear_model import Ridge

ridge_alpha = [0.1, 0.25, 0.5, 0.75, 1, 5, 10]

parameters_ridge = {'alpha': ridge_alpha,
                    'fit_intercept': true_false_matrix,
                    'normalize': true_false_matrix,
                    'copy_X': true_false_matrix,
                    }

ts = time.time()

model_ridge = Ridge()
model_ridge_cv = GridSearchCV(estimator=model_ridge, param_grid=parameters_ridge, n_jobs=-1)
model_lr_cv.fit(X_train, Y_train)

model_ridge_cved = model_lr_cv.best_estimator_


time.time() - ts


16.027203798294067

In [12]:
prediction = model_ridge_cved.predict(X_test)

In [13]:
from sklearn.linear_model import SGDRegressor

sgd_alpha = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]

parameters_sgd = {
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': sgd_alpha,
    'fit_intercept': true_false_matrix,
}

ts = time.time()

model_sgd = SGDRegressor()
model_sgd_cv = GridSearchCV(estimator=model_sgd, param_grid=parameters_sgd, n_jobs=-1)
model_sgd_cv.fit(X_train, Y_train)

model_sgd_cved = model_sgd_cv.best_estimator_

time.time() - ts

KeyboardInterrupt: 

In [None]:
prediction = model_sgd_cved.predict(X_test)

In [None]:
submission = pd.DataFrame({
    "ID": test.index,
    "item_cnt_month": prediction
})
submission.to_csv('lr_submission.csv', index=False)


In [None]:
# X_train = matrix[matrix.date_block_num < 33].drop(['item_cnt_month'], axis=1)
# Y_train = matrix[matrix.date_block_num < 33]['item_cnt_month']
# X_valid = matrix[matrix.date_block_num == 33].drop(['item_cnt_month'], axis=1)
# Y_valid = matrix[matrix.date_block_num == 33]['item_cnt_month']
# X_test = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
# from xgboost import XGBRegressor
#
# ts = time.time()
#
# model = XGBRegressor(
#     max_depth=8,
#     n_estimators=1000,
#     min_child_weight=300,
#     colsample_bytree=0.8,
#     subsample=0.8,
#     eta=0.3,
#     seed=42)
#
# model.fit(
#     X_train,
#     Y_train,
#     eval_metric="rmse",
#     eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
#     verbose=True,
#     early_stopping_rounds = 10)
#
# time.time() - ts

In [None]:
# Y_pred = model.predict(X_valid).clip(0, 20)
# Y_test = model.predict(X_test).clip(0, 20)
#
# submission = pd.DataFrame({
#     "ID": test.index,
#     "item_cnt_month": Y_test
# })
# submission.to_csv('xgb_submission.csv', index=False)