In [1]:
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from itertools import product
import shap

In [2]:
"""
skip:
lags
shop cat
first sale date
lags
"""

In [2]:
path = '../data/'
sales_train = pd.read_csv(path + 'sales_train.csv')
items = pd.read_csv(path + 'items.csv')
shops = pd.read_csv(path + 'shops.csv')
item_categories = pd.read_csv(path + 'item_categories.csv')
test = pd.read_csv(path + 'test.csv')
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [3]:
sales_train = sales_train[sales_train['item_price'] > 0]
sales_train = sales_train[sales_train['item_price'] < 50000]
sales_train = sales_train[sales_train['item_cnt_day'] > 0]
sales_train = sales_train[sales_train['item_cnt_day'] < 1000]

In [4]:
sales_train.loc[sales_train['shop_id'] == 0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id'] == 1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id'] == 10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id'] == 39, 'shop_id'] = 40

test.loc[test['shop_id'] == 0, 'shop_id'] = 57
test.loc[test['shop_id'] == 1, 'shop_id'] = 58
test.loc[test['shop_id'] == 10, 'shop_id'] = 11
test.loc[test['shop_id'] == 39, 'shop_id'] = 40

In [6]:
"""unique_test_shop_id = test['shop_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]"""

In [5]:
shops['city'] = shops['shop_name'].str.extract(r'(\w+\.*\w*)')
shops.loc[shops.city == 'Выездная', 'city'] = 'Выездная торговля'
shops.loc[shops.city == 'РостовНаДону', 'city'] = 'Ростов-На-Дону'
shops.loc[shops.city == 'Сергиев', 'city'] = 'Сергиев Посад'
shops.loc[shops.city == 'Цифровой', 'city'] = 'Интернет'

label_encoder = LabelEncoder()
shops['city'] = label_encoder.fit_transform(shops['city'])
shops.head()

Unnamed: 0,shop_name,shop_id,city
0,"!Якутск Орджоникидзе, 56 фран",0,28
1,"!Якутск ТЦ ""Центральный"" фран",1,28
2,"Адыгея ТЦ ""Мега""",2,0
3,"Балашиха ТРК ""Октябрь-Киномир""",3,1
4,"Волжский ТЦ ""Волга Молл""",4,2


In [6]:
item_categories['category'] = item_categories['item_category_name'].apply(lambda x: x.split()[0])
item_categories['category'].value_counts()

Игры          14
Книги         13
Подарки       12
Игровые        8
Аксессуары     7
Музыка         6
Программы      6
Карты          5
Кино           5
Служебные      2
Чистые         2
PC             1
Билеты         1
Доставка       1
Элементы       1
Name: category, dtype: int64

In [7]:
def create_category(x):
    if len(item_categories[item_categories['category'] == x]) >= 5:
        return x
    else:
        return 'ост.'
    
    
item_categories['category'] = item_categories['category'].apply(create_category)
label_encoder = LabelEncoder()
item_categories['category'] = label_encoder.fit_transform(item_categories['category'])
item_categories = item_categories.drop('item_category_name', axis=1)

In [8]:
train = []
for i in sales_train['date_block_num'].unique():
    all_shops = sales_train.loc[sales_train['date_block_num'] == i, 'shop_id'].unique()
    all_items = sales_train.loc[sales_train['date_block_num'] == i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shops, all_items))))

train = pd.DataFrame(np.vstack(train), columns=['date_block_num', 'shop_id', 'item_id'])
train

Unnamed: 0,date_block_num,shop_id,item_id
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564
...,...,...,...
10812763,33,21,7635
10812764,33,21,7638
10812765,33,21,7640
10812766,33,21,7632


In [9]:
group = sales_train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': 'sum', 'item_price': 'mean'}).reset_index()
group = group.rename(columns={'item_cnt_day': 'item_cnt_month', 'item_price': 'item_price_mean'})

train = train.merge(group, on=['date_block_num', 'shop_id', 'item_id'], how='left')
train

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price_mean
0,0,59,22154,1.0,999.0
1,0,59,2552,,
2,0,59,2554,,
3,0,59,2555,,
4,0,59,2564,,
...,...,...,...,...,...
10812763,33,21,7635,,
10812764,33,21,7638,,
10812765,33,21,7640,,
10812766,33,21,7632,,


In [10]:
test['date_block_num'] = 34

all_data = pd.concat([train, test.drop(columns='ID')], ignore_index=True, keys=['date_block_num', 'shop_id', 'item_id'])
all_data = all_data.fillna(0)
all_data['item_cnt_month'] = all_data['item_cnt_month'].clip(0,20)
all_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price_mean
0,0,59,22154,1.0,999.0
1,0,59,2552,0.0,0.0
2,0,59,2554,0.0,0.0
3,0,59,2555,0.0,0.0
4,0,59,2564,0.0,0.0


In [11]:
lags = pd.DataFrame()
for i in range(12, 35):
    temp = all_data[all_data.date_block_num == i]
    temp1 = all_data[(all_data.date_block_num == i - 1)]
    temp1 = temp1.groupby(['shop_id', 'item_id'], as_index=False).agg({'item_cnt_month': 'sum', 'item_price_mean': 'mean'})
    temp1 = temp1.rename({'item_cnt_month': 'item_cnt_last_1', 'item_price_mean': 'item_price_last_1'}, axis=1)
    
    temp1_item = all_data[(all_data.date_block_num == i - 1)]
    temp1_item = temp1_item.groupby(['item_id'], as_index=False).agg({'item_cnt_month': 'sum', 'item_price_mean': 'mean'})
    temp1_item = temp1_item.rename({'item_cnt_month': 'item_cnt_last_1_item', 'item_price_mean': 'item_price_last_1_item'}, axis=1)
    
    temp12 = all_data[(all_data.date_block_num == i - 12)]
    temp12 = temp12.groupby(['shop_id', 'item_id'], as_index=False).agg({'item_cnt_month': 'sum', 'item_price_mean': 'mean'})
    temp12 = temp12.rename({'item_cnt_month': 'item_cnt_last_12', 'item_price_mean': 'item_price_last_12'}, axis=1)
    
    temp12_item = all_data[(all_data.date_block_num == i - 12)]
    temp12_item = temp12_item.groupby(['item_id'], as_index=False).agg({'item_cnt_month': 'sum', 'item_price_mean': 'mean'})
    temp12_item = temp12_item.rename({'item_cnt_month': 'item_cnt_last_12_item', 'item_price_mean': 'item_price_last_12_item'}, axis=1)
    
    temp2 = all_data[(all_data.date_block_num == i - 2)]
    temp2 = temp2.groupby(['shop_id', 'item_id'], as_index=False).agg({'item_cnt_month': 'sum', 'item_price_mean': 'mean'})
    temp2 = temp2.rename({'item_cnt_month': 'item_cnt_last_2', 'item_price_mean': 'item_price_last_2'}, axis=1)
    
    temp2_item = all_data[(all_data.date_block_num == i - 2)]
    temp2_item = temp2_item.groupby(['item_id'], as_index=False).agg({'item_cnt_month': 'sum', 'item_price_mean': 'mean'})
    temp2_item = temp2_item.rename({'item_cnt_month': 'item_cnt_last_2_item', 'item_price_mean': 'item_price_last_2_item'}, axis=1)
    
    temp3 = all_data[(all_data.date_block_num == i - 3)]
    temp3 = temp3.groupby(['shop_id', 'item_id'], as_index=False).agg({'item_cnt_month': 'sum', 'item_price_mean': 'mean'})
    temp3 = temp3.rename({'item_cnt_month': 'item_cnt_last_3', 'item_price_mean': 'item_price_last_3'}, axis=1)
    
    temp3_item = all_data[(all_data.date_block_num == i - 3)]
    temp3_item = temp3_item.groupby(['item_id'], as_index=False).agg({'item_cnt_month': 'sum', 'item_price_mean': 'mean'})
    temp3_item = temp3_item.rename({'item_cnt_month': 'item_cnt_last_3_item', 'item_price_mean': 'item_price_last_3_item'}, axis=1)
    
    temp = temp.merge(temp1, how='left', on=['shop_id', 'item_id'])
    temp = temp.merge(temp2, how='left', on=['shop_id', 'item_id'])
    temp = temp.merge(temp3, how='left', on=['shop_id', 'item_id'])
    temp = temp.merge(temp12, how='left', on=['shop_id', 'item_id'])
    temp = temp.merge(temp1_item, how='left', on=['item_id'])
    temp = temp.merge(temp2_item, how='left', on=['item_id'])
    temp = temp.merge(temp3_item, how='left', on=['item_id'])
    temp = temp.merge(temp12_item, how='left', on=['item_id'])
    
    lags = pd.concat([lags, temp], ignore_index=True)
    
lags = lags.fillna(0)
all_data = lags

In [12]:
all_data = all_data.merge(shops[['shop_id', 'city']], on='shop_id', how='left')
all_data = all_data.merge(items[['item_id', 'item_category_id']], on='item_id', how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')
all_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price_mean,item_cnt_last_1,item_price_last_1,item_cnt_last_2,item_price_last_2,item_cnt_last_3,...,item_price_last_1_item,item_cnt_last_2_item,item_price_last_2_item,item_cnt_last_3_item,item_price_last_3_item,item_cnt_last_12_item,item_price_last_12_item,city,item_category_id,category
0,12,54,10297,4.0,749.0,3.0,749.0,0.0,0.0,0.0,...,416.555435,2.0,33.288889,0.0,0.0,0.0,0.0,26,37,4
1,12,54,10296,3.0,1599.0,0.0,0.0,0.0,0.0,0.0,...,660.456522,0.0,0.0,0.0,0.0,0.0,0.0,26,38,4
2,12,54,10298,14.0,399.0,20.0,399.0,20.0,399.0,7.0,...,364.23913,676.0,379.696333,144.0,292.978007,0.0,0.0,26,40,4
3,12,54,10300,3.0,699.0,1.0,699.0,20.0,694.0,0.0,...,358.167391,327.0,643.254691,53.0,372.143116,0.0,0.0,26,37,4
4,12,54,10284,1.0,299.0,0.0,0.0,0.0,0.0,0.0,...,23.130435,3.0,19.933333,5.0,23.130435,0.0,0.0,26,57,6


In [16]:
"""all_data['date_block_num'] = all_data['date_block_num'].astype("category")
all_data['shop_id'] = all_data['shop_id'].astype("category")
all_data['item_id'] = all_data['item_id'].astype("category")
all_data['city'] = all_data['city'].astype("category")
all_data['item_category_id'] = all_data['item_category_id'].astype("category")
all_data['category'] = all_data['category'].astype("category")"""

In [17]:
"""transformer = make_column_transformer(
    (OneHotEncoder(), ['category']),
    remainder='passthrough')

transformed = transformer.fit_transform(all_data)
all_data = pd.DataFrame(transformed)
all_data = all_data.rename(columns={10: 'date_block_num', 13: 'item_cnt_month'})
all_data.head()"""

In [13]:
X_train = all_data[all_data['date_block_num'] < 33].drop(columns=['item_cnt_month'])
y_train = all_data[all_data['date_block_num'] < 33]['item_cnt_month']
X_valid = all_data[all_data['date_block_num'] == 33].drop(columns=['item_cnt_month'])
y_valid = all_data[all_data['date_block_num'] == 33]['item_cnt_month']
X_test = all_data[all_data['date_block_num'] == 34].drop(columns=['item_cnt_month'])

In [19]:
model = xgb.XGBRegressor(
    seed = 42,
    subsample = 0.9,
    n_estimators=500,
    max_depth = 10,
    learning_rate = 0.3,
    gamma = 1,
    colsample_bytree = 0.9,
    early_stopping_rounds = 40)

rmse_cross_val = []

for i in range(13, 32):
    start = time.time()
    X_train = all_data[all_data['date_block_num'] < i].drop(columns=['item_cnt_month'])
    y_train = all_data[all_data['date_block_num'] < i]['item_cnt_month']
    X_valid = all_data[all_data['date_block_num'] == i].drop(columns=['item_cnt_month'])
    y_valid = all_data[all_data['date_block_num'] == i]['item_cnt_month']
    print('Fitting... {} example'.format(i))
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose = 10)
    end = time.time()
    print('Took ' + str(int(end - start)) + ' seconds to fit the model')
    rmse_cross_val.append([mean_squared_error(model.predict(X_train).clip(0, 20), y_train, squared=False), mean_squared_error(model.predict(X_valid).clip(0, 20), y_valid, squared=False)])

In [47]:
from sktime.forecasting.model_selection import ExpandingWindowSplitter
from sklearn.model_selection import GridSearchCV

forecaster = xgb.XGBRegressor(seed=42,
                             gamma = 1,
                             early_stopping_rounds = 50)

cv = ExpandingWindowSplitter(
    initial_window=22,
    step_length=10,
    start_with_window=True,
    fh=[1])

params = {'learning_rate': [0.05, 0.1, 0.03],
              'max_depth': [5, 7, 10],
              'min_child_weight': [4],
              'subsample': [0.7, 0.9],
              'colsample_bytree': [0.7, 0.9],
              'n_estimators': [100, 250, 500]}

xgb_grid = GridSearchCV(
    estimator=forecaster,
    scoring='neg_root_mean_squared_error',
    param_grid=params,
    cv=cv,
    n_jobs=-1)
xgb_grid.fit(all_data.drop(columns=['item_cnt_month']), all_data['item_cnt_month'])
params = xgb_grid.best_params_
score = xgb_grid.score_
print('Best params: {}'.format(params))
print('Best score: {}'.format(score))

In [16]:
"""start = time.time()

model = xgb.XGBRegressor(
    seed = 42,
    subsample = 0.9,
    n_estimators=500,
    max_depth = 10,
    learning_rate = 0.3,
    gamma = 1,
    colsample_bytree = 0.9,
    early_stopping_rounds = 50)

print('Fitting...')
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose = 10)

end = time.time()
print('Took ' + str(int(end - start)) + ' seconds to fit the model')"""

In [None]:
"""explainer = shap.Explainer(model)
shap_values = explainer(X_train)
shap.plots.waterfall(shap_values[0])"""

In [19]:
preds_train = model.predict(X_train).clip(0, 20)
print('Train set RMSE = ' + str(mean_squared_error(preds_train, y_train, squared=False)))
preds_val = model.predict(X_valid).clip(0, 20)
print('Validation set RMSE = ' + str(mean_squared_error(preds_val, y_valid, squared=False)))

In [20]:
y_test = model.predict(X_test).clip(0, 20)
submission = pd.DataFrame({
    "ID": np.arange(y_test.shape[0]), 
    "item_cnt_month": y_test
})
submission.to_csv('s.csv', index=False)