In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
test = pd.read_csv('../data/input/test.csv')
items = pd.read_csv('../data/input/items.csv')

item_categories = pd.read_csv('../data/output/item_categories_ppd.csv')
shops = pd.read_csv('../data/output/shops_ppd.csv')
sales_train = pd.read_csv('../data/output/sales_train_ppd.csv')
month_shop_item_cnt = pd.read_csv('../data/output/month_shop_item_cnt.csv')
month_shop_item_sales = pd.read_csv('../data/output/month_shop_item_sales.csv')

# Create Train Data

In [4]:
train_full_comb = pd.DataFrame()# innluding test month
for i in range(35):
    mid = test[['shop_id', 'item_id']]
    mid['date_block_num'] = i
    train_full_comb = pd.concat([train_full_comb, mid], axis=0)

In [6]:
# monthly sales item number
train_test = pd.merge(
    train_full_comb,
    month_shop_item_cnt, 
    on=['date_block_num', 'shop_id', 'item_id'],
    how='left'
)

In [7]:
# monthly sales
train_test = pd.merge(
    train_test, 
    month_shop_item_sales,
    on=['date_block_num', 'shop_id', 'item_id'],
    how='left'
)

In [8]:
train_test = pd.merge(
    train_test,
    items[['item_id', 'item_category_id']],
    on='item_id',
    how='left'
)

train_test = pd.merge(
    train_test,
    item_categories[['item_category_id', 'major_item_category']],
    on='item_category_id',
    how='left'
)

train_test = pd.merge(
    train_test,
    shops[['shop_id', 'shop_city_name']],
    on='shop_id',
    how='left'
)

# Lag Feature

In [12]:
# True target values are clipped into [0,20] range.
train_test['month_shop_item_cnt'] = train_test['month_shop_item_cnt'].clip(0, 20)

In [13]:
train_test = train_test.sort_values(
    ['shop_id', 'item_id', 'date_block_num'],
    ascending=[True, True, True]
).reset_index(drop=True)

In [14]:
lag_col_list = ['month_shop_item_cnt', 'month_shop_item_sales']
lag_num_list = [1, 3, 6, 9, 12]

for lag_col in lag_col_list:
    for lag in lag_num_list:
        set_col_name = lag_col + '_' + str(lag)
        df_lag = train_test[['shop_id', 'item_id', 'date_block_num', lag_col]].sort_values(
            ['shop_id', 'item_id', 'date_block_num'],
            ascending=[True, True, True]
        ).reset_index(drop=True).shift(lag).rename(columns={lag_col:set_col_name})
        train_test = pd.concat([train_test, df_lag[set_col_name]], axis=1)
        
train_test = train_test.fillna(0)
train_test.head()

Unnamed: 0,shop_id,item_id,date_block_num,month_shop_item_cnt,month_shop_item_sales,item_category_id,major_item_category,shop_city_name,month_shop_item_cnt_1,month_shop_item_cnt_3,month_shop_item_cnt_6,month_shop_item_cnt_9,month_shop_item_cnt_12,month_shop_item_sales_1,month_shop_item_sales_3,month_shop_item_sales_6,month_shop_item_sales_9,month_shop_item_sales_12
0,2,30,0,0.0,0.0,40,Кино,Адыгея,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,30,1,0.0,0.0,40,Кино,Адыгея,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,30,2,1.0,359.0,40,Кино,Адыгея,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,30,3,0.0,0.0,40,Кино,Адыгея,1.0,0.0,0.0,0.0,0.0,359.0,0.0,0.0,0.0,0.0
4,2,30,4,0.0,0.0,40,Кино,Адыгея,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496995,59,22167,30,0.0,0.0,49,Книги,Ярославль,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7496996,59,22167,31,0.0,0.0,49,Книги,Ярославль,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7496997,59,22167,32,0.0,0.0,49,Книги,Ярославль,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7496998,59,22167,33,0.0,0.0,49,Книги,Ярославль,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Use monthly sales 12months ago
train_ = train_test[(train_test['date_block_num']<=33) & (train_test['date_block_num']>=12)].\
reset_index(drop=True)
test_ = train_test[train_test['date_block_num']==34].reset_index(drop=True)

# split data into feature for model & target value
train_y = train_['month_shop_item_cnt']
train_X = train_.drop(columns=['date_block_num','month_shop_item_cnt', 'month_shop_item_sales'])
test_X = test_.drop(columns=['date_block_num','month_shop_item_cnt', 'month_shop_item_sales'])

In [16]:
train_X.head()

Unnamed: 0,shop_id,item_id,item_category_id,major_item_category,shop_city_name,month_shop_item_cnt_1,month_shop_item_cnt_3,month_shop_item_cnt_6,month_shop_item_cnt_9,month_shop_item_cnt_12,month_shop_item_sales_1,month_shop_item_sales_3,month_shop_item_sales_6,month_shop_item_sales_9,month_shop_item_sales_12
0,2,30,40,Кино,Адыгея,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,30,40,Кино,Адыгея,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,30,40,Кино,Адыгея,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,399.0,359.0
3,2,30,40,Кино,Адыгея,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,30,40,Кино,Адыгея,1.0,0.0,0.0,0.0,0.0,169.0,0.0,0.0,0.0,0.0


# Create Model

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

obj_col_list = ['major_item_category', 'shop_city_name']
for obj_col in obj_col_list:
    le = LabelEncoder()
    train_X[obj_col] = pd.DataFrame({obj_col:le.fit_transform(train_X[obj_col])})
    test_X[obj_col] = pd.DataFrame({obj_col:le.fit_transform(test_X[obj_col])})
    

tr_x, va_x, tr_y, va_y = train_test_split(
    train_X, train_y, test_size=0.33, random_state=2020
)

In [20]:
import lightgbm as lgb

lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

params = {
    'task':'train',
    'objective':'regression',
    'seed':71,
    'boosting_type':'gbdt',
    'verbose':0,
    'metrics':{'l2'},
    'num_leavers':31,
    'learning_rate':0.1,
    'feature_fraction':0.9,
    'bagging_fraction':'0.8',
    'bagging_freq':5,
}

In [21]:
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=lgb_eval,
    early_stopping_rounds=10
)

y_pred = model.predict(test_X, num_iteration=model.best_iteration)

[1]	valid_0's l2: 0.737305
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.680999
[3]	valid_0's l2: 0.635324
[4]	valid_0's l2: 0.598331
[5]	valid_0's l2: 0.568018
[6]	valid_0's l2: 0.543121
[7]	valid_0's l2: 0.522878
[8]	valid_0's l2: 0.506439
[9]	valid_0's l2: 0.492694
[10]	valid_0's l2: 0.481456
[11]	valid_0's l2: 0.472177
[12]	valid_0's l2: 0.464711
[13]	valid_0's l2: 0.45839
[14]	valid_0's l2: 0.453057
[15]	valid_0's l2: 0.448763
[16]	valid_0's l2: 0.445113
[17]	valid_0's l2: 0.441947
[18]	valid_0's l2: 0.439344
[19]	valid_0's l2: 0.437226
[20]	valid_0's l2: 0.435296
[21]	valid_0's l2: 0.433617
[22]	valid_0's l2: 0.432317
[23]	valid_0's l2: 0.430935
[24]	valid_0's l2: 0.429815
[25]	valid_0's l2: 0.428869
[26]	valid_0's l2: 0.428051
[27]	valid_0's l2: 0.427285
[28]	valid_0's l2: 0.426717
[29]	valid_0's l2: 0.426018
[30]	valid_0's l2: 0.425498
[31]	valid_0's l2: 0.424832
[32]	valid_0's l2: 0.42439
[33]	valid_0's l2: 0.423927
[34]	valid_0's l2: 0.42352

# Output Submission

In [22]:
test_X['item_cnt_month'] = y_pred
submission = pd.merge(
    test, 
    test_X[['shop_id', 'item_id', 'item_cnt_month']],
    on=['shop_id', 'item_id'],
    how='left'
)

submission[['ID', 'item_cnt_month']].to_csv('../data/output/submission.csv', index=False)