In [2]:
# basic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
import pprint

# plot
import seaborn as sns
import matplotlib.pyplot as plt

# preprocess
from sklearn.preprocessing import LabelEncoder

# model
import lightgbm as lgb

# optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error as MSE

# other
import pickle
import time
import sys
import gc
import datetime
import eli5



In [3]:
# set index to ID to avoid droping it later
test  = pd.read_csv('../data/input/test.csv').set_index('ID')

In [4]:
data = pd.read_pickle('../features/feature_2020-11-03-17-52-21.pkl')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5694622 entries, 4111216 to 9805837
Data columns (total 41 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   date_block_num                        int8   
 1   shop_id                               int8   
 2   item_id                               int16  
 3   item_cnt_month                        float16
 4   city_code                             int8   
 5   item_category_id                      int8   
 6   type_code                             int8   
 7   subtype_code                          int8   
 8   item_cnt_month_lag_1                  float16
 9   item_cnt_month_lag_2                  float16
 10  item_cnt_month_lag_3                  float16
 11  item_cnt_month_lag_4                  float16
 12  item_cnt_month_lag_5                  float16
 13  item_cnt_month_lag_6                  float16
 14  item_cnt_month_lag_9                  float16
 15  item_cnt_

In [5]:
data.drop(
    [
        'date_shop_type_avg_item_cnt_lag_1',
        'date_shop_subtype_avg_item_cnt_lag_1',
        'date_type_avg_item_cnt_lag_1',
        'date_subtype_avg_item_cnt_lag_1'
    ],
    axis=1,
    inplace=True
)

In [6]:
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

tr_idxes = []
va_idxes = []

# train:2013-01~2015-09
# valid:2015-10
tr_idxes.append(list(data[data.date_block_num < 33].index))
va_idxes.append(list(data[data.date_block_num == 33].index))

# train:2015-09
# valid:2015-10
tr_idxes.append(list(data[data.date_block_num == 32].index))
va_idxes.append(list(data[data.date_block_num == 33].index))

# train:2015-07~2015-09
# valid:2015-10
tr_idxes.append(list(data[(data.date_block_num < 33) & (data.date_block_num > 29)].index))
va_idxes.append(list(data[data.date_block_num == 33].index))

# train:2015-04~2015-09
# valid:2015-10
tr_idxes.append(list(data[(data.date_block_num < 33) & (data.date_block_num > 26)].index))
va_idxes.append(list(data[data.date_block_num == 33].index))

# train:2014-10~2015-09
# valid:2015-10
tr_idxes.append(list(data[(data.date_block_num < 33) & (data.date_block_num > 20)].index))
va_idxes.append(list(data[data.date_block_num == 33].index))

# train:2013-01~2014-10
# valid:2014-11
tr_idxes.append(list(data[data.date_block_num < 22].index))
va_idxes.append(list(data[data.date_block_num == 22].index))

# train:2014-10
# valid:2014-11
tr_idxes.append(list(data[data.date_block_num == 21].index))
va_idxes.append(list(data[data.date_block_num == 22].index))

# train:2014-08~2014-10
# valid:2014-11
tr_idxes.append(list(data[(data.date_block_num < 22) & (data.date_block_num > 18)].index))
va_idxes.append(list(data[data.date_block_num == 22].index))

# train:2014-05~2014-10
# valid:2014-11
tr_idxes.append(list(data[(data.date_block_num < 22) & (data.date_block_num > 15)].index))
va_idxes.append(list(data[data.date_block_num == 22].index))

# train:2013-11~2014-10
# valid:2014-11
tr_idxes.append(list(data[(data.date_block_num < 22) & (data.date_block_num > 9)].index))
va_idxes.append(list(data[data.date_block_num == 22].index))

In [7]:
def score(params):
    params['max_depth'] = int(params['max_depth'])
    #params['num_leaves'] = int(params['num_leaves'])
    #params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
    
    model = lgb.LGBMRegressor(**params, random_state=0, n_jobs=-1)
    model.fit(X=X_train, y=Y_train, eval_set=(X_valid, Y_valid))
    Y_pred = model.predict(X_valid)
    score = np.sqrt(MSE(y_true=Y_valid, y_pred=Y_pred))
    print(f'params: {params}, score:{score:.4f}')
    
    history.append((model, score))
    return {'loss':score, 'status':STATUS_OK}

In [8]:
param_space = {
    #'alpha' : hp.loguniform(label='alpha', low=np.log(1e-8), high=np.log(1.0)),
    'bagging_fraction': hp.quniform(
        label='bagging_fraction', low=0.6, high=0.95, q=0.05
    ),
    'feature_fraction': hp.quniform(
        label='feature_fraction', low=0.6, high=0.95, q=0.05
    ),
    #'gamma': hp.quniform(label='gamma', low=0.1, high=0.4, q=0.1), 
    'lambda' : hp.loguniform(
        label='lambda', low=np.log(1e-6), high=np.log(10.0)
    ),
    'max_depth': hp.quniform(
        label='max_depth', low=3, high=9, q=1
    ),
    'min_child_weight': hp.quniform(
        label='min_child_weight', low=1, high=5, q=1
    ),
}

In [9]:
history = []
best = []

train_X = data.drop(['item_cnt_month'], axis=1)
train_y = data['item_cnt_month']

for i in np.arange(len(tr_idxes)):
    X_train = train_X.loc[tr_idxes[i]]
    Y_train = train_y.loc[tr_idxes[i]]
    X_valid = train_X.loc[va_idxes[i]]
    Y_valid = train_y.loc[va_idxes[i]]
    
    fmin(
        fn=score,
        space=param_space, 
        algo=tpe.suggest, 
        trials=Trials(),
        max_evals=10
    )
    
    history = sorted(history, key=lambda tpl:tpl[1])
    best.append(history[0])

[1]	valid_0's l2: 1.06869                                                                                              
[2]	valid_0's l2: 1.01998                                                                                              
[3]	valid_0's l2: 0.980564                                                                                             
[4]	valid_0's l2: 0.949404                                                                                             
[5]	valid_0's l2: 0.922086                                                                                             
[6]	valid_0's l2: 0.899875                                                                                             
[7]	valid_0's l2: 0.882106                                                                                             
[8]	valid_0's l2: 0.867086                                                                                             
[9]	valid_0's l2: 0.854364              

In [10]:
best

[(LGBMRegressor(bagging_fraction=0.7000000000000001, feature_fraction=0.8,
                lambda=4.88659358360843e-05, max_depth=5, min_child_weight=2.0,
                random_state=0),
  0.87844211580088),
 (LGBMRegressor(bagging_fraction=0.7000000000000001, feature_fraction=0.8,
                lambda=4.88659358360843e-05, max_depth=5, min_child_weight=2.0,
                random_state=0),
  0.87844211580088),
 (LGBMRegressor(bagging_fraction=0.7000000000000001, feature_fraction=0.8,
                lambda=4.88659358360843e-05, max_depth=5, min_child_weight=2.0,
                random_state=0),
  0.87844211580088),
 (LGBMRegressor(bagging_fraction=0.7000000000000001, feature_fraction=0.8,
                lambda=4.88659358360843e-05, max_depth=5, min_child_weight=2.0,
                random_state=0),
  0.87844211580088),
 (LGBMRegressor(bagging_fraction=0.8, feature_fraction=0.8,
                lambda=0.00014792744086714114, max_depth=7, min_child_weight=1.0,
                random

In [11]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
submission = pd.DataFrame({"ID": test.index})
importance_list = []
perm_df = pd.DataFrame(train_X.columns.values, columns=['feature'])

for i, model_i in enumerate(best):
    submission = pd.concat(
        [submission, pd.Series(model_i[0].predict(X_test).clip(0, 20), name="pred_"+str(i).zfill(2))],
        axis=1
    )
    
    pickle.dump(
        model_i[0],
        open('../models/model_' + dt + '_lightgbm_num_' + str(i).zfill(2) + '.pickle', 'wb')
    )
    
    pickle.dump(
        model_i[0].get_params,
        open('../logs/params_' + dt + '_num' + str(i).zfill(2) + '.pickle', 'wb')
    )
    
    pickle.dump(
        model_i[1],
        open('../logs/train_score_' + dt + '_num' + str(i).zfill(2) + '.pickle', 'wb')
    )
    
    importance_list.append(model_i[0].feature_importances_.tolist())
    
    cols = 'weight' + str(i).zfill(2)
    tmp = eli5.explain_weights_df(model_i[0]).rename(columns={'weight':cols})
    perm_df = pd.concat([perm_df, tmp[cols]], axis=1)

In [15]:
submission['mean'] = submission.iloc[:, 1:].mean(axis='columns')

pickle.dump(
    submission,
    open('../logs/test_preds' + dt + '.pickle', 'wb')
)

submission.rename(columns={'mean': 'item_cnt_month'}, inplace=True)
submission = submission[['ID', 'item_cnt_month']]

submission.to_csv(
    '../data/output/sub' + dt + '_lightgbm.csv',
    index=False
)

## Check Importance

### lgb importance method

In [21]:
importance_df = pd.DataFrame(importance_list, columns=train_X.columns)
display(importance_df.mean().sort_values(ascending=False))

delta_price_lag                      209.4
item_cnt_month_lag_1                 203.6
date_item_city_avg_item_cnt_lag_1    199.6
item_category_id                     194.3
date_shop_cat_avg_item_cnt_lag_1     169.0
date_cat_avg_item_cnt_lag_1          168.0
item_first_sale                      160.4
item_cnt_month_lag_2                 137.5
subtype_code                         125.8
date_shop_avg_item_cnt_lag_2         119.5
item_id                              117.2
item_last_sale                       114.5
date_block_num                       107.8
month                                 78.9
item_cnt_month_lag_3                  73.4
item_cnt_month_lag_6                  71.3
date_avg_item_cnt_lag_1               67.2
shop_id                               66.8
item_shop_last_sale                   65.1
item_cnt_month_lag_5                  55.2
date_shop_avg_item_cnt_lag_1          50.9
date_city_avg_item_cnt_lag_1          50.7
item_cnt_month_lag_12                 47.7
item_shop_f

### eli5 permutation importance

In [21]:
perm_df['mean'] = perm_df.mean(axis=1)
display(perm_df)

Unnamed: 0,feature,weight00,weight01,weight02,weight03,weight04,weight05,weight06,weight07,weight08,weight09,mean
0,date_block_num,0.534864,0.534864,0.534864,0.534864,0.534864,0.488173,0.488173,0.488173,0.488173,0.488173,0.511519
1,shop_id,0.17042,0.17042,0.17042,0.17042,0.17042,0.178561,0.178561,0.178561,0.178561,0.178561,0.174491
2,item_id,0.052797,0.052797,0.052797,0.052797,0.052797,0.047315,0.047315,0.047315,0.047315,0.047315,0.050056
3,city_code,0.043986,0.043986,0.043986,0.043986,0.043986,0.039866,0.039866,0.039866,0.039866,0.039866,0.041926
4,item_category_id,0.029927,0.029927,0.029927,0.029927,0.029927,0.028854,0.028854,0.028854,0.028854,0.028854,0.029391
5,type_code,0.021162,0.021162,0.021162,0.021162,0.021162,0.022269,0.022269,0.022269,0.022269,0.022269,0.021715
6,subtype_code,0.017052,0.017052,0.017052,0.017052,0.017052,0.020991,0.020991,0.020991,0.020991,0.020991,0.019022
7,item_cnt_month_lag_1,0.015421,0.015421,0.015421,0.015421,0.015421,0.020937,0.020937,0.020937,0.020937,0.020937,0.018179
8,item_cnt_month_lag_2,0.014864,0.014864,0.014864,0.014864,0.014864,0.015876,0.015876,0.015876,0.015876,0.015876,0.01537
9,item_cnt_month_lag_3,0.010857,0.010857,0.010857,0.010857,0.010857,0.014462,0.014462,0.014462,0.014462,0.014462,0.012659


In [22]:
display(perm_df.set_index('feature')['mean'])

feature
date_block_num                       0.511519
shop_id                              0.174491
item_id                              0.050056
city_code                            0.041926
item_category_id                     0.029391
type_code                            0.021715
subtype_code                         0.019022
item_cnt_month_lag_1                 0.018179
item_cnt_month_lag_2                 0.015370
item_cnt_month_lag_3                 0.012659
item_cnt_month_lag_4                 0.011557
item_cnt_month_lag_5                 0.011281
item_cnt_month_lag_6                 0.010522
item_cnt_month_lag_9                 0.009610
item_cnt_month_lag_12                0.008406
date_avg_item_cnt_lag_1              0.008013
date_shop_avg_item_cnt_lag_1         0.007040
date_shop_avg_item_cnt_lag_2         0.006015
date_shop_avg_item_cnt_lag_3         0.005113
date_shop_avg_item_cnt_lag_4         0.004876
date_shop_avg_item_cnt_lag_5         0.004034
date_shop_avg_item_cnt_lag