In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import time


In [2]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from module.prepare_data import *

In [24]:
ORIGINAL = '../input/m5-forecasting-accuracy/'
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())
FINAL_TARGETS = 'sales'
SAV_BASE_PATH = '../cache/ver2'
PKL_BASE_PATH = '../cache'
START_TRAIN = 0
END_TRAIN = 1913
P_HORIZON = 28
SEED = 42
VER = 2
LGB_PARAMS = {
                    'boosting_type': 'gbdt',
                    'objective': 'regression',
                    # 'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

CAT_COLUMNS = [
    'item_id',
 'dept_id',
 'cat_id',
 'event_name_1',
 'event_type_1',
 'event_name_2',
 'event_type_2',
 'snap_CA',
 'snap_TX',
 'snap_WI'
]

NUM_COLUMNS = [
 'release',
 'sell_price',
 'price_max',
 'price_min',
 'price_std',
 'price_mean',
 'price_norm',
 'price_nunique',
 'item_nunique',
 'price_momentum',
 'price_momentum_m',
 'price_momentum_y',
 'tm_d',
 'tm_w',
 'tm_m',
 'tm_y',
 'tm_wm',
 'tm_dw',
 'tm_w_end',
 'enc_store_id_cat_id_mean',
 'enc_store_id_cat_id_std',
 'enc_store_id_dept_id_mean',
 'enc_store_id_dept_id_std',
 'enc_store_id_item_id_mean',
 'enc_store_id_item_id_std',
 'enc_store_id_tm_dw_item_id_mean',
 'enc_store_id_tm_dw_item_id_std',
 'enc_store_id_tm_dw_mean',
 'enc_store_id_tm_dw_std',
 'rolling_mean_7',
 'rolling_std_7',
 'rolling_mean_14',
 'rolling_std_14',
 'rolling_mean_30',
 'rolling_std_30',
 'rolling_mean_60',
 'rolling_std_60',
 'rolling_mean_180',
 'rolling_std_180']

########################### set seed
#################################################################################
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

########################### Train Models
#################################################################################
def train_model(grid_df, features_columns, categorical_features, target, shift, save_base_path):
    
    for store_id in STORES_IDS:
        print('Train', store_id)
        train_mask = (grid_df['d'] > START_TRAIN) & (grid_df['d'] <= (END_TRAIN - P_HORIZON))
        valid_mask = (grid_df['d'] > (END_TRAIN - P_HORIZON - 200)) & (grid_df['d'] <= (END_TRAIN))
        train_data = lgb.Dataset(grid_df[train_mask][features_columns+categorical_features], label=grid_df[train_mask][target])
        valid_data = lgb.Dataset(grid_df[valid_mask][features_columns+categorical_features], label=grid_df[valid_mask][target])
        seed_everything(SEED)
        estimator = lgb.train(LGB_PARAMS, train_data, valid_sets=[valid_data], verbose_eval=100, categorical_feature=categorical_features)
        model_name = f'{save_base_path}/lgb_model_{store_id}_shift{shift}_v{VER}.bin'
        pickle.dump(estimator, open(model_name, 'wb'))
    return

In [25]:
BASE_GRID_DF = load_base_features(PKL_BASE_PATH, SAV_BASE_PATH, FINAL_TARGETS)
print(BASE_GRID_DF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46881677 entries, 0 to 46881676
Data columns (total 44 columns):
 #   Column                           Dtype   
---  ------                           -----   
 0   id                               category
 1   item_id                          category
 2   dept_id                          category
 3   cat_id                           category
 4   store_id                         category
 5   state_id                         category
 6   d                                int16   
 7   sales                            float64 
 8   release                          int16   
 9   sell_price                       float16 
 10  price_max                        float16 
 11  price_min                        float16 
 12  price_std                        float16 
 13  price_mean                       float16 
 14  price_norm                       float16 
 15  price_nunique                    float16 
 16  item_nunique                     i

In [None]:
start_shift = 4
for target in ['sales']:
    
    for model in ['lgb']:

        for store_id in STORES_IDS:
        
            for shift in range(start_shift,29):
        
                print(f'{model} train {store_id} {shift} {target}')
                rolling_features_df = load_rolling_features(BASE_GRID_DF.copy(), SAV_BASE_PATH, target=FINAL_TARGETS, shift=shift)
                grid_df = pd.concat([BASE_GRID_DF, rolling_features_df.iloc[:, 3:]], axis=1)
                grid_df = grid_df[grid_df['store_id']==store_id]
                del rolling_features_df
                lag_columns = [f'sales_lag_{i}' for i in range(shift,shift+15)]
                train_model(grid_df, NUM_COLUMNS+lag_columns, CAT_COLUMNS, target, shift, SAV_BASE_PATH)

lgb train CA_1 4 sales
Train CA_1


New categorical_feature is ['cat_id', 'dept_id', 'event_name_1', 'event_name_2', 'event_type_1', 'event_type_2', 'item_id', 'snap_CA', 'snap_TX', 'snap_WI']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[100]	valid_0's rmse: 2.12383
[200]	valid_0's rmse: 2.10996
[300]	valid_0's rmse: 2.10514
[400]	valid_0's rmse: 2.10232
[500]	valid_0's rmse: 2.0988
[600]	valid_0's rmse: 2.09385
[700]	valid_0's rmse: 2.08838
[800]	valid_0's rmse: 2.08182
[900]	valid_0's rmse: 2.07616
[1000]	valid_0's rmse: 2.07125
[1100]	valid_0's rmse: 2.06655
[1200]	valid_0's rmse: 2.06257
[1300]	valid_0's rmse: 2.05858
[1400]	valid_0's rmse: 2.05494
Train CA_2
[100]	valid_0's rmse: 2.12383
[200]	valid_0's rmse: 2.10996
[300]	valid_0's rmse: 2.10514
[400]	valid_0's rmse: 2.10232
[500]	valid_0's rmse: 2.0988
[600]	valid_0's rmse: 2.09385
[700]	valid_0's rmse: 2.08838
[800]	valid_0's rmse: 2.08182
[900]	valid_0's rmse: 2.07616
[1000]	valid_0's rmse: 2.07125
[1100]	valid_0's rmse: 2.06655
[1200]	valid_0's rmse: 2.06257
[1300]	valid_0's rmse: 2.05858
[1400]	valid_0's rmse: 2.05494
Train CA_3
[100]	valid_0's rmse: 2.12383
[200]	valid_0's rmse: 2.10996
[300]	valid_0's rmse: 2.10514
[400]	valid_0's rmse: 2.10232
[500]	vali

In [28]:
shift

4