In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import time


In [3]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from module.prepare_data import *

In [7]:
ORIGINAL = '../input/m5-forecasting-accuracy/'
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())
FINAL_TARGETS = 'sales'
SAV_BASE_PATH = '../cache/ver2'
PKL_BASE_PATH = '../cache'
START_TRAIN = 0
END_TRAIN = 1913
P_HORIZON = 28
SEED = 42
VER = 2
LGB_PARAMS = {
                    'boosting_type': 'gbdt',
                    'objective': 'regression',
                    # 'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

CAT_COLUMNS = [
    'item_id',
 'dept_id',
 'cat_id',
 'event_name_1',
 'event_type_1',
 'event_name_2',
 'event_type_2',
 'snap_CA',
 'snap_TX',
 'snap_WI'
]

NUM_COLUMNS = [
 'release',
 'sell_price',
 'price_max',
 'price_min',
 'price_std',
 'price_mean',
 'price_norm',
 'price_nunique',
 'item_nunique',
 'price_momentum',
 'price_momentum_m',
 'price_momentum_y',
 'tm_d',
 'tm_w',
 'tm_m',
 'tm_y',
 'tm_wm',
 'tm_dw',
 'tm_w_end',
 'enc_store_id_cat_id_mean',
 'enc_store_id_cat_id_std',
 'enc_store_id_dept_id_mean',
 'enc_store_id_dept_id_std',
 'enc_store_id_item_id_mean',
 'enc_store_id_item_id_std',
 'enc_store_id_tm_dw_item_id_mean',
 'enc_store_id_tm_dw_item_id_std',
 'enc_store_id_tm_dw_mean',
 'enc_store_id_tm_dw_std',
 'rolling_mean_7',
 'rolling_std_7',
 'rolling_mean_14',
 'rolling_std_14',
 'rolling_mean_30',
 'rolling_std_30',
 'rolling_mean_60',
 'rolling_std_60',
 'rolling_mean_180',
 'rolling_std_180']


########################### root_mean_sqared_error
#################################################################################
def root_mean_sqared_error(y, y_pred):
    return np.sqrt(np.mean(np.square(y - y_pred)))

########################### set seed
#################################################################################
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

########################### Train Models
#################################################################################
def train_model(grid_df, base_path, store_id, shift, features_columns, categorical_features, target, verbose=1):
    if verbose > 0:
        print('Train', store_id)
    train_mask = (grid_df['d'] > START_TRAIN) & (grid_df['d'] <= (END_TRAIN - P_HORIZON))
    valid_mask = (grid_df['d'] > (END_TRAIN - P_HORIZON)) & (grid_df['d'] <= (END_TRAIN))
    train_data = lgb.Dataset(grid_df[train_mask][features_columns+categorical_features], label=grid_df[train_mask][target])
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns+categorical_features], label=grid_df[valid_mask][target])
    seed_everything(SEED)
    estimator = lgb.train(LGB_PARAMS, train_data, valid_sets=[valid_data], verbose_eval=100)
    model_name = f'{base_path}/lgb_model_{store_id}_shift{shift}_v{VER}.bin'
    pickle.dump(estimator, open(model_name, 'wb'))
    return

########################### Validation
#################################################################################
def predict_samples(grid_df, base_path, store_id, shift_day, features_columns, categorical_features, target, verbose=1):
    model_path = f'{base_path}/lgb_model_{store_id}_shift{shift_day}_v{VER}.bin'
    estimator = pickle.load(open(model_path, 'rb'))
    y_pred = estimator.predict(grid_df[features_columns+categorical_features])
    if type(target) != type(None):
        rmse_score = root_mean_sqared_error(grid_df[target].values, y_pred)
        if verbose > 0:
            print(f'{store_id} {shift_day} rmse score {rmse_score}')
    else:
        rmse_score = None
    return y_pred, rmse_score

In [5]:
BASE_GRID_DF = load_base_features(PKL_BASE_PATH, SAV_BASE_PATH, FINAL_TARGETS)
print(BASE_GRID_DF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46881677 entries, 0 to 46881676
Data columns (total 44 columns):
 #   Column                           Dtype   
---  ------                           -----   
 0   id                               category
 1   item_id                          category
 2   dept_id                          category
 3   cat_id                           category
 4   store_id                         category
 5   state_id                         category
 6   d                                int16   
 7   sales                            float64 
 8   release                          int16   
 9   sell_price                       float16 
 10  price_max                        float16 
 11  price_min                        float16 
 12  price_std                        float16 
 13  price_mean                       float16 
 14  price_norm                       float16 
 15  price_nunique                    float16 
 16  item_nunique                     i

In [None]:
start_shift = 1
for target in ['sales']:
    for model in ['lgb']:
        for store_id in STORES_IDS:
            for shift in range(start_shift,29):
                print(f'{model} train {store_id} {shift} {target}')
                rolling_features_df = load_rolling_features(BASE_GRID_DF.copy(), SAV_BASE_PATH, target=FINAL_TARGETS, shift=shift)
                grid_df = pd.concat([BASE_GRID_DF, rolling_features_df.iloc[:, 3:]], axis=1)
                grid_df = grid_df[grid_df['store_id']==store_id]
                del rolling_features_df
                lag_columns = [f'sales_lag_{i}' for i in range(shift,shift+15)]
                train_model(grid_df, SAV_BASE_PATH, store_id, shift, NUM_COLUMNS+lag_columns, CAT_COLUMNS, target)

lgb train CA_1 1 sales
Train CA_1




In [36]:
his = []
for target in ['sales']:
    
    for model in ['lgb']:

        for store_id in STORES_IDS:
        
            for shift in range(1,29):

                print(f'test {model} {store_id} {shift}')
                rolling_features_df = load_rolling_features(BASE_GRID_DF.copy(), SAV_BASE_PATH, target=FINAL_TARGETS, shift=shift)
                lag_columns = [f'sales_lag_{i}' for i in range(shift,shift+15)]
                grid_df = pd.concat([BASE_GRID_DF, rolling_features_df.iloc[:, 3:]], axis=1)
                valid_grid_df = grid_df[(grid_df['store_id']==store_id)&(grid_df['d']==END_TRAIN-P_HORIZON+shift)]
                test_grid_df = grid_df[(grid_df['store_id']==store_id)&(grid_df['d']==END_TRAIN+shift)]
                
                y_pred, score = predict_samples(valid_grid_df, SAV_BASE_PATH, store_id, shift, NUM_COLUMNS+lag_columns, CAT_COLUMNS, FINAL_TARGETS)
                his.append({'store_id':store_id, 'shift':shift, 'score':score})
                
                
                
                
                

test lgb CA_1 1


Exception: stop

In [47]:
lag_columns = [f'sales_lag_{i}' for i in range(shift,shift+15)]

In [38]:
valid_grid_df.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique,price_momentum,price_momentum_m,price_momentum_y,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,tm_d,tm_w,tm_m,tm_y,tm_wm,tm_dw,tm_w_end,enc_store_id_cat_id_mean,enc_store_id_cat_id_std,enc_store_id_dept_id_mean,enc_store_id_dept_id_std,enc_store_id_item_id_mean,enc_store_id_item_id_std,enc_store_id_tm_dw_item_id_mean,enc_store_id_tm_dw_item_id_std,enc_store_id_tm_dw_mean,enc_store_id_tm_dw_std,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,sales_lag_5,sales_lag_6,sales_lag_7,sales_lag_8,sales_lag_9,sales_lag_10,sales_lag_11,sales_lag_12,sales_lag_13,sales_lag_14,sales_lag_15,rolling_mean_7,rolling_std_7,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60,rolling_mean_180,rolling_std_180
45177281,FOODS_3_823_CA_1_validation,FOODS_3_823,FOODS_3,FOODS,CA_1,CA,1886,0.0,127,2.880859,2.980469,2.480469,0.152222,2.755859,0.966309,5.0,161,1.0,1.019531,0.995117,,,,,0,0,0,28,13,3,5,4,0,0,2.429688,5.925781,3.132812,7.273438,0.807617,1.381836,0.810547,1.328125,1.570312,4.085938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.466553,1.041992,1.049805,1.333008,1.544922,1.797852
45177282,FOODS_3_824_CA_1_validation,FOODS_3_824,FOODS_3,FOODS,CA_1,CA,1886,0.0,0,2.480469,2.679688,2.470703,0.086365,2.630859,0.925293,3.0,138,0.925293,0.938477,0.962891,,,,,0,0,0,28,13,3,5,4,0,0,2.429688,5.925781,3.132812,7.273438,0.710938,1.195312,0.743652,1.189453,1.570312,4.085938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.960464e-08,0.0,0.0,0.0,5.960464e-08
45177283,FOODS_3_825_CA_1_validation,FOODS_3_825,FOODS_3,FOODS,CA_1,CA,1886,1.0,1,3.980469,4.378906,3.980469,0.189697,4.121094,0.908691,3.0,165,1.0,0.963379,1.0,,,,,0,0,0,28,13,3,5,4,0,0,2.429688,5.925781,3.132812,7.273438,0.958984,1.362305,0.98877,1.331055,1.570312,4.085938,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,3.0,3.0,0.856934,0.689941,0.856934,0.949219,1.099609,1.061523,0.866699,0.947266,0.733398,1.116211
45177284,FOODS_3_826_CA_1_validation,FOODS_3_826,FOODS_3,FOODS,CA_1,CA,1886,0.0,211,1.280273,1.280273,1.280273,0.0,1.280273,1.0,1.0,36,1.0,1.0,1.0,,,,,0,0,0,28,13,3,5,4,0,0,2.429688,5.925781,3.132812,7.273438,1.606445,1.863281,1.649414,1.834961,1.570312,4.085938,0.0,0.0,3.0,0.0,3.0,2.0,4.0,3.0,2.0,1.0,1.0,0.0,4.0,0.0,1.0,1.713867,1.704102,1.642578,1.549805,1.433594,1.356445,1.366211,1.583008,0.888672,1.374023
45177285,FOODS_3_827_CA_1_validation,FOODS_3_827,FOODS_3,FOODS,CA_1,CA,1886,3.0,403,1.0,1.0,1.0,0.0,1.0,1.0,1.0,137,1.0,1.0,1.0,,,,,0,0,0,28,13,3,5,4,0,0,2.429688,5.925781,3.132812,7.273438,3.474609,3.431641,3.087891,2.765625,1.570312,4.085938,3.0,6.0,4.0,3.0,1.0,5.0,4.0,9.0,6.0,5.0,1.0,2.0,1.0,3.0,5.0,3.714844,1.603516,3.785156,2.292969,4.234375,3.234375,3.699219,3.054688,3.849609,3.666016


In [39]:
test_grid_df.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique,price_momentum,price_momentum_m,price_momentum_y,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,tm_d,tm_w,tm_m,tm_y,tm_wm,tm_dw,tm_w_end,enc_store_id_cat_id_mean,enc_store_id_cat_id_std,enc_store_id_dept_id_mean,enc_store_id_dept_id_std,enc_store_id_item_id_mean,enc_store_id_item_id_std,enc_store_id_tm_dw_item_id_mean,enc_store_id_tm_dw_item_id_std,enc_store_id_tm_dw_mean,enc_store_id_tm_dw_std,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,sales_lag_5,sales_lag_6,sales_lag_7,sales_lag_8,sales_lag_9,sales_lag_10,sales_lag_11,sales_lag_12,sales_lag_13,sales_lag_14,sales_lag_15,rolling_mean_7,rolling_std_7,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60,rolling_mean_180,rolling_std_180
46031001,FOODS_3_823_CA_1_validation,FOODS_3_823,FOODS_3,FOODS,CA_1,CA,1914,,127,2.980469,2.980469,2.480469,0.152222,2.755859,1.0,5.0,236,1.0,1.042969,1.030273,,,,,0,0,0,25,17,4,5,4,0,0,2.429688,5.925781,3.132812,7.273438,0.807617,1.381836,0.810547,1.328125,1.570312,4.085938,1.0,1.0,4.0,0.0,2.0,0.0,1.0,2.0,0.0,4.0,2.0,4.0,1.0,0.0,0.0,1.286133,1.379883,1.571289,1.504883,0.733398,1.285156,0.683105,1.242188,1.555664,1.825195
46031002,FOODS_3_824_CA_1_validation,FOODS_3_824,FOODS_3,FOODS,CA_1,CA,1914,,0,2.480469,2.679688,2.470703,0.086365,2.630859,0.925293,3.0,138,1.0,0.952637,0.962891,,,,,0,0,0,25,17,4,5,4,0,0,2.429688,5.925781,3.132812,7.273438,0.710938,1.195312,0.743652,1.189453,1.570312,4.085938,0.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.856934,0.899902,0.571289,0.755859,0.799805,1.5625,0.399902,1.166992,0.133301,0.696289
46031003,FOODS_3_825_CA_1_validation,FOODS_3_825,FOODS_3,FOODS,CA_1,CA,1914,,1,3.980469,4.378906,3.980469,0.189697,4.121094,0.908691,3.0,165,1.0,0.969238,1.0,,,,,0,0,0,25,17,4,5,4,0,0,2.429688,5.925781,3.132812,7.273438,0.958984,1.362305,0.98877,1.331055,1.570312,4.085938,2.0,3.0,1.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,3.0,2.0,1.0,1.142578,1.069336,1.286133,0.994629,1.133789,0.899414,1.099609,0.986328,0.850098,1.075195
46031004,FOODS_3_826_CA_1_validation,FOODS_3_826,FOODS_3,FOODS,CA_1,CA,1914,,211,1.280273,1.280273,1.280273,0.0,1.280273,1.0,1.0,36,1.0,1.0,1.0,,,,,0,0,0,25,17,4,5,4,0,0,2.429688,5.925781,3.132812,7.273438,1.606445,1.863281,1.649414,1.834961,1.570312,4.085938,0.0,2.0,3.0,0.0,0.0,2.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.291016,0.928711,1.328125,0.933105,1.412109,1.183594,1.396484,1.016602,1.404297
46031005,FOODS_3_827_CA_1_validation,FOODS_3_827,FOODS_3,FOODS,CA_1,CA,1914,,403,1.0,1.0,1.0,0.0,1.0,1.0,1.0,137,1.0,1.0,1.0,,,,,0,0,0,25,17,4,5,4,0,0,2.429688,5.925781,3.132812,7.273438,3.474609,3.431641,3.087891,2.765625,1.570312,4.085938,21.0,3.0,5.0,3.0,3.0,0.0,14.0,19.0,5.0,5.0,8.0,3.0,7.0,5.0,11.0,7.0,7.59375,7.214844,6.316406,5.535156,5.0,4.816406,4.242188,4.035156,3.894531


In [46]:
lag_columns

['sales_lag_16',
 'sales_lag_17',
 'sales_lag_18',
 'sales_lag_19',
 'sales_lag_20',
 'sales_lag_21',
 'sales_lag_22',
 'sales_lag_23',
 'sales_lag_24',
 'sales_lag_25',
 'sales_lag_26',
 'sales_lag_27',
 'sales_lag_28',
 'sales_lag_29',
 'sales_lag_30']

In [58]:
y_pred, score = predict_samples(valid_grid_df, SAV_BASE_PATH, store_id, shift, NUM_COLUMNS+lag_columns, CAT_COLUMNS, FINAL_TARGETS)

CA_1 1 rmse score 2.0721474071529995


In [59]:

y_pred, score = predict_samples(test_grid_df, SAV_BASE_PATH, store_id, shift, NUM_COLUMNS+lag_columns, CAT_COLUMNS, None)

In [68]:
submission = test_grid_df[['id']].copy().reset_index(drop=True)
submission[f'F{shift}'] = y_pred

In [69]:
submission

Unnamed: 0,id,F1
0,HOBBIES_1_001_CA_1_validation,0.761683
1,HOBBIES_1_002_CA_1_validation,0.124459
2,HOBBIES_1_003_CA_1_validation,0.294389
3,HOBBIES_1_004_CA_1_validation,1.335258
4,HOBBIES_1_005_CA_1_validation,0.854439
...,...,...
3044,FOODS_3_823_CA_1_validation,1.193504
3045,FOODS_3_824_CA_1_validation,0.766795
3046,FOODS_3_825_CA_1_validation,0.980805
3047,FOODS_3_826_CA_1_validation,1.020902


In [None]:
        temp_df = base_test[day_mask][['id',TARGET]]
        temp_df.columns = ['id','F'+str(PREDICT_DAY)]
        if 'id' in list(all_preds):
            all_preds = all_preds.merge(temp_df, on=['id'], how='left')
        else:
            all_preds = temp_df.copy()