# lgb model 
- <font color=red> Check
    - <font color=red> store id 
    - <font color=red> features
    - <font color=red> version
    - <font color=red> original file


In [1]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random, time
warnings.filterwarnings('ignore')

from math import ceil
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

## Load data

In [2]:
# change the file path if run on different machines
MainFilePath = 'MainData/'
ModelFilePath = 'Model/'
PredictFilePath = 'Predict/'

In [3]:
# Read data
def get_data_by_store(store):
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(MainFilePath+'grid_part123.pkl'), 
                     pd.read_pickle(MainFilePath+'lag_rolling.pkl').iloc[:,3:],
                     pd.read_pickle(MainFilePath+'encoding.pkl').iloc[:,10:]], axis=1)
    
    # Leave only relevant store
    df = df[df['store_id']==store]
          
    return df

In [4]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

## Set global vars

In [5]:
VER = 5 #model version
SEED = 1            
seed_everything(SEED)            

USE_AUX = True               # Use or not pretrained models

In [6]:
TARGET = 'sales'
START_TRAIN = 0
#END_TRAIN = 1913
END_TRAIN = 1941
P_HORIZON = 28 
STORES_IDS = ['CA_1','CA_2','CA_3','CA_4','TX_1','TX_2','TX_3','WI_1','WI_2','WI_3']

- Train on all data before 1913
- The "fake" validation set is the last 28 days of 1913
- The test data is 1913 to 1941

## Model train

<font color=red> **Attention to feature selection**

In [7]:
Customized_features = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'release', 'wm_yr_wk', 'enc_state_id_mean', 'enc_state_id_std',
       'enc_store_id_mean', 'enc_store_id_std', 'enc_cat_id_mean',
       'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std',
       'enc_state_id_cat_id_mean', 'enc_state_id_cat_id_std',
       'enc_state_id_dept_id_mean', 'enc_state_id_dept_id_std',
       'enc_store_id_cat_id_mean', 'enc_store_id_cat_id_std',
       'enc_store_id_dept_id_mean', 'enc_store_id_dept_id_std',
       'enc_item_id_mean', 'enc_item_id_std', 'enc_item_id_state_id_mean',
       'enc_item_id_state_id_std', 'enc_item_id_store_id_mean',
       'enc_item_id_store_id_std']

Basic_features = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'release', 'wm_yr_wk', 'sell_price', 'price_max', 'price_min',
       'price_std', 'price_mean', 'price_norm', 'price_nunique',
       'item_nunique', 'price_momentum_m', 'price_momentum_y',
       'price_momentum', 'event_name_1', 'event_type_1', 'event_name_2',
       'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'tm_d', 'tm_w', 'tm_m',
       'tm_y', 'tm_dw', 'tm_wm', 'tm_w_end']

# Lag_features are important

In [8]:
# according to Konstantin Yakovlev's brilliant idea, some featues will be removed
remove_features = [TARGET,'id', 'item_id', 'dept_id', 'cat_id','state_id','store_id','date',
                   'wm_yr_wk','d','price_nunique','item_nunique', 'rolling_mean7',
                   'rolling_std7', 'rolling_mean28', 'rolling_std28','enc_state_id_mean', 'enc_state_id_std',
                   'enc_cat_id_mean', 'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std','enc_state_id_cat_id_mean', 
                   'enc_state_id_cat_id_std', 'enc_state_id_dept_id_mean', 'enc_state_id_dept_id_std','enc_item_id_mean', 
                   'enc_item_id_std', 'enc_item_id_state_id_mean', 'enc_item_id_state_id_std']

In [9]:
lgb_params = {
                    'boosting_type': 'gbdt',         # Standard boosting type
                    'objective': 'tweedie',          # poisson, tweedie
                    'metric': ['rmse'],              # as we will use rmse as metric "proxy"
                    'tweedie_variance_power': 1.1,
                    'subsample': 0.5,                ### 0.3, 0.8
                    'subsample_freq': 1,             ### 
                    'learning_rate': 0.075,           ### 0.5 is "fast enough" for us
                    'num_leaves': 2**11-1,           # We will need model only for fast check
                    'min_data_in_leaf': 2**12-1,     # So we want it to train faster even with drop in generalization 
                    'feature_fraction': 0.5,         ###
                    'n_estimators': 1500,            # We don't want to limit training (you can change 5000 to any big enough number)
                    'max_bin':100,
                    'boost_from_average':False,
                    'early_stopping_rounds': 30,     # We will stop training almost immediately (if it stops improving) 
                    'seed': SEED,
                    'verbose': -1,
                } 

In [10]:
for store_id in STORES_IDS:
    print('Training '+store_id+' ...')
    
    # get the current store
    df_combined = get_data_by_store(store_id)
    all_columns = list(df_combined.columns)
    features_columns = [col for col in all_columns if col not in remove_features]
    
    # prepare training, validating, and prediction dataset
    train_ind = df_combined['d']<=END_TRAIN
    valid_ind = train_ind&(df_combined['d']>(END_TRAIN-P_HORIZON))
    preds_ind = df_combined['d']>(END_TRAIN-100)
    
    train_data = lgb.Dataset(df_combined[train_ind][features_columns], label=df_combined[train_ind][TARGET])
    valid_data = lgb.Dataset(df_combined[valid_ind][features_columns], label=df_combined[valid_ind][TARGET])
    
    # save part of dataset for prediction
    df_combined = df_combined[preds_ind].reset_index(drop=True)
    # df_combined = df_combined[feature_columns]
    df_combined.to_pickle(ModelFilePath+'test_'+store_id+'.pkl')
    del df_combined
    
    # training
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )
    
    model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(ModelFilePath+model_name, 'wb'))
    
    # free some ram
    # !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    MODEL_FEATURES = features_columns

Training CA_1 ...
Training until validation scores don't improve for 30 rounds
[100]	valid_0's rmse: 2.01953
[200]	valid_0's rmse: 2.00714
[300]	valid_0's rmse: 1.99507
[400]	valid_0's rmse: 1.98447
[500]	valid_0's rmse: 1.97546
[600]	valid_0's rmse: 1.9669
[700]	valid_0's rmse: 1.96078
[800]	valid_0's rmse: 1.95455
[900]	valid_0's rmse: 1.9483
[1000]	valid_0's rmse: 1.94101
[1100]	valid_0's rmse: 1.935
[1200]	valid_0's rmse: 1.93014
[1300]	valid_0's rmse: 1.92448
[1400]	valid_0's rmse: 1.91884
[1500]	valid_0's rmse: 1.91277
Did not meet early stopping. Best iteration is:
[1500]	valid_0's rmse: 1.91277
Training CA_2 ...
Training until validation scores don't improve for 30 rounds
[100]	valid_0's rmse: 1.91214
[200]	valid_0's rmse: 1.88881
[300]	valid_0's rmse: 1.87615
[400]	valid_0's rmse: 1.86611
[500]	valid_0's rmse: 1.85678
[600]	valid_0's rmse: 1.84865
[700]	valid_0's rmse: 1.84066
[800]	valid_0's rmse: 1.83446
[900]	valid_0's rmse: 1.82839
[1000]	valid_0's rmse: 1.82181
[1100]	val

In [11]:
2+2

4

## Predict

**Functions to make predictions**

In [12]:
def get_base_test():
    """
    The objective of this function is to read stored test dataset. The stored test dataset has 100 days before d1913 
    in order to calculate lag and rolling
    """
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle(ModelFilePath+'test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test

In [13]:
shifts = [1, 7, 28]
windows = [7, 28]
lags = [7,28]
lag_windows = [7,28]
# rolling based on original sales
def make_lag_roll(df):
    """
    The function aims to make new lag and rolling for test data
    """
    for shift in shifts:
        df['lag'+str(shift)] = df.groupby('id')[TARGET].transform(lambda x: x.shift(shift))
        
    for window in windows:
        df['rolling_mean'+str(window)] = df.groupby('id')[TARGET].transform(lambda x: x.rolling(window).mean()).astype(np.float16)
        df['rolling_std'+str(window)] = df.groupby('id')[TARGET].transform(lambda x: x.rolling(window).std()).astype(np.float16)
    for lag in lags:
        for lag_window in lag_windows:
            df['lag'+str(lag)+'rolling'+str(lag_window)]=df.groupby('id')[TARGET].transform(lambda x: x.shift(lag).rolling(lag_window).mean()).astype(np.float16) 
    
    return df

In [14]:
all_preds = pd.DataFrame()
base_test = get_base_test() # read data from all stores 
main_time = time.time()

for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()
    
    df_grid = base_test.copy()
    df_grid = make_lag_roll(df_grid)
    
    for store_id in STORES_IDS:
        model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 
        estimator = pickle.load(open(ModelFilePath+model_name, 'rb'))
        
        # read test data for the store
        day_ind = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_ind = base_test['store_id']==store_id
        total_ind = (day_ind)&(store_ind)
        
        base_test[TARGET][total_ind] = estimator.predict(df_grid[total_ind][MODEL_FEATURES])
    
    # create a sub-dataframe to store results
    temp_df = base_test[day_ind][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left') 
    else:
        all_preds = temp_df.copy() # attach 'id' on the first day
    
    print('#'*10, ' {:0.2f} min round |'.format(((time.time() - start_time) / 60)),
                  ' {:0.2f} min total |'.format(((time.time() - main_time) / 60)),
                  ' {:0.2f} day sales |'.format((temp_df['F'+str(PREDICT_DAY)].sum())))
    del temp_df
        
all_preds = all_preds.reset_index(drop=True)
all_preds
name = 'preds_all_ver'+ str(VER) +'.pkl'
all_preds.to_pickle(PredictFilePath+name)

Predict | Day: 1
##########  2.05 min round |  2.05 min total |  40135.17 day sales |
Predict | Day: 2
##########  2.05 min round |  4.11 min total |  38238.37 day sales |
Predict | Day: 3
##########  2.03 min round |  6.14 min total |  38784.35 day sales |
Predict | Day: 4
##########  2.02 min round |  8.16 min total |  37818.38 day sales |
Predict | Day: 5
##########  2.03 min round |  10.19 min total |  42831.77 day sales |
Predict | Day: 6
##########  2.00 min round |  12.19 min total |  50056.59 day sales |
Predict | Day: 7
##########  1.99 min round |  14.18 min total |  50739.56 day sales |
Predict | Day: 8
##########  1.99 min round |  16.17 min total |  46827.92 day sales |
Predict | Day: 9
##########  2.00 min round |  18.17 min total |  40191.24 day sales |
Predict | Day: 10
##########  1.96 min round |  20.14 min total |  44428.61 day sales |
Predict | Day: 11
##########  1.95 min round |  22.09 min total |  46219.84 day sales |
Predict | Day: 12
##########  1.96 min round 

In [15]:
2+2

4