## Please input your directory for the top level folder
folder name : SUBMISSION MODEL

In [1]:
dir_ = 'INPUT-PROJECT-DIRECTORY/submission_model/' # input only here

#### setting other directory

In [2]:
raw_data_dir = dir_+'2. data/'
processed_data_dir = dir_+'2. data/processed/'
log_dir = dir_+'4. logs/'
model_dir = dir_+'5. models/'

In [3]:
####################################################################################
################## 2-3. nonrecursive model by store & dept #########################
####################################################################################

In [4]:
cvs = ['private']

In [5]:
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
DEPTS = ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']

In [6]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

import os, sys, gc, time, warnings, pickle, psutil, random

warnings.filterwarnings('ignore')

In [7]:
def reduce_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [8]:
FIRST_DAY = 710
remove_feature = ['id',
                  'state_id',
                  'store_id',
#                   'item_id',
                  'dept_id',
                  'cat_id',
                  'date','wm_yr_wk','d','sales']

cat_var = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
cat_var = list(set(cat_var) - set(remove_feature))

In [9]:
grid2_colnm = ['sell_price', 'price_max', 'price_min', 'price_std',
               'price_mean', 'price_norm', 'price_nunique', 'item_nunique',
               'price_momentum', 'price_momentum_m', 'price_momentum_y']

grid3_colnm = ['event_name_1', 'event_type_1', 'event_name_2',
               'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'tm_d', 'tm_w', 'tm_m',
               'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end']

lag_colnm = [ 'sales_lag_28', 'sales_lag_29', 'sales_lag_30',
             'sales_lag_31', 'sales_lag_32', 'sales_lag_33', 'sales_lag_34',
             'sales_lag_35', 'sales_lag_36', 'sales_lag_37', 'sales_lag_38',
             'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 'sales_lag_42',
             
             'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
             'rolling_mean_30', 'rolling_std_30', 'rolling_mean_60',
             'rolling_std_60', 'rolling_mean_180', 'rolling_std_180']

mean_enc_colnm = [
    
    'enc_item_id_store_id_mean', 'enc_item_id_store_id_std'

]

In [10]:
########################### Make grid
#################################################################################
def prepare_data(store, state):
    
    grid_1 = pd.read_pickle(processed_data_dir+"grid_part_1.pkl")
    grid_2 = pd.read_pickle(processed_data_dir+"grid_part_2.pkl")[grid2_colnm]
    grid_3 = pd.read_pickle(processed_data_dir+"grid_part_3.pkl")[grid3_colnm]

    grid_df = pd.concat([grid_1, grid_2, grid_3], axis=1)
    del grid_1, grid_2, grid_3; gc.collect()
    
    grid_df = grid_df[(grid_df['store_id'] == store) & (grid_df['dept_id'] == state)]
    grid_df = grid_df[grid_df['d'] >= FIRST_DAY]
    
    lag = pd.read_pickle(processed_data_dir+"lags_df_28.pkl")[lag_colnm]
    
    lag = lag[lag.index.isin(grid_df.index)]
    
    grid_df = pd.concat([grid_df,
                     lag],
                    axis=1)
    
    del lag; gc.collect()
    

    mean_enc = pd.read_pickle(processed_data_dir+"mean_encoding_df.pkl")[mean_enc_colnm]
    mean_enc = mean_enc[mean_enc.index.isin(grid_df.index)]
    
    grid_df = pd.concat([grid_df,
                         mean_enc],
                        axis=1)    
    del mean_enc; gc.collect()
    
    grid_df = reduce_mem_usage(grid_df)
    
    
    
    return grid_df

In [11]:
validation = {
    'cv1' : [1551, 1610],
    'cv2' : [1829,1857],
    'cv3' : [1857, 1885],
    'cv4' : [1885,1913],
    'public' : [1913, 1941],
    'private' : [1941, 1969]
}

### cv1 : 2015-04-28 ~ 2015-06-26

### cv2 : 2016-02-01 ~ 2016-02-28

### cv3 : 2016-02-29 ~ 2016-03-27

### cv4 : 2016-03-28 ~ 2016-04-24

In [12]:
########################### Model params
#################################################################################
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.015,
                    'num_leaves': 2**8-1,
                    'min_data_in_leaf': 2**8-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
                    'boost_from_average': False,
                    'verbose': -1,
                    'seed' : 1995
                } 

In [None]:
########################### Train Models
#################################################################################
from lightgbm import LGBMRegressor
from gluonts.model.rotbaum._model import QRX
rmsse_bycv = dict()

for cv in cvs:
    print('cv : day', validation[cv])

    pred_list = []
    for store in STORES:
        for state in DEPTS:

            print(store,state, 'start')
            grid_df = prepare_data(store, state)

            model_var = grid_df.columns[~grid_df.columns.isin(remove_feature)]

            tr_mask = (grid_df['d'] <= validation[cv][0]) & (grid_df['d'] >= FIRST_DAY)
            vl_mask = (grid_df['d'] > validation[cv][0]) & (grid_df['d'] <= validation[cv][1])

            estimator = QRX(model=LGBMRegressor(**lgb_params),
                        min_bin_size=200)
            estimator.fit(
                grid_df[tr_mask][model_var], 
                grid_df[tr_mask]['sales'],
                max_sample_size=1000000, 
                seed=1,
                eval_set=[(
                        grid_df[vl_mask][model_var],
                        grid_df[vl_mask]['sales']
                    ),
                    (
                        grid_df[tr_mask][model_var], 
                        grid_df[tr_mask]['sales']
                    )
                ],
                verbose=100,
                x_train_is_dataframe=True
            )
            model_name = model_dir+'non_recur_model_'+store+'_'+state+'.bin'
            pickle.dump(estimator, open(model_name, 'wb'))
            
#            display(pd.DataFrame({'name':m_lgb.feature_name(),
#                                  'imp':m_lgb.feature_importance()}).sort_values('imp',ascending=False).head(25))
            
            del grid_df, estimator, tr_mask, vl_mask; gc.collect() #train_data, valid_data,

cv : day [1941, 1969]
CA_1 HOBBIES_1 start
[100]	valid_0's rmse: 1.76965	valid_1's rmse: 2.56407
[200]	valid_0's rmse: 2.1041	valid_1's rmse: 2.45938
[300]	valid_0's rmse: 2.19545	valid_1's rmse: 2.40284
[400]	valid_0's rmse: 2.21621	valid_1's rmse: 2.35022
[500]	valid_0's rmse: 2.21806	valid_1's rmse: 2.30324
[600]	valid_0's rmse: 2.22061	valid_1's rmse: 2.25746
[700]	valid_0's rmse: 2.22182	valid_1's rmse: 2.21407
[800]	valid_0's rmse: 2.22035	valid_1's rmse: 2.17136
[900]	valid_0's rmse: 2.21246	valid_1's rmse: 2.12996
[1000]	valid_0's rmse: 2.21424	valid_1's rmse: 2.08958
[1100]	valid_0's rmse: 2.20179	valid_1's rmse: 2.04951
[1200]	valid_0's rmse: 2.20023	valid_1's rmse: 2.01059
[1300]	valid_0's rmse: 2.19463	valid_1's rmse: 1.97334
[1400]	valid_0's rmse: 2.18705	valid_1's rmse: 1.93599
[1500]	valid_0's rmse: 2.18653	valid_1's rmse: 1.89932
[1600]	valid_0's rmse: 2.18435	valid_1's rmse: 1.8648
[1700]	valid_0's rmse: 2.17788	valid_1's rmse: 1.83184
[1800]	valid_0's rmse: 2.17166	va

[400]	valid_0's rmse: 2.37144	valid_1's rmse: 1.97145
[500]	valid_0's rmse: 2.35977	valid_1's rmse: 1.92399
[600]	valid_0's rmse: 2.35328	valid_1's rmse: 1.88372
[700]	valid_0's rmse: 2.34849	valid_1's rmse: 1.84789
[800]	valid_0's rmse: 2.33856	valid_1's rmse: 1.81521
[900]	valid_0's rmse: 2.33635	valid_1's rmse: 1.78406
[1000]	valid_0's rmse: 2.32503	valid_1's rmse: 1.75536
[1100]	valid_0's rmse: 2.31868	valid_1's rmse: 1.72732
[1200]	valid_0's rmse: 2.31701	valid_1's rmse: 1.70153
[1300]	valid_0's rmse: 2.31052	valid_1's rmse: 1.676
[1400]	valid_0's rmse: 2.30551	valid_1's rmse: 1.65112
[1500]	valid_0's rmse: 2.29228	valid_1's rmse: 1.62783
[1600]	valid_0's rmse: 2.2864	valid_1's rmse: 1.60531
[1700]	valid_0's rmse: 2.28195	valid_1's rmse: 1.58323
[1800]	valid_0's rmse: 2.28022	valid_1's rmse: 1.56198
[1900]	valid_0's rmse: 2.27783	valid_1's rmse: 1.54164
[2000]	valid_0's rmse: 2.27543	valid_1's rmse: 1.52168
[2100]	valid_0's rmse: 2.27075	valid_1's rmse: 1.50265
[2200]	valid_0's rm