## Please input your directory for the top level folder
folder name : SUBMISSION MODEL

In [1]:
dir_ = 'INPUT-PROJECT-DIRECTORY/submission_model/' # input only here

#### setting other directory

In [2]:
raw_data_dir = dir_+'2. data/'
processed_data_dir = dir_+'2. data/processed/'
log_dir = dir_+'4. logs/'
model_dir = dir_+'5. models/'
submission_dir = dir_+'6. submissions/'

In [3]:
####################################################################################
##################### 1-3. recursive model by store & dept #########################
####################################################################################

In [4]:
ver, KKK = 'priv', 0

In [5]:
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
DEPTS = ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']

In [6]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool

warnings.filterwarnings('ignore')

In [7]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [8]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store, dept):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    df = df[df['d']>=START_TRAIN]
    
    df = df[(df['store_id']==store) & (df['dept_id']==dept)]

    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2
    
    df = pd.concat([df, df3], axis=1)
    del df3
    
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    df = df.reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES:
        for state_id in DEPTS:
            temp_df = pd.read_pickle(processed_data_dir+'test_'+store_id+'_'+state_id+'.pkl')
            temp_df['store_id'] = store_id
            temp_df['dept_id'] = state_id
            base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

In [9]:
########################### Model params
#################################################################################
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.015,
                    'num_leaves': 2**8-1,
                    'min_data_in_leaf': 2**8-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
                    'boost_from_average': False,
                    'verbose': -1
                } 

In [10]:
########################### Vars
#################################################################################
VER = 1                          
SEED = 42                        
seed_everything(SEED)            
lgb_params['seed'] = SEED        
N_CORES = psutil.cpu_count()     


#LIMITS and const
TARGET      = 'sales'            
START_TRAIN = 700                
END_TRAIN   = 1941 - 28*KKK      
P_HORIZON   = 28                 
USE_AUX     = False             

remove_features = ['id','cat_id', 'state_id','store_id','dept_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_item_id_store_id_mean','enc_item_id_store_id_std'] 

ORIGINAL = raw_data_dir
BASE     = processed_data_dir+'grid_part_1.pkl'
PRICE    = processed_data_dir+'grid_part_2.pkl'
CALENDAR = processed_data_dir+'grid_part_3.pkl'
LAGS     = processed_data_dir+'lags_df_28.pkl'
MEAN_ENC = processed_data_dir+'mean_encoding_df.pkl'


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [11]:
_, MODEL_FEATURES = get_data_by_store(STORES[-1], DEPTS[-1])
del _; gc.collect()

0

In [12]:
def pred_q(quantile):
    print(quantile)
    all_preds = pd.DataFrame()

    # Join back the Test dataset with 
    # a small part of the training data 
    # to make recursive features

    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)

    main_time = time.time()

    for PREDICT_DAY in range(1,29):    
        print('Predict | Day:', PREDICT_DAY)
        start_time = time.time()

        for store_id in STORES:
            for state_id in DEPTS:

                model_path = model_dir+'lgb_model_'+store_id+'_'+state_id+'_v'+str(VER)+'.bin'
                if USE_AUX:
                    model_path = AUX_MODELS + model_path

                estimator = pickle.load(open(model_path, 'rb'))

                day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
                store_mask = base_test['store_id']==store_id
                state_mask = base_test['dept_id']==state_id

                mask = (day_mask)&(store_mask)&(state_mask)
                print('starting to predict')
                base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES], float(quantile))

        temp_df = base_test[day_mask][['id',TARGET]]
        temp_df.columns = ['id','F'+str(PREDICT_DAY)]
        if 'id' in list(all_preds):
            all_preds = all_preds.merge(temp_df, on=['id'], how='left')
        else:
            all_preds = temp_df.copy()

        print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                      ' %0.2f min total |' % ((time.time() - main_time) / 60),
                      ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
        del temp_df

    all_preds = all_preds.reset_index(drop=True)
    all_preds
    
    ########################### Export
    #################################################################################
    submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
    submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
    submission.to_csv(submission_dir+f'before_ensemble/submission_kaggle_recursive_store_dept_{quantile}.csv', index=False)

In [13]:
base_test = get_base_test()

In [None]:
import concurrent.futures
for quantile in ['0.005', '0.025', '0.165', '0.250', '0.500', '0.750', '0.835', '0.975', '0.995']:
     with concurrent.futures.ThreadPoolExecutor(
        ) as executor:
            executor.submit(
                pred_q, quantile
            )

0.005
Predict | Day: 1
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predi

starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
##########  32.53 min round |  189.93 min total |  1179.03 day sales |
Predict | Day: 7
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to predict
starting to 