In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs
from math import ceil
import lightgbm as lgb
from typing import Union
warnings.filterwarnings('ignore')

In [2]:
    
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [3]:
def rmse(y, y_pred):
    return np.sqrt(np.mean(np.square(y - y_pred)))

In [4]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # Leave only relevant store
    df = df[df['store_id']==store]

    # With memory limits we have to read 
    # lags and mean encoding features
    # separately and drop items that we don't need.
    # As our Features Grids are aligned 
    # we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.
    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

In [5]:
########################### Init Metric
########################### https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133834
#################################################################################
class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(self.group_ids):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def get_scale(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        return getattr(self, f'lv{lv}_scale')
        
    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')       
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())
        if VERBOSE:
            print(np.round(all_scores,3))
        return np.mean(all_scores)

    def full_score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())
        print(np.round(all_scores,3))
        return np.mean(all_scores)
    
class WRMSSEForLightGBM(WRMSSEEvaluator):

    def feval(self, preds, dtrain):
        preds = preds.reshape(self.valid_df[self.valid_target_columns].shape, order='F') #.transpose()
        score = self.score(preds)
        return 'WRMSSE', score, False
    
    def full_feval(self, preds, dtrain):
        preds = preds.reshape(self.valid_df[self.valid_target_columns].shape, order='F') #.transpose()
        score = self.full_score(preds)
        return 'WRMSSE', score, False
    
########################### Lgb evaluators
#################################################################################
def get_evaluators(items_ids):
    prices = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')
    calendar = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
    train_fold_df = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
    train_fold_df = train_fold_df[train_fold_df['store_id'].isin(items_ids)].reset_index(drop=True)

    lgb_evaluators = []
    for i in range(2):
        temp_train = train_fold_df.iloc[:,:-28*(i+1)]
        if i==0:
            temp_valid = train_fold_df.iloc[:, -28*(i+1):]
        else:
            temp_valid = train_fold_df.iloc[:, -28*(i+1):-28*i]

        lgb_evaluator = WRMSSEForLightGBM(temp_train, temp_valid, calendar, prices)
        lgb_evaluators.append(lgb_evaluator)

    del train_fold_df, temp_train, temp_valid, prices, calendar
    return lgb_evaluators

In [6]:
########################### Model params
#################################################################################
import lightgbm as lgb
# lgb_params = {
#                     'boosting_type': 'gbdt',                      
#                     'metric': ['rmse'],           
#                     'subsample': 0.5,                
#                     'subsample_freq': 1,
#                     'learning_rate': 2000,           
#                     'num_leaves': 2**11-1,            
#                     'min_data_in_leaf': 2**12-1,     
#                     'feature_fraction': 0.5,
#                     'n_estimators': 1,            
#                     #'early_stopping_rounds': 30,     
#                     #'seed': SEED,
#                     'verbose': -1,
#                 } 

# Let's look closer on params

## 'boosting_type': 'gbdt'
# we have 'goss' option for faster training
# but it normally leads to underfit.
# Also there is good 'dart' mode
# but it takes forever to train
# and model performance depends 
# a lot on random factor 
# https://www.kaggle.com/c/home-credit-default-risk/discussion/60921

## 'objective': 'tweedie'
# Tweedie Gradient Boosting for Extremely
# Unbalanced Zero-inflated Data
# https://arxiv.org/pdf/1811.10192.pdf
# and many more articles about tweediie
#
# Strange (for me) but Tweedie is close in results
# to my own ugly loss.
# My advice here - make OWN LOSS function
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/140564
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/143070
# I think many of you already using it (after poisson kernel appeared) 
# (kagglers are very good with "params" testing and tuning).
# Try to figure out why Tweedie works.
# probably it will show you new features options
# or data transformation (Target transformation?).

## 'tweedie_variance_power': 1.1
# default = 1.5
# set this closer to 2 to shift towards a Gamma distribution
# set this closer to 1 to shift towards a Poisson distribution
# my CV shows 1.1 is optimal 
# but you can make your own choice

## 'metric': 'rmse'
# Doesn't mean anything to us
# as competition metric is different
# and we don't use early stoppings here.
# So rmse serves just for general 
# model performance overview.
# Also we use "fake" validation set
# (as it makes part of the training set)
# so even general rmse score doesn't mean anything))
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133834

## 'subsample': 0.5
# Serves to fight with overfit
# this will randomly select part of data without resampling
# Chosen by CV (my CV can be wrong!)
# Next kernel will be about CV

##'subsample_freq': 1
# frequency for bagging
# default value - seems ok

## 'learning_rate': 0.03
# Chosen by CV
# Smaller - longer training
# but there is an option to stop 
# in "local minimum"
# Bigger - faster training
# but there is a chance to
# not find "global minimum" minimum

## 'num_leaves': 2**11-1
## 'min_data_in_leaf': 2**12-1
# Force model to use more features
# We need it to reduce "recursive"
# error impact.
# Also it leads to overfit
# that's why we use small 
# 'max_bin': 100

## l1, l2 regularizations
# https://towardsdatascience.com/l1-and-l2-regularization-methods-ce25e7fc831c
# Good tiny explanation
# l2 can work with bigger num_leaves
# but my CV doesn't show boost
                    
## 'n_estimators': 1400
# CV shows that there should be
# different values for each state/store.
# Current value was chosen 
# for general purpose.
# As we don't use any early stopings
# careful to not overfit Public LB.

##'feature_fraction': 0.5
# LightGBM will randomly select 
# part of features on each iteration (tree).
# We have maaaany features
# and many of them are "duplicates"
# and many just "noise"
# good values here - 0.5-0.7 (by CV)

## 'boost_from_average': False
# There is some "problem"
# to code boost_from_average for 
# custom loss
# 'True' makes training faster
# BUT carefull use it
# https://github.com/microsoft/LightGBM/issues/1514
# not our case but good to know cons

In [7]:
########################### Vars
#################################################################################
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
# lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0              # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913         # End day of our train set 1913
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = True      # Use or not pretrained models
VERBOSE   = False 
#FEATURES to remove
## These features lead to overfit
## or values not present in test set
remove_features = ['id','state_id','items_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

#PATHS for Features
ORIGINAL = '../input/m5-forecasting-accuracy/'
BASE     = '../input/m5-simple-fe/grid_part_1.pkl'
PRICE    = '../input/m5-simple-fe/grid_part_2.pkl'
CALENDAR = '../input/m5-simple-fe/grid_part_3.pkl'
LAGS     = '../input/m5-lags-features/lags_df_28.pkl'
MEAN_ENC = '../input/m5-custom-features/mean_encoding_df.pkl'


# AUX(pretrained) Models paths
AUX_MODELS = '../input/store1-1600/'


#STORES ids
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [8]:
########################### Aux Models
# If you don't want to wait hours and hours
# to have result you can train each store 
# in separate kernel and then just join result.

# If we want to use pretrained models we can 
## skip training 
## (in our case do dummy training
##  to show that we are good with memory
##  and you can safely use this (all kernel) code)
# if USE_AUX:
#     lgb_params['n_estimators'] = 4
    
# Here is some 'logs' that can compare
#Train CA_1
#[100]	valid_0's rmse: 2.02289
#[200]	valid_0's rmse: 2.0017
#[300]	valid_0's rmse: 1.99239
#[400]	valid_0's rmse: 1.98471
#[500]	valid_0's rmse: 1.97923
#[600]	valid_0's rmse: 1.97284
#[700]	valid_0's rmse: 1.96763
#[800]	valid_0's rmse: 1.9624
#[900]	valid_0's rmse: 1.95673
#[1000]	valid_0's rmse: 1.95201
#[1100]	valid_0's rmse: 1.9476
#[1200]	valid_0's rmse: 1.9434
#[1300]	valid_0's rmse: 1.9392
#[1400]	valid_0's rmse: 1.93446

#Train CA_2
#[100]	valid_0's rmse: 1.88949
#[200]	valid_0's rmse: 1.84767
#[300]	valid_0's rmse: 1.83653
#[400]	valid_0's rmse: 1.82909
#[500]	valid_0's rmse: 1.82265
#[600]	valid_0's rmse: 1.81725
#[700]	valid_0's rmse: 1.81252
#[800]	valid_0's rmse: 1.80736
#[900]	valid_0's rmse: 1.80242
#[1000]	valid_0's rmse: 1.79821
#[1100]	valid_0's rmse: 1.794
#[1200]	valid_0's rmse: 1.78973
#[1300]	valid_0's rmse: 1.78552
#[1400]	valid_0's rmse: 1.78158

In [9]:
def custom_loss(y_pred, y_true):
    grad = y_pred - y_true.get_label()
    hess = np.ones_like(y_pred)
    return grad, hess

In [10]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [11]:
#STORES_IDS[9:]

In [12]:
########################### Train Models#STORES_IDS[4:5]
#################################################################################
for store_id in STORES_IDS:
    print('Train', store_id)
    
    if store_id=='CA_1':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.03,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 1150, #1800,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }
    elif store_id=='CA_2':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.03,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 2000,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }
    elif store_id=='CA_3':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.03,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 1620,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }
    elif store_id=='CA_4':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.03,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 3000,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }   
    elif store_id=='TX_1':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.05,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 1620,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }    
    elif store_id=='TX_2':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.065,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 2200,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }        
    elif store_id=='TX_3':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.085,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 2800,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }    
    elif store_id=='WI_1':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.08,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 2600,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }
    elif store_id=='WI_2':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.085,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 3300,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }      
    elif store_id=='WI_3':
        lgb_params = {
                    'boosting_type': 'gbdt',                      
                    'metric': ['rmse'],           
                    'subsample': 0.5,                
                    'subsample_freq': 1,
                    'learning_rate': 0.08,           
                    'num_leaves': 2**11-1,            
                    'min_data_in_leaf': 2**12-1,     
                    'feature_fraction': 0.5,
                    'n_estimators': 2600,            
                    #'early_stopping_rounds': 30,     
                    #'seed': SEED,
                    'verbose': -1,
                }
    
    lgb_params['seed'] = SEED

    if USE_AUX:
        lgb_params['n_estimators'] = 2
        
    local_params = lgb_params.copy() 
   
    # Get grid for current store
    grid_df, features_columns = get_data_by_store(store_id)
    features_columns.remove('store_id')
    
    grid_df=reduce_mem_usage(grid_df, verbose=True)
    #print(grid_df.head())
    
    lgb_evaluators = get_evaluators(list(grid_df['store_id'].unique()))
    #
    #gc.collect()
    
    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validatio set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                       label=grid_df[train_mask][TARGET])
    train_data.save_binary('train_data.bin')
    train_data = lgb.Dataset('train_data.bin')
    
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle('test_'+store_id+'.pkl')
    del grid_df
    block=0
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = lgb.train(
                        local_params,
                        train_data,
                        valid_sets = [valid_data],
                        verbose_eval = 20,
                        fobj = custom_loss, 
                        feval = lgb_evaluators[block].feval,
                    )
    
#     estimator = lgb.train(lgb_params,
#                           train_data,
#                           valid_sets = [valid_data],
#                           verbose_eval = 100,
#                           )
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
    model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
    !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

Train CA_1
Mem. usage decreased to 618.06 Mb (4.2% reduction)
Train CA_2
Mem. usage decreased to 563.07 Mb (4.2% reduction)
Train CA_3
Mem. usage decreased to 614.07 Mb (4.2% reduction)
Train CA_4
Mem. usage decreased to 600.59 Mb (4.2% reduction)
Train TX_1
Mem. usage decreased to 619.30 Mb (4.2% reduction)
Train TX_2
Mem. usage decreased to 620.58 Mb (4.2% reduction)
Train TX_3
Mem. usage decreased to 611.48 Mb (4.2% reduction)
Train WI_1
Mem. usage decreased to 588.77 Mb (4.2% reduction)
Train WI_2
Mem. usage decreased to 599.82 Mb (4.2% reduction)
Train WI_3
Mem. usage decreased to 615.97 Mb (4.2% reduction)


In [13]:
#MODEL_FEATURES = features_columns  stop

In [14]:
########################### Predict
#################################################################################

# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day
for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
        
    for store_id in STORES_IDS:
        
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 
        if USE_AUX:
            model_path = AUX_MODELS + model_path
        
        estimator = pickle.load(open(model_path, 'rb'))
        
        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        print(grid_df[mask][MODEL_FEATURES])
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
    
    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
              item_id    dept_id   cat_id  release  sell_price  price_max  \
304872  HOBBIES_1_001  HOBBIES_1  HOBBIES      224    8.382812   9.578125   
304873  HOBBIES_1_002  HOBBIES_1  HOBBIES       20    3.970703   3.970703   
304874  HOBBIES_1_003  HOBBIES_1  HOBBIES      300    2.970703   2.970703   
304875  HOBBIES_1_004  HOBBIES_1  HOBBIES        5    4.640625   4.640625   
304876  HOBBIES_1_005  HOBBIES_1  HOBBIES       16    2.880859   3.080078   
...               ...        ...      ...      ...         ...        ...   
307916    FOODS_3_823    FOODS_3    FOODS      127    2.980469   2.980469   
307917    FOODS_3_824    FOODS_3    FOODS        0    2.480469   2.679688   
307918    FOODS_3_825    FOODS_3    FOODS        1    3.980469   4.378906   
307919    FOODS_3_826    FOODS_3    FOODS      211    1.280273   1.280273   
307920    FOODS_3_827    FOODS_3    FOODS      403    1.000000   1.000000   

        price_min  price_std  price_mean  price_norm  ... 

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.754883,0.710449,0.723145,0.757324,0.931152,1.017578,1.149414,0.851074,0.856445,...,0.770996,0.943359,0.963867,0.812988,0.757812,0.724121,0.772949,0.876953,1.034180,0.944824
1,HOBBIES_1_002_CA_1_validation,0.191895,0.187134,0.182861,0.181030,0.222412,0.264648,0.295898,0.237549,0.222534,...,0.207520,0.222778,0.225098,0.194946,0.177856,0.185059,0.178589,0.192871,0.232544,0.247925
2,HOBBIES_1_003_CA_1_validation,0.445801,0.421631,0.418701,0.417480,0.580566,0.742676,0.761719,0.493164,0.496094,...,0.541992,0.700195,0.704102,0.445801,0.417236,0.445068,0.479980,0.608887,0.724121,0.685059
3,HOBBIES_1_004_CA_1_validation,1.622070,1.277344,1.333008,1.523438,1.917969,2.878906,3.189453,1.768555,1.469727,...,1.861328,2.650391,3.246094,1.698242,1.451172,1.448242,1.424805,2.052734,3.144531,3.394531
4,HOBBIES_1_005_CA_1_validation,0.942871,0.853027,0.860352,0.940430,1.067383,1.469727,1.518555,0.978516,0.969238,...,1.039062,1.597656,1.598633,1.072266,0.863770,0.962891,0.931152,1.103516,1.573242,1.517578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,0.341064,0.305908,0.305908,0.291260,0.352295,0.438965,0.446533,0.415771,0.495605,...,0.464111,0.502930,0.563965,0.436035,0.378418,0.362061,0.356934,0.375977,0.407959,0.475830
30486,FOODS_3_824_WI_3_validation,0.265625,0.220215,0.183350,0.205322,0.247070,0.320801,0.316162,0.354492,0.364990,...,0.264160,0.410645,0.468262,0.283936,0.231201,0.213257,0.189331,0.194458,0.261963,0.265381
30487,FOODS_3_825_WI_3_validation,0.622559,0.521973,0.471191,0.488525,0.628418,0.775391,0.947266,1.078125,1.172852,...,0.919922,1.304688,1.520508,0.995117,0.659668,0.631348,0.598145,0.660645,0.791504,0.841309
30488,FOODS_3_826_WI_3_validation,0.998535,0.908691,0.758301,0.803223,0.883301,1.219727,1.148438,1.193359,1.250000,...,0.966309,1.224609,1.349609,0.951660,0.908203,0.859375,0.793945,0.947266,1.057617,1.175781


In [15]:
########################### Export
#################################################################################
# Reading competition sample submission and
# merging our predictions
# As we have predictions only for "_validation" data
# we need to do fillna() for "_evaluation" items
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v1'+str(VER)+'.csv', index=False)

In [16]:
submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.754883,0.710449,0.723145,0.757324,0.931152,1.017578,1.149414,0.851074,0.856445,...,0.770996,0.943359,0.963867,0.812988,0.757812,0.724121,0.772949,0.876953,1.03418,0.944824
1,HOBBIES_1_002_CA_1_validation,0.191895,0.187134,0.182861,0.18103,0.222412,0.264648,0.295898,0.237549,0.222534,...,0.20752,0.222778,0.225098,0.194946,0.177856,0.185059,0.178589,0.192871,0.232544,0.247925
2,HOBBIES_1_003_CA_1_validation,0.445801,0.421631,0.418701,0.41748,0.580566,0.742676,0.761719,0.493164,0.496094,...,0.541992,0.700195,0.704102,0.445801,0.417236,0.445068,0.47998,0.608887,0.724121,0.685059
3,HOBBIES_1_004_CA_1_validation,1.62207,1.277344,1.333008,1.523438,1.917969,2.878906,3.189453,1.768555,1.469727,...,1.861328,2.650391,3.246094,1.698242,1.451172,1.448242,1.424805,2.052734,3.144531,3.394531
4,HOBBIES_1_005_CA_1_validation,0.942871,0.853027,0.860352,0.94043,1.067383,1.469727,1.518555,0.978516,0.969238,...,1.039062,1.597656,1.598633,1.072266,0.86377,0.962891,0.931152,1.103516,1.573242,1.517578
