# Training notebook

This notebook performs the training of our models.

It also includes code to make predictions to submit in the Kaggle challenge.

In [2]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random
import lightgbm as lgb
# custom imports
from multiprocessing import Pool        # Multiprocess Runs
import ipdb

warnings.filterwarnings('ignore')
import wandb
from wandb.lightgbm import wandb_callback

## Variables

In [3]:
# SEED = 42                        # We want all things
# seed_everything(SEED)            # to be as deterministic 
# lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()       # Available CPU cores
ver='finalRun'

#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1941               # End day of our train set
P_HORIZON   = 28                 # Prediction horizon

# weight_features = ['Weight','ScalingFactor','CombinedWeight']
# FEATURES to remove: These features lead to overfit or values not present in test set
remove_features = ['id','store_id','state_id',
                   'date','wm_yr_wk','d',TARGET] #+ weight_features
# additional featues we tried to remove
# remove_features = remove_features + ['cat_id','event_name_1','event_type_1','event_name_2','event_type_2',
#                                     'snap_CA','snap_TX','snap_WI',
#                                     'price_momentum','price_nunique','tm_w_end']
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                  'enc_dept_id_mean','enc_dept_id_std',
                  'enc_item_id_mean','enc_item_id_std']
# mean features which we used with running with reduced features
#mean_features   = ['enc_item_id_mean','enc_item_id_std']

#PATHS for Features
#ORIGINAL = '../input/m5-forecasting-accuracy/'
BASE     = 'grid_part_1.pkl'
PRICE    = 'grid_part_2.pkl'
CALENDAR = 'grid_part_3.pkl'
LAGS     = 'lags_df_28.pkl'
MEAN_ENC = 'mean_encoding_df.pkl'


#STORES ids
STORES_IDS = pd.read_csv('sales_train_evaluation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())

#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

## Helper Functions

In [4]:
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [5]:
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                pd.read_pickle(PRICE).iloc[:,2:],
                pd.read_pickle(CALENDAR).iloc[:,2:]],
                axis=1)

    # Leave only relevant store
    df = df[df['store_id']==store]
    # With memory limits we have to read 
    # lags and mean encoding features
    # separately and drop items that we don't need.
    # As our Features Grids are aligned 
    # we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.

    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 

    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    # Weights featues df to test with weights
    #weights_df = df[['id','d']+[col for col in list(df) if col in weight_features]]
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d', 'store_id', 'state_id', TARGET]+features]

    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)

    return df, features#, weights_df

In [6]:
# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test

In [7]:
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

# Loading the Data and Train the Models
To train the model we load the data per store

In [8]:
# WRMSSE evalutation function to be used in a LGBM model
# only computes the WRMSSE score for the lowest level
class eval_WRMSSE:
    def __init__(self,weights_df):
        self.weights_df = weights_df

    def eval(self,preds, val_data):
#         The weights and the validation set have the same size

        labels = val_data.get_label() 
        Ws = self.weights_df['Weight'].to_numpy()
        Ss = self.weights_df['ScalingFactor'].to_numpy()
        
        RMSSEs = np.sqrt(np.square(labels - preds)/Ss)
        WRMSEE = np.sum(Ws * RMSSEs)
        
        return 'WRMSSE', WRMSEE, False 

In [9]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'None',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'reg_alpha': 1, # l1 regularization 
                    'max_bin': 100,
                    'n_estimators': 2200,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

In [10]:
total_weights = 0
comb_eval_score = 0
# for store_id in STORES_IDS:
for store_id in STORES_IDS:
    print('Train', store_id)
    #wandb.init(project='M5_competition')

    grid_df, features_columns = get_data_by_store(store_id)

    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validatio set) (only used like this for final model)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = grid_df['d']>(END_TRAIN-100)
    #weights_df = weights_df.reset_index()
    
    weighting = dict(zip(np.arange(1,1942), np.arange(1/1941,2,2/1941)))
    time_weights = [weighting[d] for d in grid_df[train_mask]['d']]

    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                       label=grid_df[train_mask][TARGET],
                       weight=time_weights)
    train_data.save_binary('train_data.bin')
    #train_data = lgb.Dataset('train_data.bin')
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])#,
                       #weight=weights_df[valid_mask]['CombinedWeight'])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle('test_'+store_id+'.pkl')
    del grid_df
    
    # TRAINING
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100
                          #feval = eval_WRMSSE(weights_df[valid_mask]).eval, # only used for training previous models
                          #callbacks=[wandb_callback()] # write model performance to wandb
                          )
    
    #Adding to the combined eval score
    #total_weights += weights_df['Weight'].sum()
    #comb_eval_score += estimator.best_score['valid_0']['WRMSSE'] * weights_df['Weight'].sum()

    # SAVE MODEL
    # - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
    model_name = 'lgb_model_'+store_id+'_v'+str(ver)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))
    # Remove temporary files and objects 
    # to free some hdd space and ram memory
    !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

Train CA_1
Train CA_2
Train CA_3
Train CA_4
Train TX_1
Train TX_2
Train TX_3
Train WI_1
Train WI_2
Train WI_3


## Prediction

In [None]:
_,MODEL_FEATURES = get_data_by_store(STORES_IDS[0])

# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with a small part of the training data to make recursive features
base_test = get_base_test()

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day
for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()
    
    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
        
    for store_id in STORES_IDS:
        
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(ver)+'.bin' 
        
        estimator = pickle.load(open(model_path, 'rb'))
        
        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY) #1941 + i 
        ipdb.set_trace()
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
        
    
    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
> [0;32m<ipython-input-9-2e9813213b14>[0m(35)[0;36m<module>[0;34m()[0m
[0;32m     34 [0;31m        [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 35 [0;31m        [0mstore_mask[0m [0;34m=[0m [0mbase_test[0m[0;34m[[0m[0;34m'store_id'[0m[0;34m][0m[0;34m==[0m[0mstore_id[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     36 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m<ipython-input-9-2e9813213b14>[0m(37)[0;36m<module>[0;34m()[0m
[0;32m     36 [0;31m[0;34m[0m[0m
[0m[0;32m---> 37 [0;31m        [0mmask[0m [0;34m=[0m [0;34m([0m[0mday_mask[0m[0;34m)[0m[0;34m&[0m[0;34m([0m[0mstore_mask[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     38 [0;31m        [0mbase_test[0m[0;34m[[0m[0mTARGET[0m[0;34m][0m[0;34m[[0m[0mmask[0m[0;34m][0m [0;34m=[0m [0mestimator[0m[0;34m.[0m[0mpredict[0m[0;34m([0m[0mgrid_df[0m[0;34m[[0m[0mmask[0m[0;34m][0m[0;34m[[0m[0mMODEL_FEATURES[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m<ipython-input-9-2e9813213b14>[0m(38)[0;36m<module>[0;34m()[0m
[0;32m     37 [0;31m        [0mmask[0m [0;34m=[0m [0;34m([0m[0mday_mask[0m[0;34m)[0m[0;34m&[0m[0;34m([0m[0mstore_mask[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 38 [0;31m        [0mbase_test[0m[0;34m[[0m[0mTARGET[0m[0;34m][0m[0;34m[[0m[0mmask[0m[0;34m][0m [0;34m=[0m [0mestimator[0m[0;34m.[0m[0mpredict[0m[0;34m([0m[0mgrid_df[0m[0;34m[[0m[0mmask[0m[0;34m][0m[0;34m[[0m[0mMODEL_FEATURES[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     39 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m<ipython-input-9-2e9813213b14>[0m(23)[0;36m<module>[0;34m()[0m
[0;32m     22 [0;31m[0;34m[0m[0m
[0m[0;32m---> 23 [0;31m    [0;32mfor[0m [0mstore_id[0m [0;32min[0m [0mSTORES_IDS[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     24 [0;31m[0;34m[0m[0m
[0m


ipdb>  base_test


                                    id     d store_id state_id  sales  \
0        HOBBIES_1_001_CA_1_evaluation  1842     CA_1       CA    4.0   
1        HOBBIES_1_002_CA_1_evaluation  1842     CA_1       CA    0.0   
2        HOBBIES_1_003_CA_1_evaluation  1842     CA_1       CA    1.0   
3        HOBBIES_1_004_CA_1_evaluation  1842     CA_1       CA    2.0   
4        HOBBIES_1_005_CA_1_evaluation  1842     CA_1       CA    5.0   
...                                ...   ...      ...      ...    ...   
3902715    FOODS_3_823_WI_3_evaluation  1969     WI_3       WI    NaN   
3902716    FOODS_3_824_WI_3_evaluation  1969     WI_3       WI    NaN   
3902717    FOODS_3_825_WI_3_evaluation  1969     WI_3       WI    NaN   
3902718    FOODS_3_826_WI_3_evaluation  1969     WI_3       WI    NaN   
3902719    FOODS_3_827_WI_3_evaluation  1969     WI_3       WI    NaN   

               item_id    dept_id   cat_id  release  sell_price  ...  \
0        HOBBIES_1_001  HOBBIES_1  HOBBIES      224

In [9]:
all_preds['id'] = all_preds['id'].map(lambda x: x.replace('evaluation','validation'))
all_preds

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.956285,0.958743,0.844446,0.863479,1.002949,1.306109,1.354877,1.112882,0.855429,...,1.051309,1.443870,1.210359,1.091003,0.860383,0.906896,0.974821,1.243794,1.394958,1.189766
1,HOBBIES_1_002_CA_1_validation,0.220830,0.191704,0.216290,0.196618,0.221648,0.296493,0.356911,0.256091,0.217043,...,0.291287,0.360629,0.376731,0.242613,0.223980,0.241911,0.249432,0.290434,0.366860,0.385570
2,HOBBIES_1_003_CA_1_validation,0.549056,0.475419,0.531760,0.493210,0.686122,0.883768,0.746115,0.514638,0.551073,...,0.702073,0.738867,0.754876,0.463662,0.413687,0.481749,0.423272,0.577301,0.644268,0.710385
3,HOBBIES_1_004_CA_1_validation,1.377182,1.347687,1.265495,1.253812,1.721266,2.567457,3.310869,1.939841,1.313381,...,1.810988,2.623314,3.231120,1.688413,1.407650,1.337308,1.379644,1.885810,2.562164,2.965730
4,HOBBIES_1_005_CA_1_validation,0.990363,1.014008,0.937470,0.869010,1.057996,1.401109,1.476086,1.005547,0.988524,...,1.204729,1.563928,1.587335,1.161014,1.070805,1.093340,1.087575,1.211521,1.586039,1.258535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,0.474207,0.493655,0.473603,0.489015,0.492745,0.592487,0.712561,0.527754,0.501123,...,0.604021,0.740740,0.944261,0.650066,0.577998,0.583696,0.492935,0.508034,0.535456,0.684040
30486,FOODS_3_824_WI_3_validation,0.244012,0.270782,0.243153,0.210993,0.220497,0.254603,0.292987,0.251368,0.216406,...,0.238860,0.397177,0.412006,0.309413,0.376216,0.406834,0.272891,0.233452,0.254938,0.337766
30487,FOODS_3_825_WI_3_validation,0.584675,0.463950,0.418996,0.392635,0.474074,0.492743,0.647157,0.619673,0.418042,...,0.776835,1.056976,1.187486,1.028814,0.925572,0.895240,0.692968,0.600376,0.664559,0.742776
30488,FOODS_3_826_WI_3_validation,0.972861,1.146929,1.024173,0.882643,1.014146,1.161431,1.078507,1.325431,1.204841,...,1.087420,1.513481,1.521882,1.372155,1.597349,1.433423,1.111526,1.189550,1.403955,1.422572


## Submission

In [10]:
# Create Kaggle sumbission notebook for public leaderbord only
ORIGINAL = ''

# Reading competition sample submission and
# merging our predictions
# As we have predictions only for "_validation" data
# we need to do fillna() for "_evaluation" items
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v'+str(ver)+'.csv', index=False)