In [2]:
import os
import gc
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb

## Loading data

In [7]:
# Loading the data
train = pd.read_csv('input/train.csv', parse_dates=['date'])
test = pd.read_csv('input/test.csv', parse_dates=['date'])
sample_sub = pd.read_csv('input/sample_submission.csv')
print('Train shape:{}, Test shape:{}'.format(train.shape, test.shape))
train.head()

Train shape:(913000, 4), Test shape:(45000, 4)


Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


## Feature Engineering

In [8]:
# Concatenating train & test
train['train_or_test'] = 'train'
test['train_or_test'] = 'test'
df = pd.concat([train,test], sort=False)
print('Combined df shape:{}'.format(df.shape))
del train, test
gc.collect()

Combined df shape:(958000, 6)


76

### Date Features

In [9]:
# Extracting date features
df['dayofmonth'] = df.date.dt.day
df['dayofyear'] = df.date.dt.dayofyear
df['dayofweek'] = df.date.dt.dayofweek
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year
df['weekofyear'] = df.date.dt.weekofyear
df['is_month_start'] = (df.date.dt.is_month_start).astype(int)
df['is_month_end'] = (df.date.dt.is_month_end).astype(int)
df.head()

Unnamed: 0,date,store,item,sales,train_or_test,id,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
0,2013-01-01,1,1,13.0,train,,1,1,1,1,2013,1,1,0
1,2013-01-02,1,1,11.0,train,,2,2,2,1,2013,1,0,0
2,2013-01-03,1,1,14.0,train,,3,3,3,1,2013,1,0,0
3,2013-01-04,1,1,13.0,train,,4,4,4,1,2013,1,0,0
4,2013-01-05,1,1,10.0,train,,5,5,5,1,2013,1,0,0


In [10]:
# Sorting the dataframe by store then item then date
#df.sort_values(by=['store','item','month','dayofweek'], axis=0, inplace=True)
df.sort_values(by=['store','item','date'], axis=0, inplace=True)

### Monthwise aggregated sales values

In [11]:
def create_sales_agg_monthwise_features(df, gpby_cols, target_col, agg_funcs):
    '''
    Creates various sales agg features with given agg functions  
    '''
    gpby = df.groupby(gpby_cols)
    newdf = df[gpby_cols].drop_duplicates().reset_index(drop=True)
    for agg_name, agg_func in agg_funcs.items():
        aggdf = gpby[target_col].agg(agg_func).reset_index()
        aggdf.rename(columns={target_col:target_col+'_'+agg_name}, inplace=True)
        newdf = newdf.merge(aggdf, on=gpby_cols, how='left')
    return newdf

### Features constructed from previous sales values

In [12]:
# Creating sales lag features
def create_sales_lag_feats(df, gpby_cols, target_col, lags):
    gpby = df.groupby(gpby_cols)
    for i in lags:
        df['_'.join([target_col, 'lag', str(i)])] = \
                gpby[target_col].shift(i).values + np.random.normal(scale=1.6, size=(len(df),))
    return df

# Creating sales rolling mean features
def create_sales_rmean_feats(df, gpby_cols, target_col, windows, min_periods=2, 
                             shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmean', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).mean().values +\
            np.random.normal(scale=1.6, size=(len(df),))
    return df

# Creating sales rolling median features
def create_sales_rmed_feats(df, gpby_cols, target_col, windows, min_periods=2, 
                            shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmed', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).median().values +\
            np.random.normal(scale=1.6, size=(len(df),))
    return df

# Creating sales exponentially weighted mean features
def create_sales_ewm_feats(df, gpby_cols, target_col, alpha=[0.9], shift=[1]):
    gpby = df.groupby(gpby_cols)
    for a in alpha:
        for s in shift:
            df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] = \
                gpby[target_col].shift(s).ewm(alpha=a).mean().values
    return df

### OHE of categorical features

In [13]:
def one_hot_encoder(df, ohe_cols=['store','item','dayofmonth','dayofweek','month','weekofyear']):
    '''
    One-Hot Encoder function
    '''
    print('Creating OHE features..\nOld df shape:{}'.format(df.shape))
    df = pd.get_dummies(df, columns=ohe_cols)
    print('New df shape:{}'.format(df.shape))
    return df

### Log Sales 

In [14]:
# Converting sales to log(1+sales)
df['sales'] = np.log1p(df.sales.values)
df.sample(2)

Unnamed: 0,date,store,item,sales,train_or_test,id,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
391707,2015-08-02,5,22,4.521789,train,,2,214,6,8,2015,31,0,0
403581,2013-02-05,2,23,3.496508,train,,5,36,1,2,2013,6,0,0


## Time-based Validation set

In [15]:
# For validation 
# We can choose last 3 months of training period(Oct, Nov, Dec 2017) as our validation set to gauge the performance of the model.
# OR to keep months also identical to test set we can choose period (Jan, Feb, Mar 2017) as the validation set.
# Here we will go with the latter choice.
masked_series = (df.year==2017) & (df.month.isin([1,2,3]))
masked_series2 = (df.year==2017) & (~(df.month.isin([1,2,3])))
df.loc[(masked_series), 'train_or_test'] = 'val'
df.loc[(masked_series2), 'train_or_test'] = 'no_train'
print('Train shape: {}'.format(df.loc[df.train_or_test=='train',:].shape))
print('Validation shape: {}'.format(df.loc[df.train_or_test=='val',:].shape))
print('No train shape: {}'.format(df.loc[df.train_or_test=='no_train',:].shape))
print('Test shape: {}'.format(df.loc[df.train_or_test=='test',:].shape))

Train shape: (730500, 14)
Validation shape: (45000, 14)
No train shape: (137500, 14)
Test shape: (45000, 14)


## Model Validation

In [16]:
# Converting sales of validation period to nan so as to resemble test period
train = df.loc[df.train_or_test.isin(['train','val']), :]
Y_val = train.loc[train.train_or_test=='val', 'sales'].values.reshape((-1))
Y_train = train.loc[train.train_or_test=='train', 'sales'].values.reshape((-1))
train.loc[train.train_or_test=='val', 'sales'] = np.nan

# # Creating sales lag, rolling mean, rolling median, ohe features of the above train set
train = create_sales_lag_feats(train, gpby_cols=['store','item'], target_col='sales', 
                               lags=[91,98,105,112,119,126,182,364,546,728])

train = create_sales_rmean_feats(train, gpby_cols=['store','item'], 
                                 target_col='sales', windows=[364,546], 
                                 min_periods=10, win_type='triang') #98,119,91,182,

# # train = create_sales_rmed_feats(train, gpby_cols=['store','item'], 
# #                                 target_col='sales', windows=[364,546], 
# #                                 min_periods=10, win_type=None) #98,119,91,182,

train = create_sales_ewm_feats(train, gpby_cols=['store','item'], 
                               target_col='sales', 
                               alpha=[0.95, 0.9, 0.8, 0.7, 0.6, 0.5], 
                               shift=[91,98,105,112,119,126,182,364,546,728])

# # Creating sales monthwise aggregated values
# agg_df = create_sales_agg_monthwise_features(df.loc[df.train_or_test=='train', :], 
#                                              gpby_cols=['store','item','month'], 
#                                              target_col='sales', 
#                                              agg_funcs={'mean':np.mean, 
#                                              'median':np.median, 'max':np.max, 
#                                              'min':np.min, 'std':np.std})

# # Joining agg_df with train
# train = train.merge(agg_df, on=['store','item','month'], how='left')

# One-Hot Encoding 
train = one_hot_encoder(train, ohe_cols=['store','item','dayofweek','month']) 
#,'dayofmonth','weekofyear'

# Final train and val datasets
val = train.loc[train.train_or_test=='val', :]
train = train.loc[train.train_or_test=='train', :]
print('Train shape:{}, Val shape:{}'.format(train.shape, val.shape))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-vers

Creating OHE features..
Old df shape:(775500, 86)
New df shape:(775500, 161)
Train shape:(730500, 161), Val shape:(45000, 161)


## LightGBM Model

### Training features

In [17]:
avoid_cols = ['date', 'sales', 'train_or_test', 'id', 'year']
cols = [col for col in train.columns if col not in avoid_cols]
print('No of training features: {} \nAnd they are:{}'.format(len(cols), cols))

No of training features: 156 
And they are:['dayofmonth', 'dayofyear', 'weekofyear', 'is_month_start', 'is_month_end', 'sales_lag_91', 'sales_lag_98', 'sales_lag_105', 'sales_lag_112', 'sales_lag_119', 'sales_lag_126', 'sales_lag_182', 'sales_lag_364', 'sales_lag_546', 'sales_lag_728', 'sales_rmean_364', 'sales_rmean_546', 'sales_lag_91_ewm_0.95', 'sales_lag_98_ewm_0.95', 'sales_lag_105_ewm_0.95', 'sales_lag_112_ewm_0.95', 'sales_lag_119_ewm_0.95', 'sales_lag_126_ewm_0.95', 'sales_lag_182_ewm_0.95', 'sales_lag_364_ewm_0.95', 'sales_lag_546_ewm_0.95', 'sales_lag_728_ewm_0.95', 'sales_lag_91_ewm_0.9', 'sales_lag_98_ewm_0.9', 'sales_lag_105_ewm_0.9', 'sales_lag_112_ewm_0.9', 'sales_lag_119_ewm_0.9', 'sales_lag_126_ewm_0.9', 'sales_lag_182_ewm_0.9', 'sales_lag_364_ewm_0.9', 'sales_lag_546_ewm_0.9', 'sales_lag_728_ewm_0.9', 'sales_lag_91_ewm_0.8', 'sales_lag_98_ewm_0.8', 'sales_lag_105_ewm_0.8', 'sales_lag_112_ewm_0.8', 'sales_lag_119_ewm_0.8', 'sales_lag_126_ewm_0.8', 'sales_lag_182_ewm_0.

In [59]:
def smape(preds, target):
    '''
    Function to calculate SMAPE
    '''
    n = len(preds)
    masked_arr = ~((preds==0)&(target==0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds-target)
    denom = np.abs(preds)+np.abs(target)
    smape_val = (200*np.sum(num/denom))/n
    return smape_val

def lgbm_smape(preds, train_data):
    '''
    Custom Evaluation Function for LGBM
    '''
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

In [66]:
# LightGBM parameters
lgb_params = {'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 
              'metric': {'mae'}, 'num_leaves': 10, 'learning_rate': 0.02, 
              'feature_fraction': 0.8, 'max_depth': 5, 'verbose': 0, 
              'num_boost_round':15000, 'early_stopping_rounds':200, 'nthread':-1}

In [22]:
# Creating lgbtrain & lgbval
lgbtrain = lgb.Dataset(data=train.loc[:,cols].values, label=Y_train, 
                       feature_name=cols)
lgbval = lgb.Dataset(data=val.loc[:,cols].values, label=Y_val, 
                     reference=lgbtrain, feature_name=cols)

In [23]:
def lgb_validation(params, lgbtrain, lgbval, X_val, Y_val, verbose_eval):
    t0 = time.time()
    evals_result = {}
    model = lgb.train(params, lgbtrain, num_boost_round=params['num_boost_round'], 
                      valid_sets=[lgbtrain, lgbval], feval=lgbm_smape, 
                      early_stopping_rounds=params['early_stopping_rounds'], 
                      evals_result=evals_result, verbose_eval=verbose_eval)
    print(model.best_iteration)
    print('Total time taken to build the model: ', (time.time()-t0)/60, 'minutes!!')
    pred_Y_val = model.predict(X_val, num_iteration=model.best_iteration)
    pred_Y_val = np.expm1(pred_Y_val)
    Y_val = np.expm1(Y_val)
    val_df = pd.DataFrame(columns=['true_Y_val','pred_Y_val'])
    val_df['pred_Y_val'] = pred_Y_val
    val_df['true_Y_val'] = Y_val
    print(val_df.shape)
    print(val_df.sample(5))
    print('SMAPE for validation data is:{}'.format(smape(pred_Y_val, Y_val)))
    return model, val_df

In [45]:
# Training lightgbm model and validating
model, val_df = lgb_validation(lgb_params, lgbtrain, lgbval, val.loc[:,cols].values, 
                               Y_val, verbose_eval=500)

Training until validation scores don't improve for 200 rounds.
[500]	training's l1: 0.1467	training's SMAPE: 15.0186	valid_1's l1: 0.14078	valid_1's SMAPE: 14.4538
[1000]	training's l1: 0.137887	training's SMAPE: 14.1391	valid_1's l1: 0.138398	valid_1's SMAPE: 14.2159
[1500]	training's l1: 0.133905	training's SMAPE: 13.7381	valid_1's l1: 0.136968	valid_1's SMAPE: 14.0724
[2000]	training's l1: 0.131763	training's SMAPE: 13.5221	valid_1's l1: 0.135874	valid_1's SMAPE: 13.9624
[2500]	training's l1: 0.130434	training's SMAPE: 13.388	valid_1's l1: 0.135129	valid_1's SMAPE: 13.8873
[3000]	training's l1: 0.129592	training's SMAPE: 13.3031	valid_1's l1: 0.134586	valid_1's SMAPE: 13.8326
[3500]	training's l1: 0.12898	training's SMAPE: 13.2412	valid_1's l1: 0.134207	valid_1's SMAPE: 13.7944
[4000]	training's l1: 0.128515	training's SMAPE: 13.1942	valid_1's l1: 0.133934	valid_1's SMAPE: 13.767
[4500]	training's l1: 0.128112	training's SMAPE: 13.1535	valid_1's l1: 0.1337	valid_1's SMAPE: 13.7434
[

In [46]:
# Let's see top 25 features as identified by the lightgbm model.
print("Features importance...")
gain = model.feature_importance('gain')
feat_imp = pd.DataFrame({'feature':model.feature_name(), 
                         'split':model.feature_importance('split'), 
                         'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print('Top 25 features:\n', feat_imp.head(25))

Features importance...
Top 25 features:
                     feature  split       gain
67     sales_lag_91_ewm_0.5   2739  27.799887
57     sales_lag_91_ewm_0.6   1447  11.682958
74    sales_lag_364_ewm_0.5   3313   7.573448
1                 dayofyear   6125   7.563541
5              sales_lag_91   3770   4.797943
37     sales_lag_91_ewm_0.8   1547   3.228566
64    sales_lag_364_ewm_0.6   1605   2.809390
18    sales_lag_98_ewm_0.95   2467   2.613903
28     sales_lag_98_ewm_0.9   1087   2.400444
38     sales_lag_98_ewm_0.8    754   2.343548
47     sales_lag_91_ewm_0.7   1545   2.105054
2                weekofyear   2716   2.008840
68     sales_lag_98_ewm_0.5   2425   1.967989
6              sales_lag_98   3012   1.450472
29    sales_lag_105_ewm_0.9    746   1.390890
12            sales_lag_364   2696   1.346486
54    sales_lag_364_ewm_0.7   1299   1.329591
19   sales_lag_105_ewm_0.95   1799   1.272137
20   sales_lag_112_ewm_0.95   1301   1.117175
137             dayofweek_0   1645   1.

## Final Model

In [47]:
# Creating sales lag, rolling mean, rolling median, ohe features of the above train set
df_whole = create_sales_lag_feats(df, gpby_cols=['store','item'], target_col='sales', 
                                  lags=[91,98,105,112,119,126,182,364,546,728])
df_whole = create_sales_rmean_feats(df_whole, gpby_cols=['store','item'], 
                                    target_col='sales', windows=[364,546], 
                                    min_periods=10, win_type='triang')
# df = create_sales_rmed_feats(df, gpby_cols=['store','item'], target_col='sales', 
#                              windows=[364,546], min_periods=2) #98,119,
df_whole = create_sales_ewm_feats(df_whole, gpby_cols=['store','item'], target_col='sales', 
                                  alpha=[0.95, 0.9, 0.8, 0.7, 0.6, 0.5], 
                                  shift=[91,98,105,112,119,126,182,364,546,728])

# # Creating sales monthwise aggregated values
# agg_df = create_sales_agg_monthwise_features(df.loc[~(df.train_or_test=='test'), :], 
#                                              gpby_cols=['store','item','month'], 
#                                              target_col='sales', 
#                                              agg_funcs={'mean':np.mean, 
#                                              'median':np.median, 'max':np.max, 
#                                              'min':np.min, 'std':np.std})

# # Joining agg_df with df
# df = df.merge(agg_df, on=['store','item','month'], how='left')

# One-Hot Encoding
df_whole = one_hot_encoder(df_whole, ohe_cols=['store','item','dayofweek','month']) 
#'dayofmonth',,'weekofyear'

# Final train and test datasets
test = df_whole.loc[df_whole.train_or_test=='test', :]
train = df_whole.loc[~(df_whole.train_or_test=='test'), :]
print('Train shape:{}, Test shape:{}'.format(train.shape, test.shape))

Creating OHE features..
Old df shape:(958000, 86)
New df shape:(958000, 161)
Train shape:(913000, 161), Test shape:(45000, 161)


In [48]:
# LightGBM dataset
lgbtrain_all = lgb.Dataset(data=train.loc[:,cols].values, 
                           label=train.loc[:,'sales'].values.reshape((-1,)), 
                           feature_name=cols)

In [62]:
def lgb_train(params, lgbtrain_all, X_test, num_round):
    t0 = time.time()
    model = lgb.train(params, lgbtrain_all, num_boost_round=num_round, feval=lgbm_smape)
    test_preds = model.predict(X_test, num_iteration=num_round)
    print('Total time taken in model training: ', (time.time()-t0)/60, 'minutes!')
    return model, test_preds



In [73]:
# LightGBM parameters
lgb_params = {'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 
              'metric': {'mae'}, 'num_leaves': 10, 'learning_rate': 0.02, 
              'feature_fraction': 0.8, 'max_depth': 5, 'verbose': 0, 
              'num_boost_round':15000, 'nthread':-1}

In [71]:
lgb_params

{'task': 'train',
 'boosting_type': 'gbdt',
 'objective': 'regression',
 'metric': {'mae'},
 'num_leaves': 10,
 'learning_rate': 0.02,
 'feature_fraction': 0.8,
 'max_depth': 5,
 'verbose': 0,
 'nthread': -1}

In [56]:
lgbtrain_all

<lightgbm.basic.Dataset at 0x24b0fb07d30>

In [57]:
test.loc[:,cols].values

array([[ 1.,  1.,  1., ...,  0.,  0.,  0.],
       [ 2.,  2.,  1., ...,  0.,  0.,  0.],
       [ 3.,  3.,  1., ...,  0.,  0.,  0.],
       ...,
       [29., 88., 13., ...,  0.,  0.,  0.],
       [30., 89., 13., ...,  0.,  0.,  0.],
       [31., 90., 13., ...,  0.,  0.,  0.]])

In [58]:
model.best_iteration

15000

In [60]:
lgbm_smape

<function __main__.lgbm_smape(preds, train_data)>

In [74]:
# Training lgb model on whole data(train+val)
lgb_model, test_preds = lgb_train(lgb_params, lgbtrain_all, test.loc[:,cols].values, model.best_iteration)
print('test_preds shape:{}'.format(test_preds.shape))

Total time taken in model training:  35.27308999300003 minutes!
test_preds shape:(45000,)


## Submission

In [75]:
# Create submission
sub = test.loc[:,['id','sales']]
sub['sales'] = np.expm1(test_preds)
sub['id'] = sub.id.astype(int)
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,sales
0,0,11.344051
1,1,14.128562
2,2,13.301148
3,3,15.3471
4,4,17.911739


## WaveNet Model 

In [76]:
df.head(2)

Unnamed: 0,date,store,item,sales,train_or_test,id,dayofmonth,dayofyear,dayofweek,month,...,sales_lag_91_ewm_0.5,sales_lag_98_ewm_0.5,sales_lag_105_ewm_0.5,sales_lag_112_ewm_0.5,sales_lag_119_ewm_0.5,sales_lag_126_ewm_0.5,sales_lag_182_ewm_0.5,sales_lag_364_ewm_0.5,sales_lag_546_ewm_0.5,sales_lag_728_ewm_0.5
0,2013-01-01,1,1,2.639057,train,,1,1,1,1,...,,,,,,,,,,
1,2013-01-02,1,1,2.484907,train,,2,2,2,1,...,,,,,,,,,,


In [77]:
df.date.min(), df.date.max()

(Timestamp('2013-01-01 00:00:00'), Timestamp('2018-03-31 00:00:00'))