In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression

In [2]:
cal_df = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
prices_df = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')
sales_df_wide = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')

## **Data Preprocessing**

In [3]:
#there are many NaN values in event_name and event_type fields
#Filling them with 'NoEvent' and 'None' resp
cal_df['event_name_1'].fillna('NoEvent',inplace=True)
cal_df['event_type_1'].fillna('None',inplace=True)
cal_df['event_name_2'].fillna('NoEvent',inplace=True)
cal_df['event_type_2'].fillna('None',inplace=True)
#dropping redundant columns and changing datatypes to reduce memory usage
cal_df.drop(['date','weekday'],axis=1,inplace=True)
cal_df['wm_yr_wk'] = cal_df.wm_yr_wk.astype('int16')
cal_df['d'] = cal_df.d.str[2:].astype('int16')
for col in ['wday','month','snap_CA','snap_TX','snap_WI']:
    cal_df[col] = cal_df[col].astype('int8')
for col in ['event_name_1','event_type_1','event_name_2','event_type_2']:
    cal_df[col] = cal_df[col].astype('category')

In [4]:
# changing datatypes to reduce memory usage
prices_df['store_id'] = prices_df.store_id.astype('category')
prices_df['item_id'] = prices_df.item_id.astype('category')
prices_df['wm_yr_wk'] = prices_df.wm_yr_wk.astype('int16')
prices_df['sell_price'] = prices_df.sell_price.astype('float16')

In [5]:
#adding columns 'd_1942'...'d_1969' as we have to predict unit sales for these days and assigning value 0
for i in range(1942,1970):
    sales_df_wide['d_'+str(i)] = 0
#converting the wide-form data frame to long-form so that columns from other 2 data frames can be merged
train_df = pd.melt(sales_df_wide,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],\
                   var_name='d',value_name='units_sold')
# changing datatypes to reduce memory usage
train_df['d'] = train_df.d.str[2:].astype('int16')
train_df['units_sold'] = train_df.units_sold.astype('int16')
for col in ['id','item_id','dept_id','cat_id','store_id','state_id']:
  train_df[col] = train_df[col].astype('category')    

In [6]:
#merging with 'cal_df'
train_df = pd.merge(train_df,cal_df,on='d',how='left')
#merging with 'prices_df'
train_df = pd.merge(train_df,prices_df,on=['item_id','store_id','wm_yr_wk'],how='left')

In [7]:
del cal_df,prices_df,sales_df_wide

In [8]:
#sell_price data is not available for many rows. 
#For previous weeks filling this data by mean sell_prices for the item_id and store_id pair
train_df['sell_price'].fillna(train_df.groupby(['store_id','item_id'])['sell_price'].transform('mean'),inplace=True)

In [9]:
#calculating lag features and filling NA values with 0
for i in [7,14,21,28]:
    train_df['lag_'+str(i)] = train_df.groupby('id')['units_sold'].transform(lambda x:x.shift(i))
    train_df['lag_'+str(i)].fillna(0,inplace=True)

In [10]:
#ref:https://www.kaggle.com/kyakovlev/m5-lags-features
#rolling window mean features and filling NA values with 0
for i in [7,14]:
    train_df['rolling_mean_'+str(i)] = train_df.groupby('id')['units_sold'].transform(lambda x:x.rolling(i).mean())
    train_df['rolling_mean_'+str(i)].fillna(0,inplace=True)
#rolling window meadion features and filling NA values with 0
for i in [7,14]:
    train_df['rolling_median_'+str(i)] = train_df.groupby('id')['units_sold'].transform(lambda x:x.rolling(i).median())
    train_df['rolling_median_'+str(i)].fillna(0,inplace=True)

In [11]:
#dropping features 'wm_yr_wk' and 'year' as they are similar to 'd'
train_df.drop(['wm_yr_wk','year'],axis=1,inplace=True)

In [12]:
for i in ['lag_7','lag_14','lag_21','lag_28','rolling_mean_7','rolling_mean_14','rolling_median_7','rolling_median_14']:
    train_df[i] = train_df[i].astype('float16')

In [13]:
from sklearn.model_selection import train_test_split
y = train_df['units_sold']
train_df.drop(['units_sold'],axis=1,inplace=True)
x_train,x_test,y_train,y_test = train_test_split(train_df, y, test_size=0.2, random_state=42)

In [14]:
del train_df

In [15]:
del y

In [16]:
from sklearn.preprocessing import LabelEncoder
#feature encoding
le = LabelEncoder()
x_train_item_cat = le.fit_transform(x_train.item_id.values)
x_test_item_cat = le.transform(x_test.item_id.values)

In [17]:
x_train_id_cat = le.fit_transform(x_train.id.values)
x_test_id_cat = le.transform(x_test.id.values)

In [18]:
x_train_event_name_1 = le.fit_transform(x_train.event_name_1.values)
x_test_event_name_1 = le.transform(x_test.event_name_1.values)

In [19]:
x_train_event_type_1 = le.fit_transform(x_train.event_type_1.values)
x_test_event_type_1 = le.transform(x_test.event_type_1.values)

In [20]:
x_train_event_name_2 = le.fit_transform(x_train.event_name_2.values)
x_test_event_name_2 = le.transform(x_test.event_name_2.values)

In [21]:
x_train_event_type_2 = le.fit_transform(x_train.event_type_2.values)
x_test_event_type_2 = le.transform(x_test.event_type_2.values)

In [22]:
x_train['item_id'] = x_train_item_cat
x_test['item_id'] = x_test_item_cat

x_train['id'] = x_train_id_cat
x_test['id'] = x_test_id_cat

x_train['event_name_1'] = x_train_event_name_1
x_test['event_name_1'] = x_test_event_name_1

x_train['event_type_1'] = x_train_event_type_1
x_test['event_type_1'] = x_test_event_type_1

x_train['event_name_2'] = x_train_event_name_2
x_test['event_name_2'] = x_test_event_name_2

x_train['event_type_2'] = x_train_event_type_2
x_test['event_type_2'] = x_test_event_type_2

In [23]:
del x_train_item_cat,x_train_id_cat,x_train_event_name_1,x_train_event_type_1,x_train_event_name_2,x_train_event_type_2

In [24]:
del x_test_item_cat,x_test_id_cat,x_test_event_name_1,x_test_event_type_1,x_test_event_name_2,x_test_event_type_2

In [25]:
x_train_dept_cat = le.fit_transform(x_train.dept_id.values)
x_test_dept_cat = le.transform(x_test.dept_id.values)

In [26]:
x_train_cat = le.fit_transform(x_train.cat_id.values)
x_test_cat = le.transform(x_test.cat_id.values)

In [27]:
x_train_store_cat = le.fit_transform(x_train.store_id.values)
x_test_store_cat = le.transform(x_test.store_id.values)

In [28]:
x_train_state_cat = le.fit_transform(x_train.state_id.values)
x_test_state_cat = le.transform(x_test.state_id.values)

In [29]:
x_train['dept_id'] = x_train_dept_cat
x_test['dept_id'] = x_test_dept_cat

x_train['cat_id'] = x_train_cat
x_test['cat_id'] = x_test_cat

x_train['store_id'] = x_train_store_cat
x_test['store_id'] = x_test_store_cat

x_train['state_id'] = x_train_state_cat
x_test['state_id'] = x_test_state_cat

In [30]:
del x_train_dept_cat,x_train_cat,x_train_store_cat,x_train_state_cat

In [31]:
del x_test_dept_cat,x_test_cat,x_test_store_cat,x_test_state_cat

In [32]:
for i in ['item_id','event_name_1','event_type_1','event_name_2','event_type_2']:
    x_train[i] = x_train[i].astype('int16')

In [33]:
for i in ['dept_id','cat_id','store_id','state_id']:
    x_train[i] = x_train[i].astype('int8')

In [34]:
for i in ['item_id','event_name_1','event_type_1','event_name_2','event_type_2']:
    x_test[i] = x_test[i].astype('int16')

In [35]:
for i in ['dept_id','cat_id','store_id','state_id']:
    x_test[i] = x_test[i].astype('int8')

In [36]:
x_train_1,x_train_2,y_train_1,y_train_2 = train_test_split(x_train, y_train, test_size=0.5, random_state=42)

In [37]:
del x_train

In [38]:
del y_train

In [39]:
#function to calculate assymmetric rmse,custom metric function
def armse(y_act,y_pred):
    score=0
    n = len(y_act)
    diff = np.array(y_pred) - np.array(y_act)
    for ele in diff:
        if ele<0:
            score += 4*(ele**2)
        else:
            score += ele**2
    return np.sqrt(score/n)

In [40]:
dt = DecisionTreeRegressor(max_depth=11,min_samples_split=3,random_state=0)
dt.fit(x_train_1,y_train_1)
y_pred2_dt = dt.predict(x_train_2)
y_test_dt = dt.predict(x_test)

In [41]:
rf = RandomForestRegressor(n_estimators=64,max_depth=12,min_samples_split=6,random_state=0,n_jobs=-1)
rf.fit(x_train_1,y_train_1)
y_pred2_rf = rf.predict(x_train_2)
y_test_rf = rf.predict(x_test)

In [42]:
xgb = XGBRegressor(learning_rate=0.140999404993131, max_depth=9, n_estimators=56,n_jobs=-1)
xgb.fit(x_train_1,y_train_1)
y_pred2_xgb = xgb.predict(x_train_2)
y_test_xgb = xgb.predict(x_test)

In [43]:
lgbm = LGBMRegressor(learning_rate=0.26661528358747144, max_depth=8, n_estimators=90, n_jobs=-1)
lgbm.fit(x_train_1,y_train_1)
y_pred2_lgbm = lgbm.predict(x_train_2)
y_test_lgbm = lgbm.predict(x_test)

In [44]:
cat = CatBoostRegressor(learning_rate=0.55,depth=3,logging_level="Silent")
cat.fit(x_train_1,y_train_1)
y_pred2_cat = cat.predict(x_train_2)
y_test_cat = cat.predict(x_test)

In [45]:
x_train_meta = np.vstack((y_pred2_dt,y_pred2_rf,y_pred2_xgb,y_pred2_lgbm,y_pred2_cat)).T
x_test_meta = np.vstack((y_test_dt,y_test_rf,y_test_xgb,y_test_lgbm,y_test_cat)).T

### **Linear Regression metamodel**

In [46]:
lr = LinearRegression(n_jobs=-1)
lr.fit(x_train_meta,y_train_2)
y_train_meta = lr.predict(x_train_meta)
y_test_meta = lr.predict(x_test_meta)

In [47]:
rmse_train = mse(y_train_2,y_train_meta,squared=False)
armse_train = armse(y_train_2,y_train_meta)
rmse_test = mse(y_test,y_test_meta,squared=False)
armse_test = armse(y_test,y_test_meta)

In [48]:
print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

Train: RMSE 1.8219553770324861 ARMSE 3.0875350801387773
Test: RMSE 1.8327137731488883 ARMSE 3.1106562614702455


### **LGBM metamodel**

In [52]:
for i in tqdm(range(5)):
    learning_rate = random.uniform(0.1,0.9)
    max_depth = random.sample(range(5,10),1)[0]
    n_estimators = random.sample(range(50,100),1)[0]
    lgbm_meta = LGBMRegressor(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators, n_jobs=-1)
    lgbm_meta.fit(x_train_meta,y_train_2)
    y_train_metal = lgbm_meta.predict(x_train_meta)
    y_test_metal = lgbm_meta.predict(x_test_meta)

    rmse_trainl = mse(y_train_2,y_train_metal,squared=False)
    armse_trainl = armse(y_train_2,y_train_metal)
    rmse_testl = mse(y_test,y_test_metal,squared=False)
    armse_testl = armse(y_test,y_test_metal)

    print('n_estimators {} max_depth {} learning_rate {}'.format(n_estimators,max_depth,learning_rate))
    print('Train: RMSE {} ARMSE {}'.format(rmse_trainl,armse_trainl))
    print('Test: RMSE {} ARMSE {}'.format(rmse_testl,armse_testl))

 20%|██        | 1/5 [02:30<10:01, 150.43s/it]

n_estimators 96 max_depth 6 learning_rate 0.20753827078765896
Train: RMSE 1.8801581017361118 ARMSE 3.2065123002974736
Test: RMSE 1.9244260549011638 ARMSE 3.283624039531873


 40%|████      | 2/5 [04:29<06:35, 131.84s/it]

n_estimators 62 max_depth 9 learning_rate 0.26084700446759373
Train: RMSE 1.8803602919746554 ARMSE 3.2070156177109452
Test: RMSE 1.925361088850505 ARMSE 3.2858934992768054


 60%|██████    | 3/5 [06:21<04:05, 122.73s/it]

n_estimators 91 max_depth 8 learning_rate 0.30110023171353717
Train: RMSE 1.8763145426811887 ARMSE 3.1977594072580695
Test: RMSE 1.926117993688346 ARMSE 3.285551630254249


 80%|████████  | 4/5 [08:33<02:06, 126.47s/it]

n_estimators 61 max_depth 7 learning_rate 0.8329282892922588
Train: RMSE 1.8728021820041758 ARMSE 3.187133114522237
Test: RMSE 1.933608907514247 ARMSE 3.29472083137263


100%|██████████| 5/5 [10:03<00:00, 120.66s/it]

n_estimators 53 max_depth 9 learning_rate 0.8614458405145404
Train: RMSE 1.8734602459271434 ARMSE 3.188498470559769
Test: RMSE 1.9317071048384653 ARMSE 3.2900850732881235





In [55]:
from sklearn.linear_model import Lasso
for i in tqdm(range(5)):
    alpha = random.uniform(0.1,0.5)
    ls = Lasso(alpha=alpha,random_state=0)
    ls.fit(x_train_meta,y_train_2)
    y_train_metal = ls.predict(x_train_meta)
    y_test_metal = ls.predict(x_test_meta)

    rmse_trainl = mse(y_train_2,y_train_metal,squared=False)
    armse_trainl = armse(y_train_2,y_train_metal)
    rmse_testl = mse(y_test,y_test_metal,squared=False)
    armse_testl = armse(y_test,y_test_metal)

    print('n_estimators {}'.format(alpha))
    print('Train: RMSE {} ARMSE {}'.format(rmse_trainl,armse_trainl))
    print('Test: RMSE {} ARMSE {}'.format(rmse_testl,armse_testl))

 20%|██        | 1/5 [01:59<07:59, 119.95s/it]

n_estimators 0.4146175890124124
Train: RMSE 1.8275683147169144 ARMSE 3.1593565926409695
Test: RMSE 1.835770890723141 ARMSE 3.1783777994338487


 40%|████      | 2/5 [03:49<05:40, 113.66s/it]

n_estimators 0.11105005845304304
Train: RMSE 1.8235769088833396 ARMSE 3.107993311110767
Test: RMSE 1.8316917287528112 ARMSE 3.126542613251512


 60%|██████    | 3/5 [05:38<03:43, 111.76s/it]

n_estimators 0.22564883162655983
Train: RMSE 1.8245442702911896 ARMSE 3.126712880096213
Test: RMSE 1.8326885282811196 ARMSE 3.145437028703332


 80%|████████  | 4/5 [07:12<01:44, 104.70s/it]

n_estimators 0.43637860644238624
Train: RMSE 1.8280304571262973 ARMSE 3.1632572644208956
Test: RMSE 1.8362405326582416 ARMSE 3.1823132514984103


100%|██████████| 5/5 [08:46<00:00, 105.31s/it]

n_estimators 0.2676885500435674
Train: RMSE 1.8250633037539488 ARMSE 3.133784084267938
Test: RMSE 1.833219530525953 ARMSE 3.1525734004683303





1. RMSE of custom stacking regressor is greater as compared to some first cut single models such as LGBM regressor