In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
from catboost import CatBoostRegressor

In [2]:
cal_df = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
prices_df = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')
sales_df_wide = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')

## **Data Preprocessing**

In [3]:
#there are many NaN values in event_name and event_type fields
#Filling them with 'NoEvent' and 'None' resp
cal_df['event_name_1'].fillna('NoEvent',inplace=True)
cal_df['event_type_1'].fillna('None',inplace=True)
cal_df['event_name_2'].fillna('NoEvent',inplace=True)
cal_df['event_type_2'].fillna('None',inplace=True)
#dropping redundant columns and changing datatypes to reduce memory usage
cal_df.drop(['date','weekday'],axis=1,inplace=True)
cal_df['wm_yr_wk'] = cal_df.wm_yr_wk.astype('int16')
cal_df['d'] = cal_df.d.str[2:].astype('int16')
for col in ['wday','month','snap_CA','snap_TX','snap_WI']:
    cal_df[col] = cal_df[col].astype('int8')
for col in ['event_name_1','event_type_1','event_name_2','event_type_2']:
    cal_df[col] = cal_df[col].astype('category')

In [4]:
# changing datatypes to reduce memory usage
prices_df['store_id'] = prices_df.store_id.astype('category')
prices_df['item_id'] = prices_df.item_id.astype('category')
prices_df['wm_yr_wk'] = prices_df.wm_yr_wk.astype('int16')
prices_df['sell_price'] = prices_df.sell_price.astype('float16')

In [5]:
#adding columns 'd_1942'...'d_1969' as we have to predict unit sales for these days and assigning value 0
for i in range(1942,1970):
    sales_df_wide['d_'+str(i)] = 0
#converting the wide-form data frame to long-form so that columns from other 2 data frames can be merged
train_df = pd.melt(sales_df_wide,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],\
                   var_name='d',value_name='units_sold')
# changing datatypes to reduce memory usage
train_df['d'] = train_df.d.str[2:].astype('int16')
train_df['units_sold'] = train_df.units_sold.astype('int16')
for col in ['id','item_id','dept_id','cat_id','store_id','state_id']:
  train_df[col] = train_df[col].astype('category')    

In [6]:
#merging with 'cal_df'
train_df = pd.merge(train_df,cal_df,on='d',how='left')
#merging with 'prices_df'
train_df = pd.merge(train_df,prices_df,on=['item_id','store_id','wm_yr_wk'],how='left')

In [7]:
del cal_df,prices_df,sales_df_wide

In [8]:
#sell_price data is not available for many rows. 
#For previous weeks filling this data by mean sell_prices for the item_id and store_id pair
train_df['sell_price'].fillna(train_df.groupby(['store_id','item_id'])['sell_price'].transform('mean'),inplace=True)

In [9]:
#calculating lag features and filling NA values with 0
for i in [7,14,21,28]:
    train_df['lag_'+str(i)] = train_df.groupby('id')['units_sold'].transform(lambda x:x.shift(i))
    train_df['lag_'+str(i)].fillna(0,inplace=True)

In [10]:
#ref:https://www.kaggle.com/kyakovlev/m5-lags-features
#rolling window mean features and filling NA values with 0
for i in [7,14]:
    train_df['rolling_mean_'+str(i)] = train_df.groupby('id')['units_sold'].transform(lambda x:x.rolling(i).mean())
    train_df['rolling_mean_'+str(i)].fillna(0,inplace=True)
#rolling window meadion features and filling NA values with 0
for i in [7,14]:
    train_df['rolling_median_'+str(i)] = train_df.groupby('id')['units_sold'].transform(lambda x:x.rolling(i).median())
    train_df['rolling_median_'+str(i)].fillna(0,inplace=True)

In [11]:
#dropping features 'wm_yr_wk' and 'year' as they are similar to 'd'
train_df.drop(['wm_yr_wk','year'],axis=1,inplace=True)

In [12]:
for i in ['lag_7','lag_14','lag_21','lag_28','rolling_mean_7','rolling_mean_14','rolling_median_7','rolling_median_14']:
    train_df[i] = train_df[i].astype('float16')

In [13]:
y_train = train_df.loc[train_df.d.isin(range(1000,1914))]['units_sold']
y_test = train_df.loc[train_df.d.isin(range(1914,1942))]['units_sold']
train_df.drop(['units_sold'],axis=1,inplace=True)
x_train = train_df.loc[train_df.d.isin(range(1000,1914))]
x_test = train_df.loc[train_df.d.isin(range(1914,1942))]

In [14]:
del train_df

In [15]:
from sklearn.preprocessing import LabelEncoder
#feature encoding
le = LabelEncoder()
x_train_item_cat = le.fit_transform(x_train.item_id.values)
x_test_item_cat = le.transform(x_test.item_id.values)

In [16]:
x_train_id_cat = le.fit_transform(x_train.id.values)
x_test_id_cat = le.transform(x_test.id.values)

In [17]:
x_train_event_name_1 = le.fit_transform(x_train.event_name_1.values)
x_test_event_name_1 = le.transform(x_test.event_name_1.values)

In [18]:
x_train_event_type_1 = le.fit_transform(x_train.event_type_1.values)
x_test_event_type_1 = le.transform(x_test.event_type_1.values)

In [19]:
x_train_event_name_2 = le.fit_transform(x_train.event_name_2.values)
x_test_event_name_2 = le.transform(x_test.event_name_2.values)

In [20]:
x_train_event_type_2 = le.fit_transform(x_train.event_type_2.values)
x_test_event_type_2 = le.transform(x_test.event_type_2.values)

In [21]:
x_train_dept_cat = le.fit_transform(x_train.dept_id.values)
x_test_dept_cat = le.transform(x_test.dept_id.values)

In [22]:
x_train_cat = le.fit_transform(x_train.cat_id.values)
x_test_cat = le.transform(x_test.cat_id.values)

In [23]:
x_train_store_cat = le.fit_transform(x_train.store_id.values)
x_test_store_cat = le.transform(x_test.store_id.values)

In [24]:
x_train_state_cat = le.fit_transform(x_train.state_id.values)
x_test_state_cat = le.transform(x_test.state_id.values)

In [25]:
x_train['item_id'] = x_train_item_cat
x_test['item_id'] = x_test_item_cat

x_train['id'] = x_train_id_cat
x_test['id'] = x_test_id_cat

x_train['dept_id'] = x_train_dept_cat
x_test['dept_id'] = x_test_dept_cat

x_train['cat_id'] = x_train_cat
x_test['cat_id'] = x_test_cat

x_train['store_id'] = x_train_store_cat
x_test['store_id'] = x_test_store_cat

x_train['state_id'] = x_train_state_cat
x_test['state_id'] = x_test_state_cat

In [26]:
x_train['event_name_1'] = x_train_event_name_1
x_test['event_name_1'] = x_test_event_name_1

x_train['event_type_1'] = x_train_event_type_1
x_test['event_type_1'] = x_test_event_type_1

x_train['event_name_2'] = x_train_event_name_2
x_test['event_name_2'] = x_test_event_name_2

x_train['event_type_2'] = x_train_event_type_2
x_test['event_type_2'] = x_test_event_type_2

In [27]:
del x_train_item_cat,x_train_id_cat,x_train_event_name_1,x_train_event_type_1,x_train_event_name_2,x_train_event_type_2,x_train_dept_cat,x_train_cat,x_train_store_cat,x_train_state_cat

In [28]:
del x_test_item_cat,x_test_id_cat,x_test_event_name_1,x_test_event_type_1,x_test_event_name_2,x_test_event_type_2,x_test_dept_cat,x_test_cat,x_test_store_cat,x_test_state_cat

In [29]:
#function to calculate assymmetric rmse,custom metric function
def armse(y_act,y_pred):
    score=0
    n = len(y_act)
    diff = np.array(y_pred) - np.array(y_act)
    for ele in diff:
        if ele<0:
            score += 4*(ele**2)
        else:
            score += ele**2
    return np.sqrt(score/n)

### **Hyperparameter tuning: SGDRegressor**
* Considering SGDRegressor instead of SVR as the fit time complexity is more than quadratic and hard to fit for datasets with more than 20000 samples as per official documentation
* Ref: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [30]:
from sklearn.linear_model import SGDRegressor
sgd_rtrain = []
sgd_rtest = []
sgd_atrain = []
sgd_atest = []
for i in tqdm(range(2)):
    alpha = random.uniform(0.01,0.1)
    eta = random.uniform(0.001,0.5)
    sr = SGDRegressor(alpha=alpha,eta0=eta,max_iter=1500)
    sr.fit(x_train,y_train)
    y_test_sr = sr.predict(x_test)
    y_train_sr = sr.predict(x_train)
    rmse_train = mse(y_train,y_train_sr,squared=False)
    armse_train = armse(y_train,y_train_sr)    
    rmse_test = mse(y_test,y_test_sr,squared=False)
    armse_test = armse(y_test,y_test_sr)
    sgd_rtrain.extend([rmse_train])
    sgd_atrain.extend([armse_train])
    sgd_rtest.extend([rmse_test])
    sgd_atest.extend([armse_test])
    print('Params: {} {}'.format(alpha,eta))
    print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
    print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

 50%|█████     | 1/2 [4:12:04<4:12:04, 15124.24s/it]

Params: 0.016412257355770586 0.10437643824585228
Train: RMSE 6.587733983701801e+16 ARMSE 6.587776669851309e+16
Test: RMSE 6.537597784341029e+16 ARMSE 6.537688284043288e+16


100%|██████████| 2/2 [8:34:28<00:00, 15434.37s/it]  

Params: 0.010723422206099462 0.21328178114285148
Train: RMSE 7926531504009859.0 ARMSE 8062262019933451.0
Test: RMSE 7144434813039157.0 ARMSE 7458391190709563.0





1. SGDRegressor is taking a lot of iterations to converge and train time is very high

### **Hyperparameter tuning: DecisionTreeRegressor**

In [31]:
from sklearn.tree import DecisionTreeRegressor
dt_rtrain = []
dt_rtest = []
dt_atrain = []
dt_atest = []
for i in tqdm(range(5)):
    max_depth = random.sample(range(10,15),1)[0]
    min_samples_split = random.sample(range(2,8),1)[0]
    dt = DecisionTreeRegressor(max_depth=max_depth,min_samples_split=min_samples_split,random_state=0)
    dt.fit(x_train,y_train)
    y_train_dt = dt.predict(x_train)
    y_test_dt = dt.predict(x_test)
    rmse_train = mse(y_train,y_train_dt,squared=False)
    armse_train = armse(y_train,y_train_dt)    
    rmse_test = mse(y_test,y_test_dt,squared=False)
    armse_test = armse(y_test,y_test_dt)
    dt_rtrain.extend([rmse_train])
    dt_atrain.extend([armse_train])
    dt_rtest.extend([rmse_test])
    dt_atest.extend([armse_test])
    print('Params: max_depth {} min_samples_split {}'.format(max_depth,min_samples_split))
    print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
    print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

 20%|██        | 1/5 [05:33<22:15, 333.80s/it]

Params: max_depth 11 min_samples_split 6
Train: RMSE 1.7845744836566988 ARMSE 3.0174652866670773
Test: RMSE 1.7833629637072994 ARMSE 3.0133470743047304


 40%|████      | 2/5 [10:45<16:01, 320.53s/it]

Params: max_depth 10 min_samples_split 4
Train: RMSE 1.804485785509871 ARMSE 3.0543273518981047
Test: RMSE 1.7817758934714554 ARMSE 3.018006362334728


 60%|██████    | 3/5 [16:14<10:49, 324.55s/it]

Params: max_depth 11 min_samples_split 4
Train: RMSE 1.7830075852307998 ARMSE 3.015072410869413
Test: RMSE 1.7827317586354638 ARMSE 3.0129601208499257


 80%|████████  | 4/5 [22:35<05:46, 346.82s/it]

Params: max_depth 14 min_samples_split 2
Train: RMSE 1.7068921116109432 ARMSE 2.880097654270217
Test: RMSE 1.8106680280370904 ARMSE 3.0431061783694107


100%|██████████| 5/5 [28:57<00:00, 347.53s/it]

Params: max_depth 14 min_samples_split 3
Train: RMSE 1.7100263474107393 ARMSE 2.8847465239840866
Test: RMSE 1.810539943950743 ARMSE 3.0394217571016577





In [32]:
dt = DecisionTreeRegressor(max_depth=11,min_samples_split=3,random_state=0)
dt.fit(x_train,y_train)
y_train_dt = dt.predict(x_train)
y_test_dt = dt.predict(x_test)
rmse_train = mse(y_train,y_train_dt,squared=False)
armse_train = armse(y_train,y_train_dt)    
rmse_test = mse(y_test,y_test_dt,squared=False)
armse_test = armse(y_test,y_test_dt)
print('Params: max_depth {} min_samples_split {}'.format(max_depth,min_samples_split))
print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

Params: max_depth 14 min_samples_split 3
Train: RMSE 1.78111129155709 ARMSE 3.0120124515864624
Test: RMSE 1.7827317586354638 ARMSE 3.0129601208499257


### **Hyperparameter tuning: CatBoostRegressor**

In [33]:
for i in tqdm(range(5)):
    learning_rate = random.uniform(0.1,0.9)
    depth = random.sample(range(2,8),1)[0]
    cat = CatBoostRegressor(learning_rate=learning_rate,depth=depth,logging_level="Silent")
    cat.fit(x_train,y_train)
    y_train_cat = cat.predict(x_train)
    y_test_cat = cat.predict(x_test)
    rmse_train = mse(y_train,y_train_cat,squared=False)
    armse_train = armse(y_train,y_train_cat)    
    rmse_test = mse(y_test,y_test_cat,squared=False)
    armse_test = armse(y_test,y_test_cat)
    print('Params: learning_rate {} depth {}'.format(learning_rate,depth))
    print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
    print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

 20%|██        | 1/5 [31:07<2:04:31, 1867.93s/it]

Params: learning_rate 0.7412334808788812 depth 2
Train: RMSE 1.8021810495884045 ARMSE 3.059304725517802
Test: RMSE 1.7909334271996944 ARMSE 3.0135418560208636


 40%|████      | 2/5 [1:06:52<1:41:32, 2030.88s/it]

Params: learning_rate 0.7673233598801245 depth 4
Train: RMSE 1.6961734558458312 ARMSE 2.869178139862516
Test: RMSE 1.7663390614592756 ARMSE 2.992966444796189


 60%|██████    | 3/5 [1:51:44<1:17:45, 2332.78s/it]

Params: learning_rate 0.22140661374848813 depth 7
Train: RMSE 1.6495070380426642 ARMSE 2.7931136323697934
Test: RMSE 1.7389308414179723 ARMSE 2.9629930459293656


 80%|████████  | 4/5 [2:37:26<41:34, 2494.01s/it]  

Params: learning_rate 0.8571815579620572 depth 7
Train: RMSE 1.5670409977322248 ARMSE 2.644045537837446
Test: RMSE 1.908649938797318 ARMSE 2.9974419308488076


100%|██████████| 5/5 [3:10:47<00:00, 2289.50s/it]

Params: learning_rate 0.562297701812735 depth 3
Train: RMSE 1.7499802970350515 ARMSE 2.966272349899431
Test: RMSE 1.7631800861681584 ARMSE 2.986061769599048





In [30]:
cat = CatBoostRegressor(learning_rate=0.55,depth=3,logging_level="Silent")
cat.fit(x_train,y_train)
y_train_cat = cat.predict(x_train)
y_test_cat = cat.predict(x_test)
rmse_train = mse(y_train,y_train_cat,squared=False)
armse_train = armse(y_train,y_train_cat)    
rmse_test = mse(y_test,y_test_cat,squared=False)
armse_test = armse(y_test,y_test_cat)
print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

Train: RMSE 1.751827869842708 ARMSE 2.9703188675722347
Test: RMSE 1.7551337879956594 ARMSE 2.9900729685750433


In [31]:
from sklearn.linear_model import Lasso
for i in tqdm(range(5)):
    alpha = random.uniform(0.005,0.009)
    ls = Lasso(alpha=alpha,random_state=0)
    ls.fit(x_train,y_train)
    y_train_ls = ls.predict(x_train)
    y_test_ls = ls.predict(x_test)
    rmse_train = mse(y_train,y_train_ls,squared=False)
    armse_train = armse(y_train,y_train_ls)    
    rmse_test = mse(y_test,y_test_ls,squared=False)
    armse_test = armse(y_test,y_test_ls)
    print('Params: alpha {} '.format(alpha))
    print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
    print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

 20%|██        | 1/5 [06:02<24:09, 362.33s/it]

Params: alpha 0.00718355613033409 
Train: RMSE 1.895140900530189 ARMSE 3.2172276790456125
Test: RMSE 1.8054085225615664 ARMSE 3.0782701646682655


 40%|████      | 2/5 [12:07<18:11, 363.77s/it]

Params: alpha 0.006751722246115241 
Train: RMSE 1.895094239394977 ARMSE 3.2168084503064245
Test: RMSE 1.8053309434829417 ARMSE 3.077856927518901


 60%|██████    | 3/5 [17:51<11:49, 354.77s/it]

Params: alpha 0.00859126102141812 
Train: RMSE 1.8953110954064998 ARMSE 3.2186198958564463
Test: RMSE 1.8056850344909963 ARMSE 3.0796427844756256


 80%|████████  | 4/5 [24:06<06:03, 363.08s/it]

Params: alpha 0.005873329712020209 
Train: RMSE 1.8950084609232647 ARMSE 3.2159739053710563
Test: RMSE 1.805180706418586 ARMSE 3.077030553766479


100%|██████████| 5/5 [29:55<00:00, 359.06s/it]

Params: alpha 0.00816446194155392 
Train: RMSE 1.8952577141841063 ARMSE 3.218196763659029
Test: RMSE 1.8055942352989278 ARMSE 3.079222927743614





In [36]:
from sklearn.linear_model import Ridge
for i in tqdm(range(5)):
    alpha = random.uniform(0.1,0.9)
    rg = Ridge(alpha=alpha,random_state=0)
    rg.fit(x_train,y_train)
    y_train_rg = rg.predict(x_train)
    y_test_rg = rg.predict(x_test)
    rmse_train = mse(y_train,y_train_rg,squared=False)
    armse_train = armse(y_train,y_train_rg)    
    rmse_test = mse(y_test,y_test_rg,squared=False)
    armse_test = armse(y_test,y_test_rg)   
    print('Params: alpha {} '.format(alpha))
    print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
    print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

 20%|██        | 1/5 [00:40<02:43, 40.92s/it]

Params: alpha 0.3956833984551896 
Train: RMSE 1.8946867860908698 ARMSE 3.210782958578813
Test: RMSE 1.8042395400366795 ARMSE 3.0721695614509663


 40%|████      | 2/5 [01:22<02:03, 41.29s/it]

Params: alpha 0.44553481915343807 
Train: RMSE 1.8946867860908678 ARMSE 3.210782959592198
Test: RMSE 1.8042395400792437 ARMSE 3.072169562310709


 60%|██████    | 3/5 [02:04<01:23, 41.62s/it]

Params: alpha 0.8693474462713479 
Train: RMSE 1.8946867860908652 ARMSE 3.210782968204007
Test: RMSE 1.8042395404410987 ARMSE 3.0721695696204288


  overwrite_a=True).T
 80%|████████  | 4/5 [02:45<00:41, 41.31s/it]

Params: alpha 0.2662497227754937 
Train: RMSE 1.8946867860908663 ARMSE 3.2107829559489236
Test: RMSE 1.8042395399261693 ARMSE 3.0721695592185245


100%|██████████| 5/5 [03:26<00:00, 41.27s/it]

Params: alpha 0.8361823960826168 
Train: RMSE 1.8946867860908683 ARMSE 3.2107829675300064
Test: RMSE 1.8042395404127818 ARMSE 3.072169569048396





In [37]:
from sklearn.linear_model import ElasticNet
for i in tqdm(range(5)):
    alpha = random.uniform(0.001,0.1)
    l1 = random.uniform(0.01,0.09)
    es = ElasticNet(alpha=alpha,l1_ratio=l1,random_state=0)
    es.fit(x_train,y_train)
    y_train_es = es.predict(x_train)
    y_test_es = es.predict(x_test)
    rmse_train = mse(y_train,y_train_es,squared=False)
    armse_train = armse(y_train,y_train_es)    
    rmse_test = mse(y_test,y_test_es,squared=False)
    armse_test = armse(y_test,y_test_es)
    print('Params: alpha {} l1_ratio {}'.format(alpha,l1))
    print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
    print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

 20%|██        | 1/5 [05:14<20:58, 314.67s/it]

Params: alpha 0.08843952275650879 l1_ratio 0.04553986333402141
Train: RMSE 1.9019415376638384 ARMSE 3.257357986576797
Test: RMSE 1.810999765710858 ARMSE 3.1085529762663375


 40%|████      | 2/5 [11:38<17:45, 355.26s/it]

Params: alpha 0.04816350626542985 l1_ratio 0.04748638705329243
Train: RMSE 1.8978566829555845 ARMSE 3.2375959438217743
Test: RMSE 1.8075223394037363 ARMSE 3.093658978324611


 60%|██████    | 3/5 [18:51<13:01, 390.77s/it]

Params: alpha 0.05231500482294257 l1_ratio 0.015885460760238397
Train: RMSE 1.8982003665962248 ARMSE 3.239768625971121
Test: RMSE 1.8076493825357647 ARMSE 3.0950360943659185


 80%|████████  | 4/5 [26:07<06:48, 408.86s/it]

Params: alpha 0.03128961138808438 l1_ratio 0.050329493196139184
Train: RMSE 1.8963312171321756 ARMSE 3.2284739731289616
Test: RMSE 1.8061412702252628 ARMSE 3.08659194053629


100%|██████████| 5/5 [34:39<00:00, 415.81s/it]

Params: alpha 0.01578324417661788 l1_ratio 0.08924703975392977
Train: RMSE 1.89527675180217 ARMSE 3.2203335885170734
Test: RMSE 1.8051010213676857 ARMSE 3.080115044163228





### **Hyperparameter tuning: LinearRegression**

In [31]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(n_jobs=-1)
lr.fit(x_train,y_train)
y_pred_train = lr.predict(x_train)
y_pred_test = lr.predict(x_test)
rmse_train = mse(y_train,y_pred_train,squared=False)
armse_train = armse(y_train,y_pred_train)
rmse_test = mse(y_test,y_pred_test,squared=False)
armse_test = armse(y_test,y_pred_test)
print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

Train: RMSE 1.8946867865456574 ARMSE 3.2107820071548114
Test: RMSE 1.8042394992902844 ARMSE 3.072168306636584


1. Test error is significantly less compared to train error for Lasso,Ridge,ElasticNet and Linera regression models indicating bias

### **Hyperparameter tuning: Random forest**

In [None]:
for i in tqdm(range(5)):
    n_estimators = random.sample(range(50,100),1)[0]
    max_depth = random.sample(range(10,15),1)[0]
    min_samples_split = random.sample(range(2,8),1)[0]
    rf = RandomForestRegressor(n_estimators=n_estimators,max_depth=max_depth,min_samples_split=min_samples_split,random_state=0,n_jobs=-1)
    rf.fit(x_train,y_train)
    y_train_rf = rf.predict(x_train)
    y_test_rf = rf.predict(x_test)
    rmse_train = mse(y_train,y_train_rf,squared=False)
    armse_train = armse(y_train,y_train_rf)
    rmse_test = mse(y_test,y_test_rf,squared=False)
    armse_test = armse(y_test,y_test_rf)
    print('n_estimators {} max_depth {} min_samples_split{}'.format(n_estimators,max_depth,min_samples_split))
    print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
    print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

 20%|██        | 1/5 [2:05:54<8:23:36, 7554.20s/it]

n_estimators 89 max_depth 12 min_samples_split5
Train: RMSE 1.739818481119316 ARMSE 2.9447564065379686
Test: RMSE 1.754665108375681 ARMSE 2.9779525920756353


 40%|████      | 2/5 [4:03:46<6:03:32, 7270.96s/it]

n_estimators 81 max_depth 11 min_samples_split5
Train: RMSE 1.7626775616302255 ARMSE 2.9854358197273743
Test: RMSE 1.7578163464211263 ARMSE 2.9854907434536386


 60%|██████    | 3/5 [6:55:56<4:48:55, 8667.50s/it]

n_estimators 98 max_depth 14 min_samples_split2
Train: RMSE 1.681898912797003 ARMSE 2.8420517716217177
Test: RMSE 1.7497975900056884 ARMSE 2.963785901919721


 80%|████████  | 4/5 [8:32:50<2:05:41, 7541.02s/it]

n_estimators 64 max_depth 12 min_samples_split6
Train: RMSE 1.7419309167265904 ARMSE 2.9481936876144617
Test: RMSE 1.754608835395851 ARMSE 2.978621471515222


In [30]:
rf = RandomForestRegressor(n_estimators=64,max_depth=12,min_samples_split=6,random_state=0,n_jobs=-1)
rf.fit(x_train,y_train)
y_train_rf = rf.predict(x_train)
y_test_rf = rf.predict(x_test)
rmse_train = mse(y_train,y_train_rf,squared=False)
armse_train = armse(y_train,y_train_rf)     
rmse_test = mse(y_test,y_test_rf,squared=False)
armse_test = armse(y_test,y_test_rf)
print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

Train: RMSE 1.7419309167265904 ARMSE 2.9481936876144617
Test: RMSE 1.7546088353958513 ARMSE 2.978621471515222


### **Hyperparameter tuning: XGBoost**

In [30]:
for i in tqdm(range(5)):
    learning_rate = random.uniform(0.001,0.5)
    max_depth = random.sample(range(5,10),1)[0]
    n_estimators = random.sample(range(50,100),1)[0]
    xgb = XGBRegressor(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators,n_jobs=-1)
    xgb.fit(x_train,y_train)
    y_train_xgb = xgb.predict(x_train)
    y_test_xgb = xgb.predict(x_test)
    rmse_train = mse(y_train,y_train_xgb,squared=False)
    armse_train = armse(y_train,y_train_xgb)
    rmse_test = mse(y_test,y_test_xgb,squared=False)
    armse_test = armse(y_test,y_test_xgb) 
    print('n_estimators {} max_depth {} learning_rate {}'.format(n_estimators,max_depth,learning_rate))
    print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
    print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

  "because it will generate extra copies and increase " +
 20%|██        | 1/5 [1:17:30<5:10:02, 4650.64s/it]

n_estimators 98 max_depth 6 learning_rate 0.3490151547801042
Train: RMSE 1.8136436939239502 ARMSE 3.106635784177288
Test: RMSE 1.8035304546356201 ARMSE 3.0966927337045553


  "because it will generate extra copies and increase " +
 40%|████      | 2/5 [2:06:32<3:02:17, 3645.74s/it]

n_estimators 71 max_depth 5 learning_rate 0.19968007556004053
Train: RMSE 1.8833829164505005 ARMSE 3.2429975927032846
Test: RMSE 1.8042649030685425 ARMSE 3.0757708207232493


  "because it will generate extra copies and increase " +
 60%|██████    | 3/5 [3:34:36<2:26:26, 4393.46s/it]

n_estimators 95 max_depth 7 learning_rate 0.3080987399329117
Train: RMSE 1.7820528745651245 ARMSE 3.0506597306768923
Test: RMSE 1.807724952697754 ARMSE 3.0881879331489936


  "because it will generate extra copies and increase " +
 80%|████████  | 4/5 [4:39:01<1:09:44, 4184.79s/it]

n_estimators 59 max_depth 8 learning_rate 0.49939053351680296
Train: RMSE 1.7330522537231445 ARMSE 2.957930478086835
Test: RMSE 1.8375163078308105 ARMSE 3.163573397079174


  "because it will generate extra copies and increase " +
100%|██████████| 5/5 [5:48:38<00:00, 4183.79s/it]  

n_estimators 56 max_depth 9 learning_rate 0.140999404993131
Train: RMSE 1.783154845237732 ARMSE 3.0569330466998235
Test: RMSE 1.7846338748931885 ARMSE 3.053612327391125





In [30]:
xgb = XGBRegressor(learning_rate=0.140999404993131, max_depth=9, n_estimators=56,n_jobs=-1)
xgb.fit(x_train,y_train)
y_train_xgb = xgb.predict(x_train)
y_test_xgb = xgb.predict(x_test)
rmse_train = mse(y_train,y_train_xgb,squared=False)
armse_train = armse(y_train,y_train_xgb)
rmse_test = mse(y_test,y_test_xgb,squared=False)
armse_test = armse(y_test,y_test_xgb) 
print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

  "because it will generate extra copies and increase " +


Train: RMSE 1.783154845237732 ARMSE 3.0569330466998235
Test: RMSE 1.7846338748931885 ARMSE 3.053612327391125


### **Hyperparameter tuning: LGBM**

In [32]:
for i in tqdm(range(5)):
    learning_rate = random.uniform(0.001,0.5)
    max_depth = random.sample(range(5,10),1)[0]
    n_estimators = random.sample(range(50,100),1)[0]
    lgbm = LGBMRegressor(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators, n_jobs=-1)
    lgbm.fit(x_train,y_train)
    y_train_lgbm = lgbm.predict(x_train)
    y_test_lgbm = lgbm.predict(x_test)
    rmse_train = mse(y_train,y_train_lgbm,squared=False)
    armse_train = armse(y_train,y_train_lgbm)     
    rmse_test = mse(y_test,y_test_lgbm,squared=False)
    armse_test = armse(y_test,y_test_lgbm) 
    print('n_estimators {} max_depth {} learning_rate {}'.format(n_estimators,max_depth,learning_rate))
    print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
    print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

 20%|██        | 1/5 [02:42<10:48, 162.21s/it]

n_estimators 61 max_depth 7 learning_rate 0.3457333321230572
Train: RMSE 1.7568617029348361 ARMSE 2.981913614940773
Test: RMSE 1.7513307099838027 ARMSE 2.9760653139877307


 40%|████      | 2/5 [05:40<08:34, 171.49s/it]

n_estimators 81 max_depth 8 learning_rate 0.4341742062501302
Train: RMSE 1.7300853992413916 ARMSE 2.92963106629136
Test: RMSE 1.7563508963367647 ARMSE 3.0056769765093057


 60%|██████    | 3/5 [08:47<05:57, 178.60s/it]

n_estimators 90 max_depth 8 learning_rate 0.26661528358747144
Train: RMSE 1.7434842718616461 ARMSE 2.958828656192167
Test: RMSE 1.7478131226481162 ARMSE 2.971252548937483


 80%|████████  | 4/5 [12:15<03:10, 190.40s/it]

n_estimators 98 max_depth 7 learning_rate 0.10022727873975876
Train: RMSE 1.7846721007422275 ARMSE 3.0376312270375863
Test: RMSE 1.7504116902142783 ARMSE 2.9809227868109742


100%|██████████| 5/5 [15:24<00:00, 184.90s/it]

n_estimators 68 max_depth 8 learning_rate 0.03694414621655303
Train: RMSE 1.8604932968978833 ARMSE 3.2916972390739128
Test: RMSE 1.7937277489180639 ARMSE 3.1734357884907385





In [34]:
lgbm = LGBMRegressor(learning_rate=0.26661528358747144, max_depth=8, n_estimators=90, n_jobs=-1)
lgbm.fit(x_train,y_train)
y_train_lgbm = lgbm.predict(x_train)
y_test_lgbm = lgbm.predict(x_test)
rmse_train = mse(y_train,y_train_lgbm,squared=False)
armse_train = armse(y_train,y_train_lgbm)
rmse_test = mse(y_test,y_test_lgbm,squared=False)
armse_test = armse(y_test,y_test_lgbm)   
print('Train: RMSE {} ARMSE {}'.format(rmse_train,armse_train))
print('Test: RMSE {} ARMSE {}'.format(rmse_test,armse_test))

Train: RMSE 1.7434842718616461 ARMSE 2.958828656192167
Test: RMSE 1.7478131226481162 ARMSE 2.971252548937483


In [1]:
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "Train RMSE", "Test RMSE", "Train ARMSE", "Test ARMSE"]
x.add_row(["LGBMRegressor", 1.743, 1.747, 2.958, 2.971])
x.add_row(["XGBRegressor", 1.783, 1.784, 3.056, 3.053])
x.add_row(["CatBoostRegressor", 1.751, 1.755, 2.97, 2.99])
x.add_row(["RandomForestRegressor", 1.741, 1.754, 2.948, 2.978])
x.add_row(["DecisionTreeRegressor", 1.781, 1.782, 3.012, 3.012])
print(x)

+-----------------------+------------+-----------+-------------+------------+
|         Model         | Train RMSE | Test RMSE | Train ARMSE | Test ARMSE |
+-----------------------+------------+-----------+-------------+------------+
|     LGBMRegressor     |   1.743    |   1.747   |    2.958    |   2.971    |
|      XGBRegressor     |   1.783    |   1.784   |    3.056    |   3.053    |
|   CatBoostRegressor   |   1.751    |   1.755   |     2.97    |    2.99    |
| RandomForestRegressor |   1.741    |   1.754   |    2.948    |   2.978    |
| DecisionTreeRegressor |   1.781    |   1.782   |    3.012    |   3.012    |
+-----------------------+------------+-----------+-------------+------------+
