In [16]:
import pandas as pd
from tqdm import tqdm 
from downcast import reduce
import numpy as np

from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.naive import NaiveForecaster
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")



# Baseline models

#### Mean Models

In [2]:
sales_evaluation = pd.read_csv(r'dataset/sales_train_evaluation.csv')
sales_evaluation.index = sales_evaluation.id.str.replace('_evaluation', '')
sales_evaluation.drop(['id', 'item_id','dept_id','cat_id','store_id','state_id'], axis=1, inplace=True)
sales_evaluation.columns = [int(i.replace('d_', '')) for i in sales_evaluation.keys()]
sales_evaluation

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1,0,0,0,0,0,0,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
HOBBIES_1_002_CA_1,0,0,0,0,0,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
HOBBIES_1_003_CA_1,0,0,0,0,0,0,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
HOBBIES_1_004_CA_1,0,0,0,0,0,0,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
HOBBIES_1_005_CA_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3,0,0,2,2,0,3,1,4,1,0,...,1,0,3,0,1,1,0,0,1,1
FOODS_3_824_WI_3,0,0,0,0,0,5,0,1,1,3,...,0,0,0,0,0,0,1,0,1,0
FOODS_3_825_WI_3,0,6,0,2,2,4,1,8,5,2,...,0,0,1,2,0,1,0,1,0,2
FOODS_3_826_WI_3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0


In [3]:
def get_rmse_from_forecaster(forecaster):
    store_metric = {}
    for i in tqdm(range(len(sales_evaluation))):
        name = sales_evaluation.iloc[i].name
        time_series = sales_evaluation.iloc[i]
        y_train, y_test = temporal_train_test_split(time_series, test_size=28)
        forecaster.fit(y_train)
        y_pred = forecaster.predict(fh=[i for i in range(1, 29)])
        rmse = mean_squared_error(y_test, y_pred, squared=True)
        store_metric[name] = rmse
    mean_rmse = np.mean(list(store_metric.values()))
    return mean_rmse

In [9]:
mean_model_rmse = get_rmse_from_forecaster(NaiveForecaster(strategy="mean", sp=7))
print('Mean Model RMSE: ', mean_model_rmse)

100%|██████████| 30490/30490 [02:07<00:00, 239.96it/s]

Mean Model RMSE:  6.315840540585761





#### ETS

In [10]:
# Auto ETS
from sktime.forecasting.ets import AutoETS
# Remove warnings
import warnings
warnings.filterwarnings("ignore")


ETS_model_rmse = get_rmse_from_forecaster(AutoETS(auto=False, sp=7, n_jobs=-1))
print('ETS Model RMSE: ', ETS_model_rmse)

100%|██████████| 30490/30490 [08:42<00:00, 58.37it/s]

ETS Model RMSE:  5.039344784664702





#### Expontial Smooting

In [4]:
from sktime.forecasting.exp_smoothing import ExponentialSmoothing

exp_smoothing_model_rmse = get_rmse_from_forecaster(ExponentialSmoothing(trend='add', seasonal='add', sp=7))
print('Exp Smoothing Model RMSE: ', exp_smoothing_model_rmse)

100%|██████████| 30490/30490 [3:10:25<00:00,  2.67it/s]  

Exp Smoothing Model RMSE:  4.873291794654346





In [None]:
results = pd.DataFrame({'Model': ['Mean Model', 'ETS Model', 'Exp Smoothing'], 'RMSE': [mean_model_rmse, ETS_model_rmse, exp_smoothing_model_rmse]})

In [None]:
results

Unnamed: 0,Model,RMSE
0,Mean,6.315841
1,ETS,5.039345
2,Exp Smoothing,4.873292


# Regression Algorithms

In [17]:
sales_validation = pd.read_csv(r'dataset/sales_train_validation.csv')
sales_evaluation = pd.read_csv(r'dataset/sales_train_evaluation.csv')
calendar = pd.read_csv(r'dataset/calendar.csv')
prices = pd.read_csv(r'dataset/sell_prices.csv')
calendar = calendar.fillna('RegularDay')
sales_validation.id = sales_validation.id.str.replace('_validation', '')
sales_evaluation.id = sales_evaluation.id.str.replace('_evaluation', '')

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split


# Featurization for train and test

In [19]:
def featurize_train_data(sales_data, calendar, prices):
    melted_sales_validation = pd.melt(sales_data, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sales')
    df = pd.merge(melted_sales_validation, calendar, on='d', how='left')
    df = pd.merge(df, prices, on=['store_id','item_id','wm_yr_wk'], how='left') 
    for i in tqdm(list(range(7,30,7))):
        df['lag_'+str(i)] = df.groupby(['id'])['sales'].shift(i)
        
    for i in tqdm(list(range(7,30,7))):
        df['rolling_mean_'+str(i)] = df.groupby(['id'])['sales'].shift(i).rolling(i).mean()
        df['rolling_std_'+str(i)] = df.groupby(['id'])['sales'].shift(i).rolling(i).std()
    df = df.fillna(0)
    cols = ['event_name_1','event_type_1','event_name_2','event_type_2','id','item_id','dept_id','cat_id','store_id','state_id']
    label_encoders = {}
    for i in tqdm(cols):
        labelencoder=LabelEncoder()
        df[i+'_encoded'] = labelencoder.fit_transform(df[i].astype(str))
        label_encoders[i] = labelencoder
    df.d = df.d.apply(lambda x: x.split('_')[1]).astype(int)

    df = reduce(df)
    
    x = df.drop(['id','item_id','dept_id','cat_id','store_id','state_id','weekday','date','month','year','event_name_1','event_type_1','event_name_2','event_type_2'], axis=1)
    y = df[['d','sales']]
    x_train=x.loc[(x['d']>=1115)&(x['d']<=1885)].copy()
    x_cv=x.loc[(x['d']>1885)].copy()

    y_train=y.loc[(x['d']>=1115)&(y['d']<=1885)].copy()
    y_cv=y.loc[(y['d']>1885)].copy()

    x_train.drop(['d', 'sales'], axis=1, inplace=True)
    x_cv.drop(['d', 'sales'], axis=1, inplace=True)

    y_train.drop(['d'], axis=1, inplace=True)
    y_cv.drop(['d'], axis=1, inplace=True)

    scaler = StandardScaler()
    x_train_std = scaler.fit_transform(x_train)
    x_cv_std = scaler.transform(x_cv)

    return x_train.columns, scaler, label_encoders, x_train_std, x_cv_std, y_train, y_cv

def featurize_test_data(sales_data, calendar, prices, scaler, label_encoders):
    melted_sales_validation = pd.melt(sales_data, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sales')
    df = pd.merge(melted_sales_validation, calendar, on='d', how='left')
    df = pd.merge(df, prices, on=['store_id','item_id','wm_yr_wk'], how='left') 
    for i in tqdm(list(range(7,30,7))):
        df['lag_'+str(i)] = df.groupby(['id'])['sales'].shift(i)
        
    for i in tqdm(list(range(7,30,7))):
        df['rolling_mean_'+str(i)] = df.groupby(['id'])['sales'].shift(i).rolling(i).mean()
        df['rolling_std_'+str(i)] = df.groupby(['id'])['sales'].shift(i).rolling(i).std()
    df = df.fillna(0)
    cols = ['event_name_1','event_type_1','event_name_2','event_type_2','id','item_id','dept_id','cat_id','store_id','state_id']
    for i in tqdm(cols):
        df[i+'_encoded'] = label_encoders[i].transform(df[i].astype(str))
    df.d = df.d.apply(lambda x: x.split('_')[1]).astype(int)

    df = reduce(df)
    
    x = df.drop(['id','item_id','dept_id','cat_id','store_id','state_id','weekday','date','month','year','event_name_1','event_type_1','event_name_2','event_type_2'], axis=1)
    y = df[['d','sales']]
    x_test = x.loc[(x['d']>=1914)].copy()

    y_test = y.loc[(x['d']>=1914)].copy()

    x_test.drop(['d', 'sales'], axis=1, inplace=True)

    y_test.drop(['d'], axis=1, inplace=True)

    x_test_std = scaler.transform(x_test)

    return x_test.columns, x_test_std, y_test

train_features, scaler, label_encoders, x_train_std, x_cv_std, y_train, y_cv = featurize_train_data(sales_validation, calendar, prices)
test_features, x_test_std, y_test = featurize_test_data(sales_evaluation, calendar, prices, scaler, label_encoders)

100%|██████████| 4/4 [00:05<00:00,  1.34s/it]
100%|██████████| 4/4 [00:11<00:00,  2.92s/it]
100%|██████████| 10/10 [00:21<00:00,  2.12s/it]
100%|██████████| 4/4 [00:05<00:00,  1.35s/it]
100%|██████████| 4/4 [00:11<00:00,  2.96s/it]
100%|██████████| 10/10 [00:19<00:00,  1.92s/it]


In [20]:
print(train_features)

Index(['wm_yr_wk', 'wday', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price',
       'lag_7', 'lag_14', 'lag_21', 'lag_28', 'rolling_mean_7',
       'rolling_std_7', 'rolling_mean_14', 'rolling_std_14', 'rolling_mean_21',
       'rolling_std_21', 'rolling_mean_28', 'rolling_std_28',
       'event_name_1_encoded', 'event_type_1_encoded', 'event_name_2_encoded',
       'event_type_2_encoded', 'id_encoded', 'item_id_encoded',
       'dept_id_encoded', 'cat_id_encoded', 'store_id_encoded',
       'state_id_encoded'],
      dtype='object')


# Hyperparameter Tuning

### Linear Regression

In [24]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import TweedieRegressor
from lightgbm import LGBMRegressor


In [25]:
def tune_sgd_regressor(x_train_std, y_train, x_cv_std, y_cv, l2_reg):
    train_scores = []
    cv_scores = []
    train_error = []
    cv_error = []
    for l2 in tqdm(l2_reg):
        sgd = SGDRegressor(loss='squared_error', penalty='l2', alpha=l2, random_state=0)
        sgd.fit(x_train_std, y_train)
        train_error_ = mean_squared_error(y_train, np.around(sgd.predict(x_train_std),0), squared=False)
        cv_error_ = mean_squared_error(y_cv, np.around(sgd.predict(x_cv_std),0), squared=False)
        train_error.append(train_error_)
        cv_error.append(cv_error_)
    return train_error, cv_error

C = [ 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 100, 10000]
train_error, cv_error = tune_sgd_regressor(x_train_std, y_train, x_cv_std, y_cv, l2_reg=C)
results_df_linear_regression = pd.DataFrame({'l2_reg':C, 'train_error':train_error, 'cv_error':cv_error})
results_df_linear_regression.sort_values(by='cv_error', ascending=True)

100%|██████████| 11/11 [03:16<00:00, 17.83s/it]


Unnamed: 0,l2_reg,train_error,cv_error
4,0.01,2.446404,2.178166
2,0.0001,2.447,2.178695
0,1e-06,2.446991,2.178708
1,1e-05,2.446991,2.17873
3,0.001,2.446901,2.178846
5,0.1,2.470704,2.206977
6,1.0,2.54564,2.287303
7,10.0,3.064857,2.801523
8,100.0,3.378298,3.123945
9,100.0,3.378298,3.123945


### Gaussian Naive Bayes

In [26]:
def tune_naive_bayes(x_train, y_train, x_cv, y_cv, var_smoothing):
    train_error = []
    cv_error = []
    for var in tqdm(var_smoothing):
        nb = GaussianNB(var_smoothing=var)
        nb.fit(x_train, y_train)
        train_error_ = mean_squared_error(y_train, np.around(nb.predict(x_train),0), squared=False)
        cv_error_ = mean_squared_error(y_cv, np.around(nb.predict(x_cv),0), squared=False)
        train_error.append(train_error_)
        cv_error.append(cv_error_)
    return train_error, cv_error
var_smoothing = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000]
train_error_nb, cv_error_nb = tune_naive_bayes(x_train_std, y_train, x_cv_std, y_cv, var_smoothing)
results_df_nb = pd.DataFrame({'var_smoothing':var_smoothing, 'train_error':train_error_nb, 'cv_error':cv_error_nb})
results_df_nb.sort_values(by='cv_error', ascending=True)

100%|██████████| 9/9 [05:03<00:00, 33.72s/it]


Unnamed: 0,var_smoothing,train_error,cv_error
5,10.0,6.074611,3.249828
6,100.0,3.562972,3.398215
7,1000.0,3.556178,3.398215
8,10000.0,3.556178,3.398215
4,1.0,10.412797,6.447999
3,0.1,13.181332,8.780273
2,0.01,16.99767,12.024037
1,0.001,22.145534,18.799591
0,0.0001,29.829064,28.107998


### Decision Tree Regressor

In [27]:
def tune_decision_tree_regressor(x_train, y_train, x_cv, y_cv, max_depths, min_samples_splits, min_samples_leafs, n_iter):
    train_error = []
    cv_error = []
    max_depths_ = []
    min_samples_splits_ = []
    min_samples_leafs_ = []
    for i in tqdm(range(n_iter)):
        max_depth = np.random.choice(max_depths, 1, replace=True)[0]
        min_samples_split = np.random.choice(min_samples_splits, 1, replace=True)[0]
        min_samples_leaf = np.random.choice(min_samples_leafs, 1, replace=True)[0]
        dt = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=0)
        dt.fit(x_train, y_train)
        train_error_ = mean_squared_error(y_train, np.around(dt.predict(x_train),0), squared=False)
        cv_error_ = mean_squared_error(y_cv, np.around(dt.predict(x_cv),0), squared=False)
        max_depths_.append(max_depth)
        min_samples_splits_.append(min_samples_split)
        min_samples_leafs_.append(min_samples_leaf)
        train_error.append(train_error_)
        cv_error.append(cv_error_)
    return train_error, cv_error, max_depths_, min_samples_splits_, min_samples_leafs_
max_depths = list(range(2, 20))
min_samples_splits = list(range(2, 30))
min_samples_leafs = list(range(2,30))
train_error_dt, cv_error_dt, max_depths_, min_samples_splits_, min_samples_leafs_ = tune_decision_tree_regressor(x_train_std, y_train, x_cv_std, y_cv, max_depths, min_samples_splits, min_samples_leafs, n_iter=20)
results_dt = pd.DataFrame({'max_depth':max_depths_, 'min_samples_split':min_samples_splits_, 'min_samples_leaf':min_samples_leafs_, 'train_error':train_error_dt, 'cv_error':cv_error_dt})
results_dt.sort_values(by='cv_error', ascending=True)

100%|██████████| 20/20 [06:33<00:00, 19.68s/it]


Unnamed: 0,max_depth,min_samples_split,min_samples_leaf,train_error,cv_error
16,11,3,5,2.259054,2.192753
13,10,24,16,2.343793,2.195772
12,9,19,27,2.381579,2.197011
11,8,4,2,2.36153,2.199432
2,11,26,14,2.310163,2.203802
6,11,9,14,2.310163,2.203802
7,6,16,23,2.463273,2.231982
5,5,12,11,2.502923,2.255038
9,16,23,25,2.24775,2.271997
19,19,27,28,2.230523,2.274726


### Tweedie Regressor

In [28]:

def tune_tweedie(x_train, y_train, x_cv, y_cv, alpha, power):
    train_error = []
    cv_error = []
    alpha_ = []
    power_ = []
    for var in tqdm(alpha):
        for p in tqdm(power):
            nb = TweedieRegressor(alpha=var, power=p)
            nb.fit(x_train, y_train)
            train_error_ = mean_squared_error(y_train, np.around(nb.predict(x_train),0), squared=False)
            cv_error_ = mean_squared_error(y_cv, np.around(nb.predict(x_cv),0), squared=False)
            train_error.append(train_error_)
            cv_error.append(cv_error_)
            alpha_.append(var)
            power_.append(p)
    return train_error, cv_error, alpha_, power_

alpha = [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]
power = [1.1, 1.3, 1.4]
train_error_tweedie, cv_error_tweedie, alpha_, power_ = tune_tweedie(x_train_std, y_train, x_cv_std, y_cv, alpha, power)
results_df_tweedie = pd.DataFrame({'alpha':alpha_, 'power':power_, 'train_error':train_error_tweedie, 'cv_error':cv_error_tweedie})
results_df_tweedie.sort_values(by='cv_error', ascending=True)


100%|██████████| 3/3 [01:11<00:00, 23.79s/it]
100%|██████████| 3/3 [01:06<00:00, 22.00s/it]
100%|██████████| 3/3 [00:43<00:00, 14.35s/it]
100%|██████████| 3/3 [00:24<00:00,  8.17s/it]
100%|██████████| 3/3 [00:13<00:00,  4.63s/it]
100%|██████████| 3/3 [00:12<00:00,  4.32s/it]
100%|██████████| 3/3 [00:14<00:00,  4.86s/it]
100%|██████████| 7/7 [04:06<00:00, 35.20s/it]


Unnamed: 0,alpha,power,train_error,cv_error
9,1.0,1.1,12.950694,2.719709
12,10.0,1.1,10.60614,2.790567
13,10.0,1.3,30.479093,2.794545
14,10.0,1.4,51.387356,2.826686
15,100.0,1.1,3.303286,3.056747
16,100.0,1.3,3.325098,3.078801
17,100.0,1.4,3.332904,3.085991
6,0.1,1.1,9.038981,3.114711
20,1000.0,1.4,3.400491,3.1441
18,1000.0,1.1,3.400119,3.1441


### LightGBM Regressor

In [29]:
def tune_lgbm(x_train, y_train, x_cv, y_cv, n_estimators, max_depths, num_leaves, learning_rates, reg_lambdas, n_iter):
    train_error = []
    cv_error = []
    n_estimators_ = []
    max_depths_ = []
    learning_rates_ = []
    num_leaves_ = []
    reg_lambdas_ = []
    for i in tqdm(range(n_iter)):
        n_estimator = np.random.choice(n_estimators, 1, replace=True)[0]
        max_depth = np.random.choice(max_depths, 1, replace=True)[0]
        learning_rate = np.random.choice(learning_rates, 1, replace=True)[0]
        num_leaf = np.random.choice(num_leaves, 1, replace=True)[0]
        reg_lambda = np.random.choice(reg_lambdas, 1, replace=True)[0]
        lgbm = LGBMRegressor(objective='tweedie', 
                            tweedie_variance_power=1.1,
                            n_estimators=n_estimator, 
                            num_leaves=num_leaf,
                            max_depth=max_depth, 
                            learning_rate=learning_rate, 
                            reg_lambda = reg_lambda,
                            n_jobs=-1,
                            random_state=0)
        lgbm.fit(x_train, y_train)
        train_error_ = mean_squared_error(y_train, np.around(lgbm.predict(x_train),0), squared=False)
        cv_error_ = mean_squared_error(y_cv, np.around(lgbm.predict(x_cv),0), squared=False)

        n_estimators_.append(n_estimator)
        max_depths_.append(max_depth)
        learning_rates_.append(learning_rate)
        num_leaves_.append(num_leaf)
        reg_lambdas_.append(reg_lambda)
        train_error.append(train_error_)
        cv_error.append(cv_error_)
    return train_error, cv_error, n_estimators_, max_depths_, learning_rates_, num_leaves_, reg_lambdas_

# Tune lgbm
n_estimators = [100, 1000]
max_depths = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
num_leaves = [100000]
learning_rates = [0.01, 0.1, 0.3]
reg_lambdas = [0.3, 0.5, 0.7, 0.9]

train_error_lgbm, cv_error_lgbm, n_estimators_, max_depths_, learning_rates_, num_leaves_, reg_lambdas_ = tune_lgbm(x_train_std, y_train, x_cv_std, y_cv, n_estimators, max_depths, num_leaves, learning_rates, reg_lambdas, n_iter = 10)

100%|██████████| 10/10 [18:52<00:00, 113.22s/it]


In [30]:
results_lgbm = pd.DataFrame({'n_estimators':n_estimators_, 'max_depth':max_depths_, 'learning_rate':learning_rates_, 'num_leaves':num_leaves_, 'reg_lambda':reg_lambdas_, 'train_error':train_error_lgbm, 'cv_error':cv_error_lgbm})
results_lgbm.sort_values(by='cv_error', ascending=True)

Unnamed: 0,n_estimators,max_depth,learning_rate,num_leaves,reg_lambda,train_error,cv_error
7,1000,10,0.01,100000,0.3,1.932674,2.096811
0,1000,5,0.01,100000,0.3,2.329517,2.132988
3,100,3,0.3,100000,0.5,2.360733,2.135046
1,100,12,0.1,100000,0.5,1.707952,2.135951
4,100,7,0.3,100000,0.9,2.06261,2.285918
9,1000,7,0.3,100000,0.3,1.498984,2.287496
6,1000,8,0.3,100000,0.5,1.299552,2.32315
2,100,11,0.01,100000,0.3,2.521621,2.380228
5,100,7,0.01,100000,0.5,2.642418,2.398913
8,1000,11,0.3,100000,0.9,0.737117,2.470182


# Result

In [35]:
best_models = pd.DataFrame({
    'Models':['Simple mean model', 'ETS Model', 'Exponential Smoothing', 
                    'linear regression (L2 regularization)', 'Gaussian Naive Bayes', 
                    'DecisionTree regressor', 'Tweedie regressor', 'LightGBM regressor'],
    'RMSE':[6.315841,5.039345, 4.873292, 2.178166, 3.249828, 2.192753,2.719709, 2.096811]})
best_models.sort_values(by='RMSE', ascending=True)

Unnamed: 0,Models,RMSE
7,LightGBM regressor,2.096811
3,linear regression (L2 regularization),2.178166
5,DecisionTree regressor,2.192753
6,Tweedie regressor,2.719709
4,Gaussian Naive Bayes,3.249828
2,Exponential Smoothing,4.873292
1,ETS Model,5.039345
0,Simple mean model,6.315841


In [39]:
best_linear_reg = SGDRegressor(alpha=0.01, penalty='l2', random_state=0)
best_linear_reg.fit(x_train_std, y_train)
print('Train error:', mean_squared_error(y_train, best_linear_reg.predict(x_train_std), squared=False))
print('CV error:', mean_squared_error(y_cv, best_linear_reg.predict(x_cv_std), squared=False))
print('Test error:', mean_squared_error(y_test, best_linear_reg.predict(x_test_std), squared=False))

Train error: 2.429733028188093
CV error: 2.1574791043294232
Test error: 2.2686696080810513


In [40]:
best_decision_tree = DecisionTreeRegressor(max_depth=11, min_samples_split=3, min_samples_leaf=5, random_state=0)
best_decision_tree.fit(x_train_std, y_train)
print('Train error:', mean_squared_error(y_train, best_decision_tree.predict(x_train_std), squared=False))
print('CV error:', mean_squared_error(y_cv, best_decision_tree.predict(x_cv_std), squared=False))
print('Test error:', mean_squared_error(y_test, best_decision_tree.predict(x_test_std), squared=False))

Train error: 2.241267230273972
CV error: 2.1724272380972334
Test error: 2.3791142037350785


In [41]:
best_lightgbm = LGBMRegressor(objective='tweedie', tweedie_variance_power=1.1, n_estimators=1000, num_leaves=100000, max_depth=10, learning_rate=0.01, reg_lambda = 0.3, n_jobs=-1, random_state=0) 
best_lightgbm.fit(x_train_std, y_train)
print('Train error:', mean_squared_error(y_train, best_lightgbm.predict(x_train_std), squared=False))
print('CV error:', mean_squared_error(y_cv, best_lightgbm.predict(x_cv_std), squared=False))
print('Test error:', mean_squared_error(y_test, best_lightgbm.predict(x_test_std), squared=False)) 

Train error: 1.9126752394933386
CV error: 2.0743967552440843
Test error: 2.20561306979029


In [43]:
# Pandas table of best models
best_models = pd.DataFrame({
    'Models':['Linear Regression with L2 Regularization', 'DecisionTree regressor', 'LightGBM regressor'],
    'Train RMSE':[2.429733028188093,2.241267230273972, 1.9126752394933386],
    'CV RMSE':[2.1574791043294232, 2.1724272380972334, 2.0743967552440843],
    'Test RMSE':[2.2686696080810513, 2.3791142037350785, 2.20561306979029]})
best_models.sort_values(by='Test RMSE', ascending=True)

Unnamed: 0,Models,Train RMSE,CV RMSE,Test RMSE
2,LightGBM regressor,1.912675,2.074397,2.205613
0,Linear Regression with L2 Regularization,2.429733,2.157479,2.26867
1,DecisionTree regressor,2.241267,2.172427,2.379114
