# Other Decision Tree Ensembles
Up till now, only random forest has been used to predict future rentals. Now, some other learners including HistGradientBoostingTrees, XGboost and LightGBM will be used to evaluate if they make better predictions.

In [1]:
# Reading the dataset
import pandas as pd
selected_rentals = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 5 (TU Dresden)\Research Task - Spatial Modelling\Code\rentals_working_with_subway_stations.csv")
selected_rentals.columns

Index(['#_rentals', 'datetime', 'year', 'month', 'day', 'hour', 'ID',
       'coordinates', '#_rentals_lag_1', 'name_of_day', 'weekend',
       'is_holiday', '#_rentals_lag_2', 'prev_day', 'prev_week', 'roll_avg',
       'w_avg_lag_1', 'w_avg_prev_day', 'w_avg_roll_avg', 'w_avg_lag_2',
       'temp', 'rhum', 'prcp', 'wspd', 'rush_hr', 'MapID', 'coco',
       'sb_st_800'],
      dtype='object')

## Creating Training, Test and Validation Sets

In [2]:
# Creating the test set for March, 2024
test_set_mar = selected_rentals.loc[(selected_rentals.month == 3) & (selected_rentals.day >= 25),:]

# Creating the test set for April, 2024
test_set_apr = selected_rentals.loc[(selected_rentals.month == 4) & (selected_rentals.day >= 24),:]

# Creating the training set for March, 2024
train_set_mar = selected_rentals.loc[((selected_rentals.month == 3) & (selected_rentals.day < 18)) | ((selected_rentals.month == 2) & (selected_rentals.day >= 5)),:]

# Creating the training set for April, 2024
train_set_apr = selected_rentals.loc[((selected_rentals.month == 4) & (selected_rentals.day < 17)) | ((selected_rentals.month == 3) & (selected_rentals.day >= 6)),:]

# Creating the validation set for March, 2024
val_set_mar = selected_rentals.loc[(selected_rentals.month == 3) & (selected_rentals.day < 25) & (selected_rentals.day >= 18),:]

# Creating the validation set for April, 2024
val_set_apr = selected_rentals.loc[(selected_rentals.month == 4) & (selected_rentals.day < 24) & (selected_rentals.day >= 17),:]

## Selecting Features 

In [3]:
# creating dummies for the ID feature (station ID)
train_set_mar_dum = pd.get_dummies(train_set_mar[["#_rentals", "month", "day", "hour", "ID", "#_rentals_lag_1","MapID", "temp", "rhum", "prcp", "wspd", "coco", "name_of_day", "weekend",'w_avg_lag_1',"roll_avg"]], columns = ["ID","MapID","name_of_day"], drop_first=False)
train_set_apr_dum = pd.get_dummies(train_set_apr[["#_rentals", "month", "day", "hour", "ID", "#_rentals_lag_1","MapID", "temp", "rhum", "prcp", "wspd", "coco", "name_of_day", "weekend",'w_avg_lag_1',"roll_avg"]], columns = ["ID","MapID","name_of_day"], drop_first=False)

# Repeating for the test set
test_set_mar_dum = pd.get_dummies(test_set_mar[["#_rentals", "month", "day", "hour", "ID", "#_rentals_lag_1","MapID", "temp", "rhum", "prcp", "wspd", "coco", "name_of_day", "weekend",'w_avg_lag_1',"roll_avg"]], columns = ["ID","MapID","name_of_day"], drop_first=False)
test_set_apr_dum = pd.get_dummies(test_set_apr[["#_rentals", "month", "day", "hour", "ID", "#_rentals_lag_1","MapID", "temp", "rhum", "prcp", "wspd", "coco", "name_of_day", "weekend",'w_avg_lag_1',"roll_avg"]], columns = ["ID","MapID","name_of_day"], drop_first=False)

## Separating X and y

In [4]:
# Separating features from the target: March

X_train_mar = train_set_mar_dum[list(train_set_mar_dum.columns)[1:]]
y_train_mar = train_set_mar_dum["#_rentals"]

X_test_mar = test_set_mar_dum[list(test_set_mar_dum.columns)[1:]]
y_test_mar = test_set_mar_dum["#_rentals"]


# Separating features from the target: April

X_train_apr = train_set_apr_dum[list(train_set_apr_dum.columns)[1:]]
y_train_apr = train_set_apr_dum["#_rentals"]

X_test_apr = test_set_apr_dum[list(test_set_apr_dum.columns)[1:]]
y_test_apr = test_set_apr_dum["#_rentals"]

## HGBT Learner - Default Hyperparameters
Works with minimal tuning, default parameters tend to work well.<br>
[Reference 1](https://www.restack.io/p/histgradientboostingregressor-vs-xgboost-answer) 
[Reference 2](https://machinelearningmastery.com/histogram-based-gradient-boosting-ensembles/)

In [6]:
# Training performance for March
from sklearn.ensemble import HistGradientBoostingRegressor
model_mar = HistGradientBoostingRegressor(random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
from sklearn.ensemble import HistGradientBoostingRegressor
model_apr = HistGradientBoostingRegressor(random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

3.001851713985223 0.5564852728582502
4.487234867745212 0.5691226383591312


Provides slightly better performance compared to Random Forest. Average MSE = 3.74, average r2 score = 0.56

## Tuning - HGBT
The most influential hyperparameters are found to be *learning rate* and *max iterations*. Changing these values might influence learner performance.

In [14]:
# PREPARING VALIDATION SET

# Creating dummies 

val_set_mar_dum = pd.get_dummies(val_set_mar[["#_rentals", "month", "day", "hour", "ID", "#_rentals_lag_1","MapID", "temp", "rhum", "prcp", "wspd", "coco", "name_of_day", "weekend",'w_avg_lag_1',"roll_avg"]], columns = ["ID","MapID","name_of_day"], drop_first=False)
val_set_apr_dum = pd.get_dummies(val_set_apr[["#_rentals", "month", "day", "hour", "ID", "#_rentals_lag_1","MapID", "temp", "rhum", "prcp", "wspd", "coco", "name_of_day", "weekend",'w_avg_lag_1',"roll_avg"]], columns = ["ID","MapID","name_of_day"], drop_first=False)

# Separating features from the target: March
X_val_mar = val_set_mar_dum[list(val_set_mar_dum.columns)[1:]]
y_val_mar = val_set_mar_dum["#_rentals"]

# Separating features from the target: April
X_val_apr = val_set_apr_dum[list(val_set_apr_dum.columns)[1:]]
y_val_apr = val_set_apr_dum["#_rentals"]

In [12]:
# Defining the hyperparameter space - attempt 1

import numpy as np

param_dist = {
    'learning_rate': np.arange(0.06,0.08,0.01),     
    'max_iter': np.arange(100,400,100)
}

In [13]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 1

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 25,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [14]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)



Best hyperparameters found: {'max_iter': 300, 'learning_rate': 0.07999999999999999}


In [16]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.2387339517335265 0.42371965616544316


In [17]:
# Defining the hyperparameter space - attempt 2

param_dist = {
    'learning_rate': np.arange(0.08,1.02,0.01),     
    'max_iter': np.arange(100,400,100)
}

In [18]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 2

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 25,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [19]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'max_iter': 100, 'learning_rate': 0.10999999999999999}


In [20]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.280057012809935 0.41308254235074926


Performance on validation is worse, compared to the previous attempt.

In [21]:
# Defining the hyperparameter space - attempt 3

param_dist = {
    'learning_rate': np.arange(0.01,0.06,0.01),     
    'max_iter': np.arange(400,800,100)
}

In [22]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 3

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 25,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [23]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)



Best hyperparameters found: {'max_iter': 400, 'learning_rate': 0.05}


In [24]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.2607087157418033 0.4180630552332213


Performance compared to the 1st attempt is poorer.

In [26]:
# Defining the hyperparameter space - attempt 4

param_dist = {
    'learning_rate': np.arange(0.05,0.07,0.005),     
    'max_iter': np.arange(100,400,100)
}

In [27]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 4

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 25,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [28]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)



Best hyperparameters found: {'max_iter': 300, 'learning_rate': 0.05}


In [29]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.2607087157418033 0.4180630552332213


In [30]:
# Defining the hyperparameter space - attempt 5

param_dist = {
    'learning_rate': np.arange(0.07,0.09,0.005),     
    'max_iter': np.arange(300,400,50)
}

In [31]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 5

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 25,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [32]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)



Best hyperparameters found: {'max_iter': 300, 'learning_rate': 0.08000000000000002}


In [33]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.2387339517335274 0.42371965616544305


Hence, these are the best hyperparameter values. Best performance on validation set.

## Peformance on test set - tuned HGBT

In [15]:
# Training performance for March
from sklearn.ensemble import HistGradientBoostingRegressor
model_mar = HistGradientBoostingRegressor(learning_rate = 0.08, max_iter = 300, random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
from sklearn.ensemble import HistGradientBoostingRegressor
model_apr = HistGradientBoostingRegressor(learning_rate = 0.08, max_iter = 300, random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

2.9831656805109272 0.5592460791296272
4.473480913728476 0.5704433330839516


Performance is improved slightly after tuning. Avg MSE = 3.7; avg r2 score = 0.565

## LightGBM - Defauly Hyperparameters
Learner can be more sensitive to tuning!

In [36]:
# Training performance for March
from lightgbm import LGBMRegressor
model_mar = LGBMRegressor(random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
from lightgbm import LGBMRegressor
model_apr = LGBMRegressor(random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000984 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1113
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.674524
2.9933826640152525 0.5577365499846594
4.513212151108916 0.5666282239484198


Performance is poorer compared to tuned HGBT, but better than RF.

## LightGBM - Tuning
It is sensitive to overfitting. [Hyperparameters to prioritise](https://medium.com/@sarahzouinina/a-deep-dive-into-lightgbm-how-to-choose-and-tune-parameters-7c584945842e)

In [41]:
# Defining the hyperparameter space - attempt 1

param_dist = {
    'learning_rate': np.arange(0.06,0.10,0.01),     
    'num_leaves': np.arange(20,40,5)
}

In [42]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 1

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 30,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [43]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001014 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'num_leaves': 35, 'learning_rate': 0.08999999999999998}


In [44]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.449567653126401 0.36944821500714176


In [45]:
# Defining the hyperparameter space - attempt 2

param_dist = {
    'learning_rate': np.arange(0.08,0.13,0.01),     
    'num_leaves': np.arange(25,50,5)
}

In [46]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 2

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 40,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [47]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'num_leaves': 35, 'learning_rate': 0.11999999999999998}


In [48]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.3210483188524473 0.4025308267607326


Performance has improved.

In [50]:
# Defining the hyperparameter space - attempt 3

param_dist = {
    'learning_rate': np.arange(0.1,0.14,0.001),     
    'num_leaves': np.arange(24,46,2)
}

In [54]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 3

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 100,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [55]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'num_leaves': 40, 'learning_rate': 0.10200000000000001}


In [56]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.3737965639740066 0.38895271632390127


Performance is slightly poor, it's likely that trees are overfitting.

In [58]:
# Defining the hyperparameter space - attempt 4

param_dist = {
    'learning_rate': np.arange(0.1,0.13,0.001),     
    'num_leaves': np.arange(27,51,3),
    'max_depth': np.arange(6,16,2)
}

In [59]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 4

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 100,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [60]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001969 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'num_leaves': 39, 'max_depth': 14, 'learning_rate': 0.11600000000000002}


In [61]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.3057229119686045 0.4064757933976063


In [63]:
# Defining the hyperparameter space - attempt 5

param_dist = {
    'n_estimators': np.arange(100,500,50),
    'learning_rate': np.arange(0.08,0.13,0.001),     
    'num_leaves': np.arange(10,100,5),
    'max_depth': np.arange(3,13,2)
}

In [64]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 5

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 500,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [65]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'num_leaves': 20, 'n_estimators': 250, 'max_depth': 11, 'learning_rate': 0.12600000000000006}


In [66]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.2653823566137135 0.41685999697498644


Best performance so far. Performance can be further increased.

In [70]:
# Defining the hyperparameter space - attempt 6

param_dist = {
    'n_estimators': np.arange(150,300,25),
    'learning_rate': np.arange(0.09,0.13,0.005),     
    'num_leaves': np.arange(15,40,5),
    'max_depth': np.arange(8,12,2)
}

In [71]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 6

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 200,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [72]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'num_leaves': 25, 'n_estimators': 250, 'max_depth': 8, 'learning_rate': 0.095}


In [73]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.283472350296444 0.4122033883719961


Performance has decreased slightly

In [74]:
# Defining the hyperparameter space - attempt 7

param_dist = {
    'n_estimators': np.arange(240,260,10),           
    'learning_rate': np.arange(0.09,0.14,0.01),     
    'num_leaves': np.arange(20,60,5),
    'max_depth': [20,25,35,40,55,75],
    'reg_alpha': [0.1, 0.2, 0.3],
    'reg_lambda': [0.1, 0.3, 0.5]
}

In [75]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 7

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 250,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [76]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'reg_lambda': 0.5, 'reg_alpha': 0.1, 'num_leaves': 30, 'n_estimators': 250, 'max_depth': 35, 'learning_rate': 0.09999999999999999}


In [77]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.265677415085941 0.4167840449407151


Best performance so far. This means that earlier, trees were overfitting.

In [79]:
# Defining the hyperparameter space - attempt 8

param_dist = {
    'n_estimators': np.arange(240,380,10),           
    'learning_rate': np.arange(0.05,0.15,0.01),     
    'num_leaves': np.arange(20,40,5),
    'max_depth': np.arange(5,35,5),
    'reg_lambda': [0.4, 0.5, 0.6, 0.7]
}

In [80]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 8

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 500,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [81]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'reg_lambda': 0.5, 'num_leaves': 30, 'n_estimators': 340, 'max_depth': 25, 'learning_rate': 0.1}


In [82]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.2341095046611623 0.4249100512755202


Best performance so far. Better than tuned HGBT.

In [84]:
# Defining the hyperparameter space - attempt 9

param_dist = {
    'n_estimators': np.arange(310,370,5),           
    'learning_rate': np.arange(0.09,0.12,0.005),     
    'num_leaves': np.arange(30,40,1),
    'max_depth': np.arange(25,35,2),
    'reg_lambda': [0.5, 0.6, 0.7]
}

In [85]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 9

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 200,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [86]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004990 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'reg_lambda': 0.6, 'num_leaves': 32, 'n_estimators': 335, 'max_depth': 33, 'learning_rate': 0.09}


In [87]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.298222324686777 0.40840654582774016


In [88]:
# Defining the hyperparameter space - attempt 10

param_dist = {
    'n_estimators': [330,335,340,345],           
    'learning_rate': [0.09, 0.10, 0.11, 0.12],     
    'num_leaves': np.arange(30,40,1),
    'max_depth': np.arange(16,36,2),
    'reg_lambda': [0.5, 0.6]
}

In [89]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 10

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 150,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [90]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'reg_lambda': 0.6, 'num_leaves': 32, 'n_estimators': 345, 'max_depth': 32, 'learning_rate': 0.09}


In [91]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.285234883532066 0.41174968852157046


In [92]:
# Defining the hyperparameter space - attempt 11

param_dist = {
    'n_estimators': [345, 350, 355, 360, 365],           
    'learning_rate': [0.08, 0.09, 0.10, 0.11],     
    'num_leaves': np.arange(32,42,2),
    'max_depth': np.arange(20,40,2),
    'reg_lambda': [0.6, 0.7, 0.8]
}

In [93]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 11

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 200,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [94]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
Best hyperparameters found: {'reg_lambda': 0.6, 'num_leaves': 32, 'n_estimators': 345, 'max_depth': 24, 'learning_rate': 0.09}


In [95]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.285234883532066 0.41174968852157046


In [100]:
# Defining the hyperparameter space - attempt 12

param_dist = {
    'n_estimators': [345, 350, 355, 360, 365],           
    'learning_rate': [0.08, 0.09, 0.10, 0.11],     
    'num_leaves': np.arange(60,240,40),
    'max_depth': np.arange(6,10,2),
    'reg_lambda': [0.5, 0.6]
}

In [101]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 12

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 100,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [102]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037




Best hyperparameters found: {'reg_lambda': 0.6, 'num_leaves': 140, 'n_estimators': 365, 'max_depth': 6, 'learning_rate': 0.08}


In [103]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.2228341933614137 0.42781246862959177


Best performance so far!

In [104]:
# Defining the hyperparameter space - attempt 13

param_dist = {
    'n_estimators': np.arange(360, 400, 10),           
    'learning_rate': [0.05, 0.06, 0.07, 0.08],     
    'num_leaves': np.arange(120,420,30),
    'max_depth': np.arange(3,9,3),
    'reg_lambda': [0.6, 0.7, 0.8]
}

In [105]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 13

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 80,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [106]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037




Best hyperparameters found: {'reg_lambda': 0.8, 'num_leaves': 330, 'n_estimators': 380, 'max_depth': 6, 'learning_rate': 0.08}


In [107]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.259378091661738 0.4184055758358952


In [108]:
# Defining the hyperparameter space - attempt 14

param_dist = {
    'n_estimators': np.arange(360, 420, 10),           
    'learning_rate': [0.07, 0.08, 0.09],     
    'num_leaves': np.arange(320,400,20),
    'max_depth': np.arange(6,15,3),
    'reg_lambda': [0.8, 0.9, 1.0]
}

In [109]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 14

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 100,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [110]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001000 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037




Best hyperparameters found: {'reg_lambda': 0.8, 'num_leaves': 320, 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.08}


In [111]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.2536222730155586 0.4198872012634729


## Testing performance on test set - Tuned Model

In [83]:
# Training performance for March
model_mar = LGBMRegressor(learning_rate = 0.1, reg_lambda = 0.5, num_leaves=30, n_estimators=340, max_depth=25, random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
model_apr = LGBMRegressor(learning_rate = 0.1, reg_lambda = 0.5, num_leaves=30, n_estimators=340, max_depth=25, random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1113
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.674524
2.916589988060225 0.5690824411104614
4.447067877573333 0.5729795897468821


In [112]:
# Training performance for March
model_mar = LGBMRegressor(learning_rate = 0.08, reg_lambda = 0.6, num_leaves=140, n_estimators=365, max_depth=6, random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
model_apr = LGBMRegressor(learning_rate = 0.08, reg_lambda = 0.6, num_leaves=140, n_estimators=365, max_depth=6, random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000992 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.451037






[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000802 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1113
[LightGBM] [Info] Number of data points in the train set: 58800, number of used features: 227
[LightGBM] [Info] Start training from score 1.674524






2.989223615422197 0.5583510371338161
4.513225158768416 0.5666269749150683


The upper configuration provides superior performance.

## Are these MSE values good enough?
This can be compared to the case, when only mean is predicted.

In [125]:
# predicted value is only mean
y_test_pred_mar = pd.Series([np.mean(y_test_mar)]*len(y_test_mar))
y_test_pred_apr = pd.Series([np.mean(y_test_apr)]*len(y_test_apr))

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

6.768324770928779 0.0
10.414181080799667 0.0


The MSE scores are much higher.

## XGboost - Default Hyperparameters

In [51]:
# Training performance for March
import xgboost as xgb
model_mar = xgb.XGBRegressor(objective='reg:squarederror', random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
model_apr = xgb.XGBRegressor(objective='reg:squarederror', random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

3.1199026678719535 0.539043664932251
4.845478372784829 0.5347230434417725


With default hyperparameters, the performance for XGboost is poorer to LGBM (default hyperparameters) and HGBT (default hyperparameters). However, it's better than RF.

## XGboost - Tuning
It's very sensitive to hyperparameter tuning.

In [10]:
# Defining the hyperparameter space - attempt 1
import numpy as np

param_dist = {
    'n_estimators': np.arange(100, 300, 50),           
    'learning_rate': np.arange(0.07, 0.14, 0.01),
    'max_depth': np.arange(3,15,3),
    'subsample': np.arange(0.6, 1.0, 0.1),
    'reg_lambda': [0.1, 0.3, 0.5, 0.7]
}

In [11]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 1

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 75,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [12]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.6, 'reg_lambda': 0.7, 'n_estimators': 250, 'max_depth': 6, 'learning_rate': 0.07}


In [15]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.381520334644271 0.38696444034576416


Performs appears to be very good on the first attempt.

In [16]:
# Defining the hyperparameter space - attempt 2

param_dist = {
    'n_estimators': np.arange(150, 400, 50),           
    'learning_rate': np.arange(0.05, 0.12, 0.01),
    'max_depth': np.arange(4,12,2),
    'subsample': np.arange(0.3, 0.7, 0.1),
    'reg_lambda': [0.4, 0.6, 0.7, 0.8]
}

In [17]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 2

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 60,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [18]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.6000000000000001, 'reg_lambda': 0.4, 'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.05}


In [19]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.4090554505767217 0.3798765540122986


Performance reduced. Attempt 1 was better.

In [20]:
# Defining the hyperparameter space - attempt 3

param_dist = {
    'n_estimators': np.arange(170, 350, 30),           
    'learning_rate': np.arange(0.04, 0.1, 0.02),
    'max_depth': np.arange(5,11,2),
    'subsample': np.arange(0.5, 0.7, 0.05),
    'reg_lambda': np.arange(0.3,0.7,0.2)
}

In [21]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 3

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 60,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [22]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.6500000000000001, 'reg_lambda': 0.3, 'n_estimators': 260, 'max_depth': 7, 'learning_rate': 0.04}


In [23]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.4177641600910236 0.37763482332229614


In [24]:
# Defining the hyperparameter space - attempt 4

param_dist = {
    'n_estimators': np.arange(250, 450, 50),   #        
    'learning_rate': np.arange(0.01, 0.07, 0.02),
    'max_depth': np.arange(4,8,1), #
    'subsample': np.arange(0.6, 0.8, 0.05), #
    'reg_lambda': np.arange(0.8, 1.4 ,0.2)
}

In [25]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 4

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 60,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [26]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.65, 'reg_lambda': 1.2, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.049999999999999996}


In [27]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.336713119972192 0.3984984755516052


Best performance so far.

In [28]:
# Defining the hyperparameter space - attempt 5

param_dist = {
    'n_estimators': np.arange(220, 360, 20),  #      
    'learning_rate': np.arange(0.04, 0.08, 0.01), #
    'max_depth': np.arange(5,9,1), #
    'subsample': np.arange(0.55, 0.65, 0.05), #
    'reg_lambda': np.arange(0.8, 1.8 ,0.2)
}

In [29]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 5

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 60,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [30]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.6000000000000001, 'reg_lambda': 1.4, 'n_estimators': 240, 'max_depth': 7, 'learning_rate': 0.07}


In [31]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.3475173210552907 0.3957173824310303


In [32]:
# Defining the hyperparameter space - attempt 6

param_dist = {
    'n_estimators': np.arange(200, 440, 20),      #  
    'learning_rate': np.arange(0.03, 0.11, 0.02), # 
    'max_depth': np.arange(6,8,1),               #
    'subsample': np.arange(0.6, 0.65, 0.05),     #
    'reg_lambda': np.arange(0.3, 2.1 ,0.3)
}

In [33]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 6

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 80,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [34]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.6, 'reg_lambda': 1.2, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.07}


In [35]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.330370119419308 0.4001312255859375


In [36]:
# Defining the hyperparameter space - attempt 7

param_dist = {
    'n_estimators': np.arange(180, 330, 30),      #  
    'learning_rate': np.arange(0.07, 0.21, 0.03), #
    'max_depth': np.arange(3,7,2),     #          
    'subsample': [0.4, 0.6, 0.8, 1.0],     #
    'reg_lambda': np.arange(0.3, 2.1 ,0.3)
}

In [37]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 7

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 60,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [38]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.8, 'reg_lambda': 0.3, 'n_estimators': 180, 'max_depth': 5, 'learning_rate': 0.1}


In [39]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.472174235047276 0.36362898349761963


In [40]:
# Defining the hyperparameter space - attempt 8

param_dist = {
    'n_estimators': np.arange(150, 390, 30),    #   
    'learning_rate': np.arange(0.04, 0.12, 0.02), #
    'max_depth': np.arange(4,8,2),      #        
    'subsample': [0.6, 0.7, 0.8],   #  
    'reg_lambda': np.arange(0.2, 2.0, 0.3) #
}

In [41]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 8

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 90,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [42]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.7, 'reg_lambda': 0.5, 'n_estimators': 360, 'max_depth': 6, 'learning_rate': 0.04}


In [43]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.4335572953411404 0.3735694885253906


n_estimators is highly. It's possible that value is too high and leading to over-fitting.

In [44]:
# Defining the hyperparameter space - attempt 9

param_dist = {
    'n_estimators': np.arange(160, 260, 20),     #  
    'learning_rate': np.arange(0.04, 0.14, 0.02), #
    'max_depth': np.arange(4,8,2),   #           
    'subsample': [0.6, 0.7, 0.8],   #  
    'reg_lambda': np.arange(0.5, 1.5, 0.2) #
}

In [45]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 9

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 90,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [46]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.6, 'reg_lambda': 0.7, 'n_estimators': 220, 'max_depth': 6, 'learning_rate': 0.07999999999999999}


In [47]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.4163018043936253 0.3780112862586975


In [52]:
# Defining the hyperparameter space - attempt 10

param_dist = {
    'n_estimators': np.arange(240, 300, 10),    #   
    'learning_rate': np.arange(0.05, 0.08, 0.01), #
    'max_depth': [6,7],   #           
    'subsample': [0.5, 0.6],   #
    'reg_lambda': np.arange(0.8, 1.6, 0.2) #
}

In [54]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with the already trained model (for March only) - attempt 10

random_search = RandomizedSearchCV(
    estimator = model_mar,  # Pre-trained model for March
    param_distributions = param_dist,
    n_iter = 120,  # Number of random combinations to try
    cv = None,    # Can't be used since temporal dependencies are present
    scoring ='neg_mean_squared_error',
    n_jobs = -1,  # Use all CPU cores
    random_state=42
)

In [55]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train_mar, y_train_mar)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

Best hyperparameters found: {'subsample': 0.6, 'reg_lambda': 1.4, 'n_estimators': 270, 'max_depth': 7, 'learning_rate': 0.07}


In [56]:
# Evaluating performance on the validation set
best_rf = random_search.best_estimator_
y_val_pred_mar = best_rf.predict(X_val_mar)

# Calculate score on validation set
print(mean_squared_error(y_val_mar, y_val_pred_mar), r2_score(y_val_mar, y_val_pred_mar))

2.3475485825199507 0.3957092761993408


## Tuned XGboost - Performance on Test Set
There are four configurations that are yielding similar performance on the test set.

In [57]:
#xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)

# Training performance for March
model_mar = xgb.XGBRegressor(objective='reg:squarederror', subsample = 0.65, reg_lambda = 1.2, n_estimators=300, max_depth=7, learning_rate=0.05, random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
model_apr = xgb.XGBRegressor(objective='reg:squarederror', subsample = 0.65, reg_lambda = 1.2, n_estimators=300, max_depth=7, learning_rate=0.05, random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

3.0285539964353303 0.552540123462677
4.797032504639687 0.5393749475479126


Avg MSE = 3.9

In [58]:
#xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)

# Training performance for March
model_mar = xgb.XGBRegressor(objective='reg:squarederror', subsample = 0.6, reg_lambda = 1.4, n_estimators=240, max_depth=7, learning_rate=0.07, random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
model_apr = xgb.XGBRegressor(objective='reg:squarederror', subsample = 0.6, reg_lambda = 1.4, n_estimators=240, max_depth=7, learning_rate=0.07, random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

2.9533956630586378 0.563644528388977
4.760836494385807 0.5428506135940552


Avg MSE = 3.85

In [59]:
#xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)

# Training performance for March
model_mar = xgb.XGBRegressor(objective='reg:squarederror', subsample = 0.6, reg_lambda = 1.2, n_estimators=300, max_depth=7, learning_rate=0.07, random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
model_apr = xgb.XGBRegressor(objective='reg:squarederror', subsample = 0.6, reg_lambda = 1.2, n_estimators=300, max_depth=7, learning_rate=0.07, random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

2.9755611360812035 0.5603696703910828
4.625242844973411 0.5558706521987915


Average MSE = 3.79 (Best of all 4 configurations)

In [60]:
#xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)

# Training performance for March
model_mar = xgb.XGBRegressor(objective='reg:squarederror', subsample = 0.6, reg_lambda = 1.4, n_estimators=270, max_depth=7, learning_rate=0.07, random_state=2)
model_mar.fit(X_train_mar, y_train_mar)

# Testing performance for March
y_test_pred_mar = model_mar.predict(X_test_mar)

# Training performance for April
model_apr = xgb.XGBRegressor(objective='reg:squarederror', subsample = 0.6, reg_lambda = 1.4, n_estimators=270, max_depth=7, learning_rate=0.07, random_state=2)
model_apr.fit(X_train_apr, y_train_apr)

# Testing performance for April
y_test_pred_apr = model_apr.predict(X_test_apr)

# Scores
print(mean_squared_error(y_test_mar, y_test_pred_mar), r2_score(y_test_mar, y_test_pred_mar))
print(mean_squared_error(y_test_apr, y_test_pred_apr), r2_score(y_test_apr, y_test_pred_apr))

2.9412073314296645 0.5654453635215759
4.755867745726371 0.5433276891708374


Avg MSE = 3.84