# Data Segmentation
Training, test and validation sets must be created keeping in mind the temporal nature of the dataset. Since we don't want temporal dependencies to be lost, <u>random sampling and standard k-fold cross validation</u> is not the best choice!

In [2]:
# imporing the dataset

import pandas as pd
selected_rentals = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 5 (TU Dresden)\Research Task - Spatial Modelling\Code\rentals_working_with_subway_stations.csv")
selected_rentals.head()

Unnamed: 0,#_rentals,datetime,year,month,day,hour,ID,coordinates,#_rentals_lag_1,name_of_day,...,w_avg_roll_avg,w_avg_lag_2,temp,rhum,prcp,wspd,rush_hr,MapID,coco,sb_st_800
0,0,2024-01-01 08:00:00,2024,1,1,8,0,POINT (-73.9383 40.7923272),,Monday,...,0.0,0.0,5.0,62.0,0.0,6.0,1,309.0,3.0,0.0
1,0,2024-01-01 10:00:00,2024,1,1,10,0,POINT (-73.9383 40.7923272),0.0,Monday,...,0.0,0.0,5.0,65.0,0.0,7.0,0,309.0,3.0,0.0
2,0,2024-01-01 12:00:00,2024,1,1,12,0,POINT (-73.9383 40.7923272),0.0,Monday,...,0.0,0.378298,4.0,82.0,0.0,6.0,0,309.0,3.0,0.0
3,0,2024-01-01 14:00:00,2024,1,1,14,0,POINT (-73.9383 40.7923272),0.0,Monday,...,0.0,0.655962,5.0,73.0,0.0,6.0,1,309.0,3.0,0.0
4,0,2024-01-01 16:00:00,2024,1,1,16,0,POINT (-73.9383 40.7923272),0.0,Monday,...,0.0,0.661161,7.0,60.0,0.0,6.0,1,309.0,3.0,0.0


## Swtiching the test set to one day
Instead of predciting over a week or several days, predicting for all seven time periods for a single day can lead to better prediction performance. Here, a rolling window approach could be followed where the model (trained on previous month data) is repeatedly trained for the subsequent test days. 

In [11]:
# What if forecasting is done only for the next coming day?? -- Testing for March

# creating dummies for ID, coco and name_of_day
selected_rentals_dum = pd.get_dummies(selected_rentals[['#_rentals', 'month', 'day', 'hour', 'ID',
       '#_rentals_lag_1', 'name_of_day', 'weekend',
       '#_rentals_lag_2', 'prev_day', 'prev_week', 'roll_avg',
       'w_avg_lag_1', 'w_avg_prev_day', 'w_avg_roll_avg', 'w_avg_lag_2',
       'temp', 'rhum', 'prcp', 'wspd', 'rush_hr', 'MapID', 'coco',
       'sb_st_800']], columns = ["ID", "coco", "name_of_day"], drop_first=False)

# Developing a training/test data for Mar

# training data
X_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=30) & (selected_rentals_dum["month"] == 3),list(selected_rentals_dum.columns)[1:]]
y_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=30) & (selected_rentals_dum["month"] == 3), "#_rentals"]

X_test = selected_rentals_dum.loc[(selected_rentals_dum["day"] >30) & (selected_rentals_dum["month"] == 3),list(selected_rentals_dum.columns)[1:]]
y_test = selected_rentals_dum.loc[(selected_rentals_dum["day"] >30) & (selected_rentals_dum["month"] == 3), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

3.8435239999999995 0.656937468574945


In [12]:
# What if forecasting is done only for the next coming day?? -- Testing for February

# creating dummies for ID, coco and name_of_day
selected_rentals_dum = pd.get_dummies(selected_rentals[['#_rentals', 'month', 'day', 'hour', 'ID',
       '#_rentals_lag_1', 'name_of_day', 'weekend',
       '#_rentals_lag_2', 'prev_day', 'prev_week', 'roll_avg',
       'w_avg_lag_1', 'w_avg_prev_day', 'w_avg_roll_avg', 'w_avg_lag_2',
       'temp', 'rhum', 'prcp', 'wspd', 'rush_hr', 'MapID', 'coco',
       'sb_st_800']], columns = ["ID", "coco", "name_of_day"], drop_first=False)

# Developing a training/test data for Feb

# training data
X_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=28) & (selected_rentals_dum["month"] == 2),list(selected_rentals_dum.columns)[1:]]
y_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=28) & (selected_rentals_dum["month"] == 2), "#_rentals"]

X_test = selected_rentals_dum.loc[(selected_rentals_dum["day"] >28) & (selected_rentals_dum["month"] == 2),list(selected_rentals_dum.columns)[1:]]
y_test = selected_rentals_dum.loc[(selected_rentals_dum["day"] >28) & (selected_rentals_dum["month"] == 2), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

1.589596214285714 0.5483577960106691


In [14]:
# What if forecasting is done only for the next coming day?? -- Testing for January

# creating dummies for ID, coco and name_of_day
selected_rentals_dum = pd.get_dummies(selected_rentals[['#_rentals', 'month', 'day', 'hour', 'ID',
       '#_rentals_lag_1', 'name_of_day', 'weekend',
       '#_rentals_lag_2', 'prev_day', 'prev_week', 'roll_avg',
       'w_avg_lag_1', 'w_avg_prev_day', 'w_avg_roll_avg', 'w_avg_lag_2',
       'temp', 'rhum', 'prcp', 'wspd', 'rush_hr', 'MapID', 'coco',
       'sb_st_800']], columns = ["ID", "coco", "name_of_day"], drop_first=False)

# Developing a training/test data for Jan

# training data
X_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=30) & (selected_rentals_dum["month"] == 1),list(selected_rentals_dum.columns)[1:]]
y_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=30) & (selected_rentals_dum["month"] == 1), "#_rentals"]

X_test = selected_rentals_dum.loc[(selected_rentals_dum["day"] >30) & (selected_rentals_dum["month"] == 1),list(selected_rentals_dum.columns)[1:]]
y_test = selected_rentals_dum.loc[(selected_rentals_dum["day"] >30) & (selected_rentals_dum["month"] == 1), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

2.0073266428571426 0.5705585316935134


Predicting only for the **next day** improved performance significantly!! <br>
Hopefully, average performance can be increased with model selection and tuning.

## Random Search
### Creating Discrete Training & Validation Sets
First, performing for March. <br>
The performance for the default parameters has been 0.656 (r2 score). <br>
This is the score to beat.

In [15]:
# creating dummies for ID, coco and name_of_day
selected_rentals_dum = pd.get_dummies(selected_rentals[['#_rentals', 'month', 'day', 'hour', 'ID',
       '#_rentals_lag_1', 'name_of_day', 'weekend',
       '#_rentals_lag_2', 'prev_day', 'prev_week', 'roll_avg',
       'w_avg_lag_1', 'w_avg_prev_day', 'w_avg_roll_avg', 'w_avg_lag_2',
       'temp', 'rhum', 'prcp', 'wspd', 'rush_hr', 'MapID', 'coco',
       'sb_st_800']], columns = ["ID", "coco", "name_of_day"], drop_first=False)

# training
X_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=20) & (selected_rentals_dum["month"] == 3),list(selected_rentals_dum.columns)[1:]]
y_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=20) & (selected_rentals_dum["month"] == 3), "#_rentals"]

# validation
X_val = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=27) & (selected_rentals_dum["day"] >20) & (selected_rentals_dum["month"] == 3),list(selected_rentals_dum.columns)[1:]]
y_val = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=27) & (selected_rentals_dum["day"] >20) & (selected_rentals_dum["month"] == 3), "#_rentals"]

In [22]:
# performance on the training set -- default hyperparameters

default_forest = RandomForestRegressor(random_state=2, bootstrap=False)    # bootstrapping is avoided due to time-series nature  
default_forest.fit(X_train, y_train)
y_train_pred = default_forest.predict(X_train)
print(mean_squared_error(y_train, y_train_pred), r2_score(y_train, y_train_pred))

# This high level of performance means that it is overfitting heavily!

0.00047146071428571417 0.9999242337827164


In [77]:
import numpy as np

# Defining the hyperparameter space
param_dist = {
    'n_estimators': np.arange(25,35,2),     # Number of trees
    'max_depth': [8, 9],                 # Maximum depth of each tree
    'min_samples_split': [16], # Minimum samples required to split a node
    'min_samples_leaf': [2]     # Minimum samples at a leaf node
}

For **max_features**, since categorical feature variables are used as dummies we can't really use this hyperparameter. This is because it'll lead to tree generation from incomplete or missing data.

In [78]:
# Settting up the randomised search
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Initialize RandomizedSearchCV with the already trained model
random_search = RandomizedSearchCV(
    estimator= default_forest,  # this model was already trained before
    param_distributions= param_dist,
    n_iter=20,  # Number of random combinations to try
    cv=None,  # Can't be used since temporal dependencies are present
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all CPU cores
    random_state=42
)

In [79]:
# Fit RandomizedSearchCV (this does tuning but does not re-train your initial model)
random_search.fit(X_train, y_train)

# Getting the best hyperparameters
best_params = random_search.best_params_
print("Best hyperparameters found:", best_params)

# Get the best model and evaluate on the validation set
best_rf = random_search.best_estimator_
y_val_pred = best_rf.predict(X_val)

# Calculate score on validation set
print(mean_squared_error(y_val, y_val_pred), r2_score(y_val, y_val_pred))



Best hyperparameters found: {'n_estimators': 27, 'min_samples_split': 16, 'min_samples_leaf': 2, 'max_depth': 8}
2.5633354438964155 0.3526797188563662


Areas in which certain hyperparameters are ranging:
- n_estimators: 26 or 90 (less confidence on this!)
- min_samples_split: 16 or 17
- min_samples_leaf: 1 or 2
- max_depth: in the range 8 to 10

# !Attention!
You might need to experiment with *max_features* hyperparameter for greater tree diversity!

## Re-Training the Model
After finding the optimal hyperparameter values, model is retrained by combining the training and validation sets to get better generalisation. Previously created dataframe with dummy variables is used.

In [80]:
# re-defining the training set
X_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=27) & (selected_rentals_dum["month"] == 3),list(selected_rentals_dum.columns)[1:]]
y_train = selected_rentals_dum.loc[(selected_rentals_dum["day"] <=27) & (selected_rentals_dum["month"] == 3), "#_rentals"]

# defining test set: March 28th 2024
# validation
X_test = selected_rentals_dum.loc[(selected_rentals_dum["day"] ==28) & (selected_rentals_dum["month"] == 3),list(selected_rentals_dum.columns)[1:]]
y_test = selected_rentals_dum.loc[(selected_rentals_dum["day"] ==28) & (selected_rentals_dum["month"] == 3), "#_rentals"]

In [81]:
# Best parameters found from tuning
best_rf

In [82]:
# training the model with optimal hyperparameters: should update model parameters
best_rf.fit(X_train, y_train)

# Evaluating performance on the test set
y_test_pred = best_rf.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

1.376229672645924 0.03398268152203876


In [83]:
# What about rmse?
from sklearn.metrics import root_mean_squared_error
print(root_mean_squared_error(y_test, y_test_pred))

1.1731281569572543


- *The r2 score appears low, but the mse score is very very good.* This suggests that there might be an issue about using the right error metric! (MSE should be preferred).
- There are actually a couple of reasons for this, so you might need to study deeper.