### Import Required Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import ExtraTreesRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
import joblib

pd.set_option('display.max_colwidth', None)  
pd.set_option('display.max_rows', None)   
pd.set_option('display.max_columns', None)

from utils.data_prep import load_and_prepare_data
from utils.pipeline_create import create_pipeline

### Preparing and loading the data for modelling

In [2]:
data = load_and_prepare_data()

X_train = data['X_train']
X_val = data['X_val1']
X_test = data['X_val2']
y_train = data['y_train']
y_val = data['y_val1']
y_test = data['y_val2']


features = data['features']
linear_features = features['linear']['num'] + features['linear']['cat']
other_features = features['other']['num'] + features['other']['cat']

In [3]:
# Predict the mean of training target as baseline for all validation samples
baseline_pred = [y_train.mean()] * len(y_test)

# Calculate RMSE of the baseline predictions
rmse_baseline = np.sqrt(mean_squared_error(y_test, baseline_pred))

print(f"RMSE of Baseline Model: {rmse_baseline}")

RMSE of Baseline Model: 0.9585047480524229


## Model Building

In [4]:
# Best parameters found by Optuna
model_names = ['ridge', 'extratrees', 'xgboost', 'lightgbm', 'catboost']

models = [Ridge, ExtraTreesRegressor, XGBRegressor, LGBMRegressor, CatBoostRegressor]

params = [
    {'alpha': 0.0002762284437551006, 'solver': 'sag', 'fit_intercept': True, 'tol': 0.00013031761354747614},
    {'n_estimators': 326, 'max_depth': 24, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False},
    {'n_estimators': 683, 'learning_rate': 0.05710400944032593, 'max_depth': 7, 'subsample': 0.8708501983892822, 'colsample_bytree': 0.5990353703327878, 'reg_alpha': 3.254066913534751, 'reg_lambda': 0.01593621652838458, 'min_child_weight': 2, 'gamma': 0.0009536580000644827, 'booster': 'gbtree'},
    {'max_depth': 12, 'num_leaves': 99, 'n_estimators': 491, 'learning_rate': 0.05954360974397409, 'min_child_samples': 21, 'subsample': 0.5066852407414704, 'colsample_bytree': 0.8619994805735846, 'reg_alpha': 3.595735178870403, 'reg_lambda': 1.24462626575653,  'verbose': -1},
    {'bootstrap_type': 'Bernoulli', 'iterations': 406, 'learning_rate': 0.028763633853386924, 'depth': 10, 'l2_leaf_reg': 11.869905054427921, 'border_count': 125, 'random_strength': 7.610826130799793, 'grow_policy': 'Depthwise', 'verbose': False}
]

In [5]:
# Create the pipelines for each model
pipeline_dict = {}
for model_name, model, param in zip(model_names, models, params):
    pipeline = create_pipeline(model_name, model, param, features)
    pipeline_dict[model_name] = pipeline

In [6]:
# making sure the validation set is included in the training set for stacking
X = pd.concat([X_train, X_val], axis=0)
y = pd.concat([y_train, y_val], axis=0)

In [7]:
# Create a stacking regressor with the specified models and parameters
params = {'alpha': 0.0007174822830545591, 'l1_ratio': 0.658029960276858, 'fit_intercept': True, 'tol': 0.00010062095710653478, 'max_iter': 8792, 'selection': 'cyclic'}

meta_model = ElasticNet(**params)

final_stack = StackingRegressor(
    estimators=[
        ('ridge', pipeline_dict['ridge']),
        ('extratrees', pipeline_dict['extratrees']),
        ('xgboost', pipeline_dict['xgboost']),
        ('catboost', pipeline_dict['catboost']),
        ('lightgbm', pipeline_dict['lightgbm']),
    ],
    final_estimator=meta_model,
    passthrough=False,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=1
)
final_stack.fit(X, y)
print("fitted model")

fitted model


### Model Performance Evaluation

In [9]:
# Predict on the test set
y_pred = final_stack.predict(X_test)
print("Model Evaluation:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"R2: {r2_score(y_test, y_pred)}")

Model Evaluation:
RMSE: 0.059799511117813633
MAE: 0.0338187873366706
R2: 0.9961076947608092
