In [24]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [25]:
print("Loading the preprocessed data..\n")

try:
    X_processed_data=np.load('../data/processed/X_processed.npz',allow_pickle=True)
    X_processed=X_processed_data['arr_0'].item()
    
    y=joblib.load('../data/processed/y.pkl')
    
    preprocessor=joblib.load('../outputs/models/preprocessor.pkl')
except FileNotFoundError as e:
    print(f'Error Loading files : {e}')

Loading the preprocessed data..



In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)


In [27]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.01),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42, objective='reg:squarederror'),
    'LightGBM': lgb.LGBMRegressor(random_state=42),
    'SVR': SVR(kernel='rbf')
}


In [28]:
results = {}
cv_scores = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()
    
    results[name] = {
        'model': model,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2
    }
    cv_scores[name] = cv_score
    print(f"{name} - R2: {r2:.4f}, RMSE: {rmse:.4f}, CV R2: {cv_score:.4f}")


Linear Regression - R2: 0.9889, RMSE: 6.1586, CV R2: 0.9912
Ridge Regression - R2: 0.9913, RMSE: 5.4675, CV R2: 0.9928
Lasso Regression - R2: 0.9909, RMSE: 5.5720, CV R2: 0.9927
Random Forest - R2: 0.9965, RMSE: 3.4805, CV R2: 0.9975
Gradient Boosting - R2: 0.9955, RMSE: 3.9115, CV R2: 0.9964
XGBoost - R2: 0.9957, RMSE: 3.8258, CV R2: 0.9975
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000303 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 699
[LightGBM] [Info] Number of data points in the train set: 5912, number of used features: 87
[LightGBM] [Info] Start training from score 251.071888
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 675
[LightGBM] [Info] Number of data points in the train se

In [29]:
performance_df = pd.DataFrame(results).T[['R2', 'RMSE', 'MAE']]
print(performance_df.sort_values('R2', ascending=False))


                         R2      RMSE       MAE
Random Forest      0.996466  3.480488  1.825813
XGBoost             0.99573  3.825765  2.052024
LightGBM           0.995654  3.859718  2.223702
Gradient Boosting  0.995536  3.911544  2.576186
Ridge Regression   0.991278  5.467512  3.026363
Lasso Regression   0.990942  5.571954  3.054838
Linear Regression  0.988934  6.158588  3.357782
SVR                0.908768   17.6833  7.260604


In [30]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Step 1: Pick the best model based on R2
best_model_name = performance_df['R2'].idxmax()
best_model = models[best_model_name]

# Step 2: Predict with best model
y_pred_best = best_model.predict(X_test)

# Step 3: Evaluate performance
r2 = r2_score(y_test, y_pred_best)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_best)) 
mae = mean_absolute_error(y_test, y_pred_best)

# Step 4: Print results
print(f"Best Model: {best_model_name}")
print(f"R2: {r2:.6f}")
print(f"RMSE: {rmse:.6f}")
print(f"MAE: {mae:.6f}")


Best Model: Random Forest
R2: 0.996466
RMSE: 3.480488
MAE: 1.825813


In [31]:
import joblib

# Save the best model (RandomForest in your case)
joblib.dump(best_model, "../outputs/models/best_model.pkl")

print(" Model saved as best_model.pkl")


 Model saved as best_model.pkl
