<a href="https://colab.research.google.com/github/aayush-jain-dtu/inventory-stock-prediction/blob/main/model_training_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# --- 1. Load Preprocessed Data ---

# Load features (X) and scaled targets (y_scaled)
# NOTE: The target columns are already scaled (y_train_target.csv, y_test_target.csv)
try:
    X_train = pd.read_csv('X_train_features.csv')
    X_test = pd.read_csv('X_test_features.csv')
    y_train_scaled = pd.read_csv('y_train_target.csv').values.flatten()
    y_test_scaled = pd.read_csv('y_test_target.csv').values.flatten()
except FileNotFoundError as e:
    print(f"Error loading required files: {e}")
    print("Please ensure X_train_features.csv, X_test_features.csv, y_train_target.csv, and y_test_target.csv are accessible.")
    # Exit if files aren't found
    exit()

# List to store results for the final table
results = {}

In [3]:
# --- 2. Define Model and Metric Calculation Function ---

def evaluate_model(name, model, X_train, X_test, y_train_scaled, y_test_scaled):
    print(f"\n--- Running: {name} ---")
    model.fit(X_train, y_train_scaled)
    y_pred_scaled = model.predict(X_test)

    # Calculate R2
    r2 = r2_score(y_test_scaled, y_pred_scaled)

    # Calculate RMSE/sigma (RMSE on scaled data is the normalized error)
    rmse_scaled = np.sqrt(mean_squared_error(y_test_scaled, y_pred_scaled))

    results[name] = {'R2': round(r2, 3), 'RMSE/sigma': round(rmse_scaled, 3)}

    print(f"R²: {results[name]['R2']}")
    print(f"RMSE/sigma: {results[name]['RMSE/sigma']}")
    return model

3. MODEL COMPARISON SEQUENCE ---

In [4]:
# --- A. Linear Regression (LR) ---
# Testing the baseline performance against the Failure of Linearity
lr = LinearRegression()
evaluate_model('Linear Regression (LR)', lr, X_train, X_test, y_train_scaled, y_test_scaled)


# --- B. Support Vector Regressor (SVR) ---
# Introducing non-linear kernel modeling (SVR requires scaled data)
svr = SVR(max_iter=5000)
evaluate_model('Support Vector Regressor (SVR)', svr, X_train, X_test, y_train_scaled, y_test_scaled)


# --- C. Random Forest Regressor (RF) ---
# Testing the robust parallel ensemble model (Bagging)
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
evaluate_model('Random Forest Regressor (RF)', rf, X_train, X_test, y_train_scaled, y_test_scaled)


# --- D. Hist Gradient Boosting Regressor (HGB) ---
# Testing the efficient sequential ensemble model (Boosting)
hgb = HistGradientBoostingRegressor(random_state=42)
evaluate_model('Hist Gradient Boosting Regressor (HGB)', hgb, X_train, X_test, y_train_scaled, y_test_scaled)


--- Running: Linear Regression (LR) ---
R²: 0.27
RMSE/sigma: 0.845

--- Running: Support Vector Regressor (SVR) ---




R²: 0.594
RMSE/sigma: 0.63

--- Running: Random Forest Regressor (RF) ---
R²: 0.757
RMSE/sigma: 0.487

--- Running: Hist Gradient Boosting Regressor (HGB) ---
R²: 0.732
RMSE/sigma: 0.512


In [5]:
# --- 4. Final Summary (Verification against Chapter 5 Claims) ---
print("\n=========================================================================")
print("--- FINAL VALIDATION SUMMARY (HGB vs. RF) ---")
final_df = pd.DataFrame(results).T.sort_values(by='R2', ascending=False)
print(final_df)
print(f"\nConclusion: HGB achieved R² = {results['Hist Gradient Boosting Regressor (HGB)']['R2']}, meeting the project benchmark.")


--- FINAL VALIDATION SUMMARY (HGB vs. RF) ---
                                           R2  RMSE/sigma
Random Forest Regressor (RF)            0.757       0.487
Hist Gradient Boosting Regressor (HGB)  0.732       0.512
Support Vector Regressor (SVR)          0.594       0.630
Linear Regression (LR)                  0.270       0.845

Conclusion: HGB achieved R² = 0.732, meeting the project benchmark.
