In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')
import os

In [3]:
print("Loading the preprocessed data..\n")

try:
    # Method 1: Try loading from .npy files (recommended)
    if os.path.exists('../data/processed/X_processed.npy'):
        X_processed = np.load('../data/processed/X_processed.npy')
        y = np.load('../data/processed/y.npy')
    
    # Method 2: Try loading from compressed .npz file
    elif os.path.exists('../data/processed/processed_data.npz'):
        data = np.load('../data/processed/processed_data.npz')
        X_processed = data['X']
        y = data['y']
    
    # Method 3: Fallback to original method (fixed)
    else:
        data = np.load('../data/processed/X_processed.npz', allow_pickle=True)
        X_processed = data['arr_0']
        if hasattr(X_processed, 'toarray'):  # If sparse matrix
            X_processed = X_processed.toarray()
        y = joblib.load('../data/processed/y.pkl')
    
    # Load preprocessor
    preprocessor = joblib.load('../outputs/models/preprocessor.pkl')
    
    
except FileNotFoundError as e:
    print(f' Error Loading files: {e}')
    exit()
except Exception as e:
    print(f' Unexpected error: {e}')
    exit()

# Convert to proper numpy arrays if needed
if not isinstance(X_processed, np.ndarray):
    X_processed = np.array(X_processed)
if not isinstance(y, np.ndarray):
    y = np.array(y)

print(f"\nFinal data verification:")
print(f"X_processed: {X_processed.shape}, dtype: {X_processed.dtype}")
print(f"y: {y.shape}, dtype: {y.dtype}")

Loading the preprocessed data..


Final data verification:
X_processed: (7390, 2148), dtype: float64
y: (7390,), dtype: int64


In [5]:
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y, test_size=0.2, random_state=42
    )
    
    print(f"\n✅ Data split successful:")
    print(f"Training set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
    print(f"y_train: {y_train.shape}")
    print(f"y_test: {y_test.shape}")
    
except Exception as e:
    print(f"❌ Error in train_test_split: {e}")
    print(f"X_processed type: {type(X_processed)}")
    print(f"X_processed shape: {getattr(X_processed, 'shape', 'No shape attribute')}")
    exit()


✅ Data split successful:
Training set: (5912, 2148)
Test set: (1478, 2148)
y_train: (5912,)
y_test: (1478,)


In [6]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.01),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100),

}

In [7]:
results = {}
cv_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    try:
        # Fit the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        # Cross-validation score
        cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()
        
        # Store results
        results[name] = {
            'model': model,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2
        }
        cv_scores[name] = cv_score
        
        print(f" {name}: R²={r2:.4f}, RMSE={rmse:.4f}, CV_R²={cv_score:.4f}")
        
    except Exception as e:
        print(f" Error training {name}: {e}")
        continue



Training Linear Regression...
 Linear Regression: R²=0.9889, RMSE=6.1597, CV_R²=0.9912

Training Ridge Regression...
 Ridge Regression: R²=0.9912, RMSE=5.4898, CV_R²=0.9928

Training Lasso Regression...
 Lasso Regression: R²=0.9909, RMSE=5.5719, CV_R²=0.9927

Training Random Forest...
 Random Forest: R²=0.9965, RMSE=3.4756, CV_R²=0.9975


In [8]:
performance_df = pd.DataFrame(results).T[['R2', 'RMSE', 'MAE']]
print("\n" + "="*60)
print("MODEL PERFORMANCE SUMMARY (Sorted by R² Score)")
print("="*60)
print(performance_df.sort_values('R2', ascending=False))



MODEL PERFORMANCE SUMMARY (Sorted by R² Score)
                         R2      RMSE       MAE
Random Forest      0.996476  3.475618  1.828286
Ridge Regression   0.991207  5.489794  3.028768
Lasso Regression   0.990942  5.571918  3.054642
Linear Regression   0.98893  6.159702   3.35824


In [10]:
best_model_name = performance_df['R2'].idxmax()
best_model = results[best_model_name]['model']
y_pred_best = best_model.predict(X_test)
r2_best = r2_score(y_test, y_pred_best)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
mae_best = mean_absolute_error(y_test, y_pred_best)


In [12]:

print(f"\n" + "="*50)
print("BEST MODEL RESULTS")
print("="*50)
print(f" Best Model: {best_model_name}")
print(f" R² Score: {r2_best:.6f}")
print(f" RMSE: {rmse_best:.6f}")
print(f" MAE: {mae_best:.6f}")
print(f" Cross-Validation R²: {cv_scores[best_model_name]:.6f}")


BEST MODEL RESULTS
 Best Model: Random Forest
 R² Score: 0.996476
 RMSE: 3.475618
 MAE: 1.828286
 Cross-Validation R²: 0.997471


In [13]:
os.makedirs('../outputs/models', exist_ok=True)
joblib.dump(best_model, "../outputs/models/best_model.pkl")
print(f"\n💾 Best model ({best_model_name}) saved as best_model.pkl")


💾 Best model (Random Forest) saved as best_model.pkl
