In [None]:
# Cell 1: Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

print("✅ All libraries imported successfully")

# Cell 2: Load Data and Base Models
X_train = np.load('../processed_data/X_train.npy')
X_test = np.load('../processed_data/X_test.npy')
y_train = np.load('../processed_data/y_train.npy')
y_test = np.load('../processed_data/y_test.npy')

# Load base models
base_models = {}
for model_name in ['knn', 'decision_tree', 'random_forest']:
    base_models[model_name] = joblib.load(f'../saved_base_models/{model_name}_model.pkl')

print(f"📊 Data Loaded:")
print(f"   Training: {X_train.shape}, Test: {X_test.shape}")
print(f"✅ Base models loaded: {list(base_models.keys())}")

# Cell 3: Create Ensemble Models
ensemble_models = {
    'bagging_random_forest': RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        random_state=42,
        bootstrap=True
    ),
    'boosting_gradient_boost': GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
    ),
    'stacking_ridge': Ridge(alpha=1.0, random_state=42)
}

print("🔧 Ensemble Models Created:")
for name in ensemble_models.keys():
    print(f"   ✓ {name.replace('_', ' ').upper()}")

# Cell 4: Train Ensemble Models with 10-Fold CV
X_full = np.vstack([X_train, X_test])
y_full = np.concatenate([y_train, y_test])
cv = KFold(n_splits=10, shuffle=True, random_state=42)

ensemble_results = {}

print("=" * 70)
print("TRAINING ENSEMBLE MODELS WITH 10-FOLD CROSS-VALIDATION")
print("=" * 70)

for name, model in ensemble_models.items():
    print(f"\n🔄 Training {name.replace('_', ' ').upper()}...")
    
    # 10-fold cross-validation
    fold_scores = -cross_val_score(model, X_full, y_full, cv=cv, 
                                   scoring='neg_mean_squared_error')
    
    cv_mean = np.mean(fold_scores)
    cv_std = np.std(fold_scores)
    cv_rmse = np.sqrt(cv_mean)
    
    # Train on full training set
    model.fit(X_train, y_train)
    
    # Test predictions
    y_pred = model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_pred)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(y_test, y_pred)
    test_r2 = r2_score(y_test, y_pred)
    
    ensemble_results[name] = {
        'model': model,
        'cv_mse_mean': cv_mean,
        'cv_mse_std': cv_std,
        'cv_rmse': cv_rmse,
        'cv_10fold_scores': fold_scores.tolist(),
        'test_mse': test_mse,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_r2': test_r2,
        'y_pred': y_pred
    }
    
    print(f"   ✅ {name.replace('_', ' ').upper()} Trained")
    print(f"      10-Fold CV RMSE: {cv_rmse:.4f}")
    print(f"      Test RMSE: {test_rmse:.4f}")
    print(f"      Test R²: {test_r2:.4f}")

# Cell 5: Compare Ensemble Models
summary_data = []
for name, data in ensemble_results.items():
    summary_data.append({
        'Model': name.replace('_', ' ').upper(),
        'CV RMSE': f"{data['cv_rmse']:.4f}",
        'Test RMSE': f"{data['test_rmse']:.4f}",
        'Test MAE': f"{data['test_mae']:.4f}",
        'Test R²': f"{data['test_r2']:.4f}"
    })

summary_df = pd.DataFrame(summary_data)
print("\n" + "=" * 70)
print("ENSEMBLE MODEL PERFORMANCE SUMMARY")
print("=" * 70)
display(summary_df)

# Cell 6: Visualize Ensemble Performance
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

model_names = list(ensemble_results.keys())
colors = ['#f59e0b', '#10b981', '#8b5cf6']

# Plot 1: RMSE Comparison
x = np.arange(len(model_names))
width = 0.35

cv_rmse = [ensemble_results[name]['cv_rmse'] for name in model_names]
test_rmse = [ensemble_results[name]['test_rmse'] for name in model_names]

axes[0, 0].bar(x - width/2, cv_rmse, width, label='CV RMSE', color=colors, alpha=0.8)
axes[0, 0].bar(x + width/2, test_rmse, width, label='Test RMSE', color=colors, alpha=0.5)
axes[0, 0].set_xlabel('Ensemble Models', fontweight='bold')
axes[0, 0].set_ylabel('RMSE', fontweight='bold')
axes[0, 0].set_title('RMSE Comparison', fontweight='bold')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels([n.replace('_', ' ').upper() for n in model_names], rotation=15, ha='right')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Plot 2: R² Score Comparison
r2_scores = [ensemble_results[name]['test_r2'] for name in model_names]
axes[0, 1].bar([n.replace('_', ' ').upper() for n in model_names], r2_scores, 
               color=colors, alpha=0.8)
axes[0, 1].set_ylabel('R² Score', fontweight='bold')
axes[0, 1].set_title('R² Score Comparison', fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# Plot 3 & 4: Prediction vs Actual (show 2 best models)
for idx, name in enumerate(model_names[:2]):
    y_pred = ensemble_results[name]['y_pred']
    
    axes[1, idx].scatter(y_test, y_pred, alpha=0.6, color=colors[idx])
    
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    axes[1, idx].plot([min_val, max_val], [min_val, max_val], 
                      'r--', linewidth=2, label='Perfect')
    
    axes[1, idx].set_xlabel('Actual', fontweight='bold')
    axes[1, idx].set_ylabel('Predicted', fontweight='bold')
    axes[1, idx].set_title(f'{name.replace("_", " ").upper()}\nR²: {ensemble_results[name]["test_r2"]:.3f}',
                          fontweight='bold')
    axes[1, idx].legend()
    axes[1, idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Cell 7: Save Ensemble Models
os.makedirs('../saved_ensemble_models', exist_ok=True)

for name, data in ensemble_results.items():
    joblib.dump(data['model'], f'../saved_ensemble_models/{name}_ensemble.pkl')

print("\n💾 Ensemble models saved to ../saved_ensemble_models/")
print("✅ ENSEMBLE TRAINING COMPLETE!")