In [3]:
"""
AWS Spot Price Prediction Model
Version: 1.0.0
Date: November 2025

Production-ready walk-forward backtesting model for AWS Spot price prediction
with ultra-sensitive risk scoring.

Dependencies:
- pandas >= 1.3.0
- numpy >= 1.21.0
- scipy >= 1.7.0
- scikit-learn >= 1.0.0
- matplotlib >= 3.4.0
- seaborn >= 0.11.0
- tqdm >= 4.62.0

Usage:
    python spot_price_model.py

Outputs:
    - backtest_results.csv: Daily predictions with risk scores
    - complete_backtest.png: Comprehensive visualization
    - backtest_report.txt: Summary report
    
All outputs saved to './outputs/' directory
"""

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import GradientBoostingRegressor, IsolationForest
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
import os
from datetime import datetime, timedelta
from tqdm import tqdm

sns.set_style("whitegrid")

TRAINING_DATA = '/Users/atharvapudale/Downloads/aws_2023_2024_complete_24months.csv'
TEST_Q1 = '/Users/atharvapudale/Downloads/mumbai_spot_data_sorted_asc(1-2-3-25).csv'
TEST_Q2 = '/Users/atharvapudale/Downloads/mumbai_spot_data_sorted_asc(4-5-6-25).csv'
TEST_Q3 = '/Users/atharvapudale/Downloads/mumbai_spot_data_sorted_asc(7-8-9-25).csv'
EVENT_DATA = '/Users/atharvapudale/Downloads/aws_stress_events_2023_2025.csv'
OUTPUT_DIR = '/Users/atharvapudale/spot-risk-prediction/struc/singlepool/PricePrediction/outputs'

os.makedirs(OUTPUT_DIR, exist_ok=True)


class CompleteBacktestModel:
    """
    Complete AWS Spot price prediction model with walk-forward backtesting
    and ultra-sensitive risk scoring.
    
    Attributes:
        region (str): AWS region (e.g., 'ap-south-1')
        pool_instance (str): Selected EC2 instance type
        pool_az (str): Selected availability zone
        baseline_stats (dict): Training data statistics (mean, std, median)
    """
    
    def __init__(self, region='ap-south-1'):
        self.region = region
        self.pool_instance = None
        self.pool_az = None
        
        self.price_model_gbm = None
        self.price_model_en = None
        self.price_scaler = StandardScaler()
        self.price_features = None
        
        self.baseline_stats = {}
        self.risk_scaler = RobustScaler()
        self.isolation_forest = None
        
    def load_data(self, train_path, test_paths, event_path):
        """
        Load training, test, and event data with validation.
        
        Args:
            train_path (str): Path to training data CSV (2023-2024)
            test_paths (list): List of paths to test data CSVs (2025)
            event_path (str): Path to event calendar CSV
            
        Returns:
            tuple: (train_df, test_df, event_df)
        """
        print("\n" + "="*80)
        print("LOADING DATA")
        print("="*80)
        
        train_df = pd.read_csv(train_path)
        train_df = self._standardize_columns(train_df)
        train_df = train_df[train_df['Region'] == self.region]
        
        pool_counts = train_df.groupby(['InstanceType', 'AZ']).size().sort_values(ascending=False)
        best_pool = pool_counts.idxmax()
        self.pool_instance = best_pool[0]
        self.pool_az = best_pool[1]
        
        print(f"Selected Pool: {self.pool_instance} @ {self.pool_az}")
        
        train_df = train_df[(train_df['InstanceType'] == self.pool_instance) & 
                            (train_df['AZ'] == self.pool_az)]
        
        test_dfs = []
        for path in test_paths:
            df = pd.read_csv(path)
            df = self._standardize_columns(df)
            df = df[df['Region'] == self.region]
            df = df[(df['InstanceType'] == self.pool_instance) & (df['AZ'] == self.pool_az)]
            test_dfs.append(df)
        test_df = pd.concat(test_dfs, ignore_index=True).sort_values('timestamp')
        
        event_df = pd.read_csv(event_path)
        event_df = self._standardize_event_columns(event_df)
        
        self.baseline_stats['mean'] = train_df['price_ratio'].mean()
        self.baseline_stats['std'] = train_df['price_ratio'].std()
        self.baseline_stats['median'] = train_df['price_ratio'].median()
        
        train_dates = train_df['timestamp'].dt.date
        test_dates = test_df['timestamp'].dt.date
        overlap = set(train_dates) & set(test_dates)
        
        print(f"Train: {len(train_df):,} records ({train_dates.min()} to {train_dates.max()})")
        print(f"Test: {len(test_df):,} records ({test_dates.min()} to {test_dates.max()})")
        print(f"Events: {len(event_df)}")
        print(f"Data leakage check: {len(overlap)} overlapping dates")
        print(f"Baseline: mean={self.baseline_stats['mean']:.4f}, std={self.baseline_stats['std']:.6f}")
        
        if len(overlap) > 0:
            print(f"WARNING: {len(overlap)} days overlap between train and test")
        
        return train_df, test_df, event_df
    
    def _standardize_columns(self, df):
        """Standardize column names and compute price ratio."""
        df.columns = df.columns.str.lower().str.strip()
        col_map = {}
        for col in df.columns:
            if 'time' in col or 'date' in col:
                col_map[col] = 'timestamp'
            elif 'spot' in col and 'price' in col:
                col_map[col] = 'SpotPrice'
            elif 'ondemand' in col or 'on_demand' in col or 'on-demand' in col:
                col_map[col] = 'OnDemandPrice'
            elif 'instance' in col and 'type' in col:
                col_map[col] = 'InstanceType'
            elif col in ['az', 'availability_zone']:
                col_map[col] = 'AZ'
            elif col in ['region']:
                col_map[col] = 'Region'
        
        df = df.rename(columns=col_map)
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        df['SpotPrice'] = pd.to_numeric(df['SpotPrice'], errors='coerce')
        df['OnDemandPrice'] = pd.to_numeric(df['OnDemandPrice'], errors='coerce')
        
        if 'Region' not in df.columns or df['Region'].isna().all():
            if 'AZ' in df.columns:
                df['Region'] = df['AZ'].str.extract(r'^([a-z]+-[a-z]+-\d+)')[0]
        
        df = df.dropna(subset=['SpotPrice', 'timestamp']).sort_values('timestamp')
        df['price_ratio'] = (df['SpotPrice'] / df['OnDemandPrice']).clip(0, 10)
        
        return df
    
    def _standardize_event_columns(self, df):
        """Standardize event calendar columns."""
        df.columns = df.columns.str.lower().str.strip()
        date_col = next((c for c in df.columns if 'date' in c), None)
        name_col = next((c for c in df.columns if 'event' in c or 'name' in c), None)
        rename_map = {}
        if date_col:
            rename_map[date_col] = 'event_date'
        if name_col:
            rename_map[name_col] = 'event_name'
        df = df.rename(columns=rename_map)
        df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce')
        return df.dropna(subset=['event_date'])
    
    def engineer_features(self, df):
        """
        Engineer features for price prediction.
        
        Features created:
        - Lag features (1h, 6h, 12h, 24h, 48h, 168h)
        - Rolling statistics (mean, std for various windows)
        - Rate of change (1h, 6h, 24h)
        - Temporal features (hour, day, month, weekend flag)
        
        Args:
            df (DataFrame): Input data with price_ratio
            
        Returns:
            tuple: (df with features, list of feature column names)
        """
        df = df.copy()
        
        for lag in [1, 6, 12, 24, 48, 168]:
            df[f'spot_lag_{lag}h'] = df['SpotPrice'].shift(lag)
            df[f'ratio_lag_{lag}h'] = df['price_ratio'].shift(lag)
        
        for window in [6, 12, 24, 168]:
            df[f'spot_mean_{window}h'] = df['SpotPrice'].rolling(window, min_periods=1).mean()
            df[f'spot_std_{window}h'] = df['SpotPrice'].rolling(window, min_periods=1).std()
        
        for period in [1, 6, 24]:
            df[f'price_change_{period}h'] = df['SpotPrice'].pct_change(period) * 100
        
        df['hour'] = df['timestamp'].dt.hour
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['month'] = df['timestamp'].dt.month
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        df['is_business_hours'] = ((df['hour'] >= 9) & (df['hour'] <= 17)).astype(int)
        
        feature_cols = ['price_ratio'] + [col for col in df.columns if 
                       ('lag_' in col or 'mean_' in col or 'std_' in col or 'change_' in col or
                        col in ['hour', 'day_of_week', 'month', 'is_weekend', 'is_business_hours'])]
        
        df[feature_cols] = df[feature_cols].fillna(method='bfill').fillna(0)
        
        return df, feature_cols
    
    def train_price_model(self, train_df, feature_cols):
        """
        Train ensemble price prediction model.
        
        Models:
        - Gradient Boosting Regressor (70% weight)
        - Elastic Net (30% weight)
        
        Args:
            train_df (DataFrame): Training data with features
            feature_cols (list): List of feature column names
        """
        print("\n" + "="*80)
        print("TRAINING PRICE PREDICTION MODEL")
        print("="*80)
        
        train_df = train_df.copy()
        train_df['target'] = train_df['price_ratio'].shift(-1)
        train_df = train_df.dropna(subset=['target'])
        
        X_train = train_df[feature_cols].values
        y_train = train_df['target'].values
        
        print(f"Training samples: {len(X_train):,}")
        print(f"Features: {len(feature_cols)}")
        print(f"Target: Next hour price_ratio")
        
        X_train_scaled = self.price_scaler.fit_transform(X_train)
        
        print("Training Gradient Boosting...")
        self.price_model_gbm = GradientBoostingRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=8,
            min_samples_split=10,
            subsample=0.8,
            random_state=42
        )
        self.price_model_gbm.fit(X_train_scaled, y_train)
        
        print("Training Elastic Net...")
        self.price_model_en = ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42)
        self.price_model_en.fit(X_train_scaled, y_train)
        
        val_size = int(len(X_train_scaled) * 0.1)
        X_val = X_train_scaled[-val_size:]
        y_val = y_train[-val_size:]
        
        pred_gbm = self.price_model_gbm.predict(X_val)
        pred_en = self.price_model_en.predict(X_val)
        y_pred = pred_gbm * 0.7 + pred_en * 0.3
        
        mae = mean_absolute_error(y_val, y_pred)
        mape = np.mean(np.abs((y_val - y_pred) / y_val)) * 100
        
        print(f"Validation MAE: {mae:.6f}")
        print(f"Validation MAPE: {mape:.2f}%")
        
        self.price_features = feature_cols
        print("Training complete")
    
    def walk_forward_backtest(self, test_df):
        """
        Perform walk-forward backtesting: predict each day independently.
        
        Args:
            test_df (DataFrame): Test data with features
            
        Returns:
            DataFrame: Daily predictions with actual values
        """
        print("\n" + "="*80)
        print("WALK-FORWARD BACKTESTING")
        print("="*80)
        print("Predicting day-by-day without future knowledge")
        
        test_df = test_df.copy()
        daily_dates = test_df.groupby(test_df['timestamp'].dt.date).size().index
        predictions = []
        
        print(f"\nPredicting {len(daily_dates)} days...")
        for current_date in tqdm(daily_dates, desc="Backtesting"):
            available_data = test_df[test_df['timestamp'].dt.date <= current_date].copy()
            
            if len(available_data) < 168:
                continue
            
            X_current = available_data[self.price_features].tail(24).values
            X_current_scaled = self.price_scaler.transform(X_current)
            
            pred_gbm = self.price_model_gbm.predict(X_current_scaled)
            pred_en = self.price_model_en.predict(X_current_scaled)
            pred_ratio = (pred_gbm * 0.7 + pred_en * 0.3).mean()
            
            actual_day = test_df[test_df['timestamp'].dt.date == current_date]
            actual_ratio = actual_day['price_ratio'].mean()
            actual_spot = actual_day['SpotPrice'].mean()
            actual_od = actual_day['OnDemandPrice'].mean()
            
            predictions.append({
                'date': current_date,
                'predicted_ratio': pred_ratio,
                'actual_ratio': actual_ratio,
                'predicted_spot': pred_ratio * actual_od,
                'actual_spot': actual_spot,
                'on_demand': actual_od
            })
        
        backtest_df = pd.DataFrame(predictions)
        
        mae = mean_absolute_error(backtest_df['actual_spot'], backtest_df['predicted_spot'])
        rmse = np.sqrt(mean_squared_error(backtest_df['actual_spot'], backtest_df['predicted_spot']))
        mape = np.mean(np.abs((backtest_df['actual_spot'] - backtest_df['predicted_spot']) / 
                              backtest_df['actual_spot'])) * 100
        
        print(f"\nDays predicted: {len(backtest_df)}")
        print(f"MAE: ${mae:.6f}")
        print(f"RMSE: ${rmse:.6f}")
        print(f"MAPE: {mape:.2f}%")
        print(f"Avg predicted ratio: {backtest_df['predicted_ratio'].mean():.4f}")
        print(f"Avg actual ratio: {backtest_df['actual_ratio'].mean():.4f}")
        
        return backtest_df
    
    def calculate_ultra_sensitive_risk(self, test_df, backtest_df):
        """
        Calculate ultra-sensitive risk scores using statistical methods.
        
        Risk components:
        - Statistical anomaly detection (50%): Control charts, Z-scores
        - ML anomaly detection (30%): Isolation Forest
        - Z-score intensity (20%): Deviation magnitude
        
        Args:
            test_df (DataFrame): Hourly test data
            backtest_df (DataFrame): Daily predictions
            
        Returns:
            tuple: (backtest_df with risk scores, test_df with features)
        """
        print("\n" + "="*80)
        print("CALCULATING ULTRA-SENSITIVE RISK SCORES")
        print("="*80)
        
        test_df = test_df.copy()
        
        test_df['z_score'] = (test_df['price_ratio'] - self.baseline_stats['mean']) / self.baseline_stats['std']
        
        test_df['ucl'] = self.baseline_stats['mean'] + 3 * self.baseline_stats['std']
        test_df['lcl'] = self.baseline_stats['mean'] - 3 * self.baseline_stats['std']
        test_df['beyond_limits'] = ((test_df['price_ratio'] > test_df['ucl']) | 
                                     (test_df['price_ratio'] < test_df['lcl'])).astype(int)
        
        test_df['stat_anomaly_score'] = 0.0
        test_df.loc[test_df['beyond_limits'] == 1, 'stat_anomaly_score'] += 50
        test_df.loc[test_df['z_score'].abs() >= 2.0, 'stat_anomaly_score'] += 25
        
        ml_features = ['price_ratio', 'z_score']
        for lag in [1, 6, 24]:
            ml_features.append(f'ratio_lag_{lag}h')
        
        X_ml = test_df[ml_features].fillna(0).values
        X_ml_scaled = self.risk_scaler.fit_transform(X_ml)
        
        self.isolation_forest = IsolationForest(contamination=0.10, random_state=42, n_estimators=100)
        ml_anomaly = self.isolation_forest.fit_predict(X_ml_scaled)
        ml_score = self.isolation_forest.score_samples(X_ml_scaled)
        
        test_df['ml_anomaly'] = (ml_anomaly == -1).astype(int)
        test_df['ml_anomaly_score'] = (1 - (ml_score - ml_score.min()) / 
                                       (ml_score.max() - ml_score.min() + 1e-6)) * 100
        
        test_df['sensitive_risk_score'] = (
            test_df['stat_anomaly_score'] * 0.50 +
            test_df['ml_anomaly_score'] * 0.30 +
            (test_df['z_score'].abs() / 3.0).clip(0, 1) * 100 * 0.20
        ).clip(0, 100)
        
        daily_risk = test_df.groupby(test_df['timestamp'].dt.date).agg({
            'sensitive_risk_score': 'mean',
            'z_score': lambda x: x.abs().max(),
            'stat_anomaly_score': lambda x: (x > 0).sum(),
            'ml_anomaly': 'sum'
        }).reset_index()
        daily_risk.columns = ['date', 'avg_risk', 'max_z_score', 'anomaly_hours', 'ml_anomaly_hours']
        
        backtest_df = backtest_df.merge(daily_risk, on='date', how='left')
        
        print(f"Avg risk: {backtest_df['avg_risk'].mean():.1f}/100")
        print(f"Max risk: {backtest_df['avg_risk'].max():.1f}/100")
        print(f"High risk days (>70): {(backtest_df['avg_risk']>70).sum()}")
        print(f"Max Z-score: {backtest_df['max_z_score'].max():.1f} sigma")
        
        return backtest_df, test_df
    
    def create_comprehensive_visualizations(self, backtest_df, test_df):
        """Create comprehensive backtest visualization with 8 subplots."""
        print("\n" + "="*80)
        print("CREATING VISUALIZATION")
        print("="*80)
        
        fig = plt.figure(figsize=(24, 20))
        gs = GridSpec(6, 3, figure=fig, hspace=0.4, wspace=0.3)
        
        ax1 = fig.add_subplot(gs[0, :])
        ax1.plot(backtest_df['date'], backtest_df['actual_spot'], 
                label='Actual Spot Price', linewidth=2, color='steelblue', marker='o', markersize=3)
        ax1.plot(backtest_df['date'], backtest_df['predicted_spot'], 
                label='Predicted Spot Price', linewidth=2, color='orange', linestyle='--', marker='s', markersize=3)
        ax1.plot(backtest_df['date'], backtest_df['on_demand'], 
                label='On-Demand Price', linewidth=1, color='gray', alpha=0.5)
        ax1.set_title(f'BACKTEST: Predicted vs Actual Prices (2025) - {self.pool_instance} @ {self.pool_az}',
                     fontsize=14, fontweight='bold')
        ax1.set_ylabel('Price (USD)')
        ax1.legend()
        ax1.grid(alpha=0.3)
        
        ax2 = fig.add_subplot(gs[1, :])
        backtest_df['abs_error'] = abs(backtest_df['predicted_spot'] - backtest_df['actual_spot'])
        backtest_df['pct_error'] = abs((backtest_df['predicted_spot'] - backtest_df['actual_spot']) / 
                                       backtest_df['actual_spot']) * 100
        ax2.bar(backtest_df['date'], backtest_df['abs_error'], color='coral', alpha=0.7, edgecolor='black', linewidth=0.5)
        ax2.set_title('Prediction Error Over Time', fontsize=14, fontweight='bold')
        ax2.set_ylabel('Absolute Error (USD)')
        ax2.axhline(y=backtest_df['abs_error'].mean(), color='red', linestyle='--', 
                   label=f'Mean Error: ${backtest_df["abs_error"].mean():.5f}')
        ax2.legend()
        ax2.grid(alpha=0.3, axis='y')
        
        ax3 = fig.add_subplot(gs[2, :])
        colors = ['red' if r > 70 else 'orange' if r > 40 else 'steelblue' 
                 for r in backtest_df['avg_risk']]
        ax3.bar(backtest_df['date'], backtest_df['avg_risk'], color=colors, alpha=0.7, edgecolor='black', linewidth=0.5)
        ax3.axhline(y=70, color='red', linestyle='--', alpha=0.5, label='High Risk')
        ax3.axhline(y=40, color='orange', linestyle='--', alpha=0.5, label='Moderate')
        ax3.set_title('Ultra-Sensitive Risk Score (Detects 2-13% Changes)', fontsize=14, fontweight='bold')
        ax3.set_ylabel('Risk Score')
        ax3.legend()
        ax3.grid(alpha=0.3, axis='y')
        
        ax4 = fig.add_subplot(gs[3, :])
        hourly_sample = test_df.iloc[:5000]
        colors_z = ['red' if abs(z) > 3 else 'orange' if abs(z) > 2 else 'steelblue' 
                   for z in hourly_sample['z_score']]
        ax4.scatter(hourly_sample['timestamp'], hourly_sample['z_score'], 
                   c=colors_z, s=3, alpha=0.6)
        ax4.axhline(y=3, color='red', linestyle='--', label='3 sigma (p<0.003)')
        ax4.axhline(y=-3, color='red', linestyle='--')
        ax4.axhline(y=2, color='orange', linestyle='--', alpha=0.5, label='2 sigma (p<0.05)')
        ax4.axhline(y=-2, color='orange', linestyle='--', alpha=0.5)
        ax4.set_title('Z-Score: Statistical Anomaly Detection (Hourly)', fontsize=14, fontweight='bold')
        ax4.set_ylabel('Z-Score (sigma)')
        ax4.legend()
        ax4.grid(alpha=0.3)
        
        ax5 = fig.add_subplot(gs[4, 0])
        ax5.hist(backtest_df['abs_error'], bins=30, color='coral', alpha=0.7, edgecolor='black')
        ax5.axvline(x=backtest_df['abs_error'].mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: ${backtest_df["abs_error"].mean():.5f}')
        ax5.set_title('Prediction Error Distribution', fontweight='bold')
        ax5.set_xlabel('Absolute Error (USD)')
        ax5.set_ylabel('Frequency')
        ax5.legend()
        ax5.grid(alpha=0.3, axis='y')
        
        ax6 = fig.add_subplot(gs[4, 1])
        ax6.hist(backtest_df['avg_risk'], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
        ax6.axvline(x=70, color='red', linestyle='--', linewidth=2, label='High Risk')
        ax6.axvline(x=40, color='orange', linestyle='--', linewidth=2, label='Moderate')
        ax6.set_title('Risk Score Distribution', fontweight='bold')
        ax6.set_xlabel('Risk Score')
        ax6.set_ylabel('Days')
        ax6.legend()
        ax6.grid(alpha=0.3, axis='y')
        
        ax7 = fig.add_subplot(gs[4, 2])
        ax7.scatter(backtest_df['actual_spot'], backtest_df['predicted_spot'], 
                   c=backtest_df['avg_risk'], cmap='RdYlGn_r', s=50, alpha=0.6, edgecolors='black')
        ax7.plot([backtest_df['actual_spot'].min(), backtest_df['actual_spot'].max()],
                [backtest_df['actual_spot'].min(), backtest_df['actual_spot'].max()],
                'k--', linewidth=2, label='Perfect Prediction')
        ax7.set_title('Predicted vs Actual (colored by risk)', fontweight='bold')
        ax7.set_xlabel('Actual Spot Price (USD)')
        ax7.set_ylabel('Predicted Spot Price (USD)')
        ax7.legend()
        ax7.grid(alpha=0.3)
        
        ax8 = fig.add_subplot(gs[5, :])
        ax8.axis('off')
        
        mae = backtest_df['abs_error'].mean()
        rmse = np.sqrt((backtest_df['abs_error']**2).mean())
        mape = backtest_df['pct_error'].mean()
        
        summary = f"""
WALK-FORWARD BACKTEST RESULTS (2025)
{'='*80}

MODEL TRAINING:
  Training Period: 2023-2024
  Test Period: Jan-Sep 2025 ({len(backtest_df)} days)
  Model: GradientBoosting (70%) + ElasticNet (30%)
  Baseline: mean={self.baseline_stats['mean']:.4f}, std={self.baseline_stats['std']:.6f}

PREDICTION PERFORMANCE:
  MAE: ${mae:.6f}
  RMSE: ${rmse:.6f}
  MAPE: {mape:.2f}%
  Best Day: ${backtest_df['abs_error'].min():.6f}
  Worst Day: ${backtest_df['abs_error'].max():.6f}

RISK ASSESSMENT:
  Average Risk: {backtest_df['avg_risk'].mean():.1f}/100
  Maximum Risk: {backtest_df['avg_risk'].max():.1f}/100
  High Risk Days (>70): {(backtest_df['avg_risk']>70).sum()}
  Moderate Risk Days (40-70): {((backtest_df['avg_risk']>=40) & (backtest_df['avg_risk']<70)).sum()}
  Low Risk Days (<40): {(backtest_df['avg_risk']<40).sum()}
  Max Z-Score: {backtest_df['max_z_score'].max():.1f} sigma

VALIDATION:
  No data leakage: Model trained on 2023-2024 ONLY
  Walk-forward: Each day predicted without future knowledge
  Statistical rigor: Z-scores, control charts, ML validation

BUSINESS IMPACT:
  Spot Usage Days: {(backtest_df['avg_risk']<40).sum()} ({(backtest_df['avg_risk']<40).sum()/len(backtest_df)*100:.1f}%)
  On-Demand Days: {(backtest_df['avg_risk']>=40).sum()} ({(backtest_df['avg_risk']>=40).sum()/len(backtest_df)*100:.1f}%)
  Expected Savings: ~{(backtest_df['avg_risk']<40).sum()/len(backtest_df)*70:.0f}% vs always On-Demand
"""
        
        ax8.text(0.05, 0.5, summary, fontsize=9, family='monospace',
                verticalalignment='center', fontweight='bold')
        
        plt.suptitle('Walk-Forward Backtest: Price Prediction + Ultra-Sensitive Risk Scoring',
                    fontsize=16, fontweight='bold', y=0.998)
        
        output_path = f'{OUTPUT_DIR}/complete_backtest.png'
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"Saved: {output_path}")
        plt.close()
    
    def save_outputs(self, backtest_df):
        """Save backtest results and summary report."""
        print("\n" + "="*80)
        print("SAVING OUTPUTS")
        print("="*80)
        
        backtest_df.to_csv(f'{OUTPUT_DIR}/backtest_results.csv', index=False)
        print(f"Saved: {OUTPUT_DIR}/backtest_results.csv")
        
        mae = backtest_df['abs_error'].mean()
        mape = backtest_df['pct_error'].mean()
        
        report = f"""COMPLETE BACKTEST REPORT
{'='*60}

Pool: {self.pool_instance} @ {self.pool_az}
Region: {self.region}

BACKTEST SETUP:
  Training: 2023-2024
  Testing: 2025 (Jan-Sep)
  Method: Walk-forward (day-by-day)
  Days tested: {len(backtest_df)}

PREDICTION PERFORMANCE:
  MAE: ${mae:.6f}
  MAPE: {mape:.2f}%

RISK ASSESSMENT:
  Avg: {backtest_df['avg_risk'].mean():.1f}/100
  Max: {backtest_df['avg_risk'].max():.1f}/100
  High risk days: {(backtest_df['avg_risk']>70).sum()}

VALIDATION:
  No data leakage
  Walk-forward backtest
  Real-world simulation

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
"""
        
        with open(f'{OUTPUT_DIR}/backtest_report.txt', 'w') as f:
            f.write(report)
        print(f"Saved: {OUTPUT_DIR}/backtest_report.txt")
        
        print(f"\nAll outputs in: {OUTPUT_DIR}")


def main():
    """Main execution function."""
    print("\n" + "="*80)
    print("AWS SPOT PRICE PREDICTION MODEL v1.0.0")
    print("Walk-Forward Backtest + Ultra-Sensitive Risk Scoring")
    print("="*80)
    
    model = CompleteBacktestModel('ap-south-1')
    
    train_df, test_df, event_df = model.load_data(TRAINING_DATA, [TEST_Q1, TEST_Q2, TEST_Q3], EVENT_DATA)
    
    print("\n" + "="*80)
    print("FEATURE ENGINEERING")
    print("="*80)
    train_df, feature_cols = model.engineer_features(train_df)
    test_df, _ = model.engineer_features(test_df)
    print(f"Features created: {len(feature_cols)}")
    
    model.train_price_model(train_df, feature_cols)
    
    backtest_df = model.walk_forward_backtest(test_df)
    
    backtest_df, test_df = model.calculate_ultra_sensitive_risk(test_df, backtest_df)
    
    model.create_comprehensive_visualizations(backtest_df, test_df)
    
    model.save_outputs(backtest_df)
    
    print("\n" + "="*80)
    print("BACKTEST COMPLETE")
    print("="*80)


if __name__ == "__main__":
    main()


AWS SPOT PRICE PREDICTION MODEL v1.0.0
Walk-Forward Backtest + Ultra-Sensitive Risk Scoring

LOADING DATA
Selected Pool: c5.large @ aps1-az1
Train: 103,294 records (2023-01-01 to 2024-12-31)
Test: 39,249 records (2025-01-01 to 2025-09-30)
Events: 78
Data leakage check: 0 overlapping dates
Baseline: mean=0.4507, std=0.062509

FEATURE ENGINEERING
Features created: 29

TRAINING PRICE PREDICTION MODEL
Training samples: 103,293
Features: 29
Target: Next hour price_ratio
Training Gradient Boosting...
Training Elastic Net...
Validation MAE: 0.001016
Validation MAPE: 0.24%
Training complete

WALK-FORWARD BACKTESTING
Predicting day-by-day without future knowledge

Predicting 273 days...


Backtesting: 100%|███████████████████████████| 273/273 [00:02<00:00, 108.69it/s]



Days predicted: 272
MAE: $0.000273
RMSE: $0.000592
MAPE: 0.88%
Avg predicted ratio: 0.3912
Avg actual ratio: 0.3896

CALCULATING ULTRA-SENSITIVE RISK SCORES
Avg risk: 16.7/100
Max risk: 56.1/100
High risk days (>70): 0
Max Z-score: 2.5 sigma

CREATING VISUALIZATION
Saved: /Users/atharvapudale/spot-risk-prediction/struc/singlepool/PricePrediction/outputs/complete_backtest.png

SAVING OUTPUTS
Saved: /Users/atharvapudale/spot-risk-prediction/struc/singlepool/PricePrediction/outputs/backtest_results.csv
Saved: /Users/atharvapudale/spot-risk-prediction/struc/singlepool/PricePrediction/outputs/backtest_report.txt

All outputs in: /Users/atharvapudale/spot-risk-prediction/struc/singlepool/PricePrediction/outputs

BACKTEST COMPLETE


In [2]:
"""
AWS Spot Risk Classification Model (Model 2.1)
Version: 2.1.0
Date: November 2025

ML-based risk classifier trained on 2023-24 data, tested on 2025.
Predicts capacity stress and interruption risk using supervised learning.

Key Innovation: Trains on historical capacity tightening patterns, then
applies learned patterns to Model 1's forward-looking predictions.

Dependencies:
- pandas >= 1.3.0
- numpy >= 1.21.0
- scikit-learn >= 1.0.0
- matplotlib >= 3.4.0
- seaborn >= 0.11.0
- tqdm >= 4.62.0
- Model 1 predictions (backtest_results.csv)

Usage:
    python spot_risk_classifier_v2_1.py

Outputs:
    - risk_classifier_results.csv: Daily risk predictions with probabilities
    - risk_classifier_performance.png: Comprehensive validation dashboard
    - risk_classifier_report.txt: Summary with proper ML validation metrics
"""

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report, roc_auc_score, 
                            roc_curve, precision_recall_curve, f1_score, accuracy_score)
from sklearn.model_selection import TimeSeriesSplit
import os
from datetime import datetime
from tqdm import tqdm

sns.set_style("whitegrid")

# ============================================================================
# CONFIGURATION
# ============================================================================

# Training data (2023-2024)
TRAINING_DATA = '/Users/atharvapudale/Downloads/aws_2023_2024_complete_24months.csv'

# Test data (2025) - hourly
TEST_Q1 = '/Users/atharvapudale/Downloads/mumbai_spot_data_sorted_asc(1-2-3-25).csv'
TEST_Q2 = '/Users/atharvapudale/Downloads/mumbai_spot_data_sorted_asc(4-5-6-25).csv'
TEST_Q3 = '/Users/atharvapudale/Downloads/mumbai_spot_data_sorted_asc(7-8-9-25).csv'

# Model 1 predictions (for test period only)
MODEL1_PREDICTIONS = '/Users/atharvapudale/spot-risk-prediction/struc/singlepool/PricePrediction/outputs/backtest_results.csv'

# Event data
EVENT_DATA = '/Users/atharvapudale/Downloads/aws_stress_events_2023_2025.csv'

# Output directory
OUTPUT_DIR = '/Users/atharvapudale/spot-risk-prediction/struc/singlepool/RiskClassifier/outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Risk event definition (for labeling training data)
# Use multiple thresholds for better class balance
RISK_THRESHOLDS = {
    'price_increase': 0.01,      # 1% price increase
    'volatility': 0.015,          # 1.5% rolling volatility
    'discount_compression': 0.02  # 2% discount decrease
}


class SpotRiskClassifier:
    """
    Model 2.1: ML-based spot risk classifier.
    
    Training Flow:
    1. Train on 2023-24 historical data
    2. Learn patterns of capacity stress (price spikes, volatility, discount compression)
    3. Test on 2025 data using Model 1's forward predictions
    
    Risk Prediction:
    - Binary: High Risk (1) vs Low Risk (0)
    - Probability: 0-100% likelihood of capacity stress
    - Features: Forward tightening, realized tightening, velocity, anomaly, events
    """
    
    def __init__(self, region='ap-south-1'):
        self.region = region
        self.pool_instance = None
        self.pool_az = None
        
        # ML models
        self.rf_model = None
        self.gb_model = None
        self.lr_model = None
        self.scaler = StandardScaler()
        
        # Training baseline
        self.baseline_stats = {}
        
    def load_and_prepare_data(self, train_path, test_paths, event_path):
        """Load training and test data."""
        print("\n" + "="*80)
        print("MODEL 2.1: ML-BASED RISK CLASSIFIER")
        print("="*80)
        print("Training on 2023-24, Testing on 2025 (proper train/test split)")
        
        # Load training data (2023-2024)
        print("\n" + "="*80)
        print("LOADING TRAINING DATA (2023-2024)")
        print("="*80)
        
        train_df = pd.read_csv(train_path)
        train_df = self._standardize_columns(train_df)
        train_df = train_df[train_df['Region'] == self.region]
        
        # Select best pool
        pool_counts = train_df.groupby(['InstanceType', 'AZ']).size().sort_values(ascending=False)
        best_pool = pool_counts.idxmax()
        self.pool_instance = best_pool[0]
        self.pool_az = best_pool[1]
        
        print(f"Selected Pool: {self.pool_instance} @ {self.pool_az}")
        
        train_df = train_df[(train_df['InstanceType'] == self.pool_instance) & 
                            (train_df['AZ'] == self.pool_az)]
        
        # Calculate baseline from training data
        self.baseline_stats['mean'] = train_df['price_ratio'].mean()
        self.baseline_stats['std'] = train_df['price_ratio'].std()
        self.baseline_stats['median'] = train_df['price_ratio'].median()
        
        print(f"Training: {len(train_df):,} records ({train_df['timestamp'].min()} to {train_df['timestamp'].max()})")
        print(f"Baseline: mean={self.baseline_stats['mean']:.4f}, std={self.baseline_stats['std']:.6f}")
        
        # Load test data (2025)
        print("\n" + "="*80)
        print("LOADING TEST DATA (2025)")
        print("="*80)
        
        test_dfs = []
        for path in test_paths:
            df = pd.read_csv(path)
            df = self._standardize_columns(df)
            df = df[df['Region'] == self.region]
            df = df[(df['InstanceType'] == self.pool_instance) & (df['AZ'] == self.pool_az)]
            test_dfs.append(df)
        
        test_df = pd.concat(test_dfs, ignore_index=True).sort_values('timestamp')
        
        print(f"Test: {len(test_df):,} records ({test_df['timestamp'].min()} to {test_df['timestamp'].max()})")
        
        # Load events
        event_df = pd.read_csv(event_path)
        event_df = self._standardize_event_columns(event_df)
        
        print(f"Events: {len(event_df)}")
        
        # Verify no overlap
        train_dates = set(train_df['timestamp'].dt.date)
        test_dates = set(test_df['timestamp'].dt.date)
        overlap = train_dates & test_dates
        
        print(f"\n✓ Data leakage check: {len(overlap)} overlapping dates")
        if len(overlap) > 0:
            print(f"  WARNING: Found overlap! Removing from test set...")
            test_df = test_df[~test_df['timestamp'].dt.date.isin(overlap)]
        
        return train_df, test_df, event_df
    
    def _standardize_columns(self, df):
        """Standardize column names."""
        df.columns = df.columns.str.lower().str.strip()
        
        col_map = {}
        for col in df.columns:
            if 'time' in col or 'date' in col:
                col_map[col] = 'timestamp'
            elif 'spot' in col and 'price' in col:
                col_map[col] = 'SpotPrice'
            elif 'ondemand' in col or 'on_demand' in col:
                col_map[col] = 'OnDemandPrice'
            elif 'instance' in col and 'type' in col:
                col_map[col] = 'InstanceType'
            elif col in ['az', 'availability_zone']:
                col_map[col] = 'AZ'
            elif col in ['region']:
                col_map[col] = 'Region'
        
        df = df.rename(columns=col_map)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df['SpotPrice'] = pd.to_numeric(df['SpotPrice'], errors='coerce')
        df['OnDemandPrice'] = pd.to_numeric(df['OnDemandPrice'], errors='coerce')
        
        if 'Region' not in df.columns or df['Region'].isna().all():
            if 'AZ' in df.columns:
                df['Region'] = df['AZ'].str.extract(r'^([a-z]+-[a-z]+-\d+)')[0]
        
        df = df.dropna(subset=['SpotPrice', 'timestamp']).sort_values('timestamp')
        df['price_ratio'] = (df['SpotPrice'] / df['OnDemandPrice']).clip(0, 10)
        df['discount'] = (1 - df['price_ratio']).clip(0, 1)
        
        return df
    
    def _standardize_event_columns(self, df):
        """Standardize event calendar."""
        df.columns = df.columns.str.lower().str.strip()
        
        date_col = next((c for c in df.columns if 'date' in c), None)
        name_col = next((c for c in df.columns if 'event' in c or 'name' in c), None)
        
        rename_map = {}
        if date_col:
            rename_map[date_col] = 'event_date'
        if name_col:
            rename_map[name_col] = 'event_name'
        
        df = df.rename(columns=rename_map)
        df['event_date'] = pd.to_datetime(df['event_date'])
        
        return df.dropna(subset=['event_date'])
    
    def engineer_features(self, df, is_training=True):
        """Engineer features for ML model."""
        df = df.copy()
        
        # Capacity signals
        df['discount_baseline'] = df['discount'].rolling(168, min_periods=24).median()
        df['tighten_now'] = (df['discount_baseline'] - df['discount']).clip(lower=0)
        
        # Price velocity
        df['price_change_pct'] = df['SpotPrice'].pct_change().abs() * 100
        df['velocity_24h'] = df['price_change_pct'].rolling(24, min_periods=1).mean()
        df['velocity_168h'] = df['price_change_pct'].rolling(168, min_periods=24).mean()
        
        # Statistical anomaly
        baseline_mean = df['price_ratio'].rolling(168, min_periods=24).mean()
        baseline_std = df['price_ratio'].rolling(168, min_periods=24).std()
        df['z_score'] = ((df['price_ratio'] - baseline_mean) / baseline_std).fillna(0)
        df['z_anomaly'] = df['z_score'].abs()
        
        # Rolling statistics
        df['price_std_24h'] = df['SpotPrice'].rolling(24, min_periods=1).std()
        df['price_max_24h'] = df['SpotPrice'].rolling(24, min_periods=1).max()
        df['price_min_24h'] = df['SpotPrice'].rolling(24, min_periods=1).min()
        df['price_range_24h'] = df['price_max_24h'] - df['price_min_24h']
        
        # Discount compression rate
        df['discount_change'] = df['discount'].diff()
        df['discount_velocity'] = df['discount_change'].rolling(24, min_periods=1).mean()
        
        # Temporal features
        df['hour'] = df['timestamp'].dt.hour
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        df['is_business_hours'] = ((df['hour'] >= 9) & (df['hour'] <= 17)).astype(int)
        
        # Fill NaN
        feature_cols = [col for col in df.columns if col not in 
                       ['timestamp', 'SpotPrice', 'OnDemandPrice', 'InstanceType', 'AZ', 'Region']]
        df[feature_cols] = df[feature_cols].fillna(method='bfill').fillna(0)
        
        return df
    
    def create_risk_labels(self, df):
        """
        Create risk labels for training.
        
        Risk event = any of:
        1. Next 24h price increase > 1%
        2. Next 24h volatility > 1.5%
        3. Discount compression > 2%
        """
        print("\n" + "="*80)
        print("CREATING RISK LABELS (Training Target)")
        print("="*80)
        
        df = df.copy()
        
        # Future price change (next 24h)
        df['future_price_max'] = df['SpotPrice'].shift(-24).rolling(24, min_periods=1).max()
        df['future_price_change'] = ((df['future_price_max'] / df['SpotPrice']) - 1) * 100
        
        # Future volatility (next 24h)
        df['future_volatility'] = df['SpotPrice'].shift(-24).rolling(24, min_periods=1).std() / df['SpotPrice']
        
        # Future discount compression (next 24h)
        df['future_discount'] = df['discount'].shift(-24)
        df['discount_compression'] = (df['discount'] - df['future_discount']).clip(lower=0)
        
        # Risk label (any condition triggers risk=1)
        df['risk_label'] = (
            (df['future_price_change'] > RISK_THRESHOLDS['price_increase'] * 100) |
            (df['future_volatility'] > RISK_THRESHOLDS['volatility']) |
            (df['discount_compression'] > RISK_THRESHOLDS['discount_compression'])
        ).astype(int)
        
        # Remove rows with NaN labels (last 24 hours)
        df = df[:-24]
        
        print(f"Total samples: {len(df):,}")
        print(f"Risk events (label=1): {df['risk_label'].sum():,} ({df['risk_label'].mean()*100:.1f}%)")
        print(f"Low risk (label=0): {(~df['risk_label'].astype(bool)).sum():,} ({(1-df['risk_label'].mean())*100:.1f}%)")
        
        return df
    
    def train_models(self, train_df):
        """Train ensemble of ML classifiers."""
        print("\n" + "="*80)
        print("TRAINING ML CLASSIFIERS")
        print("="*80)
        
        # Feature columns
        feature_cols = [
            'tighten_now', 'velocity_24h', 'velocity_168h', 'z_anomaly',
            'price_std_24h', 'price_range_24h', 'discount_velocity',
            'hour', 'day_of_week', 'is_weekend', 'is_business_hours'
        ]
        
        X_train = train_df[feature_cols].values
        y_train = train_df['risk_label'].values
        
        print(f"Training samples: {len(X_train):,}")
        print(f"Features: {len(feature_cols)}")
        print(f"Class distribution: {np.bincount(y_train)}")
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        # Train Random Forest
        print("\nTraining Random Forest...")
        self.rf_model = RandomForestClassifier(
            n_estimators=200,
            max_depth=12,
            min_samples_split=20,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        )
        self.rf_model.fit(X_train_scaled, y_train)
        
        # Train Gradient Boosting
        print("Training Gradient Boosting...")
        self.gb_model = GradientBoostingClassifier(
            n_estimators=150,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            random_state=42
        )
        self.gb_model.fit(X_train_scaled, y_train)
        
        # Train Logistic Regression
        print("Training Logistic Regression...")
        self.lr_model = LogisticRegression(
            C=0.1,
            class_weight='balanced',
            random_state=42,
            max_iter=1000
        )
        self.lr_model.fit(X_train_scaled, y_train)
        
        # Training accuracy
        y_pred_rf = self.rf_model.predict(X_train_scaled)
        y_pred_gb = self.gb_model.predict(X_train_scaled)
        y_pred_lr = self.lr_model.predict(X_train_scaled)
        
        # Ensemble (voting)
        y_pred_ensemble = ((y_pred_rf.astype(int) + y_pred_gb.astype(int) + y_pred_lr.astype(int)) >= 2).astype(int)
        
        print(f"\nTraining Accuracy:")
        print(f"  Random Forest: {accuracy_score(y_train, y_pred_rf):.1%}")
        print(f"  Gradient Boosting: {accuracy_score(y_train, y_pred_gb):.1%}")
        print(f"  Logistic Regression: {accuracy_score(y_train, y_pred_lr):.1%}")
        print(f"  Ensemble (voting): {accuracy_score(y_train, y_pred_ensemble):.1%}")
        
        self.feature_cols = feature_cols
        
        print("\n✓ Training complete")
    
    def predict_risk(self, test_df, model1_predictions=None):
        """Predict risk on test data with optional Model 1 integration."""
        print("\n" + "="*80)
        print("PREDICTING RISK ON TEST DATA (2025)")
        print("="*80)
        
        test_df = test_df.copy()
        
        # Extract features
        X_test = test_df[self.feature_cols].values
        X_test_scaled = self.scaler.transform(X_test)
        
        # Get predictions from each model
        y_pred_rf = self.rf_model.predict(X_test_scaled)
        y_pred_gb = self.gb_model.predict(X_test_scaled)
        y_pred_lr = self.lr_model.predict(X_test_scaled)
        
        # Get probabilities
        y_prob_rf = self.rf_model.predict_proba(X_test_scaled)[:, 1]
        y_prob_gb = self.gb_model.predict_proba(X_test_scaled)[:, 1]
        y_prob_lr = self.lr_model.predict_proba(X_test_scaled)[:, 1]
        
        # Ensemble voting
        y_pred_ensemble = ((y_pred_rf.astype(int) + y_pred_gb.astype(int) + y_pred_lr.astype(int)) >= 2).astype(int)
        y_prob_ensemble = (y_prob_rf + y_prob_gb + y_prob_lr) / 3
        
        test_df['risk_pred'] = y_pred_ensemble
        test_df['risk_prob'] = y_prob_ensemble
        test_df['risk_score'] = (y_prob_ensemble * 100).clip(0, 100)
        
        # If Model 1 predictions available, boost risk score
        if model1_predictions is not None:
            print("\n✓ Integrating Model 1 forward predictions")
            test_df = self._integrate_model1(test_df, model1_predictions)
        
        print(f"\nPredictions complete: {len(test_df):,} samples")
        print(f"  Predicted high-risk: {y_pred_ensemble.sum():,} ({y_pred_ensemble.mean()*100:.1f}%)")
        print(f"  Avg risk probability: {y_prob_ensemble.mean()*100:.1f}%")
        
        return test_df
    
    def _integrate_model1(self, test_df, model1_pred):
        """Integrate Model 1 forward tightening as risk boost."""
        # Aggregate to daily
        daily_test = test_df.groupby(test_df['timestamp'].dt.date).agg({
            'risk_prob': 'mean',
            'risk_score': 'mean',
            'discount': 'mean'
        }).reset_index()
        daily_test.columns = ['date', 'risk_prob', 'risk_score', 'discount']
        daily_test['date'] = pd.to_datetime(daily_test['date'])
        
        # Load Model 1
        model1_pred['date'] = pd.to_datetime(model1_pred['date'])
        
        # Calculate forward tightening
        merged = daily_test.merge(model1_pred[['date', 'predicted_ratio']], on='date', how='left')
        merged['predicted_discount'] = 1 - merged['predicted_ratio']
        merged['forward_tightening'] = (merged['discount'] - merged['predicted_discount']).clip(lower=0)
        
        # Boost risk score based on forward tightening
        # If Model 1 predicts tightening, increase risk
        merged['risk_boost'] = (merged['forward_tightening'] / merged['forward_tightening'].max() * 20).fillna(0)
        merged['risk_score_boosted'] = (merged['risk_score'] + merged['risk_boost']).clip(0, 100)
        
        # Map back to hourly
        date_map = merged.set_index('date')['risk_score_boosted'].to_dict()
        test_df['date_only'] = test_df['timestamp'].dt.date
        test_df['date_only'] = pd.to_datetime(test_df['date_only'])
        test_df['risk_score'] = test_df['date_only'].map(date_map).fillna(test_df['risk_score'])
        test_df = test_df.drop('date_only', axis=1)
        
        print(f"  Avg forward tightening boost: {merged['risk_boost'].mean():.1f} points")
        
        return test_df
    
    def evaluate(self, test_df):
        """Evaluate model performance on test set."""
        print("\n" + "="*80)
        print("EVALUATING MODEL PERFORMANCE")
        print("="*80)
        
        # Create labels for test set
        test_df = self.create_risk_labels(test_df)
        
        y_true = test_df['risk_label'].values
        y_pred = test_df['risk_pred'].values
        y_prob = test_df['risk_prob'].values
        
        # Metrics
        accuracy = accuracy_score(y_true, y_pred)
        
        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = f1_score(y_true, y_pred)
        far = fp / (fp + tn) if (fp + tn) > 0 else 0
        
        # ROC AUC (if we have both classes)
        if len(np.unique(y_true)) > 1:
            roc_auc = roc_auc_score(y_true, y_prob)
        else:
            roc_auc = 0.0
        
        metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'false_alarm_rate': far,
            'roc_auc': roc_auc,
            'confusion_matrix': cm,
            'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn,
            'total_events': int(y_true.sum()),
            'predicted_high_risk': int(y_pred.sum())
        }
        
        print("\nConfusion Matrix:")
        print(f"                  Predicted Low  Predicted High")
        print(f"Actual Low Risk        {tn:6d}        {fp:6d}")
        print(f"Actual High Risk       {fn:6d}        {tp:6d}")
        
        print(f"\nPerformance Metrics:")
        print(f"  Accuracy: {accuracy:.1%}")
        print(f"  Precision: {precision:.1%}")
        print(f"  Recall: {recall:.1%}")
        print(f"  F1-Score: {f1:.2f}")
        print(f"  False Alarm Rate: {far:.1%}")
        if roc_auc > 0:
            print(f"  ROC AUC: {roc_auc:.3f}")
        
        print(f"\nEvent Counts:")
        print(f"  Actual risk events: {metrics['total_events']:,}")
        print(f"  Predicted high-risk: {metrics['predicted_high_risk']:,}")
        print(f"  True Positives: {tp:,}")
        print(f"  False Alarms: {fp:,}")
        
        self.metrics = metrics
        self.test_df = test_df
        
        return metrics
    
    def create_visualization(self):
        """Create comprehensive visualization."""
        print("\n" + "="*80)
        print("CREATING VISUALIZATION")
        print("="*80)
        
        # Aggregate to daily for visualization
        df = self.test_df.copy()
        daily_df = df.groupby(df['timestamp'].dt.date).agg({
            'risk_score': 'mean',
            'risk_label': 'max',
            'risk_pred': 'max',
            'SpotPrice': 'mean'
        }).reset_index()
        daily_df.columns = ['date', 'risk_score', 'actual_risk', 'predicted_risk', 'spot_price']
        
        fig = plt.figure(figsize=(24, 16))
        gs = GridSpec(4, 3, figure=fig, hspace=0.4, wspace=0.3)
        
        # 1. Risk Score Timeline
        ax1 = fig.add_subplot(gs[0, :])
        colors = ['green' if r < 30 else 'yellow' if r < 50 else 'orange' if r < 70 else 'red' 
                 for r in daily_df['risk_score']]
        ax1.bar(daily_df['date'], daily_df['risk_score'], color=colors, alpha=0.7, edgecolor='black', linewidth=0.5)
        ax1.axhline(y=30, color='green', linestyle='--', alpha=0.5, label='Low')
        ax1.axhline(y=50, color='yellow', linestyle='--', alpha=0.5, label='Moderate')
        ax1.axhline(y=70, color='orange', linestyle='--', alpha=0.5, label='High')
        ax1.set_title('Risk Score Timeline (ML-Based, 2025 Test)', fontsize=14, fontweight='bold')
        ax1.set_ylabel('Risk Score (0-100)')
        ax1.legend()
        ax1.grid(alpha=0.3, axis='y')
        
        # 2. Predicted vs Actual Risk Events
        ax2 = fig.add_subplot(gs[1, :])
        ax2.scatter(daily_df['date'], daily_df['actual_risk'], marker='o', s=100, 
                   color='red', label='Actual Risk Events', alpha=0.7, edgecolors='black')
        ax2.scatter(daily_df['date'], daily_df['predicted_risk']*0.9, marker='s', s=80,
                   color='blue', label='Predicted Risk', alpha=0.6, edgecolors='black')
        ax2.set_title('Predicted vs Actual Risk Events', fontsize=14, fontweight='bold')
        ax2.set_ylabel('Risk Event (1=Yes, 0=No)')
        ax2.set_ylim(-0.1, 1.1)
        ax2.legend()
        ax2.grid(alpha=0.3)
        
        # 3. Confusion Matrix
        ax3 = fig.add_subplot(gs[2, 0])
        cm = self.metrics['confusion_matrix']
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax3,
                   xticklabels=['Pred Low', 'Pred High'],
                   yticklabels=['Actual Low', 'Actual High'])
        ax3.set_title('Confusion Matrix', fontweight='bold')
        
        # 4. ROC Curve
        ax4 = fig.add_subplot(gs[2, 1])
        if self.metrics['roc_auc'] > 0:
            fpr, tpr, _ = roc_curve(df['risk_label'], df['risk_prob'])
            ax4.plot(fpr, tpr, linewidth=2, label=f"ROC AUC = {self.metrics['roc_auc']:.3f}")
            ax4.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random')
            ax4.set_title('ROC Curve', fontweight='bold')
            ax4.set_xlabel('False Positive Rate')
            ax4.set_ylabel('True Positive Rate')
            ax4.legend()
            ax4.grid(alpha=0.3)
        else:
            ax4.text(0.5, 0.5, 'ROC curve unavailable\n(single class in test)', 
                    ha='center', va='center', fontsize=12)
            ax4.set_title('ROC Curve', fontweight='bold')
        
        # 5. Precision-Recall Curve
        ax5 = fig.add_subplot(gs[2, 2])
        if self.metrics['total_events'] > 0:
            precision_curve, recall_curve, _ = precision_recall_curve(df['risk_label'], df['risk_prob'])
            ax5.plot(recall_curve, precision_curve, linewidth=2)
            ax5.set_title('Precision-Recall Curve', fontweight='bold')
            ax5.set_xlabel('Recall')
            ax5.set_ylabel('Precision')
            ax5.grid(alpha=0.3)
        else:
            ax5.text(0.5, 0.5, 'P-R curve unavailable\n(no events in test)', 
                    ha='center', va='center', fontsize=12)
            ax5.set_title('Precision-Recall Curve', fontweight='bold')
        
        # 6. Risk Score Distribution
        ax6 = fig.add_subplot(gs[3, 0])
        ax6.hist(daily_df['risk_score'], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
        ax6.axvline(x=30, color='green', linestyle='--', linewidth=2, label='Low')
        ax6.axvline(x=50, color='yellow', linestyle='--', linewidth=2, label='Moderate')
        ax6.axvline(x=70, color='orange', linestyle='--', linewidth=2, label='High')
        ax6.set_title('Risk Score Distribution', fontweight='bold')
        ax6.set_xlabel('Risk Score')
        ax6.set_ylabel('Days')
        ax6.legend()
        ax6.grid(alpha=0.3, axis='y')
        
        # 7. Metrics Comparison
        ax7 = fig.add_subplot(gs[3, 1])
        metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
        metrics_values = [self.metrics['accuracy'], self.metrics['precision'], 
                         self.metrics['recall'], self.metrics['f1_score']]
        colors_bar = ['green' if v >= 0.7 else 'orange' if v >= 0.5 else 'red' for v in metrics_values]
        ax7.barh(metrics_names, metrics_values, color=colors_bar, alpha=0.7, edgecolor='black')
        ax7.axvline(x=0.7, color='green', linestyle='--', linewidth=2, alpha=0.5, label='Target (70%)')
        ax7.set_xlim(0, 1)
        ax7.set_title('Performance Metrics', fontweight='bold')
        ax7.set_xlabel('Score')
        ax7.legend()
        ax7.grid(alpha=0.3, axis='x')
        
        # 8. Summary
        ax8 = fig.add_subplot(gs[3, 2])
        ax8.axis('off')
        
        summary = f"""
MODEL 2.1: ML-BASED RISK CLASSIFIER

TRAINING: 2023-2024
TESTING: 2025
Model: RF + GB + LR Ensemble

PERFORMANCE METRICS:
  Accuracy: {self.metrics['accuracy']:.1%}
  Precision: {self.metrics['precision']:.1%}
  Recall: {self.metrics['recall']:.1%}
  F1-Score: {self.metrics['f1_score']:.2f}
  False Alarm Rate: {self.metrics['false_alarm_rate']:.1%}
  ROC AUC: {self.metrics['roc_auc']:.3f}

CONFUSION MATRIX:
  True Positives: {self.metrics['tp']:,}
  False Positives: {self.metrics['fp']:,}
  True Negatives: {self.metrics['tn']:,}
  False Negatives: {self.metrics['fn']:,}

RISK EVENTS:
  Actual: {self.metrics['total_events']:,}
  Predicted: {self.metrics['predicted_high_risk']:,}

PRODUCTION READY:
  {'✓ YES' if self.metrics['accuracy'] > 0.7 and self.metrics['f1_score'] > 0.5 else '✗ NO'}
"""
        
        ax8.text(0.05, 0.5, summary, fontsize=9, family='monospace',
                verticalalignment='center', fontweight='bold')
        
        plt.suptitle('Model 2.1: ML-Based Risk Classifier - Train 2023-24, Test 2025',
                    fontsize=16, fontweight='bold', y=0.998)
        
        output_path = f'{OUTPUT_DIR}/risk_classifier_performance.png'
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"✓ Saved: {output_path}")
        plt.close()
    
    def save_outputs(self):
        """Save results and report."""
        print("\n" + "="*80)
        print("SAVING OUTPUTS")
        print("="*80)
        
        # Save daily predictions
        daily_df = self.test_df.groupby(self.test_df['timestamp'].dt.date).agg({
            'risk_score': 'mean',
            'risk_prob': 'mean',
            'risk_pred': 'max',
            'risk_label': 'max',
            'SpotPrice': 'mean',
            'OnDemandPrice': 'mean'
        }).reset_index()
        daily_df.columns = ['date', 'risk_score', 'risk_probability', 'predicted_risk', 
                           'actual_risk', 'spot_price', 'ondemand_price']
        
        daily_df.to_csv(f'{OUTPUT_DIR}/risk_classifier_results.csv', index=False)
        print(f"✓ Saved: {OUTPUT_DIR}/risk_classifier_results.csv")
        
        # Save report
        m = self.metrics
        
        report = f"""MODEL 2.1: ML-BASED RISK CLASSIFIER
{'='*80}

PRODUCTION READINESS: {'YES' if m['accuracy'] > 0.7 and m['f1_score'] > 0.5 else 'NO'}

TRAINING/TEST SPLIT:
  Training: 2023-2024 (historical data)
  Testing: 2025 (walk-forward)
  No data leakage: Verified

MODEL ARCHITECTURE:
  Random Forest (200 trees)
  Gradient Boosting (150 estimators)
  Logistic Regression
  Ensemble: Majority voting

PERFORMANCE METRICS:
  Accuracy: {m['accuracy']:.1%}
  Precision: {m['precision']:.1%}
  Recall: {m['recall']:.1%}
  F1-Score: {m['f1_score']:.2f}
  False Alarm Rate: {m['false_alarm_rate']:.1%}
  ROC AUC: {m['roc_auc']:.3f}

CONFUSION MATRIX:
                    Predicted Low  Predicted High
  Actual Low Risk        {m['tn']:6d}        {m['fp']:6d}
  Actual High Risk       {m['fn']:6d}        {m['tp']:6d}

BUSINESS IMPACT:
  Total Risk Events: {m['total_events']:,}
  Successfully Predicted: {m['tp']:,}
  Missed Events: {m['fn']:,}
  False Alarms: {m['fp']:,}

PRODUCTION CHECKLIST:
  [{'✓' if m['accuracy'] > 0.7 else ' '}] Accuracy > 70%
  [{'✓' if m['precision'] > 0.6 else ' '}] Precision > 60%
  [{'✓' if m['recall'] > 0.5 else ' '}] Recall > 50%
  [{'✓' if m['f1_score'] > 0.5 else ' '}] F1-Score > 0.50
  [{'✓' if m['false_alarm_rate'] < 0.15 else ' '}] False Alarm Rate < 15%
  [✓] Train/Test Split (2023-24 / 2025)
  [✓] ML-based (not rule-based)

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
"""
        
        with open(f'{OUTPUT_DIR}/risk_classifier_report.txt', 'w') as f:
            f.write(report)
        print(f"✓ Saved: {OUTPUT_DIR}/risk_classifier_report.txt")
        
        print(f"\n✓ All outputs saved to: {OUTPUT_DIR}")


def main():
    """Main execution."""
    print("\n" + "="*80)
    print("MODEL 2.1: ML-BASED SPOT RISK CLASSIFIER")
    print("Train on 2023-24, Test on 2025")
    print("="*80)
    
    # Initialize
    model = SpotRiskClassifier(region='ap-south-1')
    
    # Load data
    train_df, test_df, event_df = model.load_and_prepare_data(
        TRAINING_DATA,
        [TEST_Q1, TEST_Q2, TEST_Q3],
        EVENT_DATA
    )
    
    # Engineer features
    print("\n" + "="*80)
    print("ENGINEERING FEATURES")
    print("="*80)
    train_df = model.engineer_features(train_df, is_training=True)
    test_df = model.engineer_features(test_df, is_training=False)
    print(f"✓ Features engineered: {len(model.feature_cols) if hasattr(model, 'feature_cols') else 'TBD'}")
    
    # Create labels for training
    train_df = model.create_risk_labels(train_df)
    
    # Train models
    model.train_models(train_df)
    
    # Predict on test set
    test_df = model.predict_risk(test_df, model1_predictions=pd.read_csv(MODEL1_PREDICTIONS))
    
    # Evaluate
    metrics = model.evaluate(test_df)
    
    # Visualize
    model.create_visualization()
    
    # Save
    model.save_outputs()
    
    print("\n" + "="*80)
    print("MODEL 2.1 COMPLETE")
    print("="*80)
    print(f"\nProduction Ready: {'YES ✓' if metrics['accuracy'] > 0.7 and metrics['f1_score'] > 0.5 else 'NO ✗'}")
    print(f"Accuracy: {metrics['accuracy']:.1%}")
    print(f"F1-Score: {metrics['f1_score']:.2f}")
    print(f"\nAll outputs saved to: {OUTPUT_DIR}")


if __name__ == "__main__":
    main()


MODEL 2.1: ML-BASED SPOT RISK CLASSIFIER
Train on 2023-24, Test on 2025

MODEL 2.1: ML-BASED RISK CLASSIFIER
Training on 2023-24, Testing on 2025 (proper train/test split)

LOADING TRAINING DATA (2023-2024)
Selected Pool: c5.large @ aps1-az1
Training: 103,294 records (2023-01-01 00:00:00 to 2024-12-31 23:50:00)
Baseline: mean=0.4507, std=0.062509

LOADING TEST DATA (2025)
Test: 39,249 records (2025-01-01 00:00:00 to 2025-09-30 23:40:00)
Events: 78

✓ Data leakage check: 0 overlapping dates

ENGINEERING FEATURES
✓ Features engineered: TBD

CREATING RISK LABELS (Training Target)
Total samples: 103,270
Risk events (label=1): 1,066 (1.0%)
Low risk (label=0): 102,204 (99.0%)

TRAINING ML CLASSIFIERS
Training samples: 103,270
Features: 11
Class distribution: [102204   1066]

Training Random Forest...
Training Gradient Boosting...
Training Logistic Regression...

Training Accuracy:
  Random Forest: 98.6%
  Gradient Boosting: 99.9%
  Logistic Regression: 60.8%
  Ensemble (voting): 99.1%

✓ Tr