In [None]:
# ==============================================================================
# CELL A: Main Analysis Function Definition (Complete)
# ==============================================================================
!pip install arch
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from arch import arch_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import os # For creating directory
import warnings
warnings.filterwarnings('ignore')

# Create directory for convergence plots if it doesn't exist
if not os.path.exists('convergence_plots'):
    os.makedirs('convergence_plots')

def create_lstm_sequences_global(features_data, target_data, n_steps_in):
    X, y = [], []
    # Ensure target_data is long enough relative to features_data and n_steps_in
    if len(target_data) < n_steps_in or len(features_data) < n_steps_in : # Should not happen if called correctly
        return np.array(X), np.array(y)
    for i in range(n_steps_in, len(features_data) + 1): # Iterate up to the end of features
        # The sequence of features is from i-n_steps_in to i-1
        # The target y corresponds to the state at time i (or features ending at i-1 predict y at i)
        # Here, target_data is indexed such that target_data[k] corresponds to features_data[k]
        # So, if features_data[i-n_steps_in : i] are features for y at time i (original indexing)
        # then target_data needs to be indexed at `i-1` if target_data is 0-indexed from features_data
        if i > len(target_data): # Ensure we don't go out of bounds for target
            break
        X.append(features_data[i-n_steps_in:i, :])
        y.append(target_data[i-1]) # target_data[i-1] aligns with features ending at index i-1
    return np.array(X), np.array(y)


def analyze_single_stock(ticker_symbol, start_date, end_date_main_analysis, end_date_validation):
    """
    Performs the full volatility modeling pipeline for a single stock.
    """
    print(f"\n{'='*30} Analyzing {ticker_symbol} {'='*30}")
    TICKER = ticker_symbol

    current_stock_results_list = []
    # These are for storing predictions of the *current stock* if you wanted to plot its best model here
    # _local_all_test_predictions = {}
    # _local_all_test_true_values = {}


    # --- Cell 2 & 3: Data Fetching, Cleaning, Feature Engineering ---
    print(f"\n--- {TICKER}: Fetching and Preparing Data ---")
    try:
        data_full = yf.download(TICKER, start=start_date, end=end_date_validation, progress=False)
        if data_full.empty:
            print(f"No data for {TICKER}. Skipping.")
            return pd.DataFrame(), {}

        data_main = data_full[:end_date_main_analysis].copy()
        data_val = data_full[end_date_main_analysis:end_date_validation].copy()
        if not data_val.empty and not data_main.empty and data_val.index[0] <= data_main.index[-1]:
             data_val = data_val[data_val.index > data_main.index[-1]]

        if data_main.empty:
            print(f"No main analysis data for {TICKER} up to {end_date_main_analysis}. Skipping.")
            return pd.DataFrame(), {}

        price_series_main = data_main['Adj Close'].copy()
        if isinstance(price_series_main, pd.DataFrame):
            price_series_main = price_series_main.iloc[:,0]
        price_series_main.dropna(inplace=True)
        if len(price_series_main) < 2: print(f"Too little price data for {TICKER}"); return pd.DataFrame(), {}

        log_returns_main = (np.log(price_series_main) - np.log(price_series_main.shift(1))).dropna()
        log_returns_main.name = 'Log_Returns'
        VOL_WINDOW = 30 # Defined here
        if len(log_returns_main) < VOL_WINDOW: print(f"Too little return data for {TICKER} for VOL_WINDOW {VOL_WINDOW}"); return pd.DataFrame(), {}

        realized_vol_main = log_returns_main.rolling(window=VOL_WINDOW).std() * np.sqrt(252)
        realized_vol_main.name = 'Realized_Vol'

        df_model = pd.DataFrame({'Log_Returns': log_returns_main, 'Realized_Vol': realized_vol_main}).dropna()
        if df_model.empty:
            print(f"df_model is empty for {TICKER} after processing main data. Skipping.")
            return pd.DataFrame(), {}
    except Exception as e:
        print(f"Error processing data for {TICKER}: {e}")
        return pd.DataFrame(), {}

    # --- Cell 4: Evaluation Function and Train/Test Split ---
    print(f"\n--- {TICKER}: Setting up Evaluation and Train/Test Split ---")
    def _evaluate_model_local(y_true, y_pred, model_name="Model"): # Removed store_preds
        y_true_eval = np.array(y_true).flatten()
        y_pred_eval = np.array(y_pred).flatten()
        min_len = min(len(y_true_eval), len(y_pred_eval))
        if min_len == 0: return {'Model': model_name, 'RMSE': np.nan, 'MAE': np.nan, 'R2': np.nan, 'Ticker': TICKER}
        y_true_eval, y_pred_eval = y_true_eval[:min_len], y_pred_eval[:min_len]

        rmse = np.sqrt(mean_squared_error(y_true_eval, y_pred_eval))
        mae = mean_absolute_error(y_true_eval, y_pred_eval)
        r2 = r2_score(y_true_eval, y_pred_eval)
        return {'Model': model_name, 'RMSE': rmse, 'MAE': mae, 'R2': r2, 'Ticker': TICKER}

    test_size_fraction = 0.2
    split_idx = int(len(df_model) * (1 - test_size_fraction))
    if split_idx == 0 and len(df_model) > 1: split_idx = 1
    elif split_idx == len(df_model) and len(df_model) > 1: split_idx = len(df_model)-1
    elif len(df_model) < 2 : print(f"Cannot split {TICKER}"); return pd.DataFrame(), {}

    garch_train_returns = df_model['Log_Returns'][:split_idx].copy() * 100
    garch_test_returns = df_model['Log_Returns'][split_idx:].copy() * 100
    garch_test_realized_vol = df_model['Realized_Vol'][split_idx:].copy()

    ml_train_target = df_model['Realized_Vol'][:split_idx].copy()
    ml_test_target = df_model['Realized_Vol'][split_idx:].copy()

    if garch_test_returns.empty or garch_test_realized_vol.empty :
        print(f"GARCH test set empty for {TICKER}. GARCH models will have no test data.")
        # Allow to continue for ML if ML data is fine

    # --- Cell 5: GARCH Models ---
    print(f"\n--- {TICKER}: Fitting GARCH-Family Models ---")
    garch_predictions_main = {}
    garch_results_objects_main = {} # Store fitted GARCH objects for in-sample vol for hybrids
    DISTRIBUTIONS = ['normal', 't', 'ged']
    HORIZON_LENGTH_MAIN = len(garch_test_returns) if not garch_test_returns.empty else 0

    garch_model_specs = {
        "GARCH(1,1)": {'vol': 'Garch', 'p': 1, 'o': 0, 'q': 1},
        "EGARCH(1,1)": {'vol': 'EGARCH', 'p': 1, 'o': 1, 'q': 1},
        "APARCH(1,1)": {'vol': 'APARCH', 'p': 1, 'o': 1, 'q': 1}, # delta estimated by default
        "GJR-GARCH(1,1)": {'vol': 'Garch', 'p': 1, 'o': 1, 'q': 1}
    }

    if not garch_train_returns.empty and HORIZON_LENGTH_MAIN > 0:
        for model_base_name, params in garch_model_specs.items():
            for dist_name in DISTRIBUTIONS:
                model_label = f"{model_base_name}_{dist_name}"
                # print(f"  Fitting {model_label} for {TICKER}...") # Verbose
                try:
                    garch_m = arch_model(garch_train_returns, dist=dist_name, **params).fit(disp='off', show_warning=False)
                    garch_results_objects_main[model_label] = garch_m # Store for hybrid features

                    forecast_method_garch = 'simulation' if params['vol'] in ['EGARCH', 'APARCH'] else 'analytic'
                    if forecast_method_garch == 'simulation':
                        fc = garch_m.forecast(horizon=HORIZON_LENGTH_MAIN, method='simulation', simulations=500, reindex=False) # Reduced simulations
                    else:
                        fc = garch_m.forecast(horizon=HORIZON_LENGTH_MAIN, reindex=False)

                    pred_var = fc.variance.values.flatten()
                    if len(pred_var) != HORIZON_LENGTH_MAIN: pred_var = np.full(HORIZON_LENGTH_MAIN, np.nan) # Fallback

                    pred_vol = np.sqrt(pred_var)/100 * np.sqrt(252)
                    garch_predictions_main[model_label] = pred_vol
                    current_stock_results_list.append(_evaluate_model_local(garch_test_realized_vol, pred_vol, model_label))
                except Exception as e:
                    # print(f"    Error GARCH {model_label} for {TICKER}: {e}")
                    garch_predictions_main[model_label] = np.full(HORIZON_LENGTH_MAIN, np.nan)
                    garch_results_objects_main[model_label] = None # Indicate failure
    else:
        print(f"  Skipping GARCH fitting for {TICKER} due to empty train returns or zero horizon.")


    garch_preds_for_hybrid_main = pd.DataFrame(index=ml_test_target.index if not ml_test_target.empty else None)
    for model_name_key, preds_val in garch_predictions_main.items():
        col_hybrid_name = f'{model_name_key}_Pred_Vol'
        if isinstance(preds_val, np.ndarray) and preds_val.ndim == 1 and len(preds_val) == len(garch_preds_for_hybrid_main):
            garch_preds_for_hybrid_main[col_hybrid_name] = preds_val
        else: # Ensure column exists even if preds failed or length mismatch
            garch_preds_for_hybrid_main[col_hybrid_name] = np.nan


    # --- Cell 6: ML Data Prep ---
    print(f"\n--- {TICKER}: Preparing Data for ML Models ---")
    N_LAGS = 10
    df_ml = df_model.copy()
    for i in range(1, N_LAGS + 1):
        df_ml[f'Log_Returns_Lag_{i}'] = df_ml['Log_Returns'].shift(i)
        df_ml[f'Realized_Vol_Lag_{i}'] = df_ml['Realized_Vol'].shift(i)
    df_ml.dropna(inplace=True)

    SKIP_ML_THIS_STOCK = False
    if df_ml.empty or len(df_ml) < N_LAGS + 5 :
        print(f"  Not enough data for ML for {TICKER} after lagging. Skipping ML & Hybrid models.")
        SKIP_ML_THIS_STOCK = True

    if not SKIP_ML_THIS_STOCK:
        y_ml = df_ml['Realized_Vol'].copy()
        X_ml = df_ml.drop(['Log_Returns', 'Realized_Vol'], axis=1).copy()
        split_idx_ml = int(len(X_ml) * (1 - test_size_fraction))
        if split_idx_ml == 0 and len(X_ml) > 1 : split_idx_ml = 1
        elif split_idx_ml == len(X_ml) and len(X_ml) > 1 : split_idx_ml = len(X_ml)-1
        elif len(X_ml) < 2: SKIP_ML_THIS_STOCK = True; print(f"Skipping ML for {TICKER} due to insufficient data for split after lag.")

    if not SKIP_ML_THIS_STOCK:
        X_train_ml = X_ml[:split_idx_ml].copy()
        X_test_ml = X_ml[split_idx_ml:].copy()
        y_train_ml = y_ml[:split_idx_ml].copy()
        y_test_ml = y_ml[split_idx_ml:].copy()

        if y_train_ml.empty or y_test_ml.empty: # Further check
            print(f"  ML train or test target is empty for {TICKER}. Skipping ML & Hybrid.")
            SKIP_ML_THIS_STOCK = True
        else:
            scaler_X = MinMaxScaler()
            X_train_ml_scaled = scaler_X.fit_transform(X_train_ml)
            X_test_ml_scaled = scaler_X.transform(X_test_ml)

            # --- Cells 7-10: Standalone ML Models ---
            print(f"\n--- {TICKER}: Fitting Standalone ML Models ---")
            ml_models_to_run = {
                "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', n_estimators=50, random_state=42, early_stopping_rounds=5),
                "Random Forest": RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1),
                "DFFNN": Sequential([Dense(64, activation='relu', input_shape=(X_train_ml_scaled.shape[1],)), Dropout(0.2), Dense(32, activation='relu'), Dense(1)]),
                "LSTM": Sequential([LSTM(32, activation='relu', input_shape=(N_STEPS_LSTM, X_ml_scaled_for_lstm_seq.shape[1] if 'X_ml_scaled_for_lstm_seq' in locals() and X_ml_scaled_for_lstm_seq.ndim==2 else X_train_ml_scaled.shape[1] // N_LAGS if N_LAGS > 0 else 2 ), return_sequences=False), Dropout(0.2), Dense(16, activation='relu'), Dense(1)]) # Simplified LSTM input shape guess
            }
            N_STEPS_LSTM = 5 # Define for LSTM

            for ml_name, ml_model_obj in ml_models_to_run.items():
                # print(f"    Fitting {ml_name} for {TICKER}...") # Verbose
                try:
                    if ml_name in ["XGBoost", "Random Forest"]:
                        ml_model_obj.fit(X_train_ml, y_train_ml, eval_set=[(X_test_ml, y_test_ml)], verbose=False) if ml_name == "XGBoost" else ml_model_obj.fit(X_train_ml, y_train_ml)
                        preds = ml_model_obj.predict(X_test_ml)
                        current_stock_results_list.append(_evaluate_model_local(y_test_ml, preds, ml_name))
                    elif ml_name == "DFFNN":
                        ml_model_obj.compile(optimizer='adam', loss='mean_squared_error')
                        es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=0)
                        hist = ml_model_obj.fit(X_train_ml_scaled, y_train_ml, epochs=50, batch_size=32, validation_split=0.1, callbacks=[es], verbose=0)
                        plt.figure(figsize=(7,4)); plt.plot(hist.history['loss'], label='Train'); plt.plot(hist.history['val_loss'], label='Val'); plt.title(f'{ml_name} Conv - {TICKER}'); plt.legend(); plt.savefig(f'convergence_plots/{TICKER}_{ml_name}_convergence.png'); plt.close()
                        preds = ml_model_obj.predict(X_test_ml_scaled).flatten()
                        current_stock_results_list.append(_evaluate_model_local(y_test_ml, preds, ml_name))
                    elif ml_name == "LSTM":
                        # LSTM Data Prep (simplified using all X_ml features, reshape for sequence)
                        scaler_lstm_feat_standalone = MinMaxScaler()
                        X_ml_scaled_for_lstm_seq = scaler_lstm_feat_standalone.fit_transform(X_ml)

                        X_lstm_seq_train, y_lstm_seq_train = create_lstm_sequences_global(X_ml_scaled_for_lstm_seq[:split_idx_ml], y_ml.values[:split_idx_ml], N_STEPS_LSTM)
                        X_lstm_seq_test, y_lstm_seq_test = create_lstm_sequences_global(X_ml_scaled_for_lstm_seq[split_idx_ml:], y_ml.values[split_idx_ml:], N_STEPS_LSTM)

                        if X_lstm_seq_train.size > 0 and X_lstm_seq_test.size > 0:
                            # Re-define LSTM with correct input_shape if first time
                            if not ml_model_obj.built: # Check if model is already compiled/built
                                ml_model_obj = Sequential([LSTM(32, activation='relu', input_shape=(N_STEPS_LSTM, X_lstm_seq_train.shape[2]), return_sequences=False), Dropout(0.2), Dense(16, activation='relu'), Dense(1)])
                            ml_model_obj.compile(optimizer='adam', loss='mean_squared_error')
                            es_lstm = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=0)
                            hist_lstm = ml_model_obj.fit(X_lstm_seq_train, y_lstm_seq_train, epochs=50, batch_size=32, validation_split=0.1, callbacks=[es_lstm], verbose=0)
                            plt.figure(figsize=(7,4)); plt.plot(hist_lstm.history['loss'], label='Train'); plt.plot(hist_lstm.history['val_loss'], label='Val'); plt.title(f'{ml_name} Conv - {TICKER}'); plt.legend(); plt.savefig(f'convergence_plots/{TICKER}_{ml_name}_convergence.png'); plt.close()
                            preds = ml_model_obj.predict(X_lstm_seq_test).flatten()
                            current_stock_results_list.append(_evaluate_model_local(y_lstm_seq_test, preds, ml_name))
                        else:
                            print(f"    Skipping LSTM for {TICKER} due to insufficient sequence data.")
                except Exception as e:
                    print(f"    Error {ml_name} for {TICKER}: {e}")


            # --- Cell 11: Hybrid Models ---
            print(f"\n--- {TICKER}: Fitting Hybrid Models ---")
            garch_features_for_train_hybrid_stock = pd.DataFrame(index=X_train_ml.index)
            if not garch_train_returns.empty:
                for model_lbl, garch_obj_fit in garch_results_objects_main.items():
                    col_name_hyb_train = f"{model_lbl}_Pred_Vol"
                    if garch_obj_fit is not None: # Check if model was successfully fitted
                        try:
                            cond_vol = garch_obj_fit.conditional_volatility / 100 * np.sqrt(252)
                            garch_features_for_train_hybrid_stock[col_name_hyb_train] = cond_vol.reindex(X_train_ml.index).fillna(method='bfill').fillna(method='ffill')
                        except Exception: # Broad except for any issue with conditional_volatility
                            garch_features_for_train_hybrid_stock[col_name_hyb_train] = 0 # Fallback
                    else: # Model fitting failed earlier
                        garch_features_for_train_hybrid_stock[col_name_hyb_train] = 0

            # Ensure all GARCH pred columns from test set exist in train features (even if as 0)
            for expected_garch_col in garch_preds_for_hybrid_main.columns:
                if expected_garch_col not in garch_features_for_train_hybrid_stock.columns:
                    garch_features_for_train_hybrid_stock[expected_garch_col] = 0


            X_train_hybrid_base_s = X_train_ml.join(garch_features_for_train_hybrid_stock, how='left').fillna(0)
            X_test_hybrid_base_s = X_test_ml.join(garch_preds_for_hybrid_main, how='left').fillna(0)

            common_hybrid_cols_s = list(X_test_ml.columns) + list(garch_preds_for_hybrid_main.columns) # Order: base ML, then GARCH
            for col_s in common_hybrid_cols_s:
                if col_s not in X_train_hybrid_base_s.columns: X_train_hybrid_base_s[col_s] = 0
                if col_s not in X_test_hybrid_base_s.columns: X_test_hybrid_base_s[col_s] = 0

            X_train_hybrid_final_s = X_train_hybrid_base_s[common_hybrid_cols_s].copy()
            X_test_hybrid_final_s = X_test_hybrid_base_s[common_hybrid_cols_s].copy()

            base_ml_feature_cols_s = list(X_train_ml.columns)
            for specific_garch_feat_col in garch_preds_for_hybrid_main.columns: # Iterate over the 12 GARCH feature column names
                # print(f"  Hybridizing with: {specific_garch_feat_col}") # Verbose
                current_hybrid_cols = base_ml_feature_cols_s + [specific_garch_feat_col]

                curr_X_train_hyb = X_train_hybrid_final_s[current_hybrid_cols]
                curr_X_test_hyb = X_test_hybrid_final_s[current_hybrid_cols]

                scaler_hyb = MinMaxScaler()
                curr_X_train_hyb_sc = scaler_hyb.fit_transform(curr_X_train_hyb)
                curr_X_test_hyb_sc = scaler_hyb.transform(curr_X_test_hyb)

                garch_comp_name_for_label = specific_garch_feat_col.replace('_Pred_Vol','')

                for ml_name, ml_model_obj_hyb_template in ml_models_to_run.items(): # Reuse templates
                    model_name_hyb = f"{ml_name}_Hybrid_with_{garch_comp_name_for_label}"
                    # print(f"    Fitting {model_name_hyb} for {TICKER}...") # Verbose
                    try:
                        if ml_name in ["XGBoost", "Random Forest"]:
                            ml_model_instance = ml_model_obj_hyb_template # Re-init for safety if needed
                            ml_model_instance.fit(curr_X_train_hyb, y_train_ml, eval_set=[(curr_X_test_hyb, y_test_ml)], verbose=False) if ml_name == "XGBoost" else ml_model_instance.fit(curr_X_train_hyb, y_train_ml)
                            preds_hyb = ml_model_instance.predict(curr_X_test_hyb)
                            current_stock_results_list.append(_evaluate_model_local(y_test_ml, preds_hyb, model_name_hyb))
                        elif ml_name == "DFFNN":
                            dffnn_hyb_m = Sequential([Dense(64, activation='relu', input_shape=(curr_X_train_hyb_sc.shape[1],)), Dropout(0.2), Dense(32, activation='relu'), Dense(1)])
                            dffnn_hyb_m.compile(optimizer='adam', loss='mean_squared_error')
                            es_hyb_d = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=0)
                            hist_d_hyb = dffnn_hyb_m.fit(curr_X_train_hyb_sc, y_train_ml, epochs=50, validation_split=0.1, callbacks=[es_hyb_d], verbose=0)
                            plt.figure(figsize=(7,4)); plt.plot(hist_d_hyb.history['loss'], label='Train'); plt.plot(hist_d_hyb.history['val_loss'], label='Val'); plt.title(f'{model_name_hyb} Conv - {TICKER}'); plt.legend(); plt.savefig(f'convergence_plots/{TICKER}_{model_name_hyb}_convergence.png'); plt.close()
                            preds_hyb = dffnn_hyb_m.predict(curr_X_test_hyb_sc).flatten()
                            current_stock_results_list.append(_evaluate_model_local(y_test_ml, preds_hyb, model_name_hyb))
                        elif ml_name == "LSTM":
                            X_lstm_h_train_seq, y_lstm_h_train_seq = create_lstm_sequences_global(curr_X_train_hyb_sc, y_train_ml.values, N_STEPS_LSTM)
                            X_lstm_h_test_seq, y_lstm_h_test_seq = create_lstm_sequences_global(curr_X_test_hyb_sc, y_test_ml.values, N_STEPS_LSTM)
                            if X_lstm_h_train_seq.size > 0 and X_lstm_h_test_seq.size > 0:
                                lstm_hyb_m = Sequential([LSTM(32, activation='relu', input_shape=(N_STEPS_LSTM, X_lstm_h_train_seq.shape[2]), return_sequences=False), Dropout(0.2), Dense(16, activation='relu'), Dense(1)])
                                lstm_hyb_m.compile(optimizer='adam', loss='mean_squared_error')
                                es_hyb_l = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=0)
                                hist_l_hyb = lstm_hyb_m.fit(X_lstm_h_train_seq, y_lstm_h_train_seq, epochs=50, validation_split=0.1, callbacks=[es_hyb_l], verbose=0)
                                plt.figure(figsize=(7,4)); plt.plot(hist_l_hyb.history['loss'], label='Train'); plt.plot(hist_l_hyb.history['val_loss'], label='Val'); plt.title(f'{model_name_hyb} Conv - {TICKER}'); plt.legend(); plt.savefig(f'convergence_plots/{TICKER}_{model_name_hyb}_convergence.png'); plt.close()
                                preds_hyb = lstm_hyb_m.predict(X_lstm_h_test_seq).flatten()
                                current_stock_results_list.append(_evaluate_model_local(y_lstm_h_test_seq, preds_hyb, model_name_hyb))
                            else:
                                print(f"    Skipping {model_name_hyb} for {TICKER} due to insufficient sequence data.")
                    except Exception as e:
                        print(f"    Error {model_name_hyb} for {TICKER}: {e}")
    else: # if SKIP_ML_THIS_STOCK was True
        print(f"  ML and Hybrid models skipped for {TICKER} due to earlier data issues.")


    # --- Short-Term Out-of-Sample Validation (Placeholder for full logic) ---
    print(f"\n--- {TICKER}: Short-Term Validation (2 months) ---")
    best_model_details_for_validation = {'ticker': TICKER, 'best_model_for_validation': None, 'validation_RMSE': np.nan}
    if current_stock_results_list:
        temp_results_df_val = pd.DataFrame(current_stock_results_list)
        if not temp_results_df_val.empty and 'RMSE' in temp_results_df_val.columns and temp_results_df_val['RMSE'].notna().any():
            best_model_name_for_stock = temp_results_df_val.loc[temp_results_df_val['RMSE'].idxmin()]['Model']
            print(f"  Best model for {TICKER} (main analysis): {best_model_name_for_stock}")
            best_model_details_for_validation['best_model_for_validation'] = best_model_name_for_stock
            # Actual validation forecasting logic is complex and omitted for this script's scope.
            # It would involve refitting `best_model_name_for_stock` on `df_model`
            # then predicting on features derived from `data_val`.
            if not data_val.empty:
                 print(f"  (Validation on {len(data_val)} days of OOS data would be performed here if fully implemented)")
            else:
                 print(f"  No OOS data in data_val for {TICKER}")

        else:
            print(f"  No valid RMSEs for {TICKER} to determine best model for validation.")
    else:
        print(f"  No results for {TICKER} to determine best model for validation.")

    final_df_for_stock = pd.DataFrame()
    if current_stock_results_list:
        final_df_for_stock = pd.DataFrame(current_stock_results_list)

    print(f"--- Finished {TICKER} ---")
    return final_df_for_stock, best_model_details_for_validation

# ==============================================================================
# CELL B: Stock Lists and Main Execution Script
# ==============================================================================

MAIN_ANALYSIS_START_DATE = '2010-01-01'
MAIN_ANALYSIS_END_DATE = '2022-12-31'
VALIDATION_END_DATE = '2023-02-28' # Approx 2 months after main analysis

# Top 20 US Stocks by Market Cap (example list, might change over time)
ALL_US_STOCKS_FULL_LIST = [
    'AAPL', 'MSFT', 'GOOGL', 'GOOG', 'AMZN', 'NVDA', 'TSLA', 'BRK-B', 'META', 'UNH',
    'XOM', 'JNJ', 'JPM', 'V', 'LLY', 'PG', 'HD', 'MA', 'CVX', 'MRK'
    # 'PEP', 'COST', 'AVGO', 'ABBV', 'ADBE' # More options
]
# Using first 20 for this example
ALL_US_STOCKS = ALL_US_STOCKS_FULL_LIST[:20]


num_stocks = len(ALL_US_STOCKS)
stocks_per_machine = num_stocks // 4
remain = num_stocks % 4

machine_stock_lists = []
current_idx = 0
for i in range(4):
    num_for_this_machine = stocks_per_machine + (1 if i < remain else 0)
    machine_stock_lists.append(ALL_US_STOCKS[current_idx : current_idx + num_for_this_machine])
    current_idx += num_for_this_machine

# --- !!! THIS IS WHERE YOU SELECT WHICH MACHINE'S LIST TO RUN !!! ---
# --- !!!      FOR EACH ACTUAL MACHINE, SET THIS MANUALLY        !!! ---
MACHINE_TO_SIMULATE = 1 # CHANGE THIS FROM 1 to 4 for each "machine"
# ---------------------------------------------------------------------

if 1 <= MACHINE_TO_SIMULATE <= 4:
    STOCKS_TO_RUN_ON_THIS_MACHINE = machine_stock_lists[MACHINE_TO_SIMULATE - 1]
    MACHINE_ID_STR = str(MACHINE_TO_SIMULATE)
else:
    print("Invalid MACHINE_TO_SIMULATE value. Set to 1, 2, 3, or 4.")
    STOCKS_TO_RUN_ON_THIS_MACHINE = [] # Avoid running if invalid
    MACHINE_ID_STR = "invalid"


print(f"\n--- Machine {MACHINE_ID_STR} starting analysis for: {STOCKS_TO_RUN_ON_THIS_MACHINE} ---")

all_results_machine = []
all_validation_details_machine = []

if STOCKS_TO_RUN_ON_THIS_MACHINE: # Only run if there are stocks assigned
    for ticker_val in STOCKS_TO_RUN_ON_THIS_MACHINE:
        stock_perf_df, stock_val_details_dict = analyze_single_stock(
            ticker_symbol=ticker_val,
            start_date=MAIN_ANALYSIS_START_DATE,
            end_date_main_analysis=MAIN_ANALYSIS_END_DATE,
            end_date_validation=VALIDATION_END_DATE
        )
        if not stock_perf_df.empty:
            all_results_machine.append(stock_perf_df)
        if stock_val_details_dict and stock_val_details_dict.get('best_model_for_validation') is not None: # Check if dict has content
            all_validation_details_machine.append(stock_val_details_dict)


    if all_results_machine:
        combined_df_machine = pd.concat(all_results_machine, ignore_index=True)
        output_filename = f'machine_{MACHINE_ID_STR}_stock_model_performance.csv'
        combined_df_machine.to_csv(output_filename, index=False)
        print(f"\nResults for Machine {MACHINE_ID_STR} saved to {output_filename}")
    else:
        print(f"\nNo results generated for Machine {MACHINE_ID_STR}.")

    if all_validation_details_machine:
       pd.DataFrame(all_validation_details_machine).to_csv(f'machine_{MACHINE_ID_STR}_validation_summary.csv', index=False)


    print(f"--- Machine {MACHINE_ID_STR} finished ---")
else:
    print(f"No stocks assigned to Machine {MACHINE_ID_STR}. Check MACHINE_TO_SIMULATE setting or stock list.")



Collecting arch
  Downloading arch-7.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading arch-7.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (985 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/985.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m985.3/985.3 kB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: arch
Successfully installed arch-7.2.0

--- Machine 1 starting analysis for: ['AAPL', 'MSFT', 'GOOGL', 'GOOG', 'AMZN'] ---


--- AAPL: Fetching and Preparing Data ---
YF.download() has changed argument auto_adjust default to True
Error processing data for AAPL: 'Adj Close'


--- MSFT: Fetching and Preparing Data ---
Error processing data for MSFT: 'Adj Close'


--- GOOGL: Fetching and Preparing Data ---
Error processing data for GOOGL: 'Adj Close'


--- GOOG: Fetching and Preparing Data ---
Error processing data for

In [None]:
# ==============================================================================
# CELL C: Aggregation, Summary Table Generation, and Highlighting
# ==============================================================================
import pandas as pd
import numpy as np # For NaN comparison if needed
import glob # For finding files
# For styled Excel output, you might need: pip install openpyxl
# For displaying styled tables in Jupyter:
from IPython.display import display, HTML

print("\n--- Aggregating Results and Generating Summary Tables ---")

# --- Step 1: Load data from all machine CSVs ---
all_machine_csv_files = sorted(glob.glob('machine_*_stock_model_performance.csv')) # Sort for consistent order

if not all_machine_csv_files:
    print("No machine CSV files found (e.g., 'machine_1_stock_model_performance.csv').")
    print("Please ensure the individual machine scripts have run successfully and produced output.")
    # Create an empty DataFrame to prevent errors later if no files are found
    full_results_df = pd.DataFrame(columns=['Model', 'RMSE', 'MAE', 'R2', 'Ticker'])
else:
    print(f"Found result files: {all_machine_csv_files}")
    list_of_dfs = []
    for f in all_machine_csv_files:
        try:
            df_temp = pd.read_csv(f)
            if not df_temp.empty:
                list_of_dfs.append(df_temp)
            else:
                print(f"Warning: File {f} is empty.")
        except pd.errors.EmptyDataError:
            print(f"Warning: File {f} is empty or malformed.")
        except Exception as e:
            print(f"Error reading file {f}: {e}")

    if list_of_dfs:
        full_results_df = pd.concat(list_of_dfs, ignore_index=True)
        print(f"\nSuccessfully aggregated {len(list_of_dfs)} result files.")
        print(f"Total results loaded: {len(full_results_df)} model runs.")
    else:
        print("No data loaded from any machine CSV files.")
        full_results_df = pd.DataFrame(columns=['Model', 'RMSE', 'MAE', 'R2', 'Ticker'])


if full_results_df.empty:
    print("Aggregated results DataFrame is empty. Cannot generate summaries.")
else:
    print("\n--- Full Aggregated Results (Sample) ---")
    display(HTML(full_results_df.head().to_html(index=False))) # Nicer display in Jupyter

    # Drop rows where RMSE might be NaN (if a model failed entirely for a stock)
    full_results_df.dropna(subset=['RMSE'], inplace=True)
    if full_results_df.empty:
        print("No valid RMSE values found after dropping NaNs. Cannot proceed.")
    else:
        # --- Step 2: Find Best GARCH Model (including distribution) for each stock ---
        # Ensure correct regex to capture only base GARCH models, not hybrid GARCH components if names are similar
        garch_models_only_df = full_results_df[
            full_results_df['Model'].str.match(r'^(GARCH\(1,1\)_|EGARCH\(1,1\)_|APARCH\(1,1\)_|GJR-GARCH\(1,1\)_)(normal|t|ged)$')
        ].copy()

        if not garch_models_only_df.empty:
            best_garch_per_stock = garch_models_only_df.loc[garch_models_only_df.groupby('Ticker')['RMSE'].idxmin()]
            best_garch_summary = best_garch_per_stock[['Ticker', 'Model', 'RMSE', 'MAE', 'R2']].rename(
                columns={'Model': 'Best_GARCH_Variant'}
            ).set_index('Ticker').sort_index()

            print("\n\n--- Best GARCH Model (incl. Distribution) per Stock (by RMSE) ---")
            display(HTML(best_garch_summary.to_html()))
            best_garch_summary.to_csv("summary_best_garch_model_per_stock.csv")
            print("Saved: summary_best_garch_model_per_stock.csv")

            # Create a pivot table for all GARCH models vs Tickers (RMSE)
            pivot_garch_rmse = garch_models_only_df.pivot_table(index='Ticker', columns='Model', values='RMSE')

            # Styling function to highlight the minimum RMSE in each row
            def highlight_min_in_row(s):
                is_min = s == s.min()
                return ['background-color: yellow; font-weight: bold;' if v else '' for v in is_min]

            styled_pivot_garch_rmse = pivot_garch_rmse.style.apply(highlight_min_in_row, axis=1)\
                                                            .format(precision=4)\
                                                            .set_caption("GARCH Models RMSE Comparison (Best per stock highlighted)")
            print("\n\n--- Styled GARCH Models RMSE Table (Best per stock highlighted) ---")
            display(styled_pivot_garch_rmse)
            try:
                styled_pivot_garch_rmse.to_excel("styled_garch_rmse_table.xlsx", engine='openpyxl')
                print("Saved: styled_garch_rmse_table.xlsx")
            except Exception as e:
                print(f"Could not save GARCH styled table to Excel (ensure openpyxl is installed): {e}")
        else:
            print("\nNo standalone GARCH model results found to determine best GARCH per stock.")


        # --- Step 3: Find Best Overall Model (Standalone or Hybrid) for each stock ---
        best_overall_per_stock = full_results_df.loc[full_results_df.groupby('Ticker')['RMSE'].idxmin()]
        best_overall_summary = best_overall_per_stock[['Ticker', 'Model', 'RMSE', 'MAE', 'R2']].rename(
            columns={'Model': 'Best_Overall_Model'}
        ).set_index('Ticker').sort_index()

        print("\n\n--- Best Overall Model per Stock (by RMSE) ---")
        display(HTML(best_overall_summary.to_html()))
        best_overall_summary.to_csv("summary_best_overall_model_per_stock.csv")
        print("Saved: summary_best_overall_model_per_stock.csv")

        # Optional: Create a styled pivot table for ALL models (can be very wide)
        # Consider filtering to top N models or specific types if too large
        try:
            pivot_all_rmse = full_results_df.pivot_table(index='Ticker', columns='Model', values='RMSE')
            styled_pivot_all_rmse = pivot_all_rmse.style.apply(highlight_min_in_row, axis=1)\
                                                        .format(precision=4)\
                                                        .set_caption("All Models RMSE Comparison (Best per stock highlighted)")
            print("\n\n--- Styled All Models RMSE Table (Best per stock highlighted) ---")
            print("(This table might be very wide)")
            # display(styled_pivot_all_rmse) # Can be too large for direct display
            styled_pivot_all_rmse.to_excel("styled_all_models_rmse_table.xlsx", engine='openpyxl')
            print("Saved: styled_all_models_rmse_table.xlsx")
        except Exception as e:
            print(f"Could not create or save the full styled pivot table: {e}")
            print("This might be due to too many models making the table too wide for Excel or processing.")

print("\n--- Aggregation and Summary Script Finished ---")


--- Aggregating Results and Generating Summary Tables ---
No machine CSV files found (e.g., 'machine_1_stock_model_performance.csv').
Please ensure the individual machine scripts have run successfully and produced output.
Aggregated results DataFrame is empty. Cannot generate summaries.

--- Aggregation and Summary Script Finished ---
