# Human review/ correction (if there is a minor mistake from LLM output)

## If no, just neglect this part of code --> jump to alpha factors' calculation

In [2]:
!pip install asteval

Collecting asteval
  Downloading asteval-1.0.6-py3-none-any.whl.metadata (6.2 kB)
Downloading asteval-1.0.6-py3-none-any.whl (22 kB)
Installing collected packages: asteval
Successfully installed asteval-1.0.6


In [5]:
import pandas as pd
import os

# --- Configuration ---
input_csv_filename = "HK_final_comprehensive_result_1.csv"
# Save to a new file to be safe, you can change this to input_csv_filename to overwrite
output_csv_filename = "HK_final_comprehensive_result_1_corrected.csv"

# Dictionary mapping the row index to the corrected 'code' string
corrections = {
    4: '(SMA(CLOSE, 20) - CLOSE)',    # Changed MEAN to SMA
    7: 'ATR(HIGH, LOW, CLOSE, 14)', # Added required arguments
    14: 'RSI(CLOSE, 14)'             # Added required argument
}

# --- Main Logic ---
print(f"Attempting to correct formulas in: {input_csv_filename}")

if not os.path.exists(input_csv_filename):
    print(f"Error: Input file not found at '{input_csv_filename}'")
else:
    try:
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(input_csv_filename)
        print(f"Successfully loaded '{input_csv_filename}'. Shape: {df.shape}")

        if 'code' not in df.columns:
             raise KeyError("The required 'code' column was not found in the CSV.")

        corrected_count = 0
        # Iterate through the corrections dictionary
        for index, new_code in corrections.items():
            if index in df.index:
                original_code = df.loc[index, 'code']
                # Update the 'code' column at the specified index
                df.loc[index, 'code'] = new_code
                print(f"  Index {index}: Corrected '{original_code}' ---> '{new_code}'")
                corrected_count += 1
            else:
                print(f"  Warning: Index {index} specified for correction not found in the DataFrame. Skipping.")

        if corrected_count > 0:
            # Save the modified DataFrame to the output CSV file
            # index=False prevents pandas from writing the DataFrame index as a column
            df.to_csv(output_csv_filename, index=False)
            print(f"\nSuccessfully applied {corrected_count} corrections.")
            print(f"Corrected data saved to: {output_csv_filename}")
            if output_csv_filename != input_csv_filename:
                print(f"You can now use '{output_csv_filename}' as input for the next step.")
            else:
                 print(f"Original file '{input_csv_filename}' has been overwritten.")
        else:
            print("\nNo corrections were applied (perhaps indices were incorrect or already modified).")

    except pd.errors.EmptyDataError:
        print(f"Error: The input file '{input_csv_filename}' is empty.")
    except KeyError as e:
         print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Please check the CSV file format and content.")

Attempting to correct formulas in: HK_final_comprehensive_result_1.csv
Successfully loaded 'HK_final_comprehensive_result_1.csv'. Shape: (17, 3)
  Index 4: Corrected '(MEAN(CLOSE, 20) - CLOSE)' ---> '(SMA(CLOSE, 20) - CLOSE)'
  Index 7: Corrected '(ATR(14))' ---> 'ATR(HIGH, LOW, CLOSE, 14)'
  Index 14: Corrected 'RSI(14)' ---> 'RSI(CLOSE, 14)'

Successfully applied 3 corrections.
Corrected data saved to: HK_final_comprehensive_result_1_corrected.csv
You can now use 'HK_final_comprehensive_result_1_corrected.csv' as input for the next step.


In [1]:
# Block 1 REPLACEMENT: Dynamic LLM Factor Calculation
import pandas as pd
import yfinance as yf
import numpy as np
from datetime import datetime, timedelta
from asteval import Interpreter # Import asteval
import traceback # For detailed error logging
from tqdm import tqdm # For progress bars

# --- Configuration ---
LLM_FACTORS_CSV = "HK_final_comprehensive_result_1_corrected.csv" # Input from Phase I
OUTPUT_CSV = "processed_llm_alpha_data_values.csv" # Output for Block 2
TICKERS_LIST = ["0001.HK", "0002.HK", "0003.HK", "0005.HK", "0006.HK", "0011.HK", '0012.HK', '0016.HK', '0027.HK', '0066.HK', '0101.HK', '0175.HK', '0241.HK', '0267.HK', '0285.HK', '0288.HK', '0291.HK', '0316.HK',
                  "0322.HK", '0386.HK', '0388.HK', '0669.HK', '0688.HK', '0700.HK', '0762.HK', '0823.HK', '0836.HK', '0857.HK', '0868.HK', '0881.HK', '0883.HK', '0939.HK', '0941.HK', '0960.HK', '0968.HK', '0981.HK',
                  "0992.HK", "1024.HK", '1038.HK', '1044.HK', '1088.HK', "1093.HK", '1099.HK', '1109.HK', '1113.HK', '1177.HK', '1209.HK', '1211.HK', '1299.HK', '1378.HK', '1398.HK', '1810.HK', '1876.HK', '1928.HK',
                  '1929.HK', '1997.HK', '2015.HK', '2020.HK', '2269.HK', '2313.HK', '2318.HK', '2319.HK', '2331.HK', '2359.HK', '2382.HK', '2388.HK', '2628.HK', '2688.HK', '2899.HK', '3690.HK', '3692.HK', '3968.HK',
                  '3988.HK', '6618.HK', '6690.HK', '6862.HK', '9618.HK', '9633.HK', '9888.HK', '9901.HK', '9961.HK', '9988.HK', '9999.HK'] # Your list of tickers
END_DATE_STR = "2025-03-24" # Use consistent end date with analysis block
START_DATE_STR = "2020-03-24" # Use consistent start date with analysis block
LOOKBACK_BUFFER_DAYS = 200 # Extra days for rolling calculations (adjust as needed)

# --- Define Calculation Functions for asteval ---
# These functions MUST operate on pandas Series and return a pandas Series

def safe_division(numerator, denominator):
    """Performs division, replacing division by zero or NaN denominator with NaN."""
    # Ensure denominator is a Series for .replace
    if not isinstance(denominator, pd.Series):
        denominator = pd.Series(denominator) # Convert if scalar
    denom_safe = denominator.replace(0, np.nan)
    return numerator / denom_safe

def rolling_mean(series, window):
    return series.rolling(window=int(window), min_periods=int(window*0.8)).mean() # Example min_periods

def rolling_std(series, window):
    return series.rolling(window=int(window), min_periods=int(window*0.8)).std()

def rolling_min(series, window):
    return series.rolling(window=int(window), min_periods=int(window*0.8)).min()

def rolling_max(series, window):
    return series.rolling(window=int(window), min_periods=int(window*0.8)).max()

def ewma(series, span):
    return series.ewm(span=int(span), adjust=False).mean()

def series_delay(series, periods):
    return series.shift(int(periods))

# Use your existing RSI/ATR helpers, ensure they handle Series input/output
def calculate_rsi(close_series, window=14):
    window = int(window) # Ensure window is int
    delta = close_series.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    # Use ewm directly for rolling average
    avg_gain = gain.ewm(com=window - 1, min_periods=window, adjust=False).mean()
    avg_loss = loss.ewm(com=window - 1, min_periods=window, adjust=False).mean()
    rs = safe_division(avg_gain, avg_loss) # Use safe division
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

def calculate_atr(high_series, low_series, close_series, window=14):
    window = int(window) # Ensure window is int
    prev_close = series_delay(close_series, 1) # Use the delay function
    tr1 = high_series - low_series
    tr2 = abs(high_series - prev_close)
    tr3 = abs(low_series - prev_close)
    # Need to handle potential different indices if series come from different sources before concat
    # Assuming they come from the same df, indices should align
    true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1, skipna=False) # Handle NaNs during max
    atr = true_range.ewm(alpha=1.0/window, min_periods=window, adjust=False).mean() # Use ewm
    return atr

# --- Main Calculation Logic ---

# 1. Load LLM Formulas
print(f"Loading LLM generated formulas from: {LLM_FACTORS_CSV}")
try:
    llm_formulas_df = pd.read_csv(LLM_FACTORS_CSV)
    # Basic validation
    if not all(col in llm_formulas_df.columns for col in ['domain', 'name', 'code']):
        raise ValueError("CSV missing required columns: 'domain', 'name', 'code'")
    # Create a unique factor identifier (e.g., combining name and maybe hash of code)
    # Using index as proxy for now, but a more robust name is better
    llm_formulas_df['factor_id'] = llm_formulas_df.index.astype(str) + "_" + llm_formulas_df['name'].str.replace(r'\W+', '_', regex=True).str[:30] # Create a usable ID

    llm_formulas = llm_formulas_df[['factor_id', 'code']].to_dict('records')
    print(f"Loaded {len(llm_formulas)} factor formulas.")
except Exception as e:
    print(f"ERROR loading or processing {LLM_FACTORS_CSV}: {e}")
    exit()

# 2. Prepare Date Range and Tickers
start_date_dt = datetime.strptime(START_DATE_STR, '%Y-%m-%d')
end_date_dt = datetime.strptime(END_DATE_STR, '%Y-%m-%d')
fetch_start_date = (start_date_dt - timedelta(days=LOOKBACK_BUFFER_DAYS)).strftime('%Y-%m-%d')
fetch_end_date = (end_date_dt + timedelta(days=1)).strftime('%Y-%m-%d') # yf includes start, excludes end

all_factor_results_stacked = {} # Store final results: {factor_id: Series(Index=(date, asset))}
failed_factors = {} # Store errors: {factor_id: [error messages]}

# 3. Loop through Tickers
for ticker in tqdm(TICKERS_LIST, desc="Processing Tickers"):
    print(f"\n--- Processing ticker: {ticker} ---")
    # 3.1 Download Data
    try:
        df_raw = yf.download(ticker, start=fetch_start_date, end=fetch_end_date, progress=False, timeout=30)
        if df_raw.empty:
            print(f"    No data downloaded for {ticker}. Skipping.")
            continue

        # Rename columns for consistency (lowercase)
        new_columns = []
        for col in df_raw.columns:
            if isinstance(col, tuple):
                # If it's a tuple, assume the name is the first element
                col_name = str(col[0]) # Convert to string for safety
            else:
                # Otherwise, treat it as a potential string
                col_name = str(col) # Convert to string for safety
            # Apply lower() and replace() to the extracted string name
            new_columns.append(col_name.lower().replace('adj close', 'adj_close'))
        df_raw.columns = new_columns
        
        if 'adj_close' not in df_raw.columns:
            if 'close' in df_raw.columns:
                 df_raw['adj_close'] = df_raw['close'] # Use close if adj_close missing
            else:
                 print(f"    Missing 'close' or 'adj_close' for {ticker}. Skipping.")
                 continue

        # Select and ensure required columns
        required_cols = ['open', 'high', 'low', 'close', 'volume', 'adj_close']
        df_ticker = df_raw[[col for col in required_cols if col in df_raw.columns]].copy()
        if not all(col in df_ticker.columns for col in ['high', 'low', 'close', 'volume']): # Open is optional for many factors
             print(f"    Missing fundamental OHLCV columns for {ticker}. Skipping.")
             continue

        # Add aliases commonly used in formulas
        df_ticker['CLOSE'] = df_ticker['close']
        df_ticker['HIGH'] = df_ticker['high']
        df_ticker['LOW'] = df_ticker['low']
        df_ticker['VOLUME'] = df_ticker['volume']
        if 'open' in df_ticker.columns: df_ticker['OPEN'] = df_ticker['open']
        if 'adj_close' in df_ticker.columns: df_ticker['ADJ_CLOSE'] = df_ticker['adj_close']

        print(f"    Data downloaded. Shape: {df_ticker.shape}")

    except Exception as e_yf:
        print(f"    Error downloading data for {ticker}: {e_yf}")
        continue # Skip to next ticker

    # 3.2 Setup asteval Interpreter for this ticker
    aeval = Interpreter()
    # Add safe functions and data to the interpreter's symbol table
    aeval.symtable['SMA'] = rolling_mean
    aeval.symtable['EMA'] = ewma
    aeval.symtable['STD'] = rolling_std
    aeval.symtable['MIN'] = rolling_min
    aeval.symtable['MAX'] = rolling_max
    aeval.symtable['DELAY'] = series_delay
    aeval.symtable['RSI'] = calculate_rsi
    aeval.symtable['ATR'] = calculate_atr
    aeval.symtable['ABS'] = np.abs
    aeval.symtable['LOG'] = np.log
    aeval.symtable['LOG1P'] = np.log1p
    aeval.symtable['SQRT'] = np.sqrt
    aeval.symtable['SIGN'] = np.sign
    # Add the ticker's data columns
    for col in df_ticker.columns:
        # Use uppercase aliases if they exist, otherwise use original lowercase
        symbol_name = col.upper() if col.upper() in ['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOLUME', 'ADJ_CLOSE'] else col
        aeval.symtable[symbol_name] = df_ticker[col]
    # Add numpy for potential use within formulas (use carefully)
    aeval.symtable['np'] = np

    # 3.3 Loop through LLM formulas and evaluate
    print(f"    Evaluating {len(llm_formulas)} formulas...")
    for factor_info in llm_formulas:
        factor_id = factor_info['factor_id']
        formula_str = factor_info['code']
        # print(f"      Attempting: {factor_id} = {formula_str}") # Debug print

        try:
            # Evaluate the formula string safely
            result_series = aeval.eval(formula_str)

            # Validate result
            if not isinstance(result_series, pd.Series):
                raise TypeError(f"Calculation did not return a pandas Series (returned {type(result_series)})")

            if result_series.empty:
                 raise ValueError("Calculation returned an empty Series")

            if result_series.isna().all():
                 raise ValueError("Calculation returned a Series with all NaNs")

            # Ensure index matches the original data's index for this ticker
            result_series = result_series.reindex(df_ticker.index)

            # --- Store Successful Result ---
            # Filter result to the analysis period
            result_series_filtered = result_series.loc[start_date_dt:end_date_dt]

            # Add asset level to index
            result_series_filtered.index = pd.MultiIndex.from_product(
                [result_series_filtered.index, [ticker]],
                names=['date', 'asset']
            )

            # Append to the combined dictionary
            if factor_id not in all_factor_results_stacked:
                all_factor_results_stacked[factor_id] = []
            all_factor_results_stacked[factor_id].append(result_series_filtered)
            # print(f"      Success: {factor_id}") # Debug print

        except Exception as e_eval:
            error_msg = f"Ticker {ticker}: Failed evaluating '{formula_str}'. Error: {type(e_eval).__name__}: {e_eval}"
            # print(f"      ERROR: {error_msg}") # Debug print
            if factor_id not in failed_factors:
                failed_factors[factor_id] = []
            failed_factors[factor_id].append(error_msg)
            # Optional: Print traceback for complex errors
            # if not isinstance(e_eval, (SyntaxError, NameError, TypeError, ValueError, KeyError)):
            #      traceback.print_exc()


# 4. Combine results across all tickers
print("\n--- Combining results across all tickers ---")
final_factors_list = []
successfully_calculated_factors = set()

for factor_id, series_list in tqdm(all_factor_results_stacked.items(), desc="Combining Factors"):
    if series_list:
        try:
            combined_series = pd.concat(series_list).sort_index()
            # Optional: Drop duplicates just in case (shouldn't happen with MultiIndex)
            combined_series = combined_series[~combined_series.index.duplicated(keep='first')]
            combined_series.name = factor_id # Set Series name
            final_factors_list.append(combined_series)
            successfully_calculated_factors.add(factor_id)
        except Exception as e_concat:
            print(f"ERROR concatenating results for factor {factor_id}: {e_concat}")
            if factor_id not in failed_factors: failed_factors[factor_id] = []
            failed_factors[factor_id].append(f"Concatenation Error: {e_concat}")

# 5. Create Final DataFrame (Stacked Format)
final_stacked_df = pd.DataFrame()
if final_factors_list:
    try:
        final_stacked_df = pd.concat(final_factors_list, axis=1)
        # Ensure index names are correct
        final_stacked_df.index.names = ['date', 'asset']
        print(f"\nSuccessfully calculated and combined {len(final_stacked_df.columns)} factors.")
        print(f"Final DataFrame shape (stacked): {final_stacked_df.shape}")
    except Exception as e_final_concat:
        print(f"ERROR creating final DataFrame: {e_final_concat}")
else:
    print("\nNo factors were successfully calculated for any ticker.")

# 6. Report Failures
if failed_factors:
    print("\n--- Factor Evaluation Failures ---")
    num_failed = len(failed_factors)
    num_successful = len(successfully_calculated_factors)
    total_attempted = len(llm_formulas)
    print(f"Attempted: {total_attempted}, Succeeded: {num_successful}, Failed: {num_failed}")
    # Print details for a few failed factors
    for i, (factor_id, errors) in enumerate(failed_factors.items()):
        if i < 5 or num_failed < 10: # Print details for first 5 or all if < 10 failed
             original_formula = next((f['code'] for f in llm_formulas if f['factor_id'] == factor_id), "N/A")
             print(f"\nFactor: {factor_id} (Formula: {original_formula})")
             print(f"  Errors ({len(errors)} tickers):")
             # Print first few error messages
             for j, err in enumerate(errors):
                  if j < 3: print(f"    - {err}")
                  elif j == 3: print(f"    - ... ({len(errors) - 3} more errors)")
                  else: break
        elif i == 5:
             print(f"\n... and {num_failed - 5} more failed factors.")
             break


# 7. Save the results
if not final_stacked_df.empty:
    print(f"\nSaving calculated factor values to: {OUTPUT_CSV}")
    try:
        # Convert datetime index level to string before saving to CSV if needed,
        # or handle timezone removal if Block 2 expects naive datetimes.
        # Assuming Block 2 handles timezone conversion if needed based on its code.
        final_stacked_df.to_csv(OUTPUT_CSV)
        print("Save successful.")
    except Exception as e_save:
        print(f"ERROR saving results to {OUTPUT_CSV}: {e_save}")
else:
    print("\nNo data to save.")

print("\n--- Dynamic Factor Calculation (Block 1 Replacement) Finished ---")

# ==============================================================================
# Now you can proceed with Block 2, loading data from OUTPUT_CSV
# Make sure Block 2's loading section points to processed_llm_alpha_data_values.csv
# and expects the STACKED format (Index = ['date', 'asset'], Columns = factor_ids)
# =================================

Loading LLM generated formulas from: HK_final_comprehensive_result_1_corrected.csv
Loaded 17 factor formulas.


Processing Tickers:   0%|          | 0/83 [00:00<?, ?it/s]


--- Processing ticker: 0001.HK ---
YF.download() has changed argument auto_adjust default to True


Processing Tickers:   1%|          | 1/83 [00:03<04:53,  3.57s/it]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0002.HK ---


Processing Tickers:   2%|▏         | 2/83 [00:04<02:30,  1.86s/it]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0003.HK ---


Processing Tickers:   4%|▎         | 3/83 [00:04<01:39,  1.24s/it]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0005.HK ---


Processing Tickers:   5%|▍         | 4/83 [00:05<01:13,  1.08it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0006.HK ---


Processing Tickers:   6%|▌         | 5/83 [00:05<00:58,  1.34it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0011.HK ---


Processing Tickers:   7%|▋         | 6/83 [00:06<00:48,  1.59it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0012.HK ---


Processing Tickers:   8%|▊         | 7/83 [00:06<00:45,  1.66it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0016.HK ---


Processing Tickers:  10%|▉         | 8/83 [00:07<00:46,  1.61it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0027.HK ---


Processing Tickers:  11%|█         | 9/83 [00:07<00:48,  1.53it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0066.HK ---


Processing Tickers:  12%|█▏        | 10/83 [00:08<00:44,  1.63it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0101.HK ---


Processing Tickers:  13%|█▎        | 11/83 [00:09<00:43,  1.67it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0175.HK ---


Processing Tickers:  14%|█▍        | 12/83 [00:09<00:41,  1.72it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0241.HK ---


Processing Tickers:  16%|█▌        | 13/83 [00:09<00:36,  1.89it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0267.HK ---


Processing Tickers:  17%|█▋        | 14/83 [00:10<00:36,  1.89it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0285.HK ---


Processing Tickers:  18%|█▊        | 15/83 [00:10<00:34,  1.97it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0288.HK ---


Processing Tickers:  19%|█▉        | 16/83 [00:11<00:33,  2.02it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0291.HK ---


Processing Tickers:  20%|██        | 17/83 [00:12<00:38,  1.72it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0316.HK ---


Processing Tickers:  22%|██▏       | 18/83 [00:12<00:35,  1.83it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0322.HK ---


Processing Tickers:  23%|██▎       | 19/83 [00:13<00:33,  1.92it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0386.HK ---


Processing Tickers:  24%|██▍       | 20/83 [00:13<00:32,  1.91it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0388.HK ---


Processing Tickers:  25%|██▌       | 21/83 [00:14<00:34,  1.77it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0669.HK ---


Processing Tickers:  27%|██▋       | 22/83 [00:14<00:33,  1.80it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0688.HK ---


Processing Tickers:  28%|██▊       | 23/83 [00:15<00:32,  1.82it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0700.HK ---


Processing Tickers:  29%|██▉       | 24/83 [00:15<00:31,  1.90it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0762.HK ---


Processing Tickers:  30%|███       | 25/83 [00:16<00:31,  1.86it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0823.HK ---


Processing Tickers:  31%|███▏      | 26/83 [00:17<00:31,  1.79it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0836.HK ---


Processing Tickers:  33%|███▎      | 27/83 [00:17<00:30,  1.82it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0857.HK ---


Processing Tickers:  34%|███▎      | 28/83 [00:18<00:29,  1.84it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0868.HK ---


Processing Tickers:  35%|███▍      | 29/83 [00:18<00:28,  1.87it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0881.HK ---


Processing Tickers:  36%|███▌      | 30/83 [00:19<00:26,  2.02it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0883.HK ---


Processing Tickers:  37%|███▋      | 31/83 [00:19<00:27,  1.92it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0939.HK ---


Processing Tickers:  39%|███▊      | 32/83 [00:20<00:27,  1.89it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0941.HK ---


Processing Tickers:  40%|███▉      | 33/83 [00:20<00:27,  1.83it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0960.HK ---


Processing Tickers:  41%|████      | 34/83 [00:21<00:25,  1.93it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0968.HK ---


Processing Tickers:  42%|████▏     | 35/83 [00:21<00:25,  1.88it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0981.HK ---


Processing Tickers:  43%|████▎     | 36/83 [00:22<00:22,  2.10it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 0992.HK ---


Processing Tickers:  45%|████▍     | 37/83 [00:22<00:22,  2.08it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1024.HK ---


Processing Tickers:  46%|████▌     | 38/83 [00:27<01:19,  1.77s/it]

    Data downloaded. Shape: (1013, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1038.HK ---


Processing Tickers:  47%|████▋     | 39/83 [00:30<01:38,  2.23s/it]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1044.HK ---


Processing Tickers:  48%|████▊     | 40/83 [00:32<01:26,  2.01s/it]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1088.HK ---


Processing Tickers:  49%|████▉     | 41/83 [00:32<01:04,  1.54s/it]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1093.HK ---


Processing Tickers:  51%|█████     | 42/83 [00:33<00:49,  1.20s/it]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1099.HK ---


Processing Tickers:  52%|█████▏    | 43/83 [00:33<00:38,  1.03it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1109.HK ---


Processing Tickers:  53%|█████▎    | 44/83 [00:34<00:33,  1.15it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1113.HK ---


Processing Tickers:  54%|█████▍    | 45/83 [00:34<00:29,  1.30it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1177.HK ---


Processing Tickers:  55%|█████▌    | 46/83 [00:35<00:25,  1.48it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1209.HK ---


Processing Tickers:  57%|█████▋    | 47/83 [00:35<00:22,  1.58it/s]

    Data downloaded. Shape: (1053, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1211.HK ---


Processing Tickers:  58%|█████▊    | 48/83 [00:36<00:21,  1.66it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1299.HK ---


Processing Tickers:  59%|█████▉    | 49/83 [00:36<00:19,  1.72it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1378.HK ---


Processing Tickers:  60%|██████    | 50/83 [00:37<00:17,  1.84it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1398.HK ---


Processing Tickers:  61%|██████▏   | 51/83 [00:37<00:16,  1.97it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1810.HK ---


Processing Tickers:  63%|██████▎   | 52/83 [00:37<00:14,  2.07it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1876.HK ---


Processing Tickers:  64%|██████▍   | 53/83 [00:38<00:13,  2.18it/s]

    Data downloaded. Shape: (1348, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1928.HK ---


Processing Tickers:  65%|██████▌   | 54/83 [00:38<00:13,  2.08it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1929.HK ---


Processing Tickers:  66%|██████▋   | 55/83 [00:39<00:13,  2.03it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 1997.HK ---


Processing Tickers:  67%|██████▋   | 56/83 [00:39<00:13,  2.01it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2015.HK ---


Processing Tickers:  69%|██████▊   | 57/83 [00:40<00:12,  2.12it/s]

    Data downloaded. Shape: (887, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2020.HK ---


Processing Tickers:  70%|██████▉   | 58/83 [00:40<00:12,  2.03it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2269.HK ---


Processing Tickers:  71%|███████   | 59/83 [00:41<00:11,  2.10it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2313.HK ---


Processing Tickers:  72%|███████▏  | 60/83 [00:41<00:10,  2.19it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2318.HK ---


Processing Tickers:  73%|███████▎  | 61/83 [00:42<00:09,  2.24it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2319.HK ---


Processing Tickers:  75%|███████▍  | 62/83 [00:42<00:08,  2.37it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2331.HK ---


Processing Tickers:  76%|███████▌  | 63/83 [00:43<00:09,  2.09it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2359.HK ---


Processing Tickers:  77%|███████▋  | 64/83 [00:43<00:09,  2.03it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2382.HK ---


Processing Tickers:  78%|███████▊  | 65/83 [00:44<00:09,  1.94it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2388.HK ---


Processing Tickers:  80%|███████▉  | 66/83 [00:44<00:08,  1.90it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2628.HK ---


Processing Tickers:  81%|████████  | 67/83 [00:45<00:08,  1.92it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2688.HK ---


Processing Tickers:  82%|████████▏ | 68/83 [00:45<00:07,  1.98it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 2899.HK ---


Processing Tickers:  83%|████████▎ | 69/83 [00:46<00:06,  2.20it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 3690.HK ---


Processing Tickers:  84%|████████▍ | 70/83 [00:46<00:05,  2.44it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 3692.HK ---


Processing Tickers:  86%|████████▌ | 71/83 [00:46<00:04,  2.48it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 3968.HK ---


Processing Tickers:  87%|████████▋ | 72/83 [00:47<00:04,  2.45it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 3988.HK ---


Processing Tickers:  88%|████████▊ | 73/83 [00:47<00:03,  2.57it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 6618.HK ---


Processing Tickers:  89%|████████▉ | 74/83 [00:47<00:03,  2.54it/s]

    Data downloaded. Shape: (1054, 12)
    Evaluating 17 formulas...

--- Processing ticker: 6690.HK ---


Processing Tickers:  90%|█████████ | 75/83 [00:48<00:03,  2.16it/s]

    Data downloaded. Shape: (1043, 12)
    Evaluating 17 formulas...

--- Processing ticker: 6862.HK ---


Processing Tickers:  92%|█████████▏| 76/83 [00:49<00:03,  2.07it/s]

    Data downloaded. Shape: (1364, 12)
    Evaluating 17 formulas...

--- Processing ticker: 9618.HK ---


Processing Tickers:  93%|█████████▎| 77/83 [00:49<00:02,  2.11it/s]

    Data downloaded. Shape: (1171, 12)
    Evaluating 17 formulas...

--- Processing ticker: 9633.HK ---


Processing Tickers:  94%|█████████▍| 78/83 [00:49<00:02,  2.36it/s]

    Data downloaded. Shape: (1115, 12)
    Evaluating 17 formulas...

--- Processing ticker: 9888.HK ---


Processing Tickers:  95%|█████████▌| 79/83 [00:50<00:01,  2.40it/s]

    Data downloaded. Shape: (983, 12)
    Evaluating 17 formulas...

--- Processing ticker: 9901.HK ---


Processing Tickers:  96%|█████████▋| 80/83 [00:50<00:01,  2.20it/s]

    Data downloaded. Shape: (1075, 12)
    Evaluating 17 formulas...

--- Processing ticker: 9961.HK ---


Processing Tickers:  98%|█████████▊| 81/83 [00:51<00:00,  2.22it/s]

    Data downloaded. Shape: (967, 12)
    Evaluating 17 formulas...

--- Processing ticker: 9988.HK ---


Processing Tickers:  99%|█████████▉| 82/83 [00:51<00:00,  2.29it/s]

    Data downloaded. Shape: (1309, 12)
    Evaluating 17 formulas...

--- Processing ticker: 9999.HK ---


Processing Tickers: 100%|██████████| 83/83 [00:52<00:00,  1.59it/s]


    Data downloaded. Shape: (1176, 12)
    Evaluating 17 formulas...

--- Combining results across all tickers ---


Combining Factors: 100%|██████████| 17/17 [00:00<00:00, 89.38it/s]



Successfully calculated and combined 17 factors.
Final DataFrame shape (stacked): (100025, 17)

Saving calculated factor values to: processed_llm_alpha_data_values.csv
Save successful.

--- Dynamic Factor Calculation (Block 1 Replacement) Finished ---


In [2]:
final_stacked_df

Unnamed: 0_level_0,Unnamed: 1_level_0,0_Price_Momentum_10_days_,1_Rate_of_Change_ROC_10_days_,2_Moving_Average_Crossover_10_vs,3_Volume_Momentum,4_Mean_Reversion_20_days_,5_Moving_Average_Reversion,6_Stochastic_Oscillator_K_14_day,7_Average_True_Range_ATR_14_day_,8_Daily_High_Low_Range,9_Normalized_Bollinger_Band_Widt,10_Volume_Rate_of_Change_VROC_10_,11_Trading_Volume,12_Moving_Average_MA_,13_Exponential_Moving_Average_MA_,14_Relative_Strength_Index_RSI_,15_Bollinger_Bands_20_day_,16_Stochastic_Oscillator_K_14_day
date,asset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-03-24,0001.HK,-0.261719,-0.261719,-11.243292,11133732.0,9.331732,9.331732,0.096280,1.825778,1.379356,0.574961,inf,15296458,45.539855,43.705973,23.699002,0.712520,0.096280
2020-03-24,0002.HK,-0.170147,-0.170147,-3.527573,5853606.0,7.791596,7.791596,0.141588,1.835635,2.383914,0.272050,0.752496,9264804,61.469395,59.999745,27.437552,0.863975,0.141588
2020-03-24,0003.HK,-0.183619,-0.183619,-0.660818,34027820.0,1.625504,1.625504,0.172222,0.360250,0.339107,0.254600,0.772976,48176115,10.737181,10.524233,24.209762,0.872700,0.172222
2020-03-24,0005.HK,-0.077710,-0.077710,-7.111191,16681279.0,2.910289,2.910289,0.306929,1.263020,0.596777,0.311176,-0.483654,35692754,38.796450,38.542959,31.921737,0.844412,0.306929
2020-03-24,0006.HK,-0.220159,-0.220159,-4.299421,3932800.0,6.136819,6.136819,0.161290,1.304744,1.422925,0.363965,0.268858,6153012,38.316811,37.216102,25.250537,0.818017,0.161290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-24,9888.HK,0.016848,0.016848,6.512000,8914676.0,-2.932502,-2.932502,0.456757,4.110813,2.800003,0.213538,-0.427283,13278795,90.617501,91.777740,53.872340,0.893231,0.456757
2025-03-24,9901.HK,-0.031250,-0.031250,-1.117000,15877780.0,0.920000,0.920000,0.211864,1.812617,3.900002,0.173413,3.489181,19056392,38.120000,39.011644,42.638680,0.913294,0.211864
2025-03-24,9961.HK,0.035742,0.035742,-14.091614,779344.0,-27.012465,-27.012465,0.817156,20.539906,18.799988,0.199746,-0.530138,2491259,483.487535,497.756730,53.140591,0.900127,0.817156
2025-03-24,9988.HK,-0.012639,-0.012639,22.201001,39744597.0,1.639997,1.639997,0.321244,6.116542,3.700012,0.132679,-0.493060,90778453,134.440000,131.956694,54.166481,0.933661,0.321244


# Start

In [1]:
hk_tickers = ["0001.HK", "0002.HK", "0003.HK", "0005.HK", "0006.HK", "0011.HK", '0012.HK', '0016.HK', '0027.HK', '0066.HK', '0101.HK', '0175.HK', '0241.HK', '0267.HK', '0285.HK', '0288.HK', '0291.HK', '0316.HK', 
                  "0322.HK", '0386.HK', '0388.HK', '0669.HK', '0688.HK', '0700.HK', '0762.HK', '0823.HK', '0836.HK', '0857.HK', '0868.HK', '0881.HK', '0883.HK', '0939.HK', '0941.HK', '0960.HK', '0968.HK', '0981.HK', 
                  "0992.HK", "1024.HK", '1038.HK', '1044.HK', '1088.HK', "1093.HK", '1099.HK', '1109.HK', '1113.HK', '1177.HK', '1209.HK', '1211.HK', '1299.HK', '1378.HK', '1398.HK', '1810.HK', '1876.HK', '1928.HK',
                  '1929.HK', '1997.HK', '2015.HK', '2020.HK', '2269.HK', '2313.HK', '2318.HK', '2319.HK', '2331.HK', '2359.HK', '2382.HK', '2388.HK', '2628.HK', '2688.HK', '2899.HK', '3690.HK', '3692.HK', '3968.HK', 
                  '3988.HK', '6618.HK', '6690.HK', '6862.HK', '9618.HK', '9633.HK', '9888.HK', '9901.HK', '9961.HK', '9988.HK', '9999.HK']
len(hk_tickers)

83

# Alpha factor calculation and importing relevant library for later use

In [1]:
# HK_final_comprehensive_result_1.csv
import pandas as pd
import yfinance as yf
import numpy as np
from datetime import datetime, timedelta

# --- Helper Functions (keep as is) ---
def calculate_rsi(close_series, window=14):
    delta = close_series.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.ewm(com=window - 1, min_periods=window, adjust=False).mean()
    avg_loss = loss.ewm(com=window - 1, min_periods=window, adjust=False).mean()
    rs = avg_gain / avg_loss.replace(0, np.nan)
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_atr(high_series, low_series, close_series, window=14):
    prev_close = close_series.shift(1)
    tr1 = high_series - low_series
    tr2 = abs(high_series - prev_close)
    tr3 = abs(low_series - prev_close)
    true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = true_range.ewm(alpha=1/window, min_periods=window, adjust=False).mean()
    return atr


# --- Main Function to Calculate All Alpha Factors (REVISED FOR COLUMN INDEXING) ---
def calculate_all_factors_for_ticker(ticker, start_date, end_date_inclusive):
    """
    Downloads data and calculates all specified alpha factors for a single ticker.
    Handles potential MultiIndex columns from yfinance.
    """
    print(f"--- Processing ticker: {ticker} ---")
    #buffer_days = 75
    #start_buffer = (pd.to_datetime(start_date) - timedelta(days=buffer_days)).strftime('%Y-%m-%d')
    
    # Calculate the end_date for yfinance download (exclusive)
    yf_end_date = end_date_inclusive + timedelta(days=1)

    try:
        # Download data
        df_raw = yf.download(ticker, start=start_date, end=yf_end_date, progress=False)
        if df_raw.empty:
            print(f"No data downloaded for {ticker}. Skipping.")
            return None

        # --- DEBUG: Inspect Column Structure ---
        print(f"    Raw columns structure for {ticker}: {df_raw.columns}")
        # --- END DEBUG ---

        # --- Handle Potential MultiIndex Columns ---
        df = df_raw.copy() # Work on a copy
        if isinstance(df.columns, pd.MultiIndex):
            print(f"    Detected MultiIndex columns for {ticker}. Attempting to simplify.")
            # Flatten MultiIndex by taking the first level (usually OHLCV)
            # Adjust this logic if the structure is different (e.g., ('High', ''))
            try:
                df.columns = df.columns.get_level_values(0)
                 # Check for duplicate columns after flattening (e.g., if multiple levels existed)
                if df.columns.has_duplicates:
                    print(f"    Warning: Duplicate columns found after flattening MultiIndex for {ticker}. Keeping first occurrence.")
                    df = df.loc[:, ~df.columns.duplicated(keep='first')] # Keep first instance
                print(f"    Simplified columns: {df.columns}")
            except Exception as mi_err:
                print(f"    Error simplifying MultiIndex columns for {ticker}: {mi_err}. Skipping.")
                return None

        # --- Ensure standard columns exist after potential flattening ---
        required_cols = ['High', 'Low', 'Close', 'Volume']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"    Missing required columns after processing for {ticker}: {missing_cols}. Available: {df.columns}. Skipping.")
            return None

        # --- Now access columns, they should be Series ---
        # (The check from the previous version is less critical now, but can be kept as safety)
        for col in required_cols:
            if not isinstance(df[col], pd.Series):
                # This really shouldn't happen after the MultiIndex handling
                print(f"!!! CRITICAL WARNING: Column '{col}' is STILL not a Series for {ticker} after processing. Type: {type(df[col])} !!!")
                return None # Skip if data structure is fundamentally wrong

    except Exception as e:
        print(f"Error during data download or column processing for {ticker}: {e}")
        return None


    # --- 1. Calculate Base Indicators (using the processed `df`) ---
    # (Calculations remain the same, but now using the df with simplified columns)
    base_indicators = {}
    try:
        base_indicators['Close_Shift_10'] = df['Close'].shift(10)
        base_indicators['Volume_Shift_10'] = df['Volume'].shift(10)
        base_indicators['Volume_Shift_50'] = df['Volume'].shift(50)
        base_indicators['SMA_10'] = df['Close'].rolling(window=10).mean()
        base_indicators['SMA_20'] = df['Close'].rolling(window=20).mean()
        base_indicators['SMA_50'] = df['Close'].rolling(window=50).mean()
        base_indicators['EMA_20'] = df['Close'].ewm(span=20, adjust=False).mean()
        base_indicators['STD_20'] = df['Close'].rolling(window=20).std()
        base_indicators['MIN_14_LOW'] = df['Low'].rolling(window=14).min()
        base_indicators['MAX_14_HIGH'] = df['High'].rolling(window=14).max()
        base_indicators['RSI_14'] = calculate_rsi(df['Close'], window=14)
        base_indicators['ATR_14'] = calculate_atr(df['High'], df['Low'], df['Close'], window=14)
    except Exception as base_calc_err:
         print(f"   Error calculating base indicators for {ticker}: {base_calc_err}. Skipping.")
         return None

    # --- Combine original df and base indicators for easier reference ---
    # Use only the necessary columns from the processed df
    calc_df = pd.concat([df[['High', 'Low', 'Close', 'Volume']], pd.DataFrame(base_indicators)], axis=1)


    # --- 2. Calculate Alpha Factors and Validate Type (keep calculate_and_validate helper) ---
    factor_cols = {}

    def calculate_and_validate(factor_name, calculation_lambda):
        """Calculates a factor and ensures it's a Series."""
        try:
            result = calculation_lambda()
            if isinstance(result, pd.Series):
                factor_cols[factor_name] = result
                return True
            else:
                print(f"  *** Type Issue Calculating {factor_name} for {ticker}! Expected Series, got {type(result)} ***")
                return False
        except Exception as e:
            print(f"  *** Error Calculating {factor_name} for {ticker}: {type(e).__name__}: {e} ***")
            return False

    # (Calculations using calculate_and_validate remain the same as previous version)
    calculate_and_validate('Price_Momentum_10D', lambda: (calc_df['Close'] - calc_df['Close_Shift_10']) / calc_df['Close_Shift_10'].replace(0, np.nan))
    if 'Price_Momentum_10D' in factor_cols: calculate_and_validate('Rate_of_Change_10D', lambda: factor_cols['Price_Momentum_10D'])
    calculate_and_validate('MA_Crossover_10_50', lambda: calc_df['SMA_10'] - calc_df['SMA_50'])
    calculate_and_validate('Volume_Momentum_50D', lambda: calc_df['Volume'] - calc_df['Volume_Shift_50'])
    calculate_and_validate('Mean_Reversion_20D', lambda: calc_df['SMA_20'] - calc_df['Close'])
    if 'Mean_Reversion_20D' in factor_cols: calculate_and_validate('Moving_Avg_Reversion_20D', lambda: factor_cols['Mean_Reversion_20D'])
    calculate_and_validate('Stochastic_Oscillator_K_14D', lambda: ((calc_df['Close'] - calc_df['MIN_14_LOW']) / (calc_df['MAX_14_HIGH'] - calc_df['MIN_14_LOW']).replace(0, np.nan)) * 100)
    calculate_and_validate('ATR_14D', lambda: calc_df['ATR_14'])
    calculate_and_validate('Daily_High_Low_Range', lambda: calc_df['High'] - calc_df['Low'])
    calculate_and_validate('Norm_Bollinger_Width_20D', lambda: (4 * calc_df['STD_20']) / calc_df['SMA_20'].replace(0, np.nan))
    calculate_and_validate('Volume_ROC_10D', lambda: (calc_df['Volume'] / calc_df['Volume_Shift_10'].replace(0, np.nan)) - 1)
    calculate_and_validate('Trading_Volume', lambda: calc_df['Volume'])
    calculate_and_validate('Moving_Average_20D', lambda: calc_df['SMA_20'])
    calculate_and_validate('Exponential_MA_20D', lambda: calc_df['EMA_20'])
    calculate_and_validate('RSI_14D', lambda: calc_df['RSI_14'])
    calculate_and_validate('Bollinger_Ratio_LB_20D', lambda: (calc_df['SMA_20'] - 2 * calc_df['STD_20']) / calc_df['SMA_20'].replace(0, np.nan))
    if 'Stochastic_Oscillator_K_14D' in factor_cols: calculate_and_validate('Stochastic_Oscillator_D_14D', lambda: factor_cols['Stochastic_Oscillator_K_14D'])


    # --- 3. Combine *Validated* Factors into DataFrame ---
    # (This section remains the same as previous version)
    if not factor_cols:
        print(f"No factors could be successfully calculated as Series for {ticker}. Skipping DataFrame creation.")
        return None
    try:
        factors_df = pd.DataFrame(factor_cols)
        factors_df['stock_id'] = ticker
        factors_df = factors_df.loc[start_date:]
    except ValueError as ve:
         print(f"!!! ERROR creating final DataFrame for {ticker} even after validation: {ve} !!!")
         return None
    except Exception as final_e:
         print(f"!!! UNEXPECTED ERROR creating final DataFrame for {ticker}: {final_e} !!!")
         return None

    return factors_df

# --- Main Execution (keep as is) ---
if __name__ == "__main__":
    # Define tickers and time period
    hk_tickers = ["0001.HK", "0002.HK", "0003.HK", "0005.HK", "0006.HK", "0011.HK", '0012.HK', '0016.HK', '0027.HK', '0066.HK', '0101.HK', '0175.HK', '0241.HK', '0267.HK', '0285.HK', '0288.HK', '0291.HK', '0316.HK', 
                  "0322.HK", '0386.HK', '0388.HK', '0669.HK', '0688.HK', '0700.HK', '0762.HK', '0823.HK', '0836.HK', '0857.HK', '0868.HK', '0881.HK', '0883.HK', '0939.HK', '0941.HK', '0960.HK', '0968.HK', '0981.HK', 
                  "0992.HK", "1024.HK", '1038.HK', '1044.HK', '1088.HK', "1093.HK", '1099.HK', '1109.HK', '1113.HK', '1177.HK', '1209.HK', '1211.HK', '1299.HK', '1378.HK', '1398.HK', '1810.HK', '1876.HK', '1928.HK',
                  '1929.HK', '1997.HK', '2015.HK', '2020.HK', '2269.HK', '2313.HK', '2318.HK', '2319.HK', '2331.HK', '2359.HK', '2382.HK', '2388.HK', '2628.HK', '2688.HK', '2899.HK', '3690.HK', '3692.HK', '3968.HK', 
                  '3988.HK', '6618.HK', '6690.HK', '6862.HK', '9618.HK', '9633.HK', '9888.HK', '9901.HK', '9961.HK', '9988.HK', '9999.HK']

    end_date = datetime.strptime("2025-03-24",'%Y-%m-%d')
    start_date = datetime.strptime("2020-03-24",'%Y-%m-%d')

    all_results = []

    for ticker_id in hk_tickers:
        ticker_factors = calculate_all_factors_for_ticker(ticker_id, start_date, end_date)
        if ticker_factors is not None:
            all_results.append(ticker_factors)

    if all_results:
        final_df = pd.concat(all_results)
        final_df = final_df.reset_index()

        # Make sure 'Date' column is correct type after reset_index
        if 'Date' not in final_df.columns and 'index' in final_df.columns:
             final_df = final_df.rename(columns={'index': 'Date'})
        final_df['Date'] = pd.to_datetime(final_df['Date'])


        cols = ['Date', 'stock_id'] + [col for col in final_df.columns if col not in ['Date', 'stock_id']]
        final_df = final_df[cols]

        print("\n--- Final Combined Alpha Factors DataFrame ---")
        pd.set_option('display.max_rows', 100)
        pd.set_option('display.max_columns', 20)
        pd.set_option('display.width', 120)
        print(final_df.head())
        print("\n...")
        print(final_df.tail())
        print(f"\nDataFrame Shape: {final_df.shape}")

        try:
            output_csv = "manual_alpha_factors_5years.csv"
            final_df.to_csv(output_csv, index=False)
            print(f"\nFull results saved to {output_csv}")
        except Exception as e:
            print(f"\nError saving results to CSV: {e}")

    else:
        print("\nNo results were generated for any ticker.")


--- Processing ticker: 0001.HK ---
YF.download() has changed argument auto_adjust default to True
    Raw columns structure for 0001.HK: MultiIndex([( 'Close', '0001.HK'),
            (  'High', '0001.HK'),
            (   'Low', '0001.HK'),
            (  'Open', '0001.HK'),
            ('Volume', '0001.HK')],
           names=['Price', 'Ticker'])
    Detected MultiIndex columns for 0001.HK. Attempting to simplify.
    Simplified columns: Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')
--- Processing ticker: 0002.HK ---
    Raw columns structure for 0002.HK: MultiIndex([( 'Close', '0002.HK'),
            (  'High', '0002.HK'),
            (   'Low', '0002.HK'),
            (  'Open', '0002.HK'),
            ('Volume', '0002.HK')],
           names=['Price', 'Ticker'])
    Detected MultiIndex columns for 0002.HK. Attempting to simplify.
    Simplified columns: Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')
--- Processing 

In [2]:
final_df.to_csv("alpha_data_values.csv")

In [3]:
!pip install pandas numpy scipy matplotlib statsmodels yfinance tqdm



In [4]:
!pip install pandas_market_calendars



In [5]:
pip install --upgrade ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [6]:
!pip install openpyxl



# REAL

In [7]:
alpha_factors_df = pd.read_csv('alpha_data_values.csv')
alpha_factor_names = alpha_factors_df.drop(columns=['Date', 'stock_id'], errors='ignore').columns[1:].to_list()
alpha_factor_names

['Price_Momentum_10D',
 'Rate_of_Change_10D',
 'MA_Crossover_10_50',
 'Volume_Momentum_50D',
 'Mean_Reversion_20D',
 'Moving_Avg_Reversion_20D',
 'Stochastic_Oscillator_K_14D',
 'ATR_14D',
 'Daily_High_Low_Range',
 'Norm_Bollinger_Width_20D',
 'Volume_ROC_10D',
 'Trading_Volume',
 'Moving_Average_20D',
 'Exponential_MA_20D',
 'RSI_14D',
 'Bollinger_Ratio_LB_20D',
 'Stochastic_Oscillator_D_14D']

In [9]:
# Change it to MultiIndex
alpha_factors_df = pd.read_csv('alpha_data_values.csv')
alpha_factor_names = alpha_factors_df.drop(columns=['Date', 'stock_id'], errors='ignore').columns[1:].to_list()
alpha_factors_df = alpha_factors_df.rename(columns={'Date': 'date', 'stock_id': 'asset'})
alpha_factors = pd.DataFrame()

alpha_data = {}
for factor_name in alpha_factor_names:
    """
    pivot_df = alpha_factors_df.pivot_table(index='date', columns='asset', values=factor_name)
    pivot_df = pivot_df.reindex(dates)  # Reindex to ensure all dates are present
    
    # Create the MultiIndex and stack the pivoted DataFrame
    multi_index = pd.MultiIndex.from_product([pivot_df.index, pivot_df.columns], names=['date', 'asset'])
    stacked_series = pivot_df.stack().reindex(multi_index)
    
    alpha_factors[factor_name] = stacked_series"
    """
    pivot_df = alpha_factors_df.pivot_table(index='date', columns='asset', values=factor_name)

    #pivot_df = pivot_df.reindex(dates)
    #print(pivot_df)
    #break

    multi_index = pd.MultiIndex.from_product([pivot_df.index, pivot_df.columns], names=['date', 'asset'])

    stacked_series = pivot_df.stack().reindex(multi_index)

    alpha_factors[factor_name] = stacked_series

# alpha_factors # Print the first few rows to verify the result.

alpha_factors.to_csv("processed_alpha_data_values.csv")

In [10]:
alpha_factors

Unnamed: 0_level_0,Unnamed: 1_level_0,Price_Momentum_10D,Rate_of_Change_10D,MA_Crossover_10_50,Volume_Momentum_50D,Mean_Reversion_20D,Moving_Avg_Reversion_20D,Stochastic_Oscillator_K_14D,ATR_14D,Daily_High_Low_Range,Norm_Bollinger_Width_20D,Volume_ROC_10D,Trading_Volume,Moving_Average_20D,Exponential_MA_20D,RSI_14D,Bollinger_Ratio_LB_20D,Stochastic_Oscillator_D_14D
date,asset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-04-07,0001.HK,0.219048,0.219048,,,,,,,1.341042,,-0.111247,13594770.0,,39.130322,,,
2020-04-07,0002.HK,0.181347,0.181347,,,,,,,1.708473,,-0.289064,6586685.0,,56.771402,,,
2020-04-07,0003.HK,0.122977,0.122977,,,,,,,0.235901,,-0.303770,33541644.0,,9.416128,,,
2020-04-07,0005.HK,-0.109756,-0.109756,,,,,,,0.861280,,1.074696,74051605.0,,33.489280,,,
2020-04-07,0006.HK,0.142857,0.142857,,,,,,,0.985104,,-0.170215,5105675.0,,33.616435,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-24,9888.HK,0.016848,0.016848,6.512000,8914676.0,-2.932502,-2.932502,45.675700,4.110813,2.800003,0.213538,-0.427283,13278795.0,90.617501,91.777740,53.872340,0.893231,45.675700
2025-03-24,9901.HK,-0.031250,-0.031250,-1.117000,15877780.0,0.920000,0.920000,21.186449,1.812617,3.900002,0.173413,3.489181,19056392.0,38.120000,39.011644,42.638680,0.913294,21.186449
2025-03-24,9961.HK,0.035742,0.035742,-14.091614,779344.0,-27.012465,-27.012465,81.715599,20.539906,18.799988,0.199746,-0.530138,2491259.0,483.487535,497.756730,53.140591,0.900127,81.715599
2025-03-24,9988.HK,-0.012639,-0.012639,22.201001,39744597.0,1.639997,1.639997,32.124384,6.116542,3.700012,0.132679,-0.493060,90778453.0,134.440000,131.956694,54.166481,0.933661,32.124384


In [11]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [12]:
# --- Imports ---
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
# import matplotlib.pyplot as plt # Keep commented unless plotting is explicitly re-enabled
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS # Correct import for RollingOLS
from statsmodels.tools.sm_exceptions import MissingDataError # Import specific error
from numpy.linalg import LinAlgError # Import specific error
import yfinance as yf
from datetime import timedelta
import traceback # For detailed error reporting
from tqdm import tqdm # Use standard tqdm
import warnings # To suppress specific warnings if needed
import os # For path handling
import math # For sqrt
import openpyxl # Explicitly import for ExcelWriter engine check
import time # Can be useful for adding delays

# --- Suppress Warnings ---
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=RuntimeWarning)
warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning) # Suppress PerformanceWarning if desired
pd.options.mode.chained_assignment = None # Suppress SettingWithCopyWarning ('warn' or None)

# --- Configuration ---
_start_date_str = '2020-03-24'
_end_date_str = '2025-03-24'
target_timezone = 'UTC'

# Analysis Parameters
benchmark_ticker = "^HSI"
analysis_periods_str = ['1D_fwd_ret', '3D_fwd_ret', '5D_fwd_ret']
fwd_ret_periods_int = tuple(int(p.split('D')[0]) for p in analysis_periods_str)
num_quantiles = 5
ic_method = 'spearman' # 'spearman' or 'pearson'
neutralization_lookback = 60 # For style factors & neutralization lookback
MAX_DECAY_LAG = 20
# Lookbacks for style factors (keep generic longest lookback calculation)
longest_lookback_generic = neutralization_lookback # Adjust if other lookbacks needed for style/data fetch

# Output Configuration
output_dir = "factor_analysis_output_combined"
os.makedirs(output_dir, exist_ok=True)
# Define the SINGLE output Excel file name
combined_output_filename = os.path.join(output_dir, "combined_factor_analysis_results.xlsx")

# --- Date Handling ---
try:
    start_date_naive = pd.to_datetime(_start_date_str)
    end_date_naive = pd.to_datetime(_end_date_str)
    start_date = start_date_naive.tz_localize(target_timezone)
    end_date = end_date_naive.tz_localize(target_timezone)
    print(f"INFO: Start date localized to {target_timezone}: {start_date}")
    print(f"INFO: End date localized to {target_timezone}: {end_date}")
except Exception as e:
    print(f"ERROR: Could not localize start/end dates to timezone '{target_timezone}'. Error: {e}")
    raise ValueError("Failed to set timezone for start/end dates") from e

# --- Placeholder for Dynamic Universe ---
def get_index_constituents_historical(index_ticker, date_str):
    print(f"Attempting to get constituents for {index_ticker} on {date_str}.")
    #print("         This is a placeholder - yfinance lacks this feature.")
    #print("         Returning a default hardcoded list for now.")
    default_assets = ["0001.HK", "0002.HK", "0003.HK", "0005.HK", "0006.HK", "0011.HK", '0012.HK', '0016.HK', '0027.HK', '0066.HK', '0101.HK', '0175.HK', '0241.HK', '0267.HK', '0285.HK', '0288.HK', '0291.HK', '0316.HK', 
                  "0322.HK", '0386.HK', '0388.HK', '0669.HK', '0688.HK', '0700.HK', '0762.HK', '0823.HK', '0836.HK', '0857.HK', '0868.HK', '0881.HK', '0883.HK', '0939.HK', '0941.HK', '0960.HK', '0968.HK', '0981.HK', 
                  "0992.HK", "1024.HK", '1038.HK', '1044.HK', '1088.HK', "1093.HK", '1099.HK', '1109.HK', '1113.HK', '1177.HK', '1209.HK', '1211.HK', '1299.HK', '1378.HK', '1398.HK', '1810.HK', '1876.HK', '1928.HK',
                  '1929.HK', '1997.HK', '2015.HK', '2020.HK', '2269.HK', '2313.HK', '2318.HK', '2319.HK', '2331.HK', '2359.HK', '2382.HK', '2388.HK', '2628.HK', '2688.HK', '2899.HK', '3690.HK', '3692.HK', '3968.HK', 
                  '3988.HK', '6618.HK', '6690.HK', '6862.HK', '9618.HK', '9633.HK', '9888.HK', '9901.HK', '9961.HK', '9988.HK', '9999.HK']
    # Add more realistic tickers if possible for better testing
    # default_assets.extend(["1299.HK", "2318.HK", "0011.HK", "0003.HK", "0016.HK", "0012.HK"])
    if index_ticker == "^HSI": return default_assets
    else: return []

target_index = "^HSI"
print(f"\n--- Attempting to define universe based on index: {target_index} ---")
assets = get_index_constituents_historical(target_index, _start_date_str)
if not assets:
    print(f"ERROR: Could not determine assets for index {target_index}. Falling back to hardcoded list.")
    assets =  ["0001.HK", "0002.HK", "0003.HK", "0005.HK", "0006.HK", "0011.HK", '0012.HK', '0016.HK', '0027.HK', '0066.HK', '0101.HK', '0175.HK', '0241.HK', '0267.HK', '0285.HK', '0288.HK', '0291.HK', '0316.HK', 
                "0322.HK", '0386.HK', '0388.HK', '0669.HK', '0688.HK', '0700.HK', '0762.HK', '0823.HK', '0836.HK', '0857.HK', '0868.HK', '0881.HK', '0883.HK', '0939.HK', '0941.HK', '0960.HK', '0968.HK', '0981.HK', 
                "0992.HK", "1024.HK", '1038.HK', '1044.HK', '1088.HK', "1093.HK", '1099.HK', '1109.HK', '1113.HK', '1177.HK', '1209.HK', '1211.HK', '1299.HK', '1378.HK', '1398.HK', '1810.HK', '1876.HK', '1928.HK',
                '1929.HK', '1997.HK', '2015.HK', '2020.HK', '2269.HK', '2313.HK', '2318.HK', '2319.HK', '2331.HK', '2359.HK', '2382.HK', '2388.HK', '2628.HK', '2688.HK', '2899.HK', '3690.HK', '3692.HK', '3968.HK', 
                '3988.HK', '6618.HK', '6690.HK', '6862.HK', '9618.HK', '9633.HK', '9888.HK', '9901.HK', '9961.HK', '9988.HK', '9999.HK']

assets = sorted(list(set(assets))) # Ensure unique and sorted
print(f"INFO: Using asset universe (Count: {len(assets)}): {assets[:10]}...")

# --- Create Target Business Day Index ---
try:
    import pandas_market_calendars as mcal
    hk_calendar = mcal.get_calendar('XHKG')
    max_fwd_buffer_days = max(fwd_ret_periods_int) + 10 if fwd_ret_periods_int else 10
    # Extend fetch range slightly more for lookbacks and forward returns
    calendar_start_naive = start_date_naive - timedelta(days=longest_lookback_generic + 40) # Slightly longer buffer
    calendar_end_naive = end_date_naive + timedelta(days=max(max_fwd_buffer_days, MAX_DECAY_LAG + 15)) # Slightly longer buffer
    schedule = hk_calendar.schedule(start_date=calendar_start_naive, end_date=calendar_end_naive)
    fetch_dates_index_raw = pd.to_datetime(schedule.index).tz_localize(schedule.index.tz)
    if fetch_dates_index_raw.tz is None: fetch_dates_index_raw = pd.to_datetime(schedule.index).tz_localize('UTC', ambiguous='infer', nonexistent='shift_forward')
    fetch_dates_index = fetch_dates_index_raw.tz_convert(target_timezone).drop_duplicates().sort_values() # Ensure unique & sorted early
    dates_index = fetch_dates_index[(fetch_dates_index >= start_date) & (fetch_dates_index <= end_date)].drop_duplicates().sort_values()
    print(f"Using pandas_market_calendars for HK business days. Full fetch range index length: {len(fetch_dates_index)}")
except ImportError:
    print("WARNING: pandas_market_calendars not found. Using pd.date_range(freq='B'). This might include holidays.")
    max_fwd_period = max(fwd_ret_periods_int) if fwd_ret_periods_int else 0
    fetch_start_dt_b = start_date - pd.Timedelta(days=longest_lookback_generic + 40) # Adjusted buffer
    fetch_end_dt_b = end_date + pd.Timedelta(days=max(max_fwd_period, MAX_DECAY_LAG) + 15) # Adjusted buffer
    fetch_dates_index = pd.date_range(start=fetch_start_dt_b, end=fetch_end_dt_b, freq='B', tz=target_timezone).drop_duplicates().sort_values()
    dates_index = fetch_dates_index[(fetch_dates_index >= start_date) & (fetch_dates_index <= end_date)].drop_duplicates().sort_values()

if dates_index.empty:
    raise ValueError(f"ERROR: Target dates_index is empty after filtering between {start_date} and {end_date}.")
print(f"Target Analysis Date Range: {dates_index.min()} to {dates_index.max()} ({len(dates_index)} analysis days)")

# --- Data Fetching (Prices and Total Volume) ---
print("\n--- Downloading Price and Total Volume Data ---")
prices_lookback = pd.DataFrame()
volumes_lookback = pd.DataFrame()
benchmark_prices_lookback = pd.Series(dtype=float)
try:
    fetch_start_str = fetch_dates_index.min().strftime('%Y-%m-%d')
    fetch_end_str = (fetch_dates_index.max() + pd.Timedelta(days=1)).strftime('%Y-%m-%d') # Add 1 day for yf end date convention
    print(f"Fetching data from {fetch_start_str} to {fetch_end_str} for {len(assets)} assets + benchmark {benchmark_ticker}...")

    # Fetch asset data
    data_assets = yf.download(assets, start=fetch_start_str, end=fetch_end_str, progress=True, timeout=180, group_by='ticker')

    # Fetch benchmark data
    data_benchmark = yf.download(benchmark_ticker, start=fetch_start_str, end=fetch_end_str, progress=False)

    # Process Asset Data
    prices_list = []
    volumes_list = []
    valid_assets = [] # Keep track of assets with successfully downloaded data
    if not data_assets.empty:
        # Check if data_assets index needs converting (can happen with yfinance sometimes)
        if not isinstance(data_assets.index, pd.DatetimeIndex):
             try: data_assets.index = pd.to_datetime(data_assets.index)
             except: print("WARN: Could not convert downloaded asset data index to DatetimeIndex.")

        for asset in assets:
            try:
                # Access asset data robustly
                if isinstance(data_assets.columns, pd.MultiIndex):
                    if asset in data_assets.columns.get_level_values(0):
                       asset_data = data_assets[asset]
                    else:
                       print(f"WARN: No data returned for {asset} in multi-index result.")
                       continue
                elif len(assets) == 1 and asset == assets[0]: # Handle case where only one asset was requested (no multi-index)
                     asset_data = data_assets
                else: # Should not happen if group_by='ticker' worked for multiple assets
                     print(f"WARN: Unexpected data structure for {asset}. Skipping.")
                     continue

                # Select price and volume
                adj_close_key = 'Adj Close' if 'Adj Close' in asset_data.columns else 'Close'
                if adj_close_key not in asset_data.columns or 'Volume' not in asset_data.columns:
                     print(f"WARN: Missing '{adj_close_key}' or 'Volume' for {asset}. Skipping.")
                     continue
                price_col = asset_data[adj_close_key]
                volume_col = asset_data['Volume']

                # Check for sufficient non-NaN data
                if not price_col.dropna().empty: # Check if not ALL NaN
                    prices_list.append(price_col.rename(asset))
                    volumes_list.append(volume_col.rename(asset))
                    valid_assets.append(asset)
                else:
                    print(f"WARN: Price data for {asset} is all NaN.")

            except KeyError:
                print(f"WARN: KeyError accessing data for {asset}. Ticker might be invalid or delisted for the period.")
            except Exception as e_asset:
                print(f"WARN: Could not process data for {asset}. Error: {e_asset}")

    if prices_list:
        prices_raw = pd.concat(prices_list, axis=1)
        volumes_raw = pd.concat(volumes_list, axis=1)

        # Convert index to datetime and localize if needed
        if not isinstance(prices_raw.index, pd.DatetimeIndex): prices_raw.index = pd.to_datetime(prices_raw.index)
        if prices_raw.index.tz is None: prices_raw.index = prices_raw.index.tz_localize('UTC', ambiguous='infer', nonexistent='shift_forward')
        prices_raw = prices_raw.tz_convert(target_timezone)

        if not isinstance(volumes_raw.index, pd.DatetimeIndex): volumes_raw.index = pd.to_datetime(volumes_raw.index)
        if volumes_raw.index.tz is None: volumes_raw.index = volumes_raw.index.tz_localize('UTC', ambiguous='infer', nonexistent='shift_forward')
        volumes_raw = volumes_raw.tz_convert(target_timezone)

        # Reindex to our full business day index and forward fill prices, fillna(0) volumes
        prices_lookback = prices_raw.reindex(fetch_dates_index).ffill()
        volumes_lookback = volumes_raw.reindex(fetch_dates_index).fillna(0)
        print(f"Asset price/volume data processed. Shape: {prices_lookback.shape}")

        # --- Crucial: Update asset list to only include those successfully downloaded ---
        original_asset_count = len(assets)
        assets = sorted(valid_assets) # Update the global assets list
        if len(assets) < original_asset_count:
            print(f"INFO: Asset list updated to {len(assets)} tickers with valid data.")
        if not assets: # Check if asset list became empty
             print("CRITICAL ERROR: No assets remaining after data download/validation. Exiting.")
             exit()
        # -------------------------------------------------------------------------------

    else:
        print("ERROR: No valid asset price data could be fetched or processed. Exiting.")
        exit() # Exit if no asset data

    # Process Benchmark Data
    if not data_benchmark.empty:
        if not isinstance(data_benchmark.index, pd.DatetimeIndex): data_benchmark.index = pd.to_datetime(data_benchmark.index)
        adj_close_key_bm = 'Adj Close' if 'Adj Close' in data_benchmark.columns else 'Close'
        if adj_close_key_bm not in data_benchmark.columns:
             print(f"ERROR: Benchmark price column ('{adj_close_key_bm}') not found.")
             # Create empty series as fallback
             benchmark_prices_lookback = pd.Series(dtype=float, index=fetch_dates_index, name=benchmark_ticker)
        else:
            benchmark_prices_raw = data_benchmark[adj_close_key_bm]
            if benchmark_prices_raw.index.tz is None: benchmark_prices_raw.index = benchmark_prices_raw.index.tz_localize('UTC', ambiguous='infer', nonexistent='shift_forward')
            benchmark_prices_raw = benchmark_prices_raw.tz_convert(target_timezone)
            benchmark_prices_lookback = benchmark_prices_raw.reindex(fetch_dates_index).ffill()
            print(f"Benchmark data processed. Length: {len(benchmark_prices_lookback)}")
    else:
        print("ERROR: Benchmark data could not be fetched.")
        # Create empty series if benchmark fetch failed
        benchmark_prices_lookback = pd.Series(dtype=float, index=fetch_dates_index, name=benchmark_ticker)

except Exception as e:
    print(f"\nERROR during data download: {e}"); traceback.print_exc()
    print("CRITICAL ERROR: Data download failed. Exiting.")
    exit()


# Create dataframes for the analysis period by slicing lookback data
# Ensure slicing uses the potentially updated 'assets' list
prices = prices_lookback.loc[dates_index, assets].copy()
volumes = volumes_lookback.loc[dates_index, assets].copy()
benchmark_prices = benchmark_prices_lookback.loc[dates_index].copy()

# --- Robust check for empty or all-NaN core data ---
prices_all_nan = False
if not prices.empty:
    prices_all_nan = prices.isna().all().all() # Check if ALL values are NaN

benchmark_all_nan = False
if not benchmark_prices.empty and isinstance(benchmark_prices, pd.Series): # Ensure it's a Series
    benchmark_all_nan = benchmark_prices.isna().all() # Check if ALL values are NaN

if prices.empty or benchmark_prices.empty or prices_all_nan or benchmark_all_nan:
     print("CRITICAL ERROR: Prices or Benchmark data is invalid (empty or all NaN) for the analysis period. Exiting.")
     exit()
# --- End of robust check ---


# --- Fetch Industry Data ---
print("\n--- Fetching Industry Classification Data ---")
def fetch_industry_data(tickers):
    industry_dict = {}
    missing_industries = []
    for ticker_str in tqdm(tickers, desc="Fetching Industries"):
        try:
            ticker_obj = yf.Ticker(ticker_str)
            # info_data = ticker_obj.fast_info # Potentially faster, fewer fields
            info_data = ticker_obj.info # Slower but more comprehensive
            industry = info_data.get('industry', 'Unknown')
            sector = info_data.get('sector', 'Unknown') # Get sector too
            # Prefer industry, fall back to sector, then Unknown
            if industry in [None, '', 'N/A', 'Unknown']:
                 industry = sector if sector not in [None, '', 'N/A', 'Unknown'] else 'Unknown'
            final_industry = industry if industry is not None else 'Unknown' # Ensure value is not None

            if final_industry == 'Unknown': missing_industries.append(ticker_str)
            industry_dict[ticker_str] = final_industry
            time.sleep(0.05) # Small delay to avoid potential rate limiting
        except Exception as e_ind:
            print(f"WARN: Error fetching industry for {ticker_str}: {e_ind}") # Show specific error
            industry_dict[ticker_str] = 'Unknown'
            missing_industries.append(ticker_str)

    if missing_industries: print(f"WARNING: Could not reliably fetch industry/sector for: {list(set(missing_industries))}")
    return pd.Series(industry_dict, name='industry')

if assets: # Only fetch if we have assets
    asset_industries = fetch_industry_data(assets)
    industry_dummies_static = pd.DataFrame()
    if not asset_industries.empty:
        # Create dummies, ensuring they align with the final 'assets' list
        industry_dummies_static = pd.get_dummies(asset_industries.reindex(assets).fillna('Unknown'), dummy_na=False, prefix='Ind').astype(int)
        # Drop 'Ind_Unknown' if other industries exist and it's all zero, or if only Unknown exists keep it.
        if 'Ind_Unknown' in industry_dummies_static.columns and len(industry_dummies_static.columns) > 1:
             if not industry_dummies_static['Ind_Unknown'].any():
                  industry_dummies_static = industry_dummies_static.drop('Ind_Unknown', axis=1)
        industry_dummies_static.index.name = 'asset'
        print(f"Created Static Industry Dummies shape: {industry_dummies_static.shape}")
        if industry_dummies_static.empty:
             print("WARN: Industry dummies became empty after processing (e.g., only 'Unknown' dropped). Creating default.")
             industry_dummies_static = pd.DataFrame({'Ind_NoIndustry': 1}, index=assets).astype(int)
    else:
        print("WARNING: Could not create industry dummies (fetch returned empty). Creating default.")
        industry_dummies_static = pd.DataFrame({'Ind_NoIndustry': 1}, index=assets).astype(int)
        industry_dummies_static.index.name = 'asset'
else:
    print("WARNING: No assets defined, skipping industry fetch.")
    asset_industries = pd.Series(dtype=str, name='industry')
    industry_dummies_static = pd.DataFrame(index=pd.Index([], name='asset')) # Ensure empty df has index


# ================================================================
# === LOAD OR DEFINE YOUR PRE-CALCULATED FACTORS HERE ===
# ================================================================
print("\n--- Loading/Defining Pre-calculated Factors ---")

# --- INPUT REQUIRED ---
# Option 1: Load from file (RECOMMENDED)
LOAD_FROM_FILE = True # SET TO TRUE TO LOAD FROM FILE
factors_file_path = "processed_alpha_data_values.csv" # OR .csv, .pkl etc.
# Expected format: See FORMAT 1 or FORMAT 2 descriptions below.

# Option 2: Define programmatically (like the dummy example)
CREATE_DUMMY_FACTORS = True # Set to False if loading from file

factors_input_df = pd.DataFrame() # Initialize

if LOAD_FROM_FILE:
    print(f"Attempting to load factors from: {factors_file_path}")
    if not os.path.exists(factors_file_path):
         print(f"ERROR: Factors file not found at {factors_file_path}")
    else:
        try:
            # Example loading parquet (adjust based on your file type)
            if factors_file_path.endswith(".parquet"):
                factors_input_df = pd.read_parquet(factors_file_path)
            elif factors_file_path.endswith(".csv"):
                # Adjust read_csv parameters as needed (e.g., index_col, parse_dates)
                # Assuming format 2 (stacked) for CSV example:
                factors_input_df = pd.read_csv(factors_file_path, index_col=[0, 1], parse_dates=[0])
                # Set index names if not read automatically
                if factors_input_df.index.names != ['date', 'asset']:
                     print("WARN: Setting loaded CSV index names to ['date', 'asset'].")
                     factors_input_df.index.names = ['date', 'asset']
            elif factors_file_path.endswith(".pkl"):
                factors_input_df = pd.read_pickle(factors_file_path)
            else:
                print(f"ERROR: Unsupported file format: {factors_file_path}")

            if not factors_input_df.empty:
                print(f"Successfully loaded factors from file. Initial shape: {factors_input_df.shape}")

                # --- Post-load processing based on format ---
                # Check if loaded data is Format 2 (stacked: Index=(date, asset), Columns=FactorNames)
                if isinstance(factors_input_df.index, pd.MultiIndex) and list(factors_input_df.index.names) == ['date', 'asset']:
                    print("INFO: Loaded data appears to be in Format 2 (Stacked). Unstacking...")
                    try:
                        factors_input_df_wide = factors_input_df.unstack(level='asset')
                        factors_input_df_wide.columns = pd.MultiIndex.from_tuples(
                            [(col_name, asset_name) for col_name, asset_name in factors_input_df_wide.columns],
                            names=['factor_name', 'asset']
                        )
                        factors_input_df = factors_input_df_wide # Overwrite with Format 1
                        print("Successfully unstacked factors to Format 1 (Wide).")
                    except Exception as e_unstack_load:
                        print(f"ERROR: Could not unstack the loaded factor DataFrame: {e_unstack_load}")
                        factors_input_df = pd.DataFrame() # Invalidate on error
                # Assume loaded data is already Format 1 (wide: Index=date, Columns=(factor_name, asset))
                elif isinstance(factors_input_df.index, pd.DatetimeIndex) and isinstance(factors_input_df.columns, pd.MultiIndex):
                    print("INFO: Loaded data appears to be in Format 1 (Wide).")
                    # Ensure column level names are correct
                    if list(factors_input_df.columns.names) != ['factor_name', 'asset']:
                        print("WARN: Renaming columns to ['factor_name', 'asset']. Please verify.")
                        factors_input_df.columns.names = ['factor_name', 'asset']
                else:
                    print("ERROR: Loaded DataFrame format is not recognized as Format 1 or Format 2.")
                    factors_input_df = pd.DataFrame() # Invalidate

        except Exception as e_load:
            print(f"ERROR: Failed to load or process factors file: {e_load}")
            traceback.print_exc()
            factors_input_df = pd.DataFrame()

elif CREATE_DUMMY_FACTORS:
    # Create dummy stacked data for demonstration:
    print("INFO: Creating dummy factor data for demonstration...")
    if not dates_index.empty and assets:
        multi_idx = pd.MultiIndex.from_product([dates_index, assets], names=['date', 'asset'])
        dummy_data = {
            'Factor_Dummy_1': np.random.randn(len(multi_idx)),
            'Factor_Dummy_2': np.random.rand(len(multi_idx)) - 0.5
        }
        factors_input_df_stacked = pd.DataFrame(dummy_data, index=multi_idx)
        print(f"Dummy stacked factors created. Shape: {factors_input_df_stacked.shape}")

        # --- Convert FORMAT 2 (Stacked) to FORMAT 1 (Wide - Preferred by the script) ---
        try:
            factors_input_df = factors_input_df_stacked.unstack(level='asset')
            factors_input_df.columns = pd.MultiIndex.from_tuples(
                [(col_name, asset_name) for col_name, asset_name in factors_input_df.columns],
                names=['factor_name', 'asset']
            )
            print("Successfully unstacked dummy factors to Format 1 (Wide).")
        except Exception as e_unstack_dummy:
            print(f"ERROR: Could not unstack the dummy factor DataFrame: {e_unstack_dummy}")
            factors_input_df = pd.DataFrame() # Assign empty df on error
    else:
        print("ERROR: Cannot create dummy factors - dates_index or assets are empty.")
        factors_input_df = pd.DataFrame()
else:
    print("INFO: No factor loading or creation specified.")


# --- Validation and Final Preparation ---
precalculated_factors_df = pd.DataFrame() # Initialize final df

if not factors_input_df.empty:
    # Ensure index is DatetimeIndex and has correct timezone
    if not isinstance(factors_input_df.index, pd.DatetimeIndex):
        try:
            factors_input_df.index = pd.to_datetime(factors_input_df.index)
        except Exception as e_conv:
            print(f"ERROR: Could not convert factor index to DatetimeIndex: {e_conv}. Invalidating factors.")
            factors_input_df = pd.DataFrame()

    if not factors_input_df.empty: # Check again after potential invalidation
        if factors_input_df.index.tz is None:
            try:
                print("INFO: Localizing factor index timezone...")
                factors_input_df.index = factors_input_df.index.tz_localize(target_timezone, ambiguous='infer', nonexistent='shift_forward')
            except TypeError: # Already localized
                 pass
            except Exception as e_tz:
                print(f"ERROR: Could not localize factor index timezone: {e_tz}. Invalidating factors.")
                factors_input_df = pd.DataFrame()

        if not factors_input_df.empty and factors_input_df.index.tz != target_timezone:
            try:
                print("INFO: Converting factor index timezone...")
                factors_input_df.index = factors_input_df.index.tz_convert(target_timezone)
            except Exception as e_tz_conv:
                print(f"ERROR: Could not convert factor index timezone: {e_tz_conv}. Invalidating factors.")
                factors_input_df = pd.DataFrame()

    # Ensure columns are MultiIndex ['factor_name', 'asset']
    if not factors_input_df.empty:
        if isinstance(factors_input_df.columns, pd.MultiIndex) and list(factors_input_df.columns.names) == ['factor_name', 'asset']:
             # Reindex to ensure all factors/assets/dates are present
             factor_names_present = factors_input_df.columns.get_level_values('factor_name').unique()
             # Ensure assets used for reindexing are the ones we have price data for
             target_multi_columns = pd.MultiIndex.from_product([factor_names_present, assets], names=['factor_name', 'asset'])

             print(f"Reindexing loaded factors to match analysis dates ({len(dates_index)}) and assets ({len(assets)})...")
             # Reindex BOTH index and columns to match the analysis scope
             precalculated_factors_df = factors_input_df.reindex(index=dates_index, columns=target_multi_columns)
             # Check for excessive NaNs after reindexing
             nan_frac = precalculated_factors_df.isna().mean().mean() if not precalculated_factors_df.empty else 1.0
             if precalculated_factors_df.isna().all().all():
                  print("CRITICAL WARN: Factor DataFrame is ALL NaNs after reindexing. Check date/asset alignment. Analysis will likely fail.")
             elif nan_frac > 0.9: # Example threshold
                  print(f"WARN: Factor DataFrame has >90% NaNs ({nan_frac:.1%}) after reindexing.")

             print(f"Final precalculated factors DataFrame ready. Shape: {precalculated_factors_df.shape}")
             available_factors = precalculated_factors_df.columns.get_level_values('factor_name').unique().tolist()
             print(f"Available factors: {available_factors}")
             if not available_factors:
                  print("ERROR: No factor names found after processing. Invalidating.")
                  precalculated_factors_df = pd.DataFrame()

        else:
             print("ERROR: Processed factor DataFrame columns are not MultiIndex named ['factor_name', 'asset']. Invalidating factors.")
             precalculated_factors_df = pd.DataFrame()

else:
    print("ERROR: No factor input data loaded or created.")
    precalculated_factors_df = pd.DataFrame()

# Final check before analysis loop
if precalculated_factors_df.empty:
     print("\nCRITICAL ERROR: The precalculated_factors_df is empty or invalid after loading/processing. Cannot proceed with analysis.")
     exit()
# ================================================================
# === END OF FACTOR LOADING SECTION ===
# ================================================================


# --- Calculate Style Factors (Do ONCE before loop) ---
print("\n--- Calculating Style Factors (Beta, Size Proxy, Liquidity Proxy, Residual Volatility) ---")
# Initialize with correct index names BUT NO DATA YET
style_factors = pd.DataFrame(index=pd.MultiIndex.from_product([dates_index, assets], names=['date', 'asset'])) # Base structure
style_factors_calculated = {} # Store components temporarily

min_periods_neut = max(10, neutralization_lookback // 2)
try:
    # Use lookback dataframes here
    print(f"Calculating style factors using {neutralization_lookback}-day lookback...")

    # 1. Size/Liquidity Proxy: Log of rolling average dollar volume
    if not prices_lookback.empty and not volumes_lookback.empty:
        dollar_volume_lb = prices_lookback.loc[:, assets] * volumes_lookback.loc[:, assets] # Ensure asset alignment
        rolling_dollar_vol_lb = dollar_volume_lb.rolling(neutralization_lookback, min_periods=min_periods_neut).mean()
        epsilon = 1e-9 # Smaller epsilon
        size_liquidity_proxy_df = np.log1p(rolling_dollar_vol_lb + epsilon)
        size_liquidity_proxy_df = size_liquidity_proxy_df.replace([np.inf, -np.inf], np.nan)
        # Slice to analysis dates *before* stacking
        size_liq_proxy_stacked = size_liquidity_proxy_df.loc[dates_index, assets].stack(future_stack=True).rename('size_liquidity_proxy')
        size_liq_proxy_stacked.index.names = ['date', 'asset'] # Set Index Names
        if not size_liq_proxy_stacked.dropna().empty: # Check if not all NaN
            style_factors_calculated['size_liquidity_proxy'] = size_liq_proxy_stacked
            print(" - Calculated Size/Liquidity Proxy.")
        else: print("WARN: Size/Liquidity Proxy resulted in empty or all-NaN series.")
    else: print("WARN: Skipping Size/Liquidity Proxy calc due to missing price/volume lookback data.")


    # 2. Beta: Rolling regression against benchmark
    if not prices_lookback.empty and not benchmark_prices_lookback.dropna().empty: # Check benchmark has data
        asset_returns_lb = prices_lookback.loc[:, assets].pct_change() # Ensure asset alignment
        benchmark_returns_lb = benchmark_prices_lookback.pct_change()

        # Ensure benchmark returns are not all NaN before proceeding
        if benchmark_returns_lb.dropna().empty:
             print("WARN: Benchmark returns are all NaN in lookback period. Skipping Beta calculation.")
        else:
            aligned_benchmark_ret_lb = benchmark_returns_lb.reindex(asset_returns_lb.index).ffill()
            X_beta_base = sm.add_constant(aligned_benchmark_ret_lb.dropna()) # Prepare RHS once
            betas = {} # Re-initialize dict for beta results specifically

            print(" - Calculating Beta (this may take a while)...")
            with tqdm(total=len(assets), desc="Calculating Beta", leave=False) as pbar:
                for asset in assets:
                    y_beta = asset_returns_lb[asset].dropna()

                    # --- Robust Beta Calculation Start ---
                    if y_beta.empty or X_beta_base.empty:
                        betas[asset] = pd.Series(np.nan, index=dates_index, name=asset) # Assign NaN series aligned with main index
                        pbar.update(1)
                        continue # Skip to next asset

                    common_idx_beta = X_beta_base.index.intersection(y_beta.index)

                    if len(common_idx_beta) >= neutralization_lookback: # Use >= lookback for min_nobs logic
                        X_beta_aligned = X_beta_base.loc[common_idx_beta]
                        y_beta_aligned = y_beta.loc[common_idx_beta]

                        if y_beta_aligned.empty or X_beta_aligned.empty:
                             betas[asset] = pd.Series(np.nan, index=dates_index, name=asset)
                             pbar.update(1)
                             continue

                        try:
                            # Use imported RollingOLS directly
                            rols = RollingOLS(endog=y_beta_aligned, exog=X_beta_aligned,
                                             window=neutralization_lookback, min_nobs=min_periods_neut)
                            results = rols.fit()
                            # Check if params DataFrame is not empty and has enough columns
                            if not results.params.empty and results.params.shape[1] > 1:
                                 beta_series = results.params.iloc[:, 1] # Beta coeff index 1
                                 # Reindex to target dates_index AFTER calculation for this asset
                                 betas[asset] = beta_series.reindex(dates_index).ffill().bfill()
                            else:
                                 #print(f"WARN [{asset}]: RollingOLS params empty or misshaped.")
                                 betas[asset] = pd.Series(np.nan, index=dates_index, name=asset)

                        except IndexError: # Catch index error if params structure unexpected
                            #print(f"WARN [{asset}]: RollingOLS IndexError (likely bad fit).")
                            betas[asset] = pd.Series(np.nan, index=dates_index, name=asset)
                        except MissingDataError: # Catch if not enough observations for a window
                            #print(f"WARN [{asset}]: RollingOLS MissingDataError.")
                            betas[asset] = pd.Series(np.nan, index=dates_index, name=asset)
                        except LinAlgError: # Catch linear algebra errors (e.g., singular matrix)
                            #print(f"WARN [{asset}]: RollingOLS LinAlgError.")
                            betas[asset] = pd.Series(np.nan, index=dates_index, name=asset)
                        except ValueError as e_ols_val: # Catch potential value errors during fit
                            #print(f"WARN [{asset}]: RollingOLS ValueError: {e_ols_val}")
                            betas[asset] = pd.Series(np.nan, index=dates_index, name=asset)
                        except Exception as e_beta_sm: # Catch other unexpected errors
                            #print(f"WARN [{asset}]: RollingOLS failed unexpectedly: {e_beta_sm}")
                            betas[asset] = pd.Series(np.nan, index=dates_index, name=asset)
                    else:
                        # Not enough common data points for reliable rolling beta
                        betas[asset] = pd.Series(np.nan, index=dates_index, name=asset) # Assign NaN series
                    pbar.update(1)
                    # --- Robust Beta Calculation End ---

            # --- Concatenate Beta results ---
            if betas: # Check if the betas dictionary is not empty
                try:
                    # Filter out any potential non-Series items just in case
                    valid_betas = {k: v for k, v in betas.items() if isinstance(v, pd.Series)}
                    if valid_betas:
                        beta_df = pd.concat(valid_betas.values(), axis=1, keys=valid_betas.keys()) # Use values and keys
                        beta_df.columns.name = 'asset' # Name the column index
                        # Stack the dataframe (already indexed by dates_index)
                        beta_stacked = beta_df.stack(future_stack=True).rename('beta')
                        beta_stacked.index.names = ['date', 'asset'] # Set Index Names
                        if not beta_stacked.dropna().empty: # Check if not all NaN
                            style_factors_calculated['beta'] = beta_stacked
                            print(" - Calculated Beta.")
                        else: print("WARN: Beta calculation resulted in empty or all-NaN series after stacking.")
                    else: print("WARN: No valid beta Series were generated.")
                except ValueError as e_concat_beta:
                     print(f"ERROR concatenating beta results: {e_concat_beta}")
                     print("WARN: Skipping Beta factor due to concatenation error.")
            else:
                print("WARN: No beta values could be calculated for any asset.")
    else: print("WARN: Skipping Beta calc due to missing price/benchmark lookback data.")


    # 3. Residual Volatility: Rolling std dev of returns
    if not prices_lookback.empty:
        if 'asset_returns_lb' not in locals(): # Calculate if not done for beta
             asset_returns_lb = prices_lookback.loc[:, assets].pct_change() # Ensure asset alignment
        rolling_std_ret = asset_returns_lb.rolling(neutralization_lookback, min_periods=min_periods_neut).std()
        # Slice to analysis dates *before* stacking
        res_vol_stacked = rolling_std_ret.loc[dates_index, assets].stack(future_stack=True).rename('residual_vol')
        res_vol_stacked.index.names = ['date', 'asset'] # Set Index Names
        if not res_vol_stacked.dropna().empty: # Check if not all NaN
             style_factors_calculated['residual_vol'] = res_vol_stacked
             print(" - Calculated Residual Volatility.")
        else: print("WARN: Residual Volatility calculation resulted in empty or all-NaN series.")
    else: print("WARN: Skipping Residual Volatility calc due to missing price lookback data.")


    # --- Combine all calculated factors at the end ---
    if style_factors_calculated:
         # Ensure all components are Series before concat
         valid_components = {k: v for k, v in style_factors_calculated.items() if isinstance(v, pd.Series)}
         if valid_components:
             style_factors = pd.concat(valid_components.values(), axis=1) # Combine valid Series into DF
             # Reindex just in case some date/asset combos were missing in all factors
             style_factors = style_factors.reindex(pd.MultiIndex.from_product([dates_index, assets], names=['date', 'asset']))
             print(f"Style factors calculation finished. Final Shape: {style_factors.shape}")
         else:
              print("WARN: No valid style factor components were calculated.")
              style_factors = pd.DataFrame(index=pd.MultiIndex.from_product([dates_index, assets], names=['date', 'asset']))
    else:
         print("WARN: No style factors were successfully calculated.")
         style_factors = pd.DataFrame(index=pd.MultiIndex.from_product([dates_index, assets], names=['date', 'asset']))


except Exception as e_style:
    print(f"ERROR calculating style factors: {e_style}")
    traceback.print_exc()
    style_factors = pd.DataFrame(index=pd.MultiIndex.from_product([dates_index, assets], names=['date', 'asset']))
    print("WARN: Style factors calculation failed. Proceeding without them for neutralization.")


# --- Analysis Function Definitions ---
print("\n--- Defining/Importing Analysis Functions ---")

def calculate_forward_returns(prices_df, periods):
    """Calculates forward returns for multiple periods. Corrected version 3."""
    # prices_df: Index=date, Columns=assets
    all_fwd_returns = {} # Store DataFrames for each period

    if prices_df.empty:
         print("ERROR [Fwd Ret]: Input prices_df is empty.")
         return pd.DataFrame(index=pd.MultiIndex([[],[]], [[],[]], names=['date','asset']), columns=analysis_periods_str)

    for p in periods:
        fwd_ret_col_name = f'{p}D_fwd_ret'
        # Calculate returns for all assets for this period 'p'
        shifted_price = prices_df.shift(-p)
        # Ensure alignment before division, handle potential NaNs gracefully
        returns_p = (shifted_price / prices_df - 1).replace([np.inf, -np.inf], np.nan)
        all_fwd_returns[fwd_ret_col_name] = returns_p

    if not all_fwd_returns:
        print("WARN [Fwd Ret]: No forward returns calculated.")
        return pd.DataFrame(index=pd.MultiIndex([[],[]], [[],[]], names=['date','asset']), columns=analysis_periods_str)

    # Concat creates MultiIndex columns: ('1D_fwd_ret', 'asset1'), ('3D_fwd_ret', 'asset1'), ...
    combined_fwd_returns_wide = pd.concat(all_fwd_returns, axis=1)
    combined_fwd_returns_wide.columns.names = ['period', 'asset'] # Name the column levels

    # Stack the 'asset' level from columns to index to get format:
    # Index = MultiIndex('date', 'asset'), Columns = Index(['1D_fwd_ret', '3D_fwd_ret', ...])
    fwd_returns_stacked = combined_fwd_returns_wide.stack(level='asset', future_stack=True)
    fwd_returns_stacked.index.names = ['date', 'asset'] # Ensure final index names are correct
    # Ensure columns are named correctly (should be the periods after stacking 'asset')
    fwd_returns_stacked.columns.name = 'period' # Name the columns index

    return fwd_returns_stacked

def get_quantile_assignments(factor_df, num_quantiles=5):
    """Assigns assets to quantiles based on factor values for each date."""
    # Input factor_df: Index=(date, asset), Column='factor'
    if factor_df.empty: return pd.DataFrame(columns=['quantile'], index=factor_df.index) # Handle empty input

    factor_col_name = 'factor'
    if factor_col_name not in factor_df.columns:
        if isinstance(factor_df, pd.Series) and factor_df.name == factor_col_name:
             factor_df = factor_df.to_frame()
        elif not factor_df.empty: # Try using the first column if name isn't 'factor'
             original_col = factor_df.columns[0]
             factor_df = factor_df[[original_col]].rename(columns={original_col: factor_col_name})
        else: # Cannot proceed if empty and no factor column
            return pd.DataFrame(columns=['quantile'], index=factor_df.index)

    # Use transform to handle broadcasting within groups safely
    quantiles = factor_df.groupby(level='date')[factor_col_name].transform(
        lambda x: pd.qcut(x, num_quantiles, labels=False, duplicates='drop')
    ) + 1 # Labels 1 to N
    quantiles = quantiles.rename('quantile')

    return quantiles.to_frame() # Return as DataFrame


def quantile_analysis(analysis_data, factor_display_name, num_quantiles=5, ret_col='1D_fwd_ret'):
    """Performs quantile return analysis."""
    # analysis_data: Index=(date, asset), Columns=['factor', ret_col]
    if ret_col not in analysis_data.columns:
        print(f"WARN [Quantile Analysis]: Return column '{ret_col}' not found.")
        return None, None
    if 'factor' not in analysis_data.columns:
        print(f"WARN [Quantile Analysis]: Factor column 'factor' not found.")
        return None, None
    if analysis_data.empty or analysis_data[['factor', ret_col]].isna().all().all():
         print(f"WARN [Quantile Analysis]: Input data empty or all NaN for {factor_display_name}/{ret_col}.")
         return None, None

    quantile_assignments = get_quantile_assignments(analysis_data[['factor']], num_quantiles)
    if quantile_assignments.empty or quantile_assignments['quantile'].isna().all():
         print(f"WARN [Quantile Analysis]: Could not assign quantiles for {factor_display_name}.")
         return None, None

    data_with_quantiles = analysis_data.join(quantile_assignments, how='inner').dropna(subset=['quantile'])
    if data_with_quantiles.empty: # Check after join/dropna
         print(f"WARN [Quantile Analysis]: Data empty after joining quantiles for {factor_display_name}.")
         return None, None

    # Mean return per quantile (averaged over time)
    mean_ret_by_quantile = data_with_quantiles.groupby('quantile')[ret_col].mean()

    # Cumulative return per quantile
    daily_mean_ret_by_q = data_with_quantiles.groupby(['date', 'quantile'])[ret_col].mean().unstack(level='quantile')
    # Fill missing daily quantile returns (e.g., if a quantile had no members) with 0 for cumulative calc
    daily_mean_ret_by_q = daily_mean_ret_by_q.fillna(0)
    # Calculate geometric cumulative returns
    cumulative_ret_by_q = (1 + daily_mean_ret_by_q).cumprod() - 1

    return mean_ret_by_quantile, cumulative_ret_by_q


def calculate_quantile_turnover(quantile_assignments, num_quantiles=5):
    """Calculates quantile turnover."""
    # quantile_assignments: Index=(date, asset), Column='quantile'
    if quantile_assignments.empty or quantile_assignments['quantile'].isna().all():
        print("WARN [Turnover]: Input quantile assignments are empty or all NaN.")
        return pd.DataFrame() # Return empty df

    turnover_results = {}
    quantiles_unstacked = quantile_assignments['quantile'].unstack(level='asset')

    # Ensure index is sorted for shift to work correctly
    quantiles_unstacked = quantiles_unstacked.sort_index()

    for q in range(1, num_quantiles + 1):
        quantile_members = (quantiles_unstacked == q)
        prev_members = quantile_members.shift(1)

        # Align and stack, keeping only days where both current and previous exist
        combined = pd.concat(
            [quantile_members.stack(future_stack=True).rename('current'),
             prev_members.stack(future_stack=True).rename('previous')],
            axis=1
        ).dropna() # Drop rows where either is NaN (i.e., first day, or if assets change)

        if combined.empty:
             # Handle case with only one day of data or no overlap
             daily_turnover_series = pd.Series(np.nan, index=quantiles_unstacked.index)
        else:
            def daily_turnover(group):
                # Check if group is empty or has wrong structure
                if group.empty or not all(c in group.columns for c in ['current', 'previous']):
                    return np.nan

                stayed = (group['current'] & group['previous']).sum()
                entered = (group['current'] & ~group['previous']).sum()
                exited = (~group['current'] & group['previous']).sum()
                total_current = group['current'].sum()
                total_previous = group['previous'].sum()

                avg_size = (total_current + total_previous) / 2.0
                if avg_size < 1e-6: return 0.0 # Handle near-zero avg size

                # Using: max(entered, exited) / avg_size
                traded = max(entered, exited)
                return traded / avg_size if avg_size > 0 else 0.0

            # Apply daily turnover calculation
            daily_turnover_series = combined.groupby(level='date').apply(daily_turnover)
            # Reindex to original dates index to include days with NaN turnover
            daily_turnover_series = daily_turnover_series.reindex(quantiles_unstacked.index)


        turnover_results[f'Q{q}_Turnover'] = daily_turnover_series

    turnover_df = pd.DataFrame(turnover_results)
    if not turnover_df.empty:
        turnover_df['Mean_Turnover'] = turnover_df.mean(axis=1)
    return turnover_df


def calculate_ic(analysis_data, factor_display_name, ret_col='1D_fwd_ret', method='spearman'):
    """Calculates Information Coefficient (IC)."""
    # analysis_data: Index=(date, asset), Columns=['factor', ret_col]
    if ret_col not in analysis_data.columns or 'factor' not in analysis_data.columns:
         print(f"WARN [IC]: Missing 'factor' or '{ret_col}' for {factor_display_name}")
         return None, None
    if analysis_data.empty or analysis_data[['factor', ret_col]].isna().all().all():
         print(f"WARN [IC]: Input data empty or all NaN for {factor_display_name}/{ret_col}.")
         return None, None

    def ic_calc(group):
        group_clean = group[['factor', ret_col]].dropna()
        if len(group_clean) < 3: return np.nan # Need >= 3 points for reliable correlation? Usually 2 is min.
        try:
            # Check for zero variance before calculating correlation
            factor_std_dev = group_clean['factor'].std()
            ret_std_dev = group_clean[ret_col].std()
            if pd.isna(factor_std_dev) or factor_std_dev < 1e-9 or pd.isna(ret_std_dev) or ret_std_dev < 1e-9:
                 return 0.0 # Treat constant series as zero correlation

            if method == 'spearman':
                coeff, p_val = spearmanr(group_clean['factor'], group_clean[ret_col])
                return coeff if pd.notna(coeff) else 0.0 # Return 0 if spearman returns NaN
            elif method == 'pearson':
                coeff = group_clean['factor'].corr(group_clean[ret_col], method='pearson')
                return coeff if pd.notna(coeff) else 0.0 # Return 0 if pearson returns NaN
            else: return np.nan
        except ValueError: # Handle other potential errors (e.g., from spearmanr)
             return np.nan

    daily_ic = analysis_data.groupby(level='date').apply(ic_calc)
    daily_ic.name = f'IC_{method}' # Rename the resulting Series

    # Summarize IC
    ic_mean = daily_ic.mean()
    ic_std = daily_ic.std()
    icir = ic_mean / ic_std if pd.notna(ic_std) and ic_std > 1e-9 else np.nan # Avoid div by zero/tiny std
    hit_rate = (daily_ic > 1e-9).mean() if not daily_ic.dropna().empty else np.nan # Use > small epsilon for hit rate
    obs_days = daily_ic.count() # Count non-NaN IC days

    ic_summary = pd.Series({
        'Mean IC': ic_mean,
        'Std Dev IC': ic_std,
        'ICIR': icir,
        'Hit Rate (>0)': hit_rate,
        'Observations (Days)': obs_days
    }, name=ret_col) # Use ret_col as the Series name

    return ic_summary, daily_ic.to_frame() # Return daily IC as DataFrame


def calculate_factor_returns(analysis_data, factor_display_name, ret_col='1D_fwd_ret'):
    """Calculates factor returns (e.g., long/short portfolio based on factor)."""
    # analysis_data: Index=(date, asset), Columns=['factor', ret_col]
    if ret_col not in analysis_data.columns or 'factor' not in analysis_data.columns:
        print(f"WARN [Factor Returns]: Missing 'factor' or '{ret_col}' for {factor_display_name}")
        return None, None, None, None
    if analysis_data.empty or analysis_data[['factor', ret_col]].isna().all().all():
         print(f"WARN [Factor Returns]: Input data empty or all NaN for {factor_display_name}/{ret_col}.")
         return None, None, None, None

    # 1. Standardize Factor (cross-sectionally)
    factor_std = analysis_data.groupby(level='date')['factor'].transform(
        lambda x: (x - x.mean()) / x.std() if pd.notna(x.std()) and x.std() > 1e-9 else (x - x.mean()) # Handle zero/tiny std dev
    ).fillna(0) # Fill NaNs after standardization (e.g., single asset days) with 0 weight

    # 2. Calculate Weighted Return for each day
    analysis_data_temp = analysis_data[[ret_col]].copy() # Only need return col
    analysis_data_temp['factor_std'] = factor_std
    analysis_data_temp['weighted_ret'] = analysis_data_temp['factor_std'] * analysis_data_temp[ret_col]

    # --- Daily Factor Return: Weighted average return ---
    # Sum of (weight * return) / Sum of abs(weights) <-- For dollar neutral L/S
    sum_weighted_ret = analysis_data_temp.groupby(level='date')['weighted_ret'].sum()
    sum_abs_weights = analysis_data_temp.groupby(level='date')['factor_std'].apply(lambda x: x.abs().sum())
    # Avoid division by zero/NaN if sum of abs weights is zero/NaN for a day
    daily_factor_return = (sum_weighted_ret / sum_abs_weights.replace(0, np.nan)).dropna()
    daily_factor_return.name = 'factor_daily_ret'


    # 3. Calculate Cumulative Return
    cumulative_factor_return = pd.Series(index=daily_factor_return.index, dtype=float)
    if not daily_factor_return.empty:
        cumulative_factor_return = (1 + daily_factor_return).cumprod() - 1
    cumulative_factor_return.name = 'factor_cum_ret'

    # 4. Calculate Annualized Statistics
    ann_factor = 252 # Assuming 252 trading days per year
    num_days = len(daily_factor_return)
    ann_ret, ann_vol, sharpe = np.nan, np.nan, np.nan # Defaults
    if num_days > 5: # Require min days for meaningful stats
         mean_daily_ret = daily_factor_return.mean()
         std_daily_ret = daily_factor_return.std()
         if pd.notna(mean_daily_ret): ann_ret = mean_daily_ret * ann_factor
         if pd.notna(std_daily_ret) and std_daily_ret > 1e-9: # Avoid div by tiny std
              ann_vol = std_daily_ret * np.sqrt(ann_factor)
              if pd.notna(ann_ret) and ann_vol > 1e-9 : sharpe = ann_ret / ann_vol # Ensure vol > 0

    ann_stats = pd.Series({
        'Annualized Return': ann_ret,
        'Annualized Volatility': ann_vol,
        'Sharpe Ratio': sharpe,
        'Observations (Days)': num_days
    }, name=ret_col) # Use ret_col as the Series name

    return daily_factor_return.to_frame(), cumulative_factor_return.to_frame(), None, ann_stats # Placeholder for drawdown

# --- FIX IS HERE ---
def calculate_forward_returns_for_decay(prices_df, max_lag):
    """Calculates forward returns for multiple lags up to max_lag."""
    fwd_rets_dict = {}
    print(f"Calculating fwd returns for decay (1 to {max_lag} days)...")
    if prices_df.empty:
         print("WARN [Decay FwdRets]: Input prices_df is empty.")
         return fwd_rets_dict

    shifted_prices = {lag: prices_df.shift(-lag) for lag in range(1, max_lag + 1)}
    with tqdm(total=max_lag, desc="Fwd Returns Decay", leave=False) as pbar: # Set leave=False
        for lag in range(1, max_lag + 1):
            ret_col_name = f'{lag}D_fwd_ret'
            fwd_ret_lag = (shifted_prices[lag] / prices_df - 1).replace([np.inf, -np.inf], np.nan)
            # Stack to get (date, asset) index
            fwd_ret_stacked = fwd_ret_lag.stack(future_stack=True).rename(ret_col_name)
            # <<< FIX: Set index names >>>
            fwd_ret_stacked.index.names = ['date', 'asset']
            # <<< END FIX >>>
            if not fwd_ret_stacked.dropna().empty: # Check not all NaN
                fwd_rets_dict[lag] = fwd_ret_stacked.dropna() # Store cleaned series
            pbar.update(1)
    print(f"Finished calculating {len(fwd_rets_dict)} forward returns for decay.")
    return fwd_rets_dict # Dict: {lag: Series(Index=(date,asset), Value=ret)}
# --- END FIX ---

def calculate_ic_decay(factor_series_clean, fwd_returns_for_decay_dict, max_lag, method='spearman'):
    """Calculates IC decay over multiple forward return periods."""
    # factor_series_clean: Series, Index=(date, asset), Name='factor'
    ic_decay_values = {}
    print("Calculating IC Decay...")
    if factor_series_clean.empty or not fwd_returns_for_decay_dict:
         print("WARN [IC Decay]: Factor series empty or no fwd returns provided.")
         return pd.Series(dtype=float, name=f'Mean_IC_{method}_Decay').rename_axis('Lag (Days)')

    with tqdm(total=max_lag, desc="IC Decay", leave=False) as pbar: # Set leave=False
        for lag in range(1, max_lag + 1):
            result_ic = np.nan # Default
            if lag in fwd_returns_for_decay_dict:
                fwd_ret_lag = fwd_returns_for_decay_dict[lag]
                if not fwd_ret_lag.empty:
                    # Ensure both series are frames for merge (safer)
                    factor_frame = factor_series_clean.to_frame()
                    ret_frame = fwd_ret_lag.to_frame()
                    # <<< Check index names before merge for debugging >>>
                    # print(f"DEBUG IC Decay Lag {lag}: Factor index names: {factor_frame.index.names}, Ret index names: {ret_frame.index.names}")
                    # <<< End Debug >>>
                    try:
                        aligned_decay = pd.merge(factor_frame, ret_frame,
                                                 left_index=True, right_index=True, how='inner')
                        aligned_decay = aligned_decay.dropna() # Drop rows with NaNs in either column

                        if len(aligned_decay) > 2: # Need enough points
                            # Calculate mean daily IC for this lag
                            def ic_calc_decay(group):
                                if len(group) < 3: return np.nan
                                try:
                                     # Check variance again
                                    factor_std_dev = group['factor'].std()
                                    ret_std_dev = group[fwd_ret_lag.name].std()
                                    if pd.isna(factor_std_dev) or factor_std_dev < 1e-9 or pd.isna(ret_std_dev) or ret_std_dev < 1e-9:
                                         return 0.0 # Treat constant series as zero correlation

                                    if method == 'spearman':
                                        coeff, p_val = spearmanr(group['factor'], group[fwd_ret_lag.name])
                                        return coeff if pd.notna(coeff) else 0.0
                                    elif method == 'pearson':
                                        coeff = group['factor'].corr(group[fwd_ret_lag.name], method='pearson')
                                        return coeff if pd.notna(coeff) else 0.0
                                    else: return np.nan
                                except ValueError: return np.nan # Handle other errors (e.g. spearmanr issue)

                            daily_ic_lag = aligned_decay.groupby(level='date').apply(ic_calc_decay)
                            result_ic = daily_ic_lag.mean() # Store the mean IC for this lag
                    except ValueError as e_merge_decay: # Catch specific merge errors
                         print(f"ERROR [IC Decay Lag {lag}]: Merge failed - {e_merge_decay}. Skipping lag.")
                         result_ic = np.nan # Ensure NaN if merge fails
                    except Exception as e_decay_calc: # Catch other errors during calculation
                         print(f"ERROR [IC Decay Lag {lag}]: Calculation failed - {e_decay_calc}. Skipping lag.")
                         result_ic = np.nan

            ic_decay_values[lag] = result_ic
            pbar.update(1)

    ic_decay_series = pd.Series(ic_decay_values, name=f'Mean_IC_{method}_Decay')
    ic_decay_series.index.name = 'Lag (Days)'
    return ic_decay_series

# --- END OF ANALYSIS FUNCTION DEFINITIONS ---


# --- Helper function to save results to Excel ---
# Defined once before the loop starts
def save_to_excel_combined(df_to_save, base_sheet_name, factor_disp_name, writer_obj):
     """Saves a dataframe to a sheet in the combined Excel file, handling naming and timezones."""
     sheet_name_raw = f"{factor_disp_name}_{base_sheet_name}"
     if len(sheet_name_raw) > 31:
          max_len, len_base, len_underscore = 31, len(base_sheet_name), 1
          available_for_factor = max_len - len_base - len_underscore
          if available_for_factor < 3: sheet_name = sheet_name_raw[:max_len] # Min 3 chars for factor part
          else: sheet_name = f"{factor_disp_name[:available_for_factor]}_{base_sheet_name}"
          print(f"WARN: Sheet name '{sheet_name_raw}' > 31 chars. Truncated to '{sheet_name}'.")
     else: sheet_name = sheet_name_raw

     if df_to_save is not None and not df_to_save.empty:
          try:
               df_copy = df_to_save.copy()
               # Remove timezone info for Excel compatibility
               if isinstance(df_copy.index, pd.DatetimeIndex): df_copy.index = df_copy.index.tz_localize(None)
               if isinstance(df_copy.columns, pd.DatetimeIndex): df_copy.columns = df_copy.columns.tz_localize(None)
               if isinstance(df_copy.index, pd.MultiIndex):
                   new_levels = [lvl.tz_localize(None) if isinstance(lvl, pd.DatetimeIndex) else lvl for lvl in df_copy.index.levels]
                   df_copy.index = df_copy.index.set_levels(new_levels)
               if isinstance(df_copy.columns, pd.MultiIndex):
                   new_levels = [lvl.tz_localize(None) if isinstance(lvl, pd.DatetimeIndex) else lvl for lvl in df_copy.columns.levels]
                   df_copy.columns = df_copy.columns.set_levels(new_levels)

               df_copy.to_excel(writer_obj, sheet_name=sheet_name)
               # print(f"DEBUG: Saved sheet '{sheet_name}'") # Optional debug
               return True # Indicate sheet was saved
          except Exception as e_save: print(f"ERROR saving sheet '{sheet_name}': {e_save}")
     else: print(f"INFO: No data to save for sheet '{sheet_name}'.")
     return False # Indicate sheet was not saved


# --- Calculate Forward Returns for Analysis & IC Decay (Do ONCE before loop) ---
if prices.empty or prices.isna().all().all():
    print("ERROR: Price data is empty or all NaN. Cannot calculate forward returns. Exiting.")
    exit()

print(f"\nCalculating forward returns for analysis periods: {fwd_ret_periods_int} days...")
forward_returns_df_stacked = calculate_forward_returns(prices.copy(), periods=fwd_ret_periods_int)

if forward_returns_df_stacked.empty:
    print("ERROR: Main forward returns calculation failed or resulted in empty data. Exiting.")
    exit()
# Check if expected column names exist
expected_ret_cols_present = all(col in forward_returns_df_stacked.columns for col in analysis_periods_str)
if not expected_ret_cols_present:
     print(f"ERROR: Missing expected forward return columns in calculated df. Expected: {analysis_periods_str}, Found: {forward_returns_df_stacked.columns.tolist()}")
     # exit() # Exit or proceed carefully

print(f"\nCalculating forward returns for IC decay (up to {MAX_DECAY_LAG} days)...")
fwd_returns_for_decay_dict = calculate_forward_returns_for_decay(prices.copy(), MAX_DECAY_LAG)
if not fwd_returns_for_decay_dict:
    print("WARN: Could not calculate forward returns for IC Decay. Decay analysis will be skipped.")


# =======================================================
# === Starting Factor Analysis Loop ===
# =======================================================
print(f"\nPreparing single Excel output file: {combined_output_filename}\n")

if precalculated_factors_df.empty:
    print("ERROR: No pre-calculated factors found or loaded. Skipping analysis loop.")
else:
    # --- Start Excel Writer context ---
    overall_success = False # Flag to track if ANY sheet gets written
    try:
        with pd.ExcelWriter(combined_output_filename, engine='openpyxl') as writer:
            unique_factor_names = precalculated_factors_df.columns.get_level_values('factor_name').unique()
            print(f"Analyzing {len(unique_factor_names)} factors found in the input DataFrame...")

            # --- Loop through each factor ---
            for factor_name in unique_factor_names:
                print(f"\n\n{'='*20} Processing Factor: {factor_name} {'='*20}")
                factor_timer_start = time.time() # Timer for each factor
                sheets_saved_this_factor = 0 # Count sheets for this factor

                # --- Extract Raw Factor ---
                try:
                    raw_factor_df = precalculated_factors_df.xs(factor_name, level='factor_name', axis=1).copy()
                    raw_factor_df.columns.name = 'asset'
                    raw_factor_df.index.name = 'date'
                except KeyError:
                     print(f"ERROR: Could not extract factor '{factor_name}' using xs. Skipping.")
                     continue
                except Exception as e_extract:
                     print(f"ERROR: Unexpected error extracting factor '{factor_name}': {e_extract}. Skipping.")
                     continue

                if raw_factor_df.empty or raw_factor_df.isna().all().all():
                    print(f"WARN: Raw factor data for {factor_name} is empty or all NaNs after extraction. Skipping.")
                    continue

                # --- Factor Neutralization ---
                print("\n--- Performing Factor Neutralization ---")
                neut_timer_start = time.time()
                neutralized_factor_df = pd.DataFrame(index=dates_index, columns=assets) # Reinitialize
                neutralization_succeeded = False

                # Check if any neutralization variables exist and align them
                has_industry = False
                industry_dummies_aligned = pd.DataFrame()
                if 'industry_dummies_static' in locals() and not industry_dummies_static.empty:
                     # Align index (assets) with the current final asset list
                     industry_dummies_aligned = industry_dummies_static.reindex(assets).fillna(0)
                     has_industry = not industry_dummies_aligned.empty

                has_style = False
                style_factors_aligned = pd.DataFrame()
                if 'style_factors' in locals() and not style_factors.empty and not style_factors.isna().all().all():
                     # Align style factors (which have MultiIndex date,asset) with raw_factor_df dates
                     # And ensure assets match the final 'assets' list
                     style_factors_aligned = style_factors.reindex(index=raw_factor_df.index, level='date')
                     # Filter style factors to only include current assets
                     valid_style_assets = style_factors_aligned.index.get_level_values('asset').unique().intersection(assets)
                     if not valid_style_assets.empty:
                         style_factors_aligned = style_factors_aligned[style_factors_aligned.index.get_level_values('asset').isin(valid_style_assets)]
                         has_style = not style_factors_aligned.dropna(how='all').empty
                     else: has_style = False


                if not has_industry and not has_style:
                    print("INFO: No neutralization variables available. Using raw factor.")
                    neutralized_factor_df = raw_factor_df.copy()
                    neutralization_succeeded = False
                else:
                    print("Running neutralization regression day by day...")
                    neutralized_residuals_list = []
                    # Use index from raw factor that has *some* data for iteration
                    valid_dates_for_neut = raw_factor_df.dropna(how='all').index

                    with tqdm(total=len(valid_dates_for_neut), desc=f"Neutralizing {factor_name}", leave=False) as pbar:
                        for date in valid_dates_for_neut:
                            factor_today = raw_factor_df.loc[date].dropna()
                            if factor_today.empty:
                                neutralized_residuals_list.append(pd.Series(np.nan, index=assets, name=date))
                                pbar.update(1); continue

                            X_list = []
                            valid_assets_today = factor_today.index

                            # Industry
                            if has_industry:
                                industry_today = industry_dummies_aligned.reindex(valid_assets_today).dropna(axis=1, how='all').fillna(0)
                                # Drop industry dummies that are constant (e.g., all zero after reindex)
                                industry_today = industry_today.loc[:, industry_today.nunique() > 1]
                                if not industry_today.empty: X_list.append(industry_today)

                            # Style Factors
                            style_today_aligned_assets = pd.DataFrame() # Init empty
                            if has_style and date in style_factors_aligned.index.get_level_values('date'):
                                try:
                                    style_today = style_factors_aligned.loc[pd.IndexSlice[date, :], :] # Use IndexSlice for robustness
                                    if not style_today.empty:
                                         # If only one style factor, it might be a Series, convert to frame
                                         if isinstance(style_today, pd.Series): style_today = style_today.to_frame()

                                         # Reindex style factors for today's valid assets and fill NaNs (e.g., with mean)
                                         style_fill_value = style_today.mean() # Calculate mean before reindexing
                                         style_today_aligned_assets = style_today.reindex(valid_assets_today, level='asset').fillna(style_fill_value)
                                         # Drop style factors that are all NaN after reindexing/filling
                                         style_today_aligned_assets = style_today_aligned_assets.dropna(axis=1, how='all') # Drop empty columns
                                         if not style_today_aligned_assets.empty:
                                             # Remove constant columns (important!) before adding model constant
                                             non_const_cols = style_today_aligned_assets.loc[:, style_today_aligned_assets.nunique() > 1]
                                             if not non_const_cols.empty: X_list.append(non_const_cols)
                                except KeyError: pass # Date might not exist in aligned style factors
                                except Exception as e_style_align:
                                     print(f"WARN: Error aligning style factors for {date}: {e_style_align}")


                            if not X_list:
                                residuals_today = factor_today
                            else:
                                try:
                                    X_today = pd.concat(X_list, axis=1).astype(float) # Ensure float type
                                    # Align Y (factor) and X (exposures) on common assets
                                    common_assets = factor_today.index.intersection(X_today.index)
                                    if common_assets.empty: # Handle case where no assets overlap after considering exposures
                                         residuals_today = factor_today # Fallback to raw
                                    else:
                                        Y_aligned = factor_today.loc[common_assets].astype(float)
                                        X_aligned = X_today.loc[common_assets]

                                        # Drop rows/cols with all NaNs AFTER alignment (robustness)
                                        X_aligned = X_aligned.dropna(axis=1, how='all').dropna(axis=0, how='all')
                                        Y_aligned = Y_aligned.loc[X_aligned.index] # Re-align Y

                                        # Check for sufficient data points vs predictors
                                        if Y_aligned.empty or X_aligned.empty or len(Y_aligned) <= X_aligned.shape[1]:
                                            residuals_today = factor_today # Fallback
                                        else:
                                            X_w_const = sm.add_constant(X_aligned, has_constant='add')
                                            model = sm.OLS(Y_aligned, X_w_const, missing='drop')
                                            results = model.fit()
                                            residuals_today = results.resid.reindex(Y_aligned.index).fillna(0) # Fill NaNs from regression with 0? Or keep NaN?

                                except LinAlgError: # Handle cases like singular matrix
                                     residuals_today = factor_today
                                except ValueError as e_ols_val: # Handle dimension mismatches etc.
                                     residuals_today = factor_today
                                except Exception as e_ols:
                                     print(f"WARN: OLS failed unexpectedly for {factor_name} on {date}: {e_ols}")
                                     residuals_today = factor_today # Fallback to raw on error

                            # Reindex residuals to full asset list, filling missing ones with NaN
                            neutralized_residuals_list.append(residuals_today.reindex(assets).fillna(np.nan))
                            pbar.update(1)

                    # --- Combine daily neutralized results ---
                    if neutralized_residuals_list:
                         neutralized_factor_df_temp = pd.concat(neutralized_residuals_list, axis=1).T
                         neutralized_factor_df_temp.index.name = 'date'
                         # Reindex to ensure all analysis dates are present (fills missing dates with NaN)
                         neutralized_factor_df = neutralized_factor_df_temp.reindex(dates_index)
                         neutralization_succeeded = True
                         print(f"Neutralization completed for {factor_name}. ({(time.time() - neut_timer_start):.2f}s)")
                    else:
                         print(f"WARN: Neutralization yielded no results for {factor_name}. Using raw factor.")
                         neutralized_factor_df = raw_factor_df.copy()
                         neutralization_succeeded = False


                # --- Analysis Execution ---
                print("\n--- Starting Factor Analysis ---")
                analysis_timer_start = time.time()
                factor_to_analyze_df = None
                factor_source = "None"

                # Decide which factor version to use for analysis
                if neutralization_succeeded and not neutralized_factor_df.isna().all().all():
                    factor_to_analyze_df = neutralized_factor_df.copy()
                    factor_source = "Neut" # Shortened for sheet names
                    print(f"INFO: Using NEUTRALIZED factor '{factor_name}' for analysis.")
                elif not raw_factor_df.isna().all().all():
                    factor_to_analyze_df = raw_factor_df.copy()
                    factor_source = "Raw"
                    print(f"INFO: Using RAW factor '{factor_name}' for analysis.")
                else:
                    print(f"ERROR: No valid factor data (Raw or Neutralized) found for {factor_name}. Skipping analysis.")
                    continue

                # --- Prepare for Analysis ---
                analysis_performed = False
                # Factor display name for sheet naming - keep it concise
                factor_display_name = f"{factor_name[:15]}_{factor_source}" # Max 15 chars for factor part

                # Stack the chosen factor (Index=date, Columns=assets) -> Series (Index=(date, asset))
                factor_to_analyze_df.index.name = 'date'
                factor_to_analyze_df.columns.name = 'asset'
                factor_series = factor_to_analyze_df.stack(future_stack=True) # Use future_stack, dropna removed
                factor_series.index.names = ['date', 'asset']
                factor_series.rename('factor', inplace=True) # Ensure Series name is 'factor' for functions
                factor_series_clean = factor_series.dropna() # Drop NaNs *after* stacking

                # Align factor with forward returns (already stacked)
                aligned_data = pd.DataFrame()
                if factor_series_clean.empty:
                    print(f"ERROR: Factor series for {factor_display_name} is empty after dropna(). Skipping.")
                    continue
                else:
                    # Ensure forward returns are uniquely indexed if merging
                    fwd_returns_unique = forward_returns_df_stacked[~forward_returns_df_stacked.index.duplicated(keep='first')]
                    try:
                        # Merge the factor Series (as frame) with the forward returns DataFrame
                        aligned_data = pd.merge(factor_series_clean.to_frame(), fwd_returns_unique,
                                                left_index=True, right_index=True, how='inner')
                    except Exception as merge_err:
                        print(f"ERROR aligning data for {factor_display_name}: {merge_err}")
                        continue

                # Final check on aligned data
                # Drop rows where factor OR *any* of the analysis return periods are NaN
                aligned_data_clean = aligned_data.dropna(subset=['factor'] + analysis_periods_str, how='any')

                if aligned_data_clean.empty:
                    print(f"INFO: Skipping analysis for {factor_display_name} - no overlapping data.")
                    continue
                else:
                    print(f"Clean aligned data ready for {factor_display_name}. Shape: {aligned_data_clean.shape}")
                    # Identify return columns actually available after merge/dropna
                    available_ret_cols = [col for col in analysis_periods_str if col in aligned_data_clean.columns and not aligned_data_clean[col].isna().all()]
                    if not available_ret_cols:
                        print(f"ERROR: No valid forward returns columns remain for {factor_display_name}. Skipping.")
                        continue

                    # --- Initialize result containers ---
                    all_ic_summaries, all_daily_ics = [], {}
                    all_quantile_mean_rets, all_quantile_cum_rets = {}, {}
                    factor_daily_returns_dict, cumulative_factor_returns_dict = {}, {}
                    factor_analysis_summary = []

                    # --- Calculate IC Decay ---
                    ic_decay_results = pd.Series(dtype=float)
                    if fwd_returns_for_decay_dict:
                         # Pass the clean factor series (before alignment with specific returns)
                         ic_decay_results = calculate_ic_decay(factor_series_clean.copy(), fwd_returns_for_decay_dict, MAX_DECAY_LAG, method=ic_method)

                    # --- Calculate Quantile Turnover ---
                    all_quantile_turnover = pd.DataFrame()
                    print("\n--- Calculating Quantile Turnover ---")
                    # Pass factor from aligned data, only need 'factor' column
                    quantile_assignments = get_quantile_assignments(aligned_data_clean[['factor']].copy(), num_quantiles=num_quantiles)
                    if not quantile_assignments.empty and not quantile_assignments['quantile'].isna().all():
                        all_quantile_turnover = calculate_quantile_turnover(quantile_assignments, num_quantiles=num_quantiles)
                    else: print("WARN: Could not calculate turnover due to empty/NaN quantile assignments.")


                    # --- Loop through Analysis Periods ---
                    for ret_col in available_ret_cols:
                        print(f"\n===== Analyzing {factor_display_name} vs {ret_col} =====")
                        # Subset data needed for this specific return period
                        analysis_data_subset = aligned_data_clean[['factor', ret_col]].dropna()
                        if analysis_data_subset.empty:
                            print(f"INFO: No valid data for {factor_display_name} vs {ret_col} after dropna.")
                            continue

                        # --- Run Analyses ---
                        try: # Add try-except around individual analyses
                            mean_ret_q, cum_ret_q = quantile_analysis(analysis_data_subset.copy(), factor_display_name, num_quantiles=num_quantiles, ret_col=ret_col)
                            if mean_ret_q is not None: all_quantile_mean_rets[ret_col] = mean_ret_q
                            if cum_ret_q is not None: all_quantile_cum_rets[ret_col] = cum_ret_q

                            ic_summary, daily_ic = calculate_ic(analysis_data_subset.copy(), factor_display_name, ret_col=ret_col, method=ic_method)
                            if ic_summary is not None: all_ic_summaries.append(ic_summary)
                            if daily_ic is not None: all_daily_ics[ret_col] = daily_ic

                            factor_daily_ret, factor_cum_ret, _, factor_ann_stats = calculate_factor_returns(
                                analysis_data_subset.copy(), factor_display_name, ret_col=ret_col
                            )
                            if factor_daily_ret is not None: factor_daily_returns_dict[ret_col] = factor_daily_ret
                            if factor_cum_ret is not None: cumulative_factor_returns_dict[ret_col] = factor_cum_ret
                            if factor_ann_stats is not None: factor_analysis_summary.append(factor_ann_stats)

                            analysis_performed = True # Mark that at least one analysis ran
                        except Exception as e_analyze_period:
                             print(f"ERROR during analysis of {factor_display_name} vs {ret_col}: {e_analyze_period}")
                             traceback.print_exc() # Print detailed error for this period

                print(f"Analysis calculations finished. ({(time.time() - analysis_timer_start):.2f}s)")

                # --- Save Results ---
                if analysis_performed:
                    print(f"\n--- Saving results for {factor_display_name} to Excel ---")
                    save_timer_start = time.time()
                    # --- Save each result type using the helper defined outside the loop ---
                    if all_ic_summaries: sheets_saved_this_factor += save_to_excel_combined(pd.concat(all_ic_summaries, axis=1), 'IC_Sum', factor_display_name, writer)
                    if all_daily_ics: sheets_saved_this_factor += save_to_excel_combined(pd.concat(all_daily_ics, axis=1), 'IC_Daily', factor_display_name, writer)
                    if 'ic_decay_results' in locals() and not ic_decay_results.empty: sheets_saved_this_factor += save_to_excel_combined(ic_decay_results.to_frame(), 'IC_Decay', factor_display_name, writer)
                    if all_quantile_mean_rets: sheets_saved_this_factor += save_to_excel_combined(pd.concat(all_quantile_mean_rets, axis=1, join='outer').rename_axis('Quantile'), 'Q_MeanRet', factor_display_name, writer)
                    if all_quantile_cum_rets:
                        all_dfs_cum = []
                        for ret_p, cum_df in all_quantile_cum_rets.items():
                            if cum_df is not None and not cum_df.empty:
                                cum_df.columns.name = 'Quantile'; cum_df.columns = pd.MultiIndex.from_product([[ret_p], cum_df.columns], names=['Return_Period', 'Quantile'])
                                all_dfs_cum.append(cum_df)
                        sheets_saved_this_factor += save_to_excel_combined(pd.concat(all_dfs_cum, axis=1, join='outer') if all_dfs_cum else pd.DataFrame(), 'Q_CumRet', factor_display_name, writer)
                    if 'all_quantile_turnover' in locals() and not all_quantile_turnover.empty: sheets_saved_this_factor += save_to_excel_combined(all_quantile_turnover, 'Q_Turnover', factor_display_name, writer)
                    if factor_analysis_summary: sheets_saved_this_factor += save_to_excel_combined(pd.concat(factor_analysis_summary, axis=1).rename_axis('Metric'), 'Fctr_Stats', factor_display_name, writer)
                    if factor_daily_returns_dict: sheets_saved_this_factor += save_to_excel_combined(pd.concat(factor_daily_returns_dict, axis=1), 'Fctr_Ret', factor_display_name, writer)
                    if cumulative_factor_returns_dict: sheets_saved_this_factor += save_to_excel_combined(pd.concat(cumulative_factor_returns_dict, axis=1), 'Fctr_CumRet', factor_display_name, writer)

                    if sheets_saved_this_factor > 0:
                        print(f"--- Results for {factor_display_name} saved ({sheets_saved_this_factor} sheets). ({(time.time() - save_timer_start):.2f}s)---")
                        overall_success = True # Mark that at least one sheet was saved overall
                    else:
                        print(f"--- No data frames were valid for saving for {factor_display_name}. ---")

                else:
                    print(f"\n--- No analysis performed for factor '{factor_display_name}'. No results saved. ---")

                print(f"--- Factor {factor_name} processing time: {(time.time() - factor_timer_start):.2f}s ---")
            # --- End of loop through factors ---

            if overall_success:
                 print(f"\nAll factors processed. Finalizing Excel file: {combined_output_filename}")
            else:
                 print(f"\nWARNING: All factors processed, but no analysis results were generated or saved.")

            # ExcelWriter context manager handles saving on exit IF overall_success is True implicitly

    # --- End of Excel Writer context ---
    except ImportError:
        print("\nERROR: Could not prepare Excel file. `openpyxl` library not found.")
        print("Please install it: pip install openpyxl")
    except Exception as e_main_loop:
        print(f"\nERROR occurred during factor analysis loop or Excel writing: {e_main_loop}")
        traceback.print_exc()

# --- End of Script ---
print("\n=============================================")
print("=== Combined Factor Analysis Script Finished ===")
print("=============================================")

INFO: Start date localized to UTC: 2020-03-24 00:00:00+00:00
INFO: End date localized to UTC: 2025-03-24 00:00:00+00:00

--- Attempting to define universe based on index: ^HSI ---
Attempting to get constituents for ^HSI on 2020-03-24.
INFO: Using asset universe (Count: 83): ['0001.HK', '0002.HK', '0003.HK', '0005.HK', '0006.HK', '0011.HK', '0012.HK', '0016.HK', '0027.HK', '0066.HK']...
Using pandas_market_calendars for HK business days. Full fetch range index length: 1319
Target Analysis Date Range: 2020-03-24 00:00:00+00:00 to 2025-03-24 00:00:00+00:00 (1231 analysis days)

--- Downloading Price and Total Volume Data ---
Fetching data from 2019-12-16 to 2025-04-29 for 83 assets + benchmark ^HSI...


[*********************100%***********************]  83 of 83 completed


Asset price/volume data processed. Shape: (1319, 83)
Benchmark data processed. Length: 1319

--- Fetching Industry Classification Data ---


Fetching Industries: 100%|██████████| 83/83 [01:21<00:00,  1.02it/s]


Created Static Industry Dummies shape: (83, 50)

--- Loading/Defining Pre-calculated Factors ---
Attempting to load factors from: processed_alpha_data_values.csv
Successfully loaded factors from file. Initial shape: (101177, 17)
INFO: Loaded data appears to be in Format 2 (Stacked). Unstacking...
Successfully unstacked factors to Format 1 (Wide).
INFO: Localizing factor index timezone...
INFO: Converting factor index timezone...
Reindexing loaded factors to match analysis dates (1231) and assets (83)...
Final precalculated factors DataFrame ready. Shape: (1231, 1411)
Available factors: ['Price_Momentum_10D', 'Rate_of_Change_10D', 'MA_Crossover_10_50', 'Volume_Momentum_50D', 'Mean_Reversion_20D', 'Moving_Avg_Reversion_20D', 'Stochastic_Oscillator_K_14D', 'ATR_14D', 'Daily_High_Low_Range', 'Norm_Bollinger_Width_20D', 'Volume_ROC_10D', 'Trading_Volume', 'Moving_Average_20D', 'Exponential_MA_20D', 'RSI_14D', 'Bollinger_Ratio_LB_20D', 'Stochastic_Oscillator_D_14D']

--- Calculating Style Fa

                                                                 

 - Calculated Beta.
 - Calculated Residual Volatility.
Style factors calculation finished. Final Shape: (102173, 3)

--- Defining/Importing Analysis Functions ---

Calculating forward returns for analysis periods: (1, 3, 5) days...

Calculating forward returns for IC decay (up to 20 days)...
Calculating fwd returns for decay (1 to 20 days)...


                                                                   

Finished calculating 20 forward returns for decay.

Preparing single Excel output file: factor_analysis_output_combined/combined_factor_analysis_results.xlsx

Analyzing 17 factors found in the input DataFrame...



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                     

Neutralization completed for Price_Momentum_10D. (13.26s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Price_Momentum_10D' for analysis.
Clean aligned data ready for Price_Momentum__Raw. Shape: (98780, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Price_Momentum__Raw vs 1D_fwd_ret =====

===== Analyzing Price_Momentum__Raw vs 3D_fwd_ret =====

===== Analyzing Price_Momentum__Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.15s)

--- Saving results for Price_Momentum__Raw to Excel ---
--- Results for Price_Momentum__Raw saved (9 sheets). (0.27s)---
--- Factor Price_Momentum_10D processing time: 28.68s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                     

Neutralization completed for Rate_of_Change_10D. (11.99s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Rate_of_Change_10D' for analysis.
Clean aligned data ready for Rate_of_Change__Raw. Shape: (98780, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Rate_of_Change__Raw vs 1D_fwd_ret =====

===== Analyzing Rate_of_Change__Raw vs 3D_fwd_ret =====

===== Analyzing Rate_of_Change__Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.30s)

--- Saving results for Rate_of_Change__Raw to Excel ---
--- Results for Rate_of_Change__Raw saved (9 sheets). (0.27s)---
--- Factor Rate_of_Change_10D processing time: 27.56s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                     

Neutralization completed for MA_Crossover_10_50. (11.66s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'MA_Crossover_10_50' for analysis.
Clean aligned data ready for MA_Crossover_10_Raw. Shape: (95543, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing MA_Crossover_10_Raw vs 1D_fwd_ret =====

===== Analyzing MA_Crossover_10_Raw vs 3D_fwd_ret =====

===== Analyzing MA_Crossover_10_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.62s)

--- Saving results for MA_Crossover_10_Raw to Excel ---
--- Results for MA_Crossover_10_Raw saved (9 sheets). (0.24s)---
--- Factor MA_Crossover_10_50 processing time: 27.51s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                      

Neutralization completed for Volume_Momentum_50D. (12.53s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Volume_Momentum_50D' for analysis.
Clean aligned data ready for Volume_Momentum_Raw. Shape: (95460, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Volume_Momentum_Raw vs 1D_fwd_ret =====

===== Analyzing Volume_Momentum_Raw vs 3D_fwd_ret =====

===== Analyzing Volume_Momentum_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.26s)

--- Saving results for Volume_Momentum_Raw to Excel ---
--- Results for Volume_Momentum_Raw saved (9 sheets). (0.23s)---
--- Factor Volume_Momentum_50D processing time: 28.03s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                     

Neutralization completed for Mean_Reversion_20D. (13.00s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Mean_Reversion_20D' for analysis.
Clean aligned data ready for Mean_Reversion__Raw. Shape: (98033, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Mean_Reversion__Raw vs 1D_fwd_ret =====

===== Analyzing Mean_Reversion__Raw vs 3D_fwd_ret =====

===== Analyzing Mean_Reversion__Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.66s)

--- Saving results for Mean_Reversion__Raw to Excel ---
--- Results for Mean_Reversion__Raw saved (9 sheets). (0.24s)---
--- Factor Mean_Reversion_20D processing time: 28.90s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                          

Neutralization completed for Moving_Avg_Reversion_20D. (15.88s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Moving_Avg_Reversion_20D' for analysis.
Clean aligned data ready for Moving_Avg_Reve_Raw. Shape: (98033, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Moving_Avg_Reve_Raw vs 1D_fwd_ret =====

===== Analyzing Moving_Avg_Reve_Raw vs 3D_fwd_ret =====

===== Analyzing Moving_Avg_Reve_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (16.47s)

--- Saving results for Moving_Avg_Reve_Raw to Excel ---
--- Results for Moving_Avg_Reve_Raw saved (9 sheets). (0.31s)---
--- Factor Moving_Avg_Reversion_20D processing time: 32.67s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                              

Neutralization completed for Stochastic_Oscillator_K_14D. (12.90s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Stochastic_Oscillator_K_14D' for analysis.
Clean aligned data ready for Stochastic_Osci_Raw. Shape: (98531, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Stochastic_Osci_Raw vs 1D_fwd_ret =====

===== Analyzing Stochastic_Osci_Raw vs 3D_fwd_ret =====

===== Analyzing Stochastic_Osci_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.63s)

--- Saving results for Stochastic_Osci_Raw to Excel ---
--- Results for Stochastic_Osci_Raw saved (9 sheets). (0.31s)---
--- Factor Stochastic_Oscillator_K_14D processing time: 28.84s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                          

Neutralization completed for ATR_14D. (12.28s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'ATR_14D' for analysis.
Clean aligned data ready for ATR_14D_Raw. Shape: (98531, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing ATR_14D_Raw vs 1D_fwd_ret =====

===== Analyzing ATR_14D_Raw vs 3D_fwd_ret =====

===== Analyzing ATR_14D_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.55s)

--- Saving results for ATR_14D_Raw to Excel ---
--- Results for ATR_14D_Raw saved (9 sheets). (0.23s)---
--- Factor ATR_14D processing time: 28.08s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                       

Neutralization completed for Daily_High_Low_Range. (12.08s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Daily_High_Low_Range' for analysis.
Clean aligned data ready for Daily_High_Low__Raw. Shape: (98890, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Daily_High_Low__Raw vs 1D_fwd_ret =====

===== Analyzing Daily_High_Low__Raw vs 3D_fwd_ret =====

===== Analyzing Daily_High_Low__Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.82s)

--- Saving results for Daily_High_Low__Raw to Excel ---
--- Results for Daily_High_Low__Raw saved (9 sheets). (0.37s)---
--- Factor Daily_High_Low_Range processing time: 28.27s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                           

Neutralization completed for Norm_Bollinger_Width_20D. (11.77s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Norm_Bollinger_Width_20D' for analysis.
Clean aligned data ready for Norm_Bollinger__Raw. Shape: (98033, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Norm_Bollinger__Raw vs 1D_fwd_ret =====

===== Analyzing Norm_Bollinger__Raw vs 3D_fwd_ret =====

===== Analyzing Norm_Bollinger__Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.80s)

--- Saving results for Norm_Bollinger__Raw to Excel ---
--- Results for Norm_Bollinger__Raw saved (9 sheets). (0.24s)---
--- Factor Norm_Bollinger_Width_20D processing time: 27.80s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                 

Neutralization completed for Volume_ROC_10D. (12.49s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Volume_ROC_10D' for analysis.
Clean aligned data ready for Volume_ROC_10D_Raw. Shape: (98058, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Volume_ROC_10D_Raw vs 1D_fwd_ret =====

===== Analyzing Volume_ROC_10D_Raw vs 3D_fwd_ret =====

===== Analyzing Volume_ROC_10D_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.82s)

--- Saving results for Volume_ROC_10D_Raw to Excel ---
--- Results for Volume_ROC_10D_Raw saved (9 sheets). (0.25s)---
--- Factor Volume_ROC_10D processing time: 28.56s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                

Neutralization completed for Trading_Volume. (13.91s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Trading_Volume' for analysis.
Clean aligned data ready for Trading_Volume_Raw. Shape: (98890, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Trading_Volume_Raw vs 1D_fwd_ret =====

===== Analyzing Trading_Volume_Raw vs 3D_fwd_ret =====

===== Analyzing Trading_Volume_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.83s)

--- Saving results for Trading_Volume_Raw to Excel ---
--- Results for Trading_Volume_Raw saved (9 sheets). (0.34s)---
--- Factor Trading_Volume processing time: 30.08s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                     

Neutralization completed for Moving_Average_20D. (13.73s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Moving_Average_20D' for analysis.
Clean aligned data ready for Moving_Average__Raw. Shape: (98033, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Moving_Average__Raw vs 1D_fwd_ret =====

===== Analyzing Moving_Average__Raw vs 3D_fwd_ret =====

===== Analyzing Moving_Average__Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.74s)

--- Saving results for Moving_Average__Raw to Excel ---
--- Results for Moving_Average__Raw saved (9 sheets). (0.24s)---
--- Factor Moving_Average_20D processing time: 29.72s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                     

Neutralization completed for Exponential_MA_20D. (13.92s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Exponential_MA_20D' for analysis.
Clean aligned data ready for Exponential_MA__Raw. Shape: (98890, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Exponential_MA__Raw vs 1D_fwd_ret =====

===== Analyzing Exponential_MA__Raw vs 3D_fwd_ret =====

===== Analyzing Exponential_MA__Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.94s)

--- Saving results for Exponential_MA__Raw to Excel ---
--- Results for Exponential_MA__Raw saved (9 sheets). (0.24s)---
--- Factor Exponential_MA_20D processing time: 30.10s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                          

Neutralization completed for RSI_14D. (13.37s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'RSI_14D' for analysis.
Clean aligned data ready for RSI_14D_Raw. Shape: (98531, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing RSI_14D_Raw vs 1D_fwd_ret =====

===== Analyzing RSI_14D_Raw vs 3D_fwd_ret =====

===== Analyzing RSI_14D_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.88s)

--- Saving results for RSI_14D_Raw to Excel ---
--- Results for RSI_14D_Raw saved (9 sheets). (0.26s)---
--- Factor RSI_14D processing time: 29.51s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                        

Neutralization completed for Bollinger_Ratio_LB_20D. (13.30s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Bollinger_Ratio_LB_20D' for analysis.
Clean aligned data ready for Bollinger_Ratio_Raw. Shape: (98033, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Bollinger_Ratio_Raw vs 1D_fwd_ret =====

===== Analyzing Bollinger_Ratio_Raw vs 3D_fwd_ret =====

===== Analyzing Bollinger_Ratio_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.57s)

--- Saving results for Bollinger_Ratio_Raw to Excel ---
--- Results for Bollinger_Ratio_Raw saved (9 sheets). (0.25s)---
--- Factor Bollinger_Ratio_LB_20D processing time: 29.13s ---



--- Performing Factor Neutralization ---
Running neutralization regression day by day...


                                                                                              

Neutralization completed for Stochastic_Oscillator_D_14D. (12.64s)

--- Starting Factor Analysis ---
INFO: Using RAW factor 'Stochastic_Oscillator_D_14D' for analysis.
Clean aligned data ready for Stochastic_Osci_Raw. Shape: (98531, 4)
Calculating IC Decay...


                                                         


--- Calculating Quantile Turnover ---

===== Analyzing Stochastic_Osci_Raw vs 1D_fwd_ret =====

===== Analyzing Stochastic_Osci_Raw vs 3D_fwd_ret =====

===== Analyzing Stochastic_Osci_Raw vs 5D_fwd_ret =====
Analysis calculations finished. (15.83s)

--- Saving results for Stochastic_Osci_Raw to Excel ---
--- Results for Stochastic_Osci_Raw saved (9 sheets). (0.25s)---
--- Factor Stochastic_Oscillator_D_14D processing time: 28.72s ---

All factors processed. Finalizing Excel file: factor_analysis_output_combined/combined_factor_analysis_results.xlsx

=== Combined Factor Analysis Script Finished ===
