In [None]:
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Gold Futures Forecasting with Chronos Models: 2016-2019 Analysis

## Comprehensive Performance Evaluation in Stable Market Conditions

### Objectives
1. Evaluate Chronos model performance on gold futures forecasting during 2016-2019 period
2. Use rolling window approach with optimized context lengths for next-day predictions
3. Analyze performance in stable, pre-COVID market conditions (2016-2019)
4. Compare against baseline models using standardized metrics (MASE, MAE, RMSE, directional accuracy)
5. Perform systematic configuration optimization for stable market conditions
6. Provide comparison baseline for volatile market period analysis (2020-2021)

### Methodology
- **Data**: GCUSD (Gold Futures) daily OHLCV data from 2016-2019 (4-year stable period)
- **Models**: Chronos-Bolt family with systematic configuration optimization
- **Evaluation**: Rolling window approach with configurable context lengths (30, 63, 126, 252 days)
- **Benchmarking**: Comprehensive metrics including MASE, MAE, RMSE, MAPE, and directional accuracy
- **Optimization**: Systematic testing of model sizes, context windows, and prediction horizons
- **Market Context**: Analysis of stable, lower-volatility market conditions

### Key Hypothesis
The 2016-2019 period represents stable market conditions with:
- **Lower volatility** compared to COVID-era markets (2020-2021)
- **Gradual price movements** with fewer extreme events
- **Consistent market dynamics** favoring pattern recognition
- **Optimal conditions** for sophisticated forecasting models

This analysis will test whether Chronos models perform better relative to naive baselines in stable market conditions, providing a crucial comparison point for understanding market regime effects on forecasting performance.

## 1. Environment Setup and Dependencies

In [ ]:
# Install required packages
print("Installing required packages for 2016-2019 analysis...")
import subprocess
import sys
import os

def install_package(package_name, alternative_name=None):
    """Install a package with fallback options"""
    try:
        # Try pip install first
        subprocess.run([sys.executable, "-m", "pip", "install", package_name, "--quiet"], 
                      check=True, capture_output=True)
        print(f"✅ {package_name} installed via pip")
        return True
    except subprocess.CalledProcessError:
        # If pip fails, try with --break-system-packages (not recommended but sometimes necessary)
        try:
            subprocess.run([sys.executable, "-m", "pip", "install", package_name, "--break-system-packages", "--quiet"], 
                          check=True, capture_output=True)
            print(f"✅ {package_name} installed via pip (system packages)")
            return True
        except subprocess.CalledProcessError:
            print(f"❌ Failed to install {package_name}")
            return False

# Install packages in order of importance
packages = [
    ("pandas", "python3-pandas"),
    ("numpy", "python3-numpy"),
    ("matplotlib", "python3-matplotlib"),
    ("seaborn", "python3-seaborn"),
    ("scipy", "python3-scipy"),
    ("scikit-learn", "python3-sklearn"),
    ("torch", None),  # PyTorch for Chronos
    ("chronos-forecasting", None),
    ("fev", None),  # FEV - Forecast Evaluation Framework
    ("datasets", None),  # Hugging Face datasets (required for FEV)
    ("plotly", "python3-plotly"),
    ("bokeh", "python3-bokeh"),
    ("ipywidgets", "python3-ipywidgets")
]

print("Installing core packages...")
for package, alt_name in packages:
    install_package(package, alt_name)

print("\n2016-2019 analysis environment setup completed!")

In [None]:
# Import necessary libraries for 2016-2019 analysis
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Core plotting libraries
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    print("✅ Matplotlib and seaborn imported successfully")
except ImportError as e:
    print(f"❌ Error importing matplotlib/seaborn: {e}")
    # Handle installation if needed
    import subprocess
    import sys
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'matplotlib', 'seaborn', '--break-system-packages', '--quiet'])
    import matplotlib.pyplot as plt
    import seaborn as sns
    print("✅ Matplotlib and seaborn installed and imported")

from datetime import datetime, timedelta

# Chronos imports
try:
    import torch
    from chronos import BaseChronosPipeline
    print("✅ Chronos imports successful")
except ImportError as e:
    print(f"❌ Error importing Chronos: {e}")
    print("Installing chronos-forecasting...")
    import subprocess
    import sys
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'chronos-forecasting', '--break-system-packages', '--quiet'])
    import torch
    from chronos import BaseChronosPipeline
    print("✅ Chronos installed and imported")

# FEV imports (Forecast Evaluation Framework)
fev_available = False
try:
    import fev
    from datasets import Dataset
    fev_available = True
    print("✅ FEV imports successful")
    print(f"FEV version: {fev.__version__ if hasattr(fev, '__version__') else 'Version info not available'}")
except ImportError as e:
    print(f"❌ FEV not available: {e}")
    print("Will use alternative evaluation framework")

# Interactive visualization imports
plotly_available = False
try:
    import plotly.graph_objects as go
    import plotly.express as px
    from plotly.subplots import make_subplots
    import plotly.io as pio
    pio.renderers.default = "plotly_mimetype+notebook"
    plotly_available = True
    print("✅ Plotly imports successful with notebook renderer")
except ImportError as e:
    print(f"⚠️ Plotly not available: {e}")
    print("Will use matplotlib fallbacks")

# Statistical analysis
try:
    from scipy import stats
    from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
    print("✅ Statistical analysis imports successful")
except ImportError as e:
    print(f"❌ Error importing scipy/sklearn: {e}")
    import subprocess
    import sys
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scipy', 'scikit-learn', '--break-system-packages', '--quiet'])
    from scipy import stats
    from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
    print("✅ Statistical analysis packages installed and imported")

print(f"\n🎉 All libraries imported successfully for 2016-2019 analysis!")
print(f"FEV Framework Available: {'✅ Yes' if fev_available else '❌ No (using alternatives)'}")
print(f"Plotly Available: {'✅ Yes' if plotly_available else '❌ No (using matplotlib)'}")

## 2. Data Loading and Preprocessing for 2016-2019 Period

In [ ]:
# Load gold futures data and filter to 2016-2019 period
try:
    df = pd.read_csv('GCUSD_MAX_FROM_PERPLEXITY.csv')
    print("✅ Data loaded successfully")
except FileNotFoundError:
    print("❌ Error: GCUSD_MAX_FROM_PERPLEXITY.csv not found")
    print("Please ensure the data file is in the current directory.")
    raise FileNotFoundError("Data file required for 2016-2019 analysis")

# Display basic info about full dataset
print(f"Full dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Full date range: {df['Date'].min()} to {df['Date'].max()}")

# Check data availability for 2016-2019
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)
df = df.set_index('Date')

# Filter for 2016-2019 data using the specified format
start_date = '2016-01-01'
end_date = '2019-12-31'
mask = (df.index >= start_date) & (df.index <= end_date)
df = df[mask]

# Reset index to get Date as column again
df = df.reset_index()
data_2016_2019 = df.copy()

print(f"\n📅 2016-2019 PERIOD DATA:")
print(f"Filtered dataset shape: {data_2016_2019.shape}")
print(f"Date range: {data_2016_2019['Date'].min()} to {data_2016_2019['Date'].max()}")
print(f"Number of trading days: {len(data_2016_2019)}")

# Verify sufficient data for analysis
if len(data_2016_2019) < 300:
    print(f"❌ Error: Only {len(data_2016_2019)} days of data available")
    print("Insufficient data for robust analysis. Need at least 300 days.")
    raise ValueError("Insufficient data for 2016-2019 analysis")
elif len(data_2016_2019) >= 1000:
    print(f"✅ Excellent: {len(data_2016_2019)} days provides sufficient data for robust analysis")
else:
    print(f"✅ Good: {len(data_2016_2019)} days provides adequate data for analysis")

print(f"\nFirst few rows of 2016-2019 data:")
print(data_2016_2019.head())
print(f"\nLast few rows of 2016-2019 data:")
print(data_2016_2019.tail())

In [ ]:
# Data preprocessing for 2016-2019 period
def preprocess_data_2016_2019(df):
    """
    Preprocess gold futures data for 2016-2019 time series analysis
    """
    # Create a copy
    data = df.copy()
    
    # Date is already converted and sorted
    # Handle missing values using forward fill
    data = data.ffill()
    
    # Create target variable (next day's close price)
    data['Target'] = data['Close'].shift(-1)
    
    # Remove last row (no target available)
    data = data[:-1].reset_index(drop=True)
    
    return data

# Preprocess the 2016-2019 data
data = preprocess_data_2016_2019(data_2016_2019)
print("✅ 2016-2019 data preprocessing completed")

print(f"Final dataset shape: {data.shape}")
print(f"Date range: {data['Date'].min()} to {data['Date'].max()}")
print(f"Number of trading days: {len(data)}")

# Display basic statistics for 2016-2019 period
print("\n2016-2019 Basic Statistics:")
print(data[['Open', 'High', 'Low', 'Close', 'Volume']].describe())

# Market characteristics analysis
print("\n📊 MARKET CHARACTERISTICS (2016-2019):")
print(f"Price range: ${data['Close'].min():.2f} - ${data['Close'].max():.2f}")
print(f"Average daily volatility: {data['Close'].pct_change().std()*100:.2f}%")
print(f"Total return: {((data['Close'].iloc[-1] / data['Close'].iloc[0]) - 1)*100:.1f}%")

# Calculate volatility by year for context
yearly_volatility = {}
for year in range(2016, 2020):
    year_data = data[data['Date'].dt.year == year]
    if len(year_data) > 1:
        vol = year_data['Close'].pct_change().std() * np.sqrt(252) * 100
        yearly_volatility[year] = vol

print(f"\nYearly Volatility Analysis:")
for year, vol in yearly_volatility.items():
    print(f"  {year}: {vol:.1f}% annualized")

# Period context
print(f"\n📋 PERIOD CONTEXT:")
print(f"2016-2019 represents stable market conditions:")
print(f"  - Pre-COVID stable economic environment")
print(f"  - Gradual economic growth with low volatility")
print(f"  - Consistent market patterns")
print(f"  - Optimal conditions for pattern recognition models")

## 3. Market Regime Analysis: 2016-2019 Stable Period

In [ ]:
# Market regime analysis for 2016-2019 stable period
print("📈 MARKET REGIME ANALYSIS: 2016-2019 STABLE PERIOD")
print("=" * 70)

def analyze_market_regime(data, period_name):
    """
    Analyze market characteristics for a given period
    """
    returns = data['Close'].pct_change().dropna()
    
    analysis = {
        'period': period_name,
        'trading_days': len(data),
        'price_start': data['Close'].iloc[0],
        'price_end': data['Close'].iloc[-1],
        'total_return': ((data['Close'].iloc[-1] / data['Close'].iloc[0]) - 1) * 100,
        'volatility': returns.std() * np.sqrt(252) * 100,  # Annualized volatility
        'daily_volatility': returns.std() * 100,
        'skewness': returns.skew(),
        'kurtosis': returns.kurtosis(),
        'max_drawdown': ((data['Close'] / data['Close'].cummax()) - 1).min() * 100,
        'positive_days': (returns > 0).mean() * 100,
        'large_moves': (np.abs(returns) > 0.02).mean() * 100,  # Days with >2% moves
        'price_range': data['Close'].max() - data['Close'].min(),
        'avg_volume': data['Volume'].mean()
    }
    
    return analysis

# Analyze the full 2016-2019 period
regime_2016_2019 = analyze_market_regime(data_2016_2019, "2016-2019")

print(f"\n📊 COMPREHENSIVE MARKET ANALYSIS (2016-2019):")
print(f"Trading Days: {regime_2016_2019['trading_days']}")
print(f"Price Range: ${regime_2016_2019['price_start']:.2f} - ${regime_2016_2019['price_end']:.2f}")
print(f"Total Return: {regime_2016_2019['total_return']:.1f}%")
print(f"Annualized Volatility: {regime_2016_2019['volatility']:.1f}%")
print(f"Daily Volatility: {regime_2016_2019['daily_volatility']:.2f}%")
print(f"Skewness: {regime_2016_2019['skewness']:.2f}")
print(f"Kurtosis: {regime_2016_2019['kurtosis']:.2f}")
print(f"Max Drawdown: {regime_2016_2019['max_drawdown']:.1f}%")
print(f"Positive Days: {regime_2016_2019['positive_days']:.1f}%")
print(f"Large Moves (>2%): {regime_2016_2019['large_moves']:.1f}%")

# Break down by sub-periods for regime analysis
sub_periods = {
    '2016': data_2016_2019[data_2016_2019['Date'].dt.year == 2016],
    '2017': data_2016_2019[data_2016_2019['Date'].dt.year == 2017],
    '2018': data_2016_2019[data_2016_2019['Date'].dt.year == 2018],
    '2019': data_2016_2019[data_2016_2019['Date'].dt.year == 2019]
}

print(f"\n📅 YEARLY BREAKDOWN:")
print(f"{'Year':<6} {'Days':<6} {'Return':<8} {'Volatility':<12} {'Large Moves':<12}")
print("-" * 50)

for year, year_data in sub_periods.items():
    if len(year_data) > 1:
        regime = analyze_market_regime(year_data, year)
        print(f"{year:<6} {regime['trading_days']:<6} {regime['total_return']:<8.1f}% {regime['volatility']:<12.1f}% {regime['large_moves']:<12.1f}%")

# Market regime context
print(f"\n🔍 MARKET REGIME INSIGHTS:")
print("-" * 50)
print(f"1. 📈 2016: Post-financial crisis recovery period")
print(f"2. 🚀 2017: Stable economic growth with low volatility")
print(f"3. 📊 2018: Gradual interest rate normalization")
print(f"4. 💼 2019: Pre-COVID stable market conditions")

# Hypothesis for Chronos performance
print(f"\n💡 CHRONOS PERFORMANCE HYPOTHESIS:")
print("-" * 50)
print(f"✅ Stable 4-year period with consistent market dynamics")
print(f"✅ Lower volatility favors sophisticated pattern recognition")
print(f"✅ Gradual price movements support model learning")
print(f"✅ Optimal conditions for transformer-based forecasting")

if regime_2016_2019['volatility'] < 20:
    print(f"✅ Low volatility period should favor Chronos over naive baseline")
    print(f"✅ Stable patterns enable better model performance")
else:
    print(f"⚠️ Higher than expected volatility may challenge models")
    print(f"⚠️ Naive baseline may still perform competitively")

print(f"\n📋 ANALYSIS READY: Proceeding with comprehensive 2016-2019 Chronos evaluation...")
print(f"Analysis period: {len(data)} trading days from {data['Date'].min()} to {data['Date'].max()}")
print(f"Expected outcome: Chronos should outperform naive baseline in stable conditions")

## 4. Load 2020-2021 Results for Comparison

In [None]:
# Load existing 2020-2021 results for comparison
print("📁 Loading 2020-2021 results for comparative analysis...")

try:
    # Try to load previous results
    results_2020_2021 = pd.read_csv('gold_futures_forecast_metrics.csv', index_col=0)
    print("✅ 2020-2021 results loaded successfully")
    print(f"Available models: {list(results_2020_2021.index)}")
    
    # Display key 2020-2021 results for reference
    print(f"\n📊 2020-2021 REFERENCE RESULTS:")
    print(f"{'Model':<20} {'MASE':<8} {'MAE':<8} {'Dir.Acc':<10} {'RMSE':<8}")
    print("-" * 60)
    for model in results_2020_2021.index:
        row = results_2020_2021.loc[model]
        mase = row['MASE'] if 'MASE' in row else 'N/A'
        mae = row['MAE'] if 'MAE' in row else 'N/A'
        dir_acc = row['Directional_Accuracy'] if 'Directional_Accuracy' in row else 'N/A'
        rmse = row['RMSE'] if 'RMSE' in row else 'N/A'
        print(f"{model:<20} {mase:<8} {mae:<8.2f} {dir_acc:<10} {rmse:<8.2f}")
    
    # Key findings from 2020-2021
    naive_mase_2021 = results_2020_2021.loc['Naive', 'MASE'] if 'Naive' in results_2020_2021.index else None
    chronos_mase_2021 = results_2020_2021.loc['Chronos', 'MASE'] if 'Chronos' in results_2020_2021.index else None
    
    if naive_mase_2021 and chronos_mase_2021:
        performance_gap_2021 = ((chronos_mase_2021 - naive_mase_2021) / naive_mase_2021) * 100
        print(f"\n🎯 2020-2021 KEY FINDING: Chronos MASE was {performance_gap_2021:.1f}% behind naive baseline")
        print(f"   Naive: {naive_mase_2021:.4f} vs Chronos: {chronos_mase_2021:.4f}")
    
except FileNotFoundError:
    print("⚠️ Previous 2020-2021 results not found")
    print("Will generate reference metrics for comparison after 2016-2019 analysis")
    results_2020_2021 = None
    
print(f"\n✅ Ready to run 2016-2019 analysis and compare with 2020-2021 period")

## 5. Run Complete 2016-2019 Analysis

In [ ]:
# Complete 2016-2019 Chronos Analysis Implementation
print("🚀 EXECUTING COMPLETE 2016-2019 CHRONOS ANALYSIS")
print("=" * 70)

import time
from typing import Dict, List, Tuple, Optional
import traceback

# Define evaluation metrics functions
def calculate_mase(y_true: np.ndarray, y_pred: np.ndarray, y_train: np.ndarray) -> float:
    """Calculate Mean Absolute Scaled Error (MASE)"""
    if len(y_train) < 2:
        return np.nan
    
    # Calculate naive forecast MAE on training data
    naive_mae = np.mean(np.abs(np.diff(y_train)))
    
    if naive_mae == 0:
        return np.nan
    
    # Calculate forecast MAE
    forecast_mae = np.mean(np.abs(y_true - y_pred))
    
    return forecast_mae / naive_mae

def calculate_directional_accuracy(y_true: np.ndarray, y_pred: np.ndarray, y_prev: np.ndarray) -> float:
    """Calculate directional accuracy (percentage of correct direction predictions)"""
    if len(y_true) != len(y_pred) or len(y_true) != len(y_prev):
        return np.nan
    
    actual_direction = np.sign(y_true - y_prev)
    predicted_direction = np.sign(y_pred - y_prev)
    
    correct_predictions = (actual_direction == predicted_direction).sum()
    total_predictions = len(y_true)
    
    return (correct_predictions / total_predictions) * 100

def calculate_comprehensive_metrics(y_true: np.ndarray, y_pred: np.ndarray, 
                                  y_train: np.ndarray, y_prev: np.ndarray) -> Dict:
    """Calculate comprehensive evaluation metrics"""
    
    # Remove any NaN values
    mask = ~(np.isnan(y_true) | np.isnan(y_pred))
    y_true_clean = y_true[mask]
    y_pred_clean = y_pred[mask]
    y_prev_clean = y_prev[mask]
    
    if len(y_true_clean) == 0:
        return {
            'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan, 'MASE': np.nan,
            'Directional_Accuracy': np.nan, 'MSE': np.nan, 'Count': 0
        }
    
    try:
        mae = mean_absolute_error(y_true_clean, y_pred_clean)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
        mse = mean_squared_error(y_true_clean, y_pred_clean)
        
        # MAPE with handling for zero values
        mape = np.mean(np.abs((y_true_clean - y_pred_clean) / np.maximum(np.abs(y_true_clean), 1e-8))) * 100
        
        mase = calculate_mase(y_true_clean, y_pred_clean, y_train)
        dir_acc = calculate_directional_accuracy(y_true_clean, y_pred_clean, y_prev_clean)
        
        return {
            'MAE': mae,
            'RMSE': rmse,
            'MAPE': mape,
            'MASE': mase,
            'Directional_Accuracy': dir_acc,
            'MSE': mse,
            'Count': len(y_true_clean)
        }
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        return {
            'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan, 'MASE': np.nan,
            'Directional_Accuracy': np.nan, 'MSE': np.nan, 'Count': 0
        }

# Load Chronos model
print("📥 Loading Chronos model...")
try:
    model_name = "amazon/chronos-bolt-base"
    pipeline = BaseChronosPipeline.from_pretrained(model_name)
    print(f"✅ Loaded {model_name} successfully")
except Exception as e:
    print(f"❌ Error loading Chronos model: {e}")
    print("Trying alternative model...")
    try:
        model_name = "amazon/chronos-t5-base"
        pipeline = BaseChronosPipeline.from_pretrained(model_name)
        print(f"✅ Loaded {model_name} successfully")
    except Exception as e2:
        print(f"❌ Failed to load any Chronos model: {e2}")
        raise

# Define forecasting functions
def generate_chronos_forecast(pipeline, historical_data: np.ndarray, 
                             prediction_length: int = 1, num_samples: int = 100) -> np.ndarray:
    """Generate Chronos forecast"""
    try:
        # Convert to tensor
        historical_tensor = torch.tensor(historical_data, dtype=torch.float32).unsqueeze(0)
        
        # Generate forecast
        forecast = pipeline.predict(
            context=historical_tensor,
            prediction_length=prediction_length,
            num_samples=num_samples
        )
        
        # Get median forecast
        forecast_median = forecast.median(dim=1).values
        return forecast_median.numpy().flatten()
    
    except Exception as e:
        print(f"Error in Chronos forecast: {e}")
        return np.array([np.nan] * prediction_length)

def generate_naive_forecast(historical_data: np.ndarray, prediction_length: int = 1) -> np.ndarray:
    """Generate naive forecast (last value)"""
    if len(historical_data) == 0:
        return np.array([np.nan] * prediction_length)
    return np.array([historical_data[-1]] * prediction_length)

def generate_ma_forecast(historical_data: np.ndarray, window: int = 5, 
                        prediction_length: int = 1) -> np.ndarray:
    """Generate moving average forecast"""
    if len(historical_data) < window:
        return generate_naive_forecast(historical_data, prediction_length)
    
    ma_value = np.mean(historical_data[-window:])
    return np.array([ma_value] * prediction_length)

def generate_linear_trend_forecast(historical_data: np.ndarray, window: int = 10, 
                                  prediction_length: int = 1) -> np.ndarray:
    """Generate linear trend forecast"""
    if len(historical_data) < window:
        return generate_naive_forecast(historical_data, prediction_length)
    
    try:
        # Use last 'window' points for trend calculation
        recent_data = historical_data[-window:]
        x = np.arange(len(recent_data))
        
        # Fit linear trend
        coeffs = np.polyfit(x, recent_data, 1)
        
        # Predict next value(s)
        predictions = []
        for i in range(prediction_length):
            pred = coeffs[0] * (len(recent_data) + i) + coeffs[1]
            predictions.append(pred)
        
        return np.array(predictions)
    
    except Exception as e:
        print(f"Error in linear trend forecast: {e}")
        return generate_naive_forecast(historical_data, prediction_length)

# Setup rolling window evaluation
print("🔄 Setting up rolling window evaluation...")

# Configuration for 2016-2019 analysis
config = {
    'context_window': 126,  # ~6 months of trading days
    'prediction_length': 1,
    'start_index': 200,     # Start after sufficient history
    'min_history': 200,     # Minimum history required
    'num_samples': 100,     # For Chronos probabilistic forecasting
    'models': ['Chronos', 'Naive', 'Moving_Average', 'Linear_Trend']
}

print(f"📊 Analysis Configuration:")
for key, value in config.items():
    print(f"   {key}: {value}")

# Initialize results storage
results = {model: {'predictions': [], 'actuals': [], 'dates': [], 'errors': []} 
          for model in config['models']}

# Rolling window evaluation
print(f"\n🔄 Starting rolling window evaluation...")
print(f"Total evaluation points: {len(data) - config['start_index']}")

start_time = time.time()
progress_interval = 50

for i in range(config['start_index'], len(data)):
    # Progress reporting
    if (i - config['start_index']) % progress_interval == 0:
        progress = ((i - config['start_index']) / (len(data) - config['start_index'])) * 100
        elapsed = time.time() - start_time
        print(f"Progress: {progress:.1f}% ({i - config['start_index']}/{len(data) - config['start_index']}) "
              f"- Elapsed: {elapsed:.1f}s")
    
    # Get historical data
    start_idx = max(0, i - config['context_window'])
    historical_data = data.iloc[start_idx:i]['Close'].values
    
    # Skip if insufficient history
    if len(historical_data) < config['min_history']:
        continue
    
    # Get actual value
    actual_value = data.iloc[i]['Close']
    evaluation_date = data.iloc[i]['Date']
    
    # Get previous value for directional accuracy
    prev_value = data.iloc[i-1]['Close'] if i > 0 else actual_value
    
    # Generate forecasts for each model
    for model_name in config['models']:
        try:
            if model_name == 'Chronos':
                pred = generate_chronos_forecast(pipeline, historical_data, 
                                               config['prediction_length'], 
                                               config['num_samples'])
            elif model_name == 'Naive':
                pred = generate_naive_forecast(historical_data, config['prediction_length'])
            elif model_name == 'Moving_Average':
                pred = generate_ma_forecast(historical_data, window=20, 
                                           prediction_length=config['prediction_length'])
            elif model_name == 'Linear_Trend':
                pred = generate_linear_trend_forecast(historical_data, window=20, 
                                                    prediction_length=config['prediction_length'])
            
            # Store results
            if len(pred) > 0 and not np.isnan(pred[0]):
                results[model_name]['predictions'].append(pred[0])
                results[model_name]['actuals'].append(actual_value)
                results[model_name]['dates'].append(evaluation_date)
                results[model_name]['errors'].append(abs(pred[0] - actual_value))
            else:
                # Handle failed predictions
                results[model_name]['predictions'].append(np.nan)
                results[model_name]['actuals'].append(actual_value)
                results[model_name]['dates'].append(evaluation_date)
                results[model_name]['errors'].append(np.nan)
        
        except Exception as e:
            print(f"Error with {model_name} at step {i}: {e}")
            results[model_name]['predictions'].append(np.nan)
            results[model_name]['actuals'].append(actual_value)
            results[model_name]['dates'].append(evaluation_date)
            results[model_name]['errors'].append(np.nan)

total_time = time.time() - start_time
print(f"\n✅ Evaluation completed in {total_time:.1f} seconds")

# Calculate comprehensive metrics
print("\n📊 Calculating comprehensive metrics...")
metrics_results = {}

for model_name in config['models']:
    model_results = results[model_name]
    
    # Convert to numpy arrays
    predictions = np.array(model_results['predictions'])
    actuals = np.array(model_results['actuals'])
    
    # Get training data for MASE calculation
    training_data = data.iloc[:config['start_index']]['Close'].values
    
    # Get previous values for directional accuracy
    prev_values = np.array([data.iloc[i-1]['Close'] for i in range(config['start_index'], len(data)) 
                           if i < len(data)])[:len(predictions)]
    
    # Calculate metrics
    metrics = calculate_comprehensive_metrics(actuals, predictions, training_data, prev_values)
    metrics_results[model_name] = metrics
    
    print(f"\n{model_name} Results:")
    for metric, value in metrics.items():
        if isinstance(value, float):
            print(f"   {metric}: {value:.4f}")
        else:
            print(f"   {metric}: {value}")

# Create results DataFrame
results_df = pd.DataFrame(metrics_results).T
results_df = results_df.round(4)

print(f"\n📈 2016-2019 FINAL RESULTS:")
print("=" * 70)
print(results_df)

# Save results
results_df.to_csv('gold_futures_forecast_2016_2019_metrics.csv')
print(f"\n✅ Results saved to 'gold_futures_forecast_2016_2019_metrics.csv'")

# Save detailed predictions
predictions_df = pd.DataFrame({
    'Date': results['Chronos']['dates'],
    'Actual': results['Chronos']['actuals'],
    'Chronos': results['Chronos']['predictions'],
    'Naive': results['Naive']['predictions'],
    'Moving_Average': results['Moving_Average']['predictions'],
    'Linear_Trend': results['Linear_Trend']['predictions']
})

predictions_df.to_csv('gold_futures_forecast_2016_2019_predictions.csv', index=False)
print(f"✅ Detailed predictions saved to 'gold_futures_forecast_2016_2019_predictions.csv'")

print(f"\n🎉 2016-2019 Analysis Complete!")

# Load and Compare with 2020-2021 Results
print("📊 LOADING 2020-2021 RESULTS FOR COMPREHENSIVE COMPARISON")
print("=" * 70)

# Try to load 2020-2021 results for comparison
try:
    results_2020_2021 = pd.read_csv('gold_futures_forecast_metrics.csv', index_col=0)
    print("✅ 2020-2021 results loaded successfully")
    
    # Create comprehensive comparison
    print(f"\n🔍 COMPREHENSIVE MARKET REGIME COMPARISON")
    print("=" * 50)
    
    # Create comparison table
    comparison_data = []
    
    # Get models that exist in both periods
    common_models = set(results_df.index) & set(results_2020_2021.index)
    print(f"Common models for comparison: {list(common_models)}")
    
    for model in common_models:
        metrics_2016_2019 = results_df.loc[model]
        metrics_2020_2021 = results_2020_2021.loc[model]
        
        comparison_data.append({
            'Model': model,
            'MASE_2016_2019': metrics_2016_2019['MASE'],
            'MASE_2020_2021': metrics_2020_2021['MASE'],
            'MASE_Improvement': metrics_2016_2019['MASE'] - metrics_2020_2021['MASE'],
            'MASE_Percent_Change': ((metrics_2016_2019['MASE'] - metrics_2020_2021['MASE']) / metrics_2020_2021['MASE']) * 100,
            'MAE_2016_2019': metrics_2016_2019['MAE'],
            'MAE_2020_2021': metrics_2020_2021['MAE'],
            'Dir_Acc_2016_2019': metrics_2016_2019['Directional_Accuracy'],
            'Dir_Acc_2020_2021': metrics_2020_2021['Directional_Accuracy'],
            'Dir_Acc_Improvement': metrics_2016_2019['Directional_Accuracy'] - metrics_2020_2021['Directional_Accuracy']
        })
    
    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        comparison_df = comparison_df.round(4)
        
        print(f"\n📈 DETAILED COMPARISON TABLE:")
        print(comparison_df.to_string(index=False))
        
        # Save comparison results
        comparison_df.to_csv('market_regime_comparison_2016_2019_vs_2020_2021.csv', index=False)
        print(f"\n✅ Comparison saved to 'market_regime_comparison_2016_2019_vs_2020_2021.csv'")
        
        # Key insights
        print(f"\n🎯 KEY COMPARATIVE INSIGHTS:")
        print("=" * 50)
        
        if 'Naive' in comparison_df['Model'].values:
            naive_row = comparison_df[comparison_df['Model'] == 'Naive'].iloc[0]
            print(f"1. 📊 Naive Baseline Performance:")
            print(f"   - 2016-2019 MASE: {naive_row['MASE_2016_2019']:.4f}")
            print(f"   - 2020-2021 MASE: {naive_row['MASE_2020_2021']:.4f}")
            print(f"   - Change: {naive_row['MASE_Percent_Change']:.1f}%")
            print(f"   - Interpretation: Naive was {'better' if naive_row['MASE_Improvement'] < 0 else 'worse'} in 2016-2019")
        
        if 'Chronos' in comparison_df['Model'].values:
            chronos_row = comparison_df[comparison_df['Model'] == 'Chronos'].iloc[0]
            print(f"\n2. 🤖 Chronos Performance:")
            print(f"   - 2016-2019 MASE: {chronos_row['MASE_2016_2019']:.4f}")
            print(f"   - 2020-2021 MASE: {chronos_row['MASE_2020_2021']:.4f}")
            print(f"   - Change: {chronos_row['MASE_Percent_Change']:.1f}%")
            print(f"   - Interpretation: Chronos was {'better' if chronos_row['MASE_Improvement'] < 0 else 'worse'} in 2016-2019")
        
        # Market regime effect on relative performance
        if 'Naive' in comparison_df['Model'].values and 'Chronos' in comparison_df['Model'].values:
            naive_2016 = comparison_df[comparison_df['Model'] == 'Naive']['MASE_2016_2019'].iloc[0]
            chronos_2016 = comparison_df[comparison_df['Model'] == 'Chronos']['MASE_2016_2019'].iloc[0]
            gap_2016 = ((chronos_2016 - naive_2016) / naive_2016) * 100
            
            naive_2021 = comparison_df[comparison_df['Model'] == 'Naive']['MASE_2020_2021'].iloc[0]
            chronos_2021 = comparison_df[comparison_df['Model'] == 'Chronos']['MASE_2020_2021'].iloc[0]
            gap_2021 = ((chronos_2021 - naive_2021) / naive_2021) * 100
            
            print(f"\n3. 🔄 Market Regime Impact on Chronos vs Naive:")
            print(f"   - 2016-2019 gap: {gap_2016:.1f}% ({'behind' if gap_2016 > 0 else 'ahead'})")
            print(f"   - 2020-2021 gap: {gap_2021:.1f}% ({'behind' if gap_2021 > 0 else 'ahead'})")
            print(f"   - Regime effect: {gap_2016 - gap_2021:.1f} percentage points")
            
            if gap_2016 < gap_2021:
                print(f"   ✅ Chronos performed relatively better in stable markets (2016-2019)")
                print(f"   📈 Market volatility negatively impacts Chronos relative performance")
            else:
                print(f"   ❌ Chronos did not perform relatively better in stable markets")
                print(f"   📉 Stable markets did not provide expected advantage for sophisticated models")
        
        # Directional accuracy insights
        print(f"\n4. 🎯 Directional Accuracy Comparison:")
        for model in ['Chronos', 'Naive']:
            if model in comparison_df['Model'].values:
                row = comparison_df[comparison_df['Model'] == model].iloc[0]
                print(f"   - {model}: {row['Dir_Acc_2016_2019']:.1f}% (2016-2019) vs {row['Dir_Acc_2020_2021']:.1f}% (2020-2021)")
                print(f"     Change: {row['Dir_Acc_Improvement']:.1f} percentage points")
        
        # Create visualization
        if plotly_available:
            print(f"\n📊 Creating interactive comparison visualization...")
            
            fig = make_subplots(
                rows=2, cols=2,
                subplot_titles=('MASE Comparison', 'Directional Accuracy Comparison', 
                               'Performance Change', 'Market Regime Effect'),
                specs=[[{"secondary_y": False}, {"secondary_y": False}],
                       [{"secondary_y": False}, {"secondary_y": False}]]
            )
            
            models = comparison_df['Model'].tolist()
            
            # MASE comparison
            fig.add_trace(go.Bar(name='2016-2019 (Stable)', x=models, y=comparison_df['MASE_2016_2019'],
                                marker_color='lightblue'), row=1, col=1)
            fig.add_trace(go.Bar(name='2020-2021 (Volatile)', x=models, y=comparison_df['MASE_2020_2021'],
                                marker_color='orange'), row=1, col=1)
            
            # Directional accuracy comparison
            fig.add_trace(go.Bar(name='2016-2019 (Stable)', x=models, y=comparison_df['Dir_Acc_2016_2019'],
                                marker_color='lightgreen', showlegend=False), row=1, col=2)
            fig.add_trace(go.Bar(name='2020-2021 (Volatile)', x=models, y=comparison_df['Dir_Acc_2020_2021'],
                                marker_color='red', showlegend=False), row=1, col=2)
            
            # Performance change
            fig.add_trace(go.Bar(name='MASE Change (%)', x=models, y=comparison_df['MASE_Percent_Change'],
                                marker_color='purple', showlegend=False), row=2, col=1)
            
            # Market regime effect on Chronos vs Naive gap
            if 'Naive' in models and 'Chronos' in models:
                regime_effect = [gap_2016, gap_2021]
                regime_labels = ['2016-2019\n(Stable)', '2020-2021\n(Volatile)']
                fig.add_trace(go.Bar(name='Chronos vs Naive Gap (%)', x=regime_labels, y=regime_effect,
                                    marker_color=['lightcoral', 'darkred'], showlegend=False), row=2, col=2)
            
            fig.update_layout(height=800, title_text="Market Regime Impact: 2016-2019 (Stable) vs 2020-2021 (Volatile)")
            fig.show()
        
        print(f"\n💡 PRACTICAL IMPLICATIONS:")
        print("=" * 50)
        print(f"1. Market regime significantly impacts model performance")
        print(f"2. {'Stable' if gap_2016 < gap_2021 else 'Volatile'} markets favor sophisticated models")
        print(f"3. Ensemble strategies should consider market regime detection")
        print(f"4. Model selection criteria should adapt to market volatility")
        print(f"5. Naive baseline strength varies with market conditions")
        
    else:
        print("❌ No common models found for comparison")
        
except FileNotFoundError:
    print("⚠️ 2020-2021 results not found - unable to perform comparison")
    print("Run the 2020-2021 analysis first to enable full comparison")
    print("Current analysis provides 2016-2019 baseline for future comparisons")
    
except Exception as e:
    print(f"❌ Error loading 2020-2021 results: {e}")
    print("Continuing with 2016-2019 analysis only")

print(f"\n✅ Comparison analysis complete!")

In [ ]:
# Comprehensive Results Analysis and Visualization
print("📊 COMPREHENSIVE RESULTS ANALYSIS AND VISUALIZATION")
print("=" * 70)

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('2016-2019 Gold Futures Forecasting Analysis', fontsize=16, fontweight='bold')

# 1. Model Performance Comparison
ax1 = axes[0, 0]
models = results_df.index.tolist()
mase_values = results_df['MASE'].values
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']

bars = ax1.bar(models, mase_values, color=colors[:len(models)])
ax1.set_title('Model Performance (MASE)', fontweight='bold')
ax1.set_ylabel('MASE Score')
ax1.set_xlabel('Model')
ax1.tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar, value in zip(bars, mase_values):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{value:.3f}', ha='center', va='bottom', fontweight='bold')

# Add horizontal line at MASE = 1 (break-even with naive)
ax1.axhline(y=1.0, color='red', linestyle='--', alpha=0.7, label='Break-even')
ax1.legend()

# 2. Directional Accuracy
ax2 = axes[0, 1]
dir_acc_values = results_df['Directional_Accuracy'].values
bars2 = ax2.bar(models, dir_acc_values, color=colors[:len(models)])
ax2.set_title('Directional Accuracy', fontweight='bold')
ax2.set_ylabel('Accuracy (%)')
ax2.set_xlabel('Model')
ax2.tick_params(axis='x', rotation=45)

# Add value labels
for bar, value in zip(bars2, dir_acc_values):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{value:.1f}%', ha='center', va='bottom', fontweight='bold')

# Add 50% line (random guess)
ax2.axhline(y=50.0, color='red', linestyle='--', alpha=0.7, label='Random (50%)')
ax2.legend()

# 3. Prediction vs Actual (sample)
ax3 = axes[1, 0]
if len(predictions_df) > 0:
    # Plot last 120 days as sample
    sample_size = min(120, len(predictions_df))
    sample_data = predictions_df.tail(sample_size)
    
    ax3.plot(sample_data.index, sample_data['Actual'], 'k-', linewidth=2, label='Actual', alpha=0.8)
    ax3.plot(sample_data.index, sample_data['Chronos'], 'r--', linewidth=1.5, label='Chronos', alpha=0.7)
    ax3.plot(sample_data.index, sample_data['Naive'], 'b:', linewidth=1.5, label='Naive', alpha=0.7)
    
    ax3.set_title(f'Predictions vs Actual (Last {sample_size} Days)', fontweight='bold')
    ax3.set_ylabel('Price ($)')
    ax3.set_xlabel('Time Index')
    ax3.legend()
    ax3.grid(True, alpha=0.3)

# 4. Error Distribution
ax4 = axes[1, 1]
if len(predictions_df) > 0:
    chronos_errors = np.abs(predictions_df['Actual'] - predictions_df['Chronos'])
    naive_errors = np.abs(predictions_df['Actual'] - predictions_df['Naive'])
    
    ax4.hist(chronos_errors, bins=30, alpha=0.7, label='Chronos', color='red', density=True)
    ax4.hist(naive_errors, bins=30, alpha=0.7, label='Naive', color='blue', density=True)
    
    ax4.set_title('Error Distribution', fontweight='bold')
    ax4.set_xlabel('Absolute Error ($)')
    ax4.set_ylabel('Density')
    ax4.legend()
    ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print key insights
print(f"\n🔍 KEY INSIGHTS FROM 2016-2019 ANALYSIS:")
print("=" * 50)

best_model = results_df.loc[results_df['MASE'].idxmin()]
best_model_name = results_df['MASE'].idxmin()

print(f"1. 🏆 Best Model: {best_model_name}")
print(f"   - MASE: {best_model['MASE']:.4f}")
print(f"   - Directional Accuracy: {best_model['Directional_Accuracy']:.1f}%")
print(f"   - MAE: ${best_model['MAE']:.2f}")

# Compare with naive baseline
if 'Naive' in results_df.index:
    naive_mase = results_df.loc['Naive', 'MASE']
    chronos_mase = results_df.loc['Chronos', 'MASE']
    
    improvement = ((naive_mase - chronos_mase) / naive_mase) * 100
    print(f"\n2. 📊 Chronos vs Naive Baseline:")
    print(f"   - Naive MASE: {naive_mase:.4f}")
    print(f"   - Chronos MASE: {chronos_mase:.4f}")
    print(f"   - Improvement: {improvement:.1f}%")
    
    if improvement > 0:
        print(f"   ✅ Chronos outperforms naive baseline by {improvement:.1f}%")
    else:
        print(f"   ❌ Chronos underperforms naive baseline by {abs(improvement):.1f}%")

# Directional accuracy insights
chronos_dir_acc = results_df.loc['Chronos', 'Directional_Accuracy']
naive_dir_acc = results_df.loc['Naive', 'Directional_Accuracy']

print(f"\n3. 🎯 Directional Accuracy Analysis:")
print(f"   - Chronos: {chronos_dir_acc:.1f}%")
print(f"   - Naive: {naive_dir_acc:.1f}%")

if chronos_dir_acc > 50:
    print(f"   ✅ Chronos shows directional skill ({chronos_dir_acc:.1f}% > 50%)")
else:
    print(f"   ⚠️ Chronos directional accuracy below random ({chronos_dir_acc:.1f}% < 50%)")

# Market regime context
print(f"\n4. 📈 Market Regime Context (2016-2019):")
print(f"   - Period: Stable, low-volatility pre-COVID market")
print(f"   - Expectation: Favorable conditions for sophisticated models")
print(f"   - Gradual price movements and consistent patterns")
print(f"   - Result: {'Hypothesis confirmed' if improvement > 0 else 'Hypothesis challenged'}")

# Statistical significance (simple test)
if len(predictions_df) > 0:
    chronos_errors = np.abs(predictions_df['Actual'] - predictions_df['Chronos'])
    naive_errors = np.abs(predictions_df['Actual'] - predictions_df['Naive'])
    
    # Paired t-test
    try:
        t_stat, p_value = stats.ttest_rel(chronos_errors, naive_errors)
        print(f"\n5. 📊 Statistical Significance:")
        print(f"   - T-statistic: {t_stat:.4f}")
        print(f"   - P-value: {p_value:.4f}")
        print(f"   - Significant: {'Yes' if p_value < 0.05 else 'No'} (α = 0.05)")
    except Exception as e:
        print(f"\n5. 📊 Statistical test failed: {e}")

# Performance by year (if enough data)
if len(predictions_df) > 0 and 'Date' in predictions_df.columns:
    try:
        predictions_df['Date'] = pd.to_datetime(predictions_df['Date'])
        predictions_df['Year'] = predictions_df['Date'].dt.year
        
        print(f"\n6. 📅 Performance by Year:")
        for year in sorted(predictions_df['Year'].unique()):
            year_data = predictions_df[predictions_df['Year'] == year]
            if len(year_data) > 20:  # Minimum data points
                year_mae_chronos = np.mean(np.abs(year_data['Actual'] - year_data['Chronos']))
                year_mae_naive = np.mean(np.abs(year_data['Actual'] - year_data['Naive']))
                year_improvement = ((year_mae_naive - year_mae_chronos) / year_mae_naive) * 100
                print(f"   - {year}: {year_improvement:.1f}% improvement over naive")
    except Exception as e:
        print(f"\n6. 📅 Yearly breakdown failed: {e}")

print(f"\n✅ 2016-2019 Analysis Complete - Stable market period evaluation!")
print(f"📋 Ready for comparison with volatile market periods (2020-2021)")

## 7. Executive Summary and Conclusions

This notebook provides a comprehensive analysis of Chronos model performance on 2016-2019 gold futures data, enabling direct comparison with the 2020-2021 period analysis to understand market regime effects on forecasting performance.

### Key Analysis Components:
1. **Market Regime Analysis**: Quantitative comparison of market characteristics between 2016-2019 (stable) and 2020-2021 (volatile) periods
2. **Identical Methodology**: Same evaluation framework ensures fair comparison across market regimes
3. **Performance Comparison**: Direct model performance comparison to identify market-dependent patterns
4. **Statistical Robustness**: Comprehensive metrics and significance testing
5. **Practical Insights**: Market-dependent model selection and ensemble strategy guidance

### Expected Outcomes:
- **Market Regime Impact**: Quantified impact of market volatility on Chronos vs naive performance
- **Optimal Configuration**: Best model settings for stable market conditions (2016-2019)
- **Model Selection Criteria**: When to use sophisticated models vs simple baselines
- **Ensemble Strategy**: Robust recommendations combining multiple approaches
- **Regime Detection**: Insights for adaptive forecasting based on market conditions

### Hypothesis Testing:
**H1**: Chronos models perform better relative to naive baselines in stable markets (2016-2019) vs volatile markets (2020-2021)
**H2**: Lower volatility periods favor pattern recognition capabilities of transformer-based models
**H3**: Market regime significantly impacts optimal model configuration and performance gaps