# 01 - Data Exploration

This notebook explores the data used in the volatility path states analysis.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.synthetic_data import SyntheticDataGenerator
from src.visualization.styles import set_publication_style

set_publication_style()
np.random.seed(42)

## 1. Load Data

In [None]:
# Generate synthetic data
generator = SyntheticDataGenerator(seed=42)
data = generator.generate(n_months=732)

print("Data components:")
for key, value in data.items():
    if hasattr(value, 'shape'):
        print(f"  {key}: {value.shape}")

In [None]:
# Extract components
market = data['market']
factors = data['factors']
volatility = data['volatility']

print(f"Date range: {factors.index[0]} to {factors.index[-1]}")
print(f"\nFactor columns: {list(factors.columns)}")

## 2. Summary Statistics

In [None]:
# Factor summary statistics
summary = pd.DataFrame({
    'Mean (ann %)': factors.mean() * 12 * 100,
    'Std (ann %)': factors.std() * np.sqrt(12) * 100,
    'Sharpe': factors.mean() / factors.std() * np.sqrt(12),
    'Skewness': factors.skew(),
    'Kurtosis': factors.kurtosis(),
    'Min (%)': factors.min() * 100,
    'Max (%)': factors.max() * 100,
})

print("Factor Summary Statistics:")
summary.round(2)

In [None]:
# Volatility summary
print("Volatility Summary:")
print(volatility.describe())

## 3. Visualizations

In [None]:
# Plot cumulative returns
fig, ax = plt.subplots(figsize=(12, 5))

for col in factors.columns:
    cum_ret = np.cumsum(factors[col]) * 100
    ax.plot(cum_ret.index, cum_ret.values, label=col, linewidth=1.5)

ax.set_xlabel('Date')
ax.set_ylabel('Cumulative Return (%)')
ax.set_title('Cumulative Factor Returns')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Plot return distributions
fig, axes = plt.subplots(2, 2, figsize=(10, 8))

for ax, col in zip(axes.flatten(), factors.columns):
    factors[col].hist(ax=ax, bins=50, alpha=0.7, edgecolor='white')
    ax.axvline(x=0, color='red', linestyle='--', linewidth=1)
    ax.set_title(col)
    ax.set_xlabel('Monthly Return')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
fig, ax = plt.subplots(figsize=(8, 6))

corr = factors.corr()
sns.heatmap(corr, annot=True, cmap='RdBu_r', center=0, 
            vmin=-1, vmax=1, ax=ax, fmt='.2f')
ax.set_title('Factor Correlation Matrix')
plt.tight_layout()
plt.show()

## 4. Volatility Analysis

In [None]:
# Plot volatility time series
fig, axes = plt.subplots(2, 1, figsize=(12, 6), sharex=True)

# 1-month volatility
ax1 = axes[0]
ax1.plot(volatility.index, volatility['sigma_1m'] * 100, 'b-', linewidth=1)
ax1.set_ylabel('1-Month Volatility (%)')
ax1.set_title('Realized Volatility Over Time')

# Volatility ratio
ax2 = axes[1]
ax2.plot(volatility.index, volatility['rho_sigma'], 'r-', linewidth=1)
ax2.axhline(y=1.0, color='gray', linestyle='--')
ax2.axhline(y=0.8, color='gray', linestyle=':')
ax2.axhline(y=1.5, color='gray', linestyle=':')
ax2.set_ylabel('Volatility Ratio')
ax2.set_xlabel('Date')

plt.tight_layout()
plt.show()

In [None]:
# Scatter: volatility level vs ratio
fig, ax = plt.subplots(figsize=(8, 6))

ax.scatter(volatility['sigma_1m'] * 100, volatility['rho_sigma'], 
           alpha=0.5, s=20, edgecolors='none')

# Add reference lines
vol_33 = np.percentile(volatility['sigma_1m'].dropna(), 33) * 100
vol_67 = np.percentile(volatility['sigma_1m'].dropna(), 67) * 100

ax.axvline(x=vol_33, color='gray', linestyle='--', alpha=0.7)
ax.axvline(x=vol_67, color='gray', linestyle='--', alpha=0.7)
ax.axhline(y=0.8, color='gray', linestyle='--', alpha=0.7)
ax.axhline(y=1.5, color='gray', linestyle='--', alpha=0.7)

ax.set_xlabel('1-Month Realized Volatility (%)')
ax.set_ylabel('Volatility Ratio')
ax.set_title('State Space: Volatility Level vs. Dynamics')

plt.tight_layout()
plt.show()

## 5. Key Observations

1. **Factor returns** have different characteristics - momentum is more volatile and negatively skewed
2. **Correlations** are generally low between factors, providing diversification benefits
3. **Volatility** clusters over time and exhibits mean reversion
4. **The state space** defined by volatility level and dynamics provides a basis for regime classification