# Fase 1: Information Theory - Validación Híbrida Ventanas

**Objetivo**: Calcular Mutual Information entre features diarias y retornos futuros para identificar días con información predictiva.

**Método**: Information Theory (model-agnostic)
- Mutual Information I(X_t; y) por día relativo
- Filtrado rápido: descarta días sin señal
- Solo usa columnas básicas de DIB bars

**Output**: `phase1_results.pkl` con info_results por evento

**Tiempo estimado**: 10-20 min (con sample_size=200)

## 0. Setup

In [None]:
import polars as pl
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mutual_info_score
import pickle
import warnings
warnings.filterwarnings('ignore')

# Config
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

# Paths
BARS_ROOT = Path('../../../../processed/dib_bars/pilot50_validation')
WATCHLIST = Path('../../../../processed/universe/pilot50_validation/daily')
OUTPUT_DIR = Path('.')

print(f"DIB bars dir exists: {BARS_ROOT.exists()}")
print(f"Watchlist exists: {WATCHLIST.exists()}")
print(f"Output dir: {OUTPUT_DIR.absolute()}")

## 1. Cargar Watchlist con Eventos

In [None]:
# Cargar todos los watchlists particionados por fecha
watchlist_files = list(WATCHLIST.rglob('watchlist.parquet'))
print(f"Encontrados {len(watchlist_files):,} watchlist files")

wl_parts = []
for wl_file in watchlist_files:
    # Extract date from path: date=YYYY-MM-DD/watchlist.parquet
    date_str = wl_file.parent.name.split('=')[1]
    df = pl.read_parquet(wl_file)
    df = df.with_columns([pl.lit(date_str).alias('date')])
    wl_parts.append(df)

wl = pl.concat(wl_parts)
print(f"Total watchlist rows: {wl.height:,}")

# Convertir date a pl.Date
wl = wl.with_columns([
    pl.col('date').str.strptime(pl.Date, format='%Y-%m-%d')
])

# Expandir una fila por evento
wl_expanded = wl.explode('events').rename({'events': 'event_code'})
print(f"Total event occurrences: {wl_expanded.height:,}")

# Eventos disponibles
events_available = sorted(wl_expanded['event_code'].unique().to_list())
print(f"\nEventos disponibles: {events_available}")

wl_expanded.head()

## 2. Funciones de Información Mutua

In [None]:
def load_dib_bars_day(ticker: str, day: datetime.date) -> pl.DataFrame:
    """
    Carga DIB bars de un ticker en un día específico.
    """
    bars_file = BARS_ROOT / ticker / f"date={day.isoformat()}" / "dollar_imbalance.parquet"
    if not bars_file.exists():
        return None
    return pl.read_parquet(bars_file)


def aggregate_day_features(df_bars: pl.DataFrame) -> dict:
    """
    Agrega features intradía de DIB bars a features diarias.
    Solo usa columnas básicas: o, h, l, c, v, n, dollar, imbalance_score
    """
    if df_bars is None or df_bars.height == 0:
        return None
    
    # Calcular features agregados del día
    agg = df_bars.select([
        ((pl.col('c') - pl.col('o')) / pl.col('o')).mean().alias('ret_day'),
        ((pl.col('h') - pl.col('l')) / pl.col('o')).mean().alias('range_day'),
        pl.col('v').sum().alias('vol_day'),
        pl.col('dollar').sum().alias('dollar_day'),
        pl.col('imbalance_score').mean().alias('imb_day'),
        pl.col('n').sum().alias('n_bars')
    ])
    
    return agg.to_dicts()[0] if agg.height > 0 else None


def calculate_mutual_information_discretized(
    X: np.ndarray,
    y: np.ndarray,
    bins: int = 10
) -> float:
    """
    Calcula mutual information promedio entre features X y target y.
    """
    y_binned = pd.cut(y, bins=bins, labels=False, duplicates='drop')
    
    mi_scores = []
    for col_idx in range(X.shape[1]):
        x_col = X[:, col_idx]
        x_binned = pd.cut(x_col, bins=bins, labels=False, duplicates='drop')
        
        valid_mask = ~(pd.isna(x_binned) | pd.isna(y_binned))
        if valid_mask.sum() > 10:
            mi = mutual_info_score(x_binned[valid_mask], y_binned[valid_mask])
            mi_scores.append(mi)
    
    return np.mean(mi_scores) if mi_scores else 0.0


print("✓ Funciones de información mutua definidas")

## 3. Calcular MI por Día Relativo

In [None]:
def analyze_information_by_relative_day(
    event_code: str,
    max_pre: int = 7,
    max_post: int = 7,
    sample_size: int = 500
) -> dict:
    """
    Para un evento, calcula I(X_t; y) para cada día t relativo al evento.
    
    Returns:
        {rel_day: mutual_information_score}
    """
    # Filtrar eventos de este tipo
    subset = wl_expanded.filter(pl.col('event_code') == event_code)
    
    # Sample para acelerar (opcional)
    if subset.height > sample_size:
        subset = subset.sample(sample_size, seed=42)
    
    print(f"\nAnalizando {event_code}: {subset.height} ocurrencias")
    
    # Recolectar datos por día relativo
    data_by_day = {}
    
    for rel_day in range(-max_pre, max_post + 1):
        features_list = []
        targets_list = []
        
        for row in subset.iter_rows(named=True):
            ticker = row['ticker']
            t0 = row['date']
            
            # Día relativo actual
            d = t0 + timedelta(days=rel_day)
            bars = load_dib_bars_day(ticker, d)
            
            if bars is None or bars.height == 0:
                continue
            
            # Features agregados del día
            feat = aggregate_day_features(bars)
            if feat is None:
                continue
            
            # Target: retorno futuro desde t0 (día evento)
            # Usamos bars del día t0+1, t0+2, t0+3 para calcular ret_3d
            bars_t0 = load_dib_bars_day(ticker, t0)
            bars_t3 = load_dib_bars_day(ticker, t0 + timedelta(days=3))
            
            if bars_t0 is None or bars_t3 is None:
                continue
            if bars_t0.height == 0 or bars_t3.height == 0:
                continue
            
            # Calcular retorno 3d
            p0 = bars_t0['c'][-1]
            p3 = bars_t3['c'][-1]
            ret_3d = (p3 - p0) / p0
            
            features_list.append(list(feat.values()))
            targets_list.append(ret_3d)
        
        if len(features_list) < 50:
            data_by_day[rel_day] = 0.0
            continue
        
        X = np.array(features_list)
        y = np.array(targets_list)
        
        # Calcular MI
        mi = calculate_mutual_information_discretized(X, y, bins=10)
        data_by_day[rel_day] = mi
        
        print(f"  t={rel_day:+d}: MI={mi:.4f} (n={len(features_list)})")
    
    return data_by_day


print("✓ Función de análisis por día relativo definida")

## 4. Ejecutar Análisis Information Theory

**NOTA**: Ajusta `EVENTS_TO_TEST` según necesites:
- `[:3]` → Prueba rápida (3 eventos, ~10-15 min)
- Sin slice → Análisis completo (11 eventos, ~40-60 min)

In [None]:
# CONFIGURACIÓN: Ajusta aquí el subset de eventos
EVENTS_TO_TEST = events_available[:3]  # Cambiar a events_available para análisis completo
MAX_PRE_DAYS = 3
MAX_POST_DAYS = 3
SAMPLE_SIZE = 200  # Reducir a 100 para más velocidad, aumentar a 500 para más precisión

print(f"Analizando {len(EVENTS_TO_TEST)} eventos con ventana [{-MAX_PRE_DAYS}, {MAX_POST_DAYS}]")
print(f"Sample size: {SAMPLE_SIZE} ocurrencias por evento\n")

info_results = {}

for event in EVENTS_TO_TEST:
    info_by_day = analyze_information_by_relative_day(
        event,
        max_pre=MAX_PRE_DAYS,
        max_post=MAX_POST_DAYS,
        sample_size=SAMPLE_SIZE
    )
    info_results[event] = info_by_day

print("\n" + "="*60)
print("✓ Análisis Information Theory completado")
print(f"Eventos analizados: {len(info_results)}")
print("="*60)

## 5. Visualizar Información por Día

In [None]:
fig, axes = plt.subplots(len(info_results), 1, figsize=(12, 4 * len(info_results)))

if len(info_results) == 1:
    axes = [axes]

for idx, (event, info_by_day) in enumerate(info_results.items()):
    ax = axes[idx]
    
    days = sorted(info_by_day.keys())
    mi_scores = [info_by_day[d] for d in days]
    
    # Normalizar
    max_mi = max(mi_scores) if max(mi_scores) > 0 else 1.0
    mi_norm = [m / max_mi for m in mi_scores]
    
    # Plot
    ax.bar(days, mi_norm, alpha=0.7, color='steelblue')
    ax.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Día Evento (t=0)')
    ax.axhline(y=0.1, color='orange', linestyle=':', label='Threshold 10%')
    
    # Marcar días significativos
    significant_days = [d for d, mi in zip(days, mi_norm) if mi >= 0.1]
    if significant_days:
        t_start, t_end = min(significant_days), max(significant_days)
        ax.axvspan(t_start - 0.5, t_end + 0.5, alpha=0.2, color='green',
                   label=f'Ventana sugerida: [{t_start}, {t_end}]')
    
    ax.set_xlabel('Días Relativos al Evento')
    ax.set_ylabel('Mutual Information (normalizado)')
    ax.set_title(f'{event}: Información por Día Relativo')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('information_by_day_phase1.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Gráfico guardado: information_by_day_phase1.png")

## 6. Resumen: Ventanas Sugeridas por MI

In [None]:
print("\n" + "="*80)
print("VENTANAS SUGERIDAS POR MUTUAL INFORMATION (threshold 10%)")
print("="*80)

suggested_windows = {}

for event, info_by_day in info_results.items():
    days = sorted(info_by_day.keys())
    mi_scores = [info_by_day[d] for d in days]
    max_mi = max(mi_scores) if max(mi_scores) > 0 else 1.0
    mi_norm = [m / max_mi for m in mi_scores]
    significant_days = [d for d, mi in zip(days, mi_norm) if mi >= 0.1]
    
    if significant_days:
        window = (min(significant_days), max(significant_days))
        suggested_windows[event] = window
        print(f"  {event:<25} → [{window[0]:+d}, {window[1]:+d}]")
    else:
        suggested_windows[event] = None
        print(f"  {event:<25} → Sin ventana clara (MI muy bajo)")

print("="*80)

## 7. Guardar Resultados Fase 1

**Output**: `phase1_results.pkl` con todos los datos necesarios para Fase 2

In [None]:
# Empaquetar resultados
results_phase1 = {
    'info_results': info_results,
    'wl_expanded': wl_expanded,
    'events_available': events_available,
    'suggested_windows': suggested_windows,
    'config': {
        'max_pre_days': MAX_PRE_DAYS,
        'max_post_days': MAX_POST_DAYS,
        'sample_size': SAMPLE_SIZE,
        'events_tested': EVENTS_TO_TEST
    }
}

# Guardar a disco
output_file = OUTPUT_DIR / 'phase1_results.pkl'
with open(output_file, 'wb') as f:
    pickle.dump(results_phase1, f)

print("\n" + "="*80)
print("✓ FASE 1 COMPLETADA")
print("="*80)
print(f"Resultados guardados en: {output_file.absolute()}")
print(f"\nContenido:")
print(f"  - info_results: {len(info_results)} eventos con MI por día relativo")
print(f"  - wl_expanded: {wl_expanded.height:,} event occurrences")
print(f"  - events_available: {len(events_available)} eventos totales")
print(f"  - suggested_windows: {len([w for w in suggested_windows.values() if w])} ventanas sugeridas")
print(f"\nPróximo paso: Ejecutar phase2_model_performance.ipynb")
print("="*80)