# Backtest Framework: Event Combinations Win Rates

**Fecha**: 2025-10-29  
**Objetivo**: Evaluar win rates de combinaciones de eventos E1-E11

---

## Metodologia

1. **Signal Generation**: Usar eventos como señales de entrada
2. **Forward Returns**: Calcular returns N dias adelante (1d, 3d, 5d, 10d)
3. **Win Rate**: % de trades con return positivo
4. **Expected Return**: Return promedio de todos los trades
5. **Sharpe Ratio**: Risk-adjusted return
6. **Combinaciones**: Analizar eventos individuales + combinaciones de 2-3 eventos

---

In [None]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 10)

print('Libraries loaded successfully')
print(f'Polars version: {pl.__version__}')

## 1. Load Data: Daily OHLCV + Events + Watchlist

In [None]:
# Setup paths
project_root = Path.cwd().parent.parent.parent.parent
daily_dir = project_root / 'processed' / 'daily_ohlcv'
watchlist_file = project_root / 'processed' / 'watchlist_E1_E11.parquet'

print(f'Project root: {project_root}')
print(f'Daily OHLCV dir: {daily_dir}')
print(f'Watchlist file exists: {watchlist_file.exists()}')
print()

In [None]:
# Load daily OHLCV data
print('Loading daily OHLCV data...')
daily_files = list(daily_dir.glob('*/daily.parquet'))
print(f'Found {len(daily_files):,} ticker files')

# Read and concatenate
dfs_daily = []
for i, pf in enumerate(daily_files):
    if i % 1000 == 0:
        print(f'  Loading {i:,}/{len(daily_files):,}...')
    try:
        df = pl.read_parquet(pf)
        if len(df) > 0:
            dfs_daily.append(df)
    except Exception as e:
        pass

df_daily = pl.concat(dfs_daily)
print(f'Loaded {len(df_daily):,} daily records for {df_daily["ticker"].n_unique():,} tickers')
print()
print('Sample:')
print(df_daily.head(5))

In [None]:
# Load watchlist with events
print('Loading watchlist...')
df_watchlist = pl.read_parquet(watchlist_file)
print(f'Watchlist: {len(df_watchlist):,} ticker-date combinations')
print()
print('Schema:')
print(df_watchlist.schema)
print()
print('Sample:')
print(df_watchlist.head(5))

## 2. Calculate Forward Returns (1d, 3d, 5d, 10d)

In [None]:
# Calculate forward returns
print('Calculating forward returns...')

df_returns = (
    df_daily
    .sort(['ticker', 'date'])
    .with_columns([
        # Forward prices
        pl.col('c').shift(-1).over('ticker').alias('c_1d'),
        pl.col('c').shift(-3).over('ticker').alias('c_3d'),
        pl.col('c').shift(-5).over('ticker').alias('c_5d'),
        pl.col('c').shift(-10).over('ticker').alias('c_10d'),
    ])
    .with_columns([
        # Forward returns
        ((pl.col('c_1d') - pl.col('c')) / pl.col('c')).alias('ret_1d'),
        ((pl.col('c_3d') - pl.col('c')) / pl.col('c')).alias('ret_3d'),
        ((pl.col('c_5d') - pl.col('c')) / pl.col('c')).alias('ret_5d'),
        ((pl.col('c_10d') - pl.col('c')) / pl.col('c')).alias('ret_10d'),
    ])
    .select(['ticker', 'date', 'c', 'ret_1d', 'ret_3d', 'ret_5d', 'ret_10d'])
)

print(f'Returns calculated for {len(df_returns):,} records')
print()
print('Sample:')
print(df_returns.head(10))

## 3. Join Watchlist + Returns

In [None]:
# Join watchlist with returns
print('Joining watchlist with returns...')

df_backtest = (
    df_watchlist
    .join(df_returns, on=['ticker', 'date'], how='inner')
)

print(f'Backtest dataset: {len(df_backtest):,} signals')
print(f'Tickers: {df_backtest["ticker"].n_unique():,}')
print()
print('Sample:')
print(df_backtest.head(5))

## 4. Explode Events: One Row per Event

In [None]:
# Explode events list to create one row per event
print('Exploding events...')

df_exploded = df_backtest.explode('events')

print(f'Exploded dataset: {len(df_exploded):,} event signals')
print()
print('Sample:')
print(df_exploded.head(10))

## 5. Single Event Performance: Win Rate + Expected Return

In [None]:
# Calculate performance for single events
print('Calculating single event performance...')
print()

def calculate_metrics(df, group_col='events'):
    """Calculate win rate, expected return, sharpe ratio"""
    metrics = []
    
    for horizon in ['ret_1d', 'ret_3d', 'ret_5d', 'ret_10d']:
        df_stats = (
            df
            .filter(pl.col(horizon).is_not_null())
            .group_by(group_col)
            .agg([
                pl.count().alias('n_signals'),
                (pl.col(horizon) > 0).sum().alias('n_wins'),
                pl.col(horizon).mean().alias('mean_ret'),
                pl.col(horizon).std().alias('std_ret'),
                pl.col(horizon).min().alias('min_ret'),
                pl.col(horizon).max().alias('max_ret'),
            ])
            .with_columns([
                (pl.col('n_wins') / pl.col('n_signals')).alias('win_rate'),
                (pl.col('mean_ret') / pl.col('std_ret')).alias('sharpe'),
                pl.lit(horizon).alias('horizon')
            ])
        )
        metrics.append(df_stats)
    
    return pl.concat(metrics)

df_single_performance = calculate_metrics(df_exploded, group_col='events')

print('Single event performance:')
print(df_single_performance.sort(['horizon', 'win_rate'], descending=[False, True]))
print()

# Save
df_single_performance.write_csv('single_event_performance.csv')
print('[OK] Saved: single_event_performance.csv')

## 6. Visualize Single Event Performance

In [None]:
# Plot: Win rate by event and horizon
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
horizons = ['ret_1d', 'ret_3d', 'ret_5d', 'ret_10d']
titles = ['1-Day Forward', '3-Day Forward', '5-Day Forward', '10-Day Forward']

for ax, horizon, title in zip(axes.flat, horizons, titles):
    df_plot = df_single_performance.filter(pl.col('horizon') == horizon).to_pandas()
    df_plot = df_plot.sort_values('win_rate', ascending=True)
    
    colors = ['red' if wr < 0.5 else 'green' for wr in df_plot['win_rate']]
    
    ax.barh(df_plot['events'], df_plot['win_rate'], color=colors, alpha=0.7)
    ax.axvline(0.5, color='black', linestyle='--', linewidth=1, label='Random (50%)')
    ax.set_xlabel('Win Rate', fontsize=11)
    ax.set_ylabel('Event Type', fontsize=11)
    ax.set_title(f'Win Rate - {title}', fontsize=13, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.grid(axis='x', alpha=0.3)
    ax.legend()
    
    # Add value labels
    for i, (event, wr) in enumerate(zip(df_plot['events'], df_plot['win_rate'])):
        ax.text(wr + 0.01, i, f'{wr:.3f}', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('win_rate_by_event_horizon.png', dpi=300, bbox_inches='tight')
plt.show()

print('[OK] Plot saved: win_rate_by_event_horizon.png')

In [None]:
# Plot: Expected return by event and horizon
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

for ax, horizon, title in zip(axes.flat, horizons, titles):
    df_plot = df_single_performance.filter(pl.col('horizon') == horizon).to_pandas()
    df_plot = df_plot.sort_values('mean_ret', ascending=True)
    
    colors = ['red' if mr < 0 else 'green' for mr in df_plot['mean_ret']]
    
    ax.barh(df_plot['events'], df_plot['mean_ret'] * 100, color=colors, alpha=0.7)
    ax.axvline(0, color='black', linestyle='--', linewidth=1)
    ax.set_xlabel('Expected Return (%)', fontsize=11)
    ax.set_ylabel('Event Type', fontsize=11)
    ax.set_title(f'Expected Return - {title}', fontsize=13, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, (event, mr) in enumerate(zip(df_plot['events'], df_plot['mean_ret'])):
        ax.text(mr*100 + 0.1, i, f'{mr*100:.2f}%', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('expected_return_by_event_horizon.png', dpi=300, bbox_inches='tight')
plt.show()

print('[OK] Plot saved: expected_return_by_event_horizon.png')

## 7. Event Combinations: Analyze 2-Event Combos

In [None]:
# Create binary matrix for event combinations
print('Creating event combination matrix...')

# Get all unique events
all_events = df_exploded['events'].unique().to_list()
print(f'Unique events: {len(all_events)}')
print(all_events)
print()

# Create binary columns for each event
df_combo = df_backtest.clone()

for event in all_events:
    df_combo = df_combo.with_columns([
        pl.col('events').list.contains(event).cast(pl.Int32).alias(f'has_{event}')
    ])

print('Binary event matrix created')
print()
print('Sample:')
print(df_combo.select(['ticker', 'date'] + [f'has_{e}' for e in all_events[:5]]).head(5))

In [None]:
# Analyze 2-event combinations
print('Analyzing 2-event combinations...')
print()

combo_results = []

for e1, e2 in combinations(all_events, 2):
    # Filter rows where both events occur
    df_both = df_combo.filter(
        (pl.col(f'has_{e1}') == 1) & (pl.col(f'has_{e2}') == 1)
    )
    
    if len(df_both) < 10:  # Skip if too few samples
        continue
    
    # Calculate metrics for each horizon
    for horizon in ['ret_1d', 'ret_3d', 'ret_5d', 'ret_10d']:
        df_valid = df_both.filter(pl.col(horizon).is_not_null())
        
        if len(df_valid) < 10:
            continue
        
        n_signals = len(df_valid)
        n_wins = (df_valid[horizon] > 0).sum()
        win_rate = n_wins / n_signals
        mean_ret = df_valid[horizon].mean()
        std_ret = df_valid[horizon].std()
        sharpe = mean_ret / std_ret if std_ret > 0 else 0
        
        combo_results.append({
            'combo': f'{e1} + {e2}',
            'event1': e1,
            'event2': e2,
            'horizon': horizon,
            'n_signals': n_signals,
            'win_rate': win_rate,
            'mean_ret': mean_ret,
            'std_ret': std_ret,
            'sharpe': sharpe
        })

df_combo_performance = pl.DataFrame(combo_results)
print(f'Analyzed {len(df_combo_performance):,} 2-event combinations')
print()

# Top 10 combinations by win rate (10d horizon)
print('Top 10 combinations by win rate (10-day horizon):')
top10_combos = (
    df_combo_performance
    .filter(pl.col('horizon') == 'ret_10d')
    .sort('win_rate', descending=True)
    .head(10)
)
print(top10_combos)
print()

# Save
df_combo_performance.write_csv('combo_2event_performance.csv')
print('[OK] Saved: combo_2event_performance.csv')

## 8. Visualize Top Event Combinations

In [None]:
# Plot: Top 20 combinations by win rate (10d horizon)
fig, ax = plt.subplots(figsize=(14, 10))

df_plot = (
    df_combo_performance
    .filter(pl.col('horizon') == 'ret_10d')
    .filter(pl.col('n_signals') >= 20)  # Require at least 20 signals
    .sort('win_rate', descending=True)
    .head(20)
    .to_pandas()
)

colors = ['green' if wr > 0.5 else 'red' for wr in df_plot['win_rate']]

ax.barh(df_plot['combo'], df_plot['win_rate'], color=colors, alpha=0.7)
ax.axvline(0.5, color='black', linestyle='--', linewidth=1, label='Random (50%)')
ax.set_xlabel('Win Rate', fontsize=12)
ax.set_ylabel('Event Combination', fontsize=12)
ax.set_title('Top 20 Event Combinations by Win Rate (10-Day Forward)', fontsize=14, fontweight='bold')
ax.set_xlim(0, 1)
ax.grid(axis='x', alpha=0.3)
ax.legend()

# Add value labels with n_signals
for i, (combo, wr, n) in enumerate(zip(df_plot['combo'], df_plot['win_rate'], df_plot['n_signals'])):
    ax.text(wr + 0.01, i, f'{wr:.3f} (n={n})', va='center', fontsize=8)

plt.tight_layout()
plt.savefig('top20_combo_win_rate.png', dpi=300, bbox_inches='tight')
plt.show()

print('[OK] Plot saved: top20_combo_win_rate.png')

## 9. Summary Report: Best Performers

In [None]:
print('=' * 80)
print('BACKTEST SUMMARY REPORT')
print('=' * 80)
print()

# Single events - best performers
print('SINGLE EVENT PERFORMANCE (10-Day Horizon):')
print()
best_single = (
    df_single_performance
    .filter(pl.col('horizon') == 'ret_10d')
    .sort('win_rate', descending=True)
    .head(5)
)

for row in best_single.iter_rows(named=True):
    print(f"  {row['events']}:")
    print(f"    Win Rate: {row['win_rate']:.3f}")
    print(f"    Expected Return: {row['mean_ret']*100:.2f}%")
    print(f"    Sharpe Ratio: {row['sharpe']:.3f}")
    print(f"    Signals: {row['n_signals']:,}")
    print()

# Combo events - best performers
print('=' * 80)
print('2-EVENT COMBO PERFORMANCE (10-Day Horizon):')
print()
best_combo = (
    df_combo_performance
    .filter(pl.col('horizon') == 'ret_10d')
    .filter(pl.col('n_signals') >= 20)
    .sort('win_rate', descending=True)
    .head(5)
)

for row in best_combo.iter_rows(named=True):
    print(f"  {row['combo']}:")
    print(f"    Win Rate: {row['win_rate']:.3f}")
    print(f"    Expected Return: {row['mean_ret']*100:.2f}%")
    print(f"    Sharpe Ratio: {row['sharpe']:.3f}")
    print(f"    Signals: {row['n_signals']:,}")
    print()

print('=' * 80)
print('[OK] BACKTEST COMPLETED')
print('=' * 80)