# Validacion E1-E11: Event Detection Complete

**Fecha**: 2025-10-29  
**Objetivo**: Validar deteccion completa de eventos E1-E11 con 3,459,349 eventos totales

---

## Scope

Validar:
1. Conteo de eventos por tipo (E1-E11)
2. Distribucion temporal de eventos
3. Top tickers por evento
4. Validacion flag `intraday_confirmed=False` en E3/E9
5. Schema consistency across all event types
6. Overlap analysis: cuantos dias tienen multiples eventos

---

In [None]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pandas as pd

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print('Polars version:', pl.__version__)
print('Matplotlib version:', plt.matplotlib.__version__)

## 1. Load All Events E1-E11

In [None]:
events_dir = Path('processed/events')

# Expected counts from detection
expected_counts = {
    'E1': 164_941,
    'E2': 73_170,
    'E3': 144_062,
    'E4': 197_716,
    'E5': 412_902,
    'E6': 1_543_990,
    'E7': 16_919,
    'E8': 19_924,
    'E9': 24_074,
    'E10': 814_068,
    'E11': 47_583
}

# Load all events
dfs = {}
for event_type in expected_counts.keys():
    event_file = events_dir / f'events_{event_type.lower()}.parquet'
    if event_file.exists():
        df = pl.read_parquet(event_file)
        dfs[event_type] = df
        print(f'{event_type}: {len(df):,} events (expected: {expected_counts[event_type]:,})')
    else:
        print(f'{event_type}: FILE NOT FOUND')

print()
print(f'Total events loaded: {sum(len(df) for df in dfs.values()):,}')
print(f'Expected total: {sum(expected_counts.values()):,}')

## 2. Validate Schema: E3 and E9 Have `intraday_confirmed` Flag

In [None]:
print('=== SCHEMA VALIDATION ===')
print()

for event_type, df in dfs.items():
    print(f'{event_type} Schema:')
    for col, dtype in zip(df.columns, df.dtypes):
        print(f'  {col}: {dtype}')
    
    # Check for intraday_confirmed flag in E3/E9
    if event_type in ['E3', 'E9']:
        if 'intraday_confirmed' in df.columns:
            flag_value = df['intraday_confirmed'][0]
            print(f'  ✅ intraday_confirmed flag present: {flag_value}')
        else:
            print(f'  ❌ intraday_confirmed flag MISSING')
    
    print()

## 3. Event Count Summary Table

In [None]:
summary_data = []

event_descriptions = {
    'E1': 'Volume Explosion (RVOL > 5x)',
    'E2': 'Gap Up (+10%)',
    'E3': 'Price Spike Intraday (+20%) [RADAR]',
    'E4': 'Parabolic Move (+50% in ≤5d)',
    'E5': 'Breakout ATH (52w high)',
    'E6': 'Multiple Green Days (3+ consec)',
    'E7': 'First Red Day (FRD)',
    'E8': 'Gap Down Violent (-15%)',
    'E9': 'Crash Intraday (-30%) [RADAR]',
    'E10': 'First Green Bounce (after 3+ red)',
    'E11': 'Volume Bounce (RVOL>3x + bounce)'
}

for event_type, df in dfs.items():
    summary_data.append({
        'Event': event_type,
        'Description': event_descriptions[event_type],
        'Count': len(df),
        'Unique Tickers': df['ticker'].n_unique(),
        'Date Range': f"{df['date'].min()} to {df['date'].max()}",
        'Avg Events/Ticker': len(df) / df['ticker'].n_unique()
    })

df_summary = pl.DataFrame(summary_data)
print(df_summary)

# Save summary
df_summary.write_csv('event_summary_E1_E11.csv')
print()
print('✅ Summary saved to event_summary_E1_E11.csv')

## 4. Visualize Event Distribution

In [None]:
# Bar chart: Event counts
fig, ax = plt.subplots(figsize=(14, 8))

event_counts = {k: len(v) for k, v in dfs.items()}
events = list(event_counts.keys())
counts = list(event_counts.values())

colors = ['red' if e in ['E3', 'E9'] else 'steelblue' for e in events]

ax.bar(events, counts, color=colors, alpha=0.7)
ax.set_xlabel('Event Type', fontsize=12)
ax.set_ylabel('Event Count', fontsize=12)
ax.set_title('Event Distribution E1-E11 (Total: 3,459,349)', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# Add count labels on bars
for i, (event, count) in enumerate(zip(events, counts)):
    ax.text(i, count + 20000, f'{count:,}', ha='center', fontsize=9)

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='steelblue', label='Daily OHLCV (confirmed)'),
    Patch(facecolor='red', label='Daily OHLCV (RADAR - intraday_confirmed=False)')
]
ax.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.savefig('event_distribution_E1_E11.png', dpi=300, bbox_inches='tight')
plt.show()

print('✅ Plot saved to event_distribution_E1_E11.png')

## 5. Temporal Distribution: Events Over Time

In [None]:
# Concatenate all events with event_type preserved
all_events = []
for event_type, df in dfs.items():
    df_temp = df.select(['ticker', 'date', 'event_type'])
    all_events.append(df_temp)

df_all = pl.concat(all_events)
print(f'Total events concatenated: {len(df_all):,}')
print()

# Group by year and event type
df_temporal = (
    df_all
    .with_columns([
        pl.col('date').dt.year().alias('year')
    ])
    .group_by(['year', 'event_type'])
    .agg(pl.count().alias('count'))
    .sort(['year', 'event_type'])
)

print('Events by year and type (sample):')
print(df_temporal.head(20))

In [None]:
# Plot: Events over time (stacked area chart)
fig, ax = plt.subplots(figsize=(16, 8))

# Pivot for plotting
df_pivot = df_temporal.pivot(index='year', columns='event_type', values='count').fill_null(0)
df_pivot_pd = df_pivot.to_pandas().set_index('year')

# Sort columns by event type
event_order = ['E1_VolExplosion', 'E2_GapUp', 'E3_PriceSpikeIntraday', 'E4_ParabolicMove', 
               'E5_BreakoutATH', 'E6_MultipleGreenDays', 'E7_FirstRedDay', 'E8_GapDownViolent',
               'E9_CrashIntraday', 'E10_FirstGreenBounce', 'E11_VolumeBounce']

df_pivot_pd = df_pivot_pd[event_order]

df_pivot_pd.plot(kind='area', stacked=True, ax=ax, alpha=0.7, colormap='tab20')
ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Event Count', fontsize=12)
ax.set_title('Event Distribution Over Time (2004-2025)', fontsize=14, fontweight='bold')
ax.legend(loc='upper left', fontsize=8)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('event_temporal_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('✅ Plot saved to event_temporal_distribution.png')

## 6. Top 20 Tickers by Total Event Count

In [None]:
# Count events per ticker across all event types
df_ticker_counts = (
    df_all
    .group_by('ticker')
    .agg(pl.count().alias('total_events'))
    .sort('total_events', descending=True)
    .head(20)
)

print('Top 20 tickers by total event count:')
print(df_ticker_counts)
print()

# Save
df_ticker_counts.write_csv('top20_tickers_events.csv')
print('✅ Top 20 tickers saved to top20_tickers_events.csv')

In [None]:
# Bar chart: Top 20 tickers
fig, ax = plt.subplots(figsize=(14, 8))

df_top20_pd = df_ticker_counts.to_pandas()
ax.barh(df_top20_pd['ticker'], df_top20_pd['total_events'], color='teal', alpha=0.7)
ax.set_xlabel('Total Events', fontsize=12)
ax.set_ylabel('Ticker', fontsize=12)
ax.set_title('Top 20 Tickers by Total Event Count (E1-E11)', fontsize=14, fontweight='bold')
ax.invert_yaxis()
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('top20_tickers_events.png', dpi=300, bbox_inches='tight')
plt.show()

print('✅ Plot saved to top20_tickers_events.png')

## 7. Multi-Event Days Analysis

In [None]:
# Count unique events per ticker-date
df_multi_event = (
    df_all
    .group_by(['ticker', 'date'])
    .agg([
        pl.count().alias('event_count'),
        pl.col('event_type').unique().alias('event_types')
    ])
    .sort('event_count', descending=True)
)

print('Multi-event days statistics:')
print()

# Distribution of event count per day
event_count_dist = (
    df_multi_event
    .group_by('event_count')
    .agg(pl.count().alias('days'))
    .sort('event_count')
)

print(event_count_dist)
print()

# Top 10 days with most events
print('Top 10 days with most events:')
print(df_multi_event.head(10))

In [None]:
# Plot: Distribution of events per day
fig, ax = plt.subplots(figsize=(12, 6))

df_dist_pd = event_count_dist.to_pandas()
ax.bar(df_dist_pd['event_count'], df_dist_pd['days'], color='coral', alpha=0.7)
ax.set_xlabel('Number of Events per Day', fontsize=12)
ax.set_ylabel('Number of Days', fontsize=12)
ax.set_title('Distribution: How Many Events Occur on Same Day?', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('multi_event_days_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('✅ Plot saved to multi_event_days_distribution.png')

## 8. Sample Data: E3 and E9 with `intraday_confirmed=False`

In [None]:
# Sample E3 events
print('=== E3 (Price Spike Intraday) Sample ===')
print()
print(dfs['E3'].head(10))
print()

# Verify flag
if 'intraday_confirmed' in dfs['E3'].columns:
    flag_values = dfs['E3']['intraday_confirmed'].unique()
    print(f'E3 intraday_confirmed unique values: {flag_values}')
    print(f'All E3 events have intraday_confirmed=False: {flag_values == [False]}')
else:
    print('❌ E3 intraday_confirmed column MISSING')

print()
print('=== E9 (Crash Intraday) Sample ===')
print()
print(dfs['E9'].head(10))
print()

# Verify flag
if 'intraday_confirmed' in dfs['E9'].columns:
    flag_values = dfs['E9']['intraday_confirmed'].unique()
    print(f'E9 intraday_confirmed unique values: {flag_values}')
    print(f'All E9 events have intraday_confirmed=False: {flag_values == [False]}')
else:
    print('❌ E9 intraday_confirmed column MISSING')

## 9. Event Co-occurrence Matrix

In [None]:
# Create binary matrix: ticker-date vs event_type
df_pivot_events = (
    df_all
    .with_columns([
        pl.lit(1).alias('present')
    ])
    .pivot(index=['ticker', 'date'], columns='event_type', values='present')
    .fill_null(0)
)

print('Binary event matrix (sample):')
print(df_pivot_events.head(10))

In [None]:
# Calculate co-occurrence matrix
event_cols = [col for col in df_pivot_events.columns if col not in ['ticker', 'date']]

# Convert to pandas for correlation matrix
df_events_pd = df_pivot_events.select(event_cols).to_pandas()
corr_matrix = df_events_pd.corr()

print('Event co-occurrence correlation matrix:')
print(corr_matrix)

In [None]:
# Heatmap: Event co-occurrence
fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, ax=ax)

ax.set_title('Event Co-occurrence Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('event_cooccurrence_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print('✅ Heatmap saved to event_cooccurrence_matrix.png')

## 10. Summary Report

In [None]:
print('=' * 80)
print('VALIDATION SUMMARY: EVENT DETECTION E1-E11')
print('=' * 80)
print()

print(f'Total events detected: {sum(len(df) for df in dfs.values()):,}')
print(f'Expected total: {sum(expected_counts.values()):,}')
print()

print('Event breakdown:')
for event_type, df in dfs.items():
    expected = expected_counts[event_type]
    actual = len(df)
    match = '✅' if actual == expected else '❌'
    print(f'  {match} {event_type}: {actual:,} (expected: {expected:,})')

print()
print('Schema validation:')
for event_type in ['E3', 'E9']:
    if 'intraday_confirmed' in dfs[event_type].columns:
        flag_value = dfs[event_type]['intraday_confirmed'][0]
        print(f'  ✅ {event_type}: intraday_confirmed={flag_value}')
    else:
        print(f'  ❌ {event_type}: intraday_confirmed MISSING')

print()
print('Date range:')
min_date = min(df['date'].min() for df in dfs.values())
max_date = max(df['date'].max() for df in dfs.values())
print(f'  {min_date} to {max_date}')

print()
print('Unique tickers with events:')
all_tickers = set()
for df in dfs.values():
    all_tickers.update(df['ticker'].unique().to_list())
print(f'  {len(all_tickers):,} tickers')

print()
print('Multi-event days:')
multi_event_days = df_multi_event.filter(pl.col('event_count') > 1)
print(f'  {len(multi_event_days):,} days have 2+ events')
max_events = df_multi_event['event_count'].max()
print(f'  Max events on single day: {max_events}')

print()
print('=' * 80)
print('✅ VALIDATION COMPLETED')
print('=' * 80)