# Data Exploration - Baseball Prediction Models

This notebook explores the downloaded MLB Statcast data for both models:
1. **Contact Prediction Model**: Pitch-level data
2. **Hit Outcome Model**: Batted ball data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Paths
PROJECT_ROOT = Path('.').parent
PITCH_DATA = PROJECT_ROOT / 'data' / 'raw' / 'pitch_data'
BATTED_DATA = PROJECT_ROOT / 'data' / 'raw' / 'batted_ball_data'

## 1. Load Data

In [None]:
# Load pitch data
pitches_2023 = pd.read_csv(PITCH_DATA / '2023_full_pitches.csv')
pitches_2024 = pd.read_csv(PITCH_DATA / '2024_full_pitches.csv')

# Combine
pitches = pd.concat([pitches_2023, pitches_2024], ignore_index=True)
print(f"Total pitches: {len(pitches):,}")
print(f"Columns: {len(pitches.columns)}")

In [None]:
# Load batted ball data
batted_2023 = pd.read_csv(BATTED_DATA / '2023_batted_balls.csv')
batted_2024 = pd.read_csv(BATTED_DATA / '2024_batted_balls.csv')

# Combine
batted_balls = pd.concat([batted_2023, batted_2024], ignore_index=True)
print(f"Total batted balls: {len(batted_balls):,}")

## 2. Contact Prediction Analysis

### 2.1 Pitch Location Heatmaps

In [None]:
# Create contact indicator
pitches['contact'] = (pitches['description'] == 'hit_into_play').astype(int)

# Filter for pitches in reasonable range
pitch_subset = pitches[
    (pitches['plate_x'].between(-2, 2)) & 
    (pitches['plate_z'].between(0, 5))
].copy()

In [None]:
# Pitch location heatmap
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# All pitches
ax = axes[0]
hb = ax.hexbin(pitch_subset['plate_x'], pitch_subset['plate_z'], 
               gridsize=30, cmap='YlOrRd', mincnt=1)
ax.set_xlabel('Horizontal Position (ft)')
ax.set_ylabel('Vertical Position (ft)')
ax.set_title('Pitch Location Distribution')
# Strike zone
ax.add_patch(plt.Rectangle((-0.83, 1.5), 1.66, 2, fill=False, edgecolor='blue', linewidth=2))
plt.colorbar(hb, ax=ax, label='Count')

# Contact rate by location
ax = axes[1]
contact_pitches = pitch_subset[pitch_subset['contact'] == 1]
hb = ax.hexbin(contact_pitches['plate_x'], contact_pitches['plate_z'], 
               gridsize=30, cmap='Greens', mincnt=1)
ax.set_xlabel('Horizontal Position (ft)')
ax.set_ylabel('Vertical Position (ft)')
ax.set_title('Contact Locations')
ax.add_patch(plt.Rectangle((-0.83, 1.5), 1.66, 2, fill=False, edgecolor='blue', linewidth=2))
plt.colorbar(hb, ax=ax, label='Count')

plt.tight_layout()
plt.show()

### 2.2 Contact Rate by Zone

In [None]:
# Contact rate by zone (1-14)
zone_contact = pitches.groupby('zone').agg(
    total=('contact', 'count'),
    contacts=('contact', 'sum')
).reset_index()
zone_contact['rate'] = zone_contact['contacts'] / zone_contact['total']

# Plot
fig, ax = plt.subplots(figsize=(12, 5))
bars = ax.bar(zone_contact['zone'].astype(str), zone_contact['rate'])

# Color by in-zone vs out-of-zone
for i, bar in enumerate(bars):
    zone = zone_contact.iloc[i]['zone']
    if 1 <= zone <= 9:
        bar.set_color('#22c55e')  # Green for strike zone
    else:
        bar.set_color('#ef4444')  # Red for out of zone

ax.set_xlabel('Zone')
ax.set_ylabel('Contact Rate')
ax.set_title('Contact Rate by Zone (Green=Strike Zone, Red=Out of Zone)')
ax.axhline(y=pitches['contact'].mean(), color='black', linestyle='--', label=f'Average: {pitches["contact"].mean():.2%}')
ax.legend()
plt.show()

### 2.3 Velocity by Pitch Type

In [None]:
# Velocity distribution by pitch type
main_pitch_types = ['FF', 'SI', 'FC', 'SL', 'CU', 'CH', 'FS']
pitch_type_data = pitches[pitches['pitch_type'].isin(main_pitch_types)]

fig, ax = plt.subplots(figsize=(12, 6))
sns.boxplot(data=pitch_type_data, x='pitch_type', y='release_speed', 
            order=main_pitch_types, ax=ax)
ax.set_xlabel('Pitch Type')
ax.set_ylabel('Velocity (mph)')
ax.set_title('Velocity Distribution by Pitch Type')
plt.show()

### 2.4 Count Effects on Contact

In [None]:
# Contact rate by count
count_contact = pitches.groupby(['balls', 'strikes']).agg(
    total=('contact', 'count'),
    contacts=('contact', 'sum')
).reset_index()
count_contact['rate'] = count_contact['contacts'] / count_contact['total']

# Pivot for heatmap
count_pivot = count_contact.pivot(index='strikes', columns='balls', values='rate')

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(count_pivot, annot=True, fmt='.2%', cmap='RdYlGn', ax=ax)
ax.set_xlabel('Balls')
ax.set_ylabel('Strikes')
ax.set_title('Contact Rate by Count')
plt.show()

### 2.5 Class Imbalance Check

In [None]:
# Check class balance
contact_dist = pitches['description'].value_counts()
contact_rate = (pitches['description'] == 'hit_into_play').mean()

print(f"Contact Rate: {contact_rate:.2%}")
print(f"Expected: ~17% (balls put in play)")
print("\nPitch outcome distribution:")
print(contact_dist.head(10))

## 3. Hit Outcome Analysis

### 3.1 Exit Velocity vs Launch Angle

In [None]:
# Simplify events for visualization
event_mapping = {
    'single': 'single',
    'double': 'double',
    'triple': 'triple',
    'home_run': 'home_run',
    'field_out': 'out',
    'force_out': 'out',
    'grounded_into_double_play': 'out',
    'double_play': 'out',
    'fielders_choice_out': 'out',
    'sac_fly': 'out',
    'sac_bunt': 'out',
}
batted_balls['outcome'] = batted_balls['events'].map(event_mapping)
batted_valid = batted_balls[batted_balls['outcome'].notna()].copy()

In [None]:
# EV vs LA scatter plot
fig, ax = plt.subplots(figsize=(12, 8))

colors = {
    'out': '#6b7280',
    'single': '#60a5fa',
    'double': '#3b82f6',
    'triple': '#8b5cf6',
    'home_run': '#f97316'
}

# Sample for visualization
sample = batted_valid.sample(min(50000, len(batted_valid)))

for outcome in ['out', 'single', 'double', 'triple', 'home_run']:
    data = sample[sample['outcome'] == outcome]
    ax.scatter(data['launch_angle'], data['launch_speed'], 
               c=colors[outcome], label=outcome.title(), alpha=0.3, s=10)

# Barrel zone
ax.axhline(y=98, color='red', linestyle='--', alpha=0.7)
ax.axvline(x=26, color='red', linestyle='--', alpha=0.7)
ax.axvline(x=30, color='red', linestyle='--', alpha=0.7)
ax.text(28, 105, 'BARREL\nZONE', ha='center', color='red', fontweight='bold')

ax.set_xlabel('Launch Angle (degrees)')
ax.set_ylabel('Exit Velocity (mph)')
ax.set_title('Exit Velocity vs Launch Angle by Outcome')
ax.legend()
ax.set_xlim(-50, 70)
ax.set_ylim(40, 120)
plt.show()

### 3.2 Outcome Distribution

In [None]:
# Outcome distribution
outcome_dist = batted_valid['outcome'].value_counts()
outcome_pct = (outcome_dist / outcome_dist.sum() * 100).round(2)

print("Outcome Distribution:")
for outcome, count in outcome_dist.items():
    print(f"  {outcome}: {count:,} ({outcome_pct[outcome]:.1f}%)")

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(outcome_dist.index, outcome_dist.values, 
              color=[colors.get(o, '#gray') for o in outcome_dist.index])
ax.set_xlabel('Outcome')
ax.set_ylabel('Count')
ax.set_title('Hit Outcome Distribution (Note: Triples are very rare)')

# Add percentages
for bar, pct in zip(bars, outcome_pct.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1000,
            f'{pct:.1f}%', ha='center', fontsize=10)

plt.show()

### 3.3 Batted Ball Types

In [None]:
# Batted ball type distribution
bb_type_dist = batted_balls['bb_type'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Type distribution
ax = axes[0]
bb_type_dist.plot(kind='bar', ax=ax, color=['#22c55e', '#3b82f6', '#f97316', '#ef4444'])
ax.set_xlabel('Batted Ball Type')
ax.set_ylabel('Count')
ax.set_title('Batted Ball Type Distribution')
ax.tick_params(axis='x', rotation=45)

# Outcome by type
ax = axes[1]
outcome_by_type = batted_valid.groupby(['bb_type', 'outcome']).size().unstack(fill_value=0)
outcome_by_type_pct = outcome_by_type.div(outcome_by_type.sum(axis=1), axis=0) * 100
outcome_by_type_pct[['out', 'single', 'double', 'triple', 'home_run']].plot(
    kind='bar', stacked=True, ax=ax, 
    color=[colors[c] for c in ['out', 'single', 'double', 'triple', 'home_run']]
)
ax.set_xlabel('Batted Ball Type')
ax.set_ylabel('Percentage')
ax.set_title('Outcome Distribution by Batted Ball Type')
ax.legend(title='Outcome', bbox_to_anchor=(1.02, 1))
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

### 3.4 Spray Angle Analysis

In [None]:
# Calculate spray angle from hit coordinates
center_x = 125.42  # Center field
batted_valid['spray_angle'] = np.arctan2(
    batted_valid['hc_x'] - center_x,
    batted_valid['hc_y']
) * (180 / np.pi)

# Spray angle distribution by outcome
fig, ax = plt.subplots(figsize=(12, 6))
for outcome in ['out', 'single', 'double', 'triple', 'home_run']:
    data = batted_valid[batted_valid['outcome'] == outcome]['spray_angle']
    ax.hist(data, bins=50, alpha=0.5, label=outcome.title(), color=colors[outcome])

ax.set_xlabel('Spray Angle (degrees)')
ax.set_ylabel('Count')
ax.set_title('Spray Angle Distribution by Outcome\n(Negative = Pull, Positive = Opposite Field)')
ax.legend()
plt.show()

## 4. Feature Correlations

In [None]:
# Key feature correlations for contact model
contact_features = ['plate_x', 'plate_z', 'release_speed', 'pfx_x', 'pfx_z', 
                    'balls', 'strikes', 'contact']
contact_corr = pitches[contact_features].corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(contact_corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0, ax=ax)
ax.set_title('Feature Correlations (Contact Model)')
plt.show()

In [None]:
# Key feature correlations for outcome model
outcome_features = ['launch_speed', 'launch_angle', 'spray_angle', 
                    'hc_x', 'hc_y']
# Add numeric outcome
outcome_map = {'out': 0, 'single': 1, 'double': 2, 'triple': 3, 'home_run': 4}
batted_valid['outcome_num'] = batted_valid['outcome'].map(outcome_map)

outcome_corr = batted_valid[outcome_features + ['outcome_num']].corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(outcome_corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0, ax=ax)
ax.set_title('Feature Correlations (Outcome Model)')
plt.show()

## 5. Key Insights Summary

### Contact Prediction Model
1. **Contact rate** is approximately 17% (balls put in play vs all pitch outcomes)
2. **Zone matters**: Strike zone pitches have higher contact rates
3. **Count effects**: Hitter's counts (3-1, 2-0) show different contact patterns
4. **Pitch type**: Fastballs have different contact rates than breaking balls

### Hit Outcome Model
1. **Class imbalance**: Outs (~70%), Singles (~15%), Doubles (~8%), HRs (~5%), Triples (<2%)
2. **Exit velocity is crucial**: 98+ mph defines "barrel" territory
3. **Launch angle sweet spot**: 26-30° for home runs, 8-32° for all good contact
4. **Triples are rare**: Will need class weighting or SMOTE
5. **Batted ball type**: Line drives have best hit outcomes

In [None]:
print("EDA Complete!")
print(f"\nPitch Data: {len(pitches):,} records")
print(f"Batted Ball Data: {len(batted_balls):,} records")