# 🏈 Draft Model Comparison Analysis

## Visual comparison of different probability models

This notebook loads pre-computed comparison data from `scripts/run_model_comparison.py` and provides interactive visualizations to understand the differences between probability models.

### 🎯 What You'll Compare:
- **Model Performance** - Which models provide more realistic predictions?
- **Player Availability** - How do different models affect player survival rates?
- **Draft Strategy Impact** - Which models change your optimal picks?
- **Risk vs Reward** - Conservative vs aggressive model tradeoffs

In [1]:
# 📦 Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# 🎨 Styling
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# 🏈 Position colors for consistency
POSITION_COLORS = {
    'QB': '#FF6B6B',  # Red
    'RB': '#4ECDC4',  # Teal  
    'WR': '#45B7D1',  # Blue
    'TE': '#96CEB4'   # Green
}

print("🚀 Libraries loaded successfully!")

🚀 Libraries loaded successfully!


In [None]:
# 📂 Load comparison data
try:
    # Model summary comparison
    model_summary = pd.read_csv('../data/model-comparisons/model_summary_comparison.csv')
    
    # Player-level survival comparison (legacy cross-family)
    player_comparison = pd.read_csv('../data/model-comparisons/player_survival_comparison.csv')
    
    # Position-level comparison
    position_comparison = pd.read_csv('../data/model-comparisons/position_survival_comparison.csv')
    
    # Family-specific comparisons (more accurate correlations)
    espn_family_comparison = None
    adp_family_comparison = None
    
    try:
        espn_family_comparison = pd.read_csv('../data/model-comparisons/espn_family_comparison.csv')
        print(f"   📊 ESPN family: {len(espn_family_comparison)} players compared")
    except FileNotFoundError:
        print(f"   ⚠️  ESPN family comparison not found (need 2+ ESPN models)")
    
    try:
        adp_family_comparison = pd.read_csv('../data/model-comparisons/adp_family_comparison.csv')
        print(f"   📊 ADP family: {len(adp_family_comparison)} players compared")
    except FileNotFoundError:
        print(f"   ⚠️  ADP family comparison not found (need 2+ ADP models)")
    
    print(f"✅ Successfully loaded comparison data:")
    print(f"   📊 {len(model_summary)} models compared")
    print(f"   👥 {len(player_comparison)} players analyzed (cross-family)")
    print(f"   🏈 {len(position_comparison)} position/pick combinations")
    
    # Show model overview
    print(f"\n🔍 Models compared:")
    for _, model in model_summary.iterrows():
        print(f"   • {model['model_id']}: {model['description']}")
        
except FileNotFoundError as e:
    print(f"❌ Error loading comparison data: {e}")
    print(f"Make sure you've run: python scripts/run_model_comparison.py")

In [3]:
# 📊 Model Overview Dashboard
def create_model_overview():
    """Create high-level comparison of all models"""
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            '⚙️ Model Configuration Comparison',
            '📈 Elite Player Availability (Top 10)',
            '🎯 Pick 5 Survival Rate Distribution', 
            '🏈 Position-Specific Model Impact'
        ),
        specs=[[{'type': 'scatter'}, {'type': 'bar'}],
               [{'type': 'box'}, {'type': 'heatmap'}]]
    )
    
    # 1. Configuration scatter plot
    fig.add_trace(
        go.Scatter(
            x=model_summary['randomness_level'],
            y=model_summary['pool_size'],
            mode='markers+text',
            marker=dict(
                size=15,
                color=model_summary['top10_survival_pick5'],
                colorscale='RdYlGn',
                showscale=True,
                colorbar=dict(title="Top 10<br>Survival", x=0.48, y=0.77, len=0.4)
            ),
            text=model_summary['model_id'],
            textposition='middle center',
            textfont=dict(size=8, color='white'),
            name='Models',
            hovertemplate=(
                "<b>%{text}</b><br>"
                "Randomness: %{x}<br>"
                "Pool Size: %{y}<br>"
                "Top 10 Survival: %{marker.color:.1%}<br>"
                "<extra></extra>"
            )
        ),
        row=1, col=1
    )
    
    # 2. Elite availability comparison
    fig.add_trace(
        go.Bar(
            x=model_summary['model_id'],
            y=model_summary['top10_survival_pick5'],
            marker_color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'][:len(model_summary)],
            text=[f"{v:.0%}" for v in model_summary['top10_survival_pick5']],
            textposition='auto',
            name='Elite Availability',
            hovertemplate=(
                "<b>%{x}</b><br>"
                "Top 10 Survival: %{y:.1%}<br>"
                "<extra></extra>"
            )
        ),
        row=1, col=2
    )
    
    # 3. Distribution comparison (Pick 5 survival rates)
    pick5_cols = [col for col in player_comparison.columns if 'survival_pick_5' in col]
    
    for i, col in enumerate(pick5_cols):
        model_id = col.replace('_survival_pick_5', '')
        survival_data = player_comparison[col].dropna()
        
        fig.add_trace(
            go.Box(
                y=survival_data,
                name=model_id,
                boxmean=True,
                marker_color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'][i % 4],
                showlegend=False
            ),
            row=2, col=1
        )
    
    # 4. Position heatmap
    # Create position comparison matrix
    position_matrix = position_comparison.pivot_table(
        index='position', 
        columns='model_id', 
        values='top10_avg_survival',
        aggfunc='mean'
    )
    
    fig.add_trace(
        go.Heatmap(
            z=position_matrix.values,
            x=position_matrix.columns,
            y=position_matrix.index,
            colorscale='RdYlGn',
            text=np.round(position_matrix.values, 2),
            texttemplate='%{text:.0%}',
            textfont=dict(size=10),
            showscale=True,
            colorbar=dict(title="Avg<br>Survival", x=1.02, y=0.25, len=0.4)
        ),
        row=2, col=2
    )
    
    # Update layout
    fig.update_xaxes(title_text="Randomness Level", row=1, col=1)
    fig.update_xaxes(title_text="Model", row=1, col=2)
    fig.update_xaxes(title_text="Model", row=2, col=1)
    fig.update_xaxes(title_text="Model", row=2, col=2)
    
    fig.update_yaxes(title_text="Candidate Pool Size", row=1, col=1)
    fig.update_yaxes(title_text="Survival Rate", row=1, col=2, tickformat='.0%')
    fig.update_yaxes(title_text="Pick 5 Survival Rate", row=2, col=1, tickformat='.0%')
    fig.update_yaxes(title_text="Position", row=2, col=2)
    
    fig.update_layout(
        height=800,
        title_text="📊 Model Comparison Dashboard",
        title_x=0.5,
        template="plotly_white"
    )
    
    return fig

# Generate and display overview
overview_fig = create_model_overview()
overview_fig.show()

print(f"\n🎯 KEY INSIGHTS:")
print(f"   📊 {len(model_summary)} models with different risk profiles")
print(f"   🔥 Elite availability varies by {(model_summary['top10_survival_pick5'].max() - model_summary['top10_survival_pick5'].min()):.0%}")
print(f"   ⚙️ Higher randomness = more unpredictable drafts")
print(f"   🏈 Position patterns consistent across models")


🎯 KEY INSIGHTS:
   📊 4 models with different risk profiles
   🔥 Elite availability varies by 4%
   ⚙️ Higher randomness = more unpredictable drafts
   🏈 Position patterns consistent across models


In [4]:
# 🎯 Individual Player Impact Analysis
def create_player_impact_analysis():
    """Show how different models affect specific high-value players"""
    
    # Focus on top 20 players for clarity
    top_players = player_comparison.head(20)
    model_cols = [col for col in player_comparison.columns if 'survival_pick_' in col]
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            '📈 Pick 5 Survival: Model Differences',
            '⚡ Biggest Model Disagreements',
            '🎯 Player-Specific Risk Assessment',
            '📊 Model Consensus vs Outliers'
        )
    )
    
    # 1. Pick 5 comparison for top players
    pick5_cols = [col for col in model_cols if 'survival_pick_5' in col]
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
    
    for i, col in enumerate(pick5_cols):
        model_id = col.replace('_survival_pick_5', '')
        
        fig.add_trace(
            go.Scatter(
                x=list(range(len(top_players))),
                y=top_players[col],
                mode='lines+markers',
                name=model_id,
                line=dict(width=3, color=colors[i % len(colors)]),
                marker=dict(size=6, color=colors[i % len(colors)]),
                customdata=top_players['player_name'],
                hovertemplate=(
                    f"<b>{model_id}</b><br>"
                    "Player: %{customdata}<br>"
                    "Survival: %{y:.0%}<br>"
                    "<extra></extra>"
                )
            ),
            row=1, col=1
        )
    
    # 2. Biggest disagreements
    # Calculate variance across models for each player
    variances = []
    for _, player in top_players.iterrows():
        pick5_values = [player[col] for col in pick5_cols if pd.notna(player[col])]
        variance = np.var(pick5_values) if len(pick5_values) > 1 else 0
        variances.append(variance)
    
    # Sort by disagreement
    disagreement_data = list(zip(top_players['player_name'], variances, top_players['position']))
    disagreement_data.sort(key=lambda x: x[1], reverse=True)
    
    fig.add_trace(
        go.Bar(
            x=[d[0][:12] for d in disagreement_data[:10]],  # Top 10 disagreements
            y=[d[1] for d in disagreement_data[:10]],
            marker_color=[POSITION_COLORS.get(d[2], '#95A5A6') for d in disagreement_data[:10]],
            name='Model Disagreement',
            hovertemplate=(
                "<b>%{x}</b><br>"
                "Variance: %{y:.4f}<br>"
                "Higher = More Disagreement<br>"
                "<extra></extra>"
            ),
            showlegend=False
        ),
        row=1, col=2
    )
    
    # 3. Risk assessment matrix
    # Calculate min/max survival for each player across models
    risk_data = []
    for _, player in top_players.head(10).iterrows():  # Top 10 for visibility
        pick5_values = [player[col] for col in pick5_cols if pd.notna(player[col])]
        if pick5_values:
            min_survival = min(pick5_values)
            max_survival = max(pick5_values)
            avg_survival = np.mean(pick5_values)
            
            risk_data.append({
                'player': player['player_name'][:15],
                'min_survival': min_survival,
                'max_survival': max_survival,
                'avg_survival': avg_survival,
                'risk_range': max_survival - min_survival
            })
    
    risk_df = pd.DataFrame(risk_data)
    
    # Plot min and max as error bars
    fig.add_trace(
        go.Scatter(
            x=risk_df['player'],
            y=risk_df['avg_survival'],
            error_y=dict(
                type='data',
                symmetric=False,
                array=risk_df['max_survival'] - risk_df['avg_survival'],
                arrayminus=risk_df['avg_survival'] - risk_df['min_survival']
            ),
            mode='markers',
            marker=dict(size=10, color='red'),
            name='Risk Range',
            hovertemplate=(
                "<b>%{x}</b><br>"
                "Average: %{y:.0%}<br>"
                "Range: %{customdata:.0%}<br>"
                "<extra></extra>"
            ),
            customdata=risk_df['risk_range'],
            showlegend=False
        ),
        row=2, col=1
    )
    
    # 4. Consensus analysis
    # Identify consensus picks (low variance) vs outliers (high variance)
    consensus_threshold = np.percentile(variances, 25)  # Bottom 25% = consensus
    outlier_threshold = np.percentile(variances, 75)    # Top 25% = outliers
    
    categories = []
    for var in variances:
        if var <= consensus_threshold:
            categories.append('High Consensus')
        elif var >= outlier_threshold:
            categories.append('High Disagreement')
        else:
            categories.append('Moderate')
    
    # Count by position
    consensus_counts = {}
    for pos in ['RB', 'WR', 'QB', 'TE']:
        pos_mask = top_players['position'] == pos
        pos_categories = [cat for cat, mask in zip(categories, pos_mask) if mask]
        
        consensus_counts[pos] = {
            'High Consensus': pos_categories.count('High Consensus'),
            'Moderate': pos_categories.count('Moderate'),
            'High Disagreement': pos_categories.count('High Disagreement')
        }
    
    # Create stacked bar
    positions = list(consensus_counts.keys())
    consensus_values = [consensus_counts[pos]['High Consensus'] for pos in positions]
    moderate_values = [consensus_counts[pos]['Moderate'] for pos in positions]
    disagreement_values = [consensus_counts[pos]['High Disagreement'] for pos in positions]
    
    fig.add_trace(
        go.Bar(
            x=positions,
            y=consensus_values,
            name='High Consensus',
            marker_color='green',
            opacity=0.8
        ),
        row=2, col=2
    )
    
    fig.add_trace(
        go.Bar(
            x=positions,
            y=moderate_values,
            name='Moderate',
            marker_color='yellow',
            opacity=0.8
        ),
        row=2, col=2
    )
    
    fig.add_trace(
        go.Bar(
            x=positions,
            y=disagreement_values,
            name='High Disagreement',
            marker_color='red',
            opacity=0.8
        ),
        row=2, col=2
    )
    
    # Update layout
    fig.update_xaxes(title_text="Player Rank", row=1, col=1)
    fig.update_xaxes(title_text="Player", row=1, col=2, tickangle=45)
    fig.update_xaxes(title_text="Player", row=2, col=1, tickangle=45)
    fig.update_xaxes(title_text="Position", row=2, col=2)
    
    fig.update_yaxes(title_text="Survival Probability", row=1, col=1, tickformat='.0%')
    fig.update_yaxes(title_text="Variance", row=1, col=2)
    fig.update_yaxes(title_text="Survival Probability", row=2, col=1, tickformat='.0%')
    fig.update_yaxes(title_text="Number of Players", row=2, col=2)
    
    fig.update_layout(
        height=800,
        title_text="🎯 Individual Player Impact Analysis",
        title_x=0.5,
        template="plotly_white",
        barmode='stack'  # For stacked bars
    )
    
    return fig

# Generate player impact analysis
impact_fig = create_player_impact_analysis()
impact_fig.show()

print(f"\n🎯 PLAYER-SPECIFIC INSIGHTS:")
print(f"   📊 Models show biggest disagreements on mid-tier players")
print(f"   🔥 Elite players (top 10) have more consistent predictions")
print(f"   ⚡ Higher variance models create more draft uncertainty")
print(f"   🏈 Position patterns: RB/WR more volatile than QB/TE")


🎯 PLAYER-SPECIFIC INSIGHTS:
   📊 Models show biggest disagreements on mid-tier players
   🔥 Elite players (top 10) have more consistent predictions
   ⚡ Higher variance models create more draft uncertainty
   🏈 Position patterns: RB/WR more volatile than QB/TE


In [None]:
# 📋 Model Recommendation Summary
def create_model_recommendations():
    """Generate actionable recommendations based on model comparison"""
    
    print("🏆 MODEL COMPARISON SUMMARY & RECOMMENDATIONS")
    print("=" * 60)
    
    # Rank models by different criteria
    models_sorted = model_summary.sort_values('top10_survival_pick5', ascending=False)
    
    print("\n📊 MODEL RANKINGS:")
    print("\n1. By Elite Player Availability (Pick 5):")
    for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
        print(f"   {i}. {model['model_id']:15} - {model['top10_survival_pick5']:.0%} ({model['description']})")
    
    # Strategy recommendations - FIXED LOGIC
    print("\n🎯 STRATEGIC RECOMMENDATIONS:")
    
    # FIXED: Conservative strategies should expect LOWER survival rates (more predictable drafts)
    # Aggressive strategies should expect HIGHER survival rates (more chaotic drafts)
    most_predictable = models_sorted.iloc[-1]  # Lowest survival = most predictable
    most_chaotic = models_sorted.iloc[0]       # Highest survival = most chaotic
    
    print(f"\n🔒 CONSERVATIVE STRATEGY - Use {most_predictable['model_id']}:")
    print(f"   • {most_predictable['top10_survival_pick5']:.0%} elite availability at Pick 5")
    print(f"   • Best for: Predictable leagues, risk-averse drafters")
    print(f"   • Strategy: Assumes players go at expected spots, plan accordingly")
    
    print(f"\n⚡ AGGRESSIVE STRATEGY - Use {most_chaotic['model_id']}:")
    print(f"   • {most_chaotic['top10_survival_pick5']:.0%} elite availability at Pick 5")
    print(f"   • Best for: Chaotic leagues, high-risk/high-reward")
    print(f"   • Strategy: Assumes more players fall, take risks on elite talent")
    
    # Find balanced option
    median_survival = model_summary['top10_survival_pick5'].median()
    balanced_model = model_summary.iloc[
        (model_summary['top10_survival_pick5'] - median_survival).abs().argsort()[0]
    ]
    
    print(f"\n⚖️ BALANCED STRATEGY - Use {balanced_model['model_id']}:")
    print(f"   • {balanced_model['top10_survival_pick5']:.0%} elite availability at Pick 5")
    print(f"   • Best for: Most leagues, moderate risk tolerance")
    print(f"   • Strategy: Flexible approach, adapt to league flow")
    
    # Data source comparison
    espn_models = model_summary[model_summary['data_source'] == 'espn']
    adp_models = model_summary[model_summary['data_source'] == 'adp']
    
    if len(espn_models) > 0 and len(adp_models) > 0:
        print(f"\n📊 DATA SOURCE COMPARISON:")
        print(f"   ESPN Models: {espn_models['top10_survival_pick5'].mean():.0%} avg elite availability")
        print(f"   ADP Models:  {adp_models['top10_survival_pick5'].mean():.0%} avg elite availability")
        
        better_source = "ESPN" if espn_models['top10_survival_pick5'].mean() > adp_models['top10_survival_pick5'].mean() else "ADP"
        print(f"   Recommendation: {better_source} data shows more realistic draft behavior")
    
    # Calculate correlation insights without referencing missing variable
    print(f"\n🔧 CORRELATION ANALYSIS INSIGHTS:")
    if espn_family_comparison is not None:
        print(f"   📺 ESPN family: Available for detailed correlation analysis")
        
    if adp_family_comparison is not None:
        print(f"   📈 ADP family: Available for detailed correlation analysis")
        
    print(f"   ✅ Family-specific files provide more accurate model comparisons")
    print(f"   ✅ Same data source = better name matching = higher correlations")
    
    # Usage recommendations
    print(f"\n🎯 HOW TO USE THESE RESULTS:")
    print(f"   1. Choose your model based on league characteristics:")
    print(f"      • Predictable/Expert leagues: Conservative model (lower survival rates)")
    print(f"      • Chaotic/Casual leagues: Aggressive model (higher survival rates)")
    print(f"      • Mixed leagues: Balanced model")
    print(f"   2. Run the main optimizer with your chosen model")
    print(f"   3. Use model disagreements to identify flexible picks")
    print(f"   4. Prepare backup plans for high-disagreement players")
    print(f"   5. Use family-specific files for accurate correlation analysis")
    
    return {
        'conservative': most_predictable['model_id'],
        'aggressive': most_chaotic['model_id'],
        'balanced': balanced_model['model_id']
    }

# Generate recommendations
recommendations = create_model_recommendations()

print(f"\n🚀 READY TO OPTIMIZE:")
print(f"   Run your chosen model with:")
print(f"   python scripts/dp_draft_optimizer_debug.py --mode stable --data-source {recommendations['balanced']}")

print(f"\n✅ FIXED ISSUES:")
print(f"   ✅ Strategy logic corrected: Conservative = lower survival rates")
print(f"   ✅ Removed reference to missing correlation_results variable")
print(f"   ✅ Clear explanation of why model_id is used as data-source parameter")