# MLB Moneyline Regression Analysis

This notebook uses the mlbgame API to predict MLB moneylines from current standings and starting pitcher ERA.

**Mission:** Predict Moneylines from current standings and starting pitcher ERA

**Complexity:** Small

**Backtested:** No

**Chances of beating the books:** Low

## API Overview
Using mlbgame API which provides:
- Real-time game data from MLB GameDay
- Team standings via `mlbgame.standings()`
- Game schedules via `mlbgame.games()` and `mlbgame.day()`
- Player stats via `mlbgame.player_stats()`
- Box scores via `mlbgame.box_score()`

In [13]:
TEST_PRINT = False
def print_test(*args, **kwargs):
    if TEST_PRINT:
        print(*args, **kwargs)

In [16]:
# Import required libraries
import mlbgame
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette('husl')

print(f"mlbgame version: {mlbgame.VERSION}")
print("Libraries imported successfully")

mlbgame version: 2.5.0
Libraries imported successfully


## Data Collection

First, let's collect current MLB team standings and recent game data to build our features.
We'll use the mlbgame API functions as documented in the wiki.

In [20]:
TEST_PRINT = True  # Set to True to enable debug prints
def get_current_standings():
    """
    Get current MLB team standings using mlbgame.standings()
    Returns a pandas DataFrame with team standings data
    """
    try:
        print("Fetching current MLB standings...")
        
        # Get current standings - mlbgame.standings() returns current standings by default
        standings_obj = mlbgame.standings(date=datetime.now())
        print_test(standings_obj)
        
        # Extract team data from standings object
        # The standings object contains divisions with teams
        team_data = []
        
        # Iterate through divisions (AL/NL East, Central, West)
        for division_name, division_teams in standings_obj.divisions.items():
            print(f"Processing {division_name}...")
            
            for team in division_teams:
                # Extract relevant team statistics
                team_info = {
                    'team_name': team.team_name,
                    'team_id': team.team_id,
                    'division': division_name,
                    'wins': team.w,
                    'losses': team.l,
                    'win_pct': team.w / (team.w + team.l) if (team.w + team.l) > 0 else 0.5,
                    'games_back': team.gb,
                    'runs_scored': getattr(team, 'rs', 0),  # Runs scored (if available)
                    'runs_allowed': getattr(team, 'ra', 0),  # Runs allowed (if available)
                    'run_differential': getattr(team, 'rs', 0) - getattr(team, 'ra', 0)
                }
                team_data.append(team_info)
        
        standings_df = pd.DataFrame(team_data)
        print(f"Successfully collected standings for {len(standings_df)} teams")
        return standings_df
        
    except Exception as e:
        print(f"Error getting standings: {e}")
        print("This might be due to off-season or API changes")
        
        # Return mock data for development/testing
        print("Returning mock standings data for development...")
        mock_teams = [
            {'team_name': 'Yankees', 'team_id': 'NYY', 'wins': 95, 'losses': 67, 'win_pct': 0.586, 'run_differential': 150},
            {'team_name': 'Red Sox', 'team_id': 'BOS', 'wins': 92, 'losses': 70, 'win_pct': 0.568, 'run_differential': 80},
            {'team_name': 'Dodgers', 'team_id': 'LAD', 'wins': 100, 'losses': 62, 'win_pct': 0.617, 'run_differential': 200},
            {'team_name': 'Astros', 'team_id': 'HOU', 'wins': 98, 'losses': 64, 'win_pct': 0.605, 'run_differential': 180}
        ]
        return pd.DataFrame(mock_teams)

# Get current standings
standings_df = get_current_standings()
print("\nStandings Data Preview:")
print(standings_df.head())
print(f"\nColumns: {list(standings_df.columns)}")

Fetching current MLB standings...
Error getting standings: 'NoneType' object has no attribute 'read'
This might be due to off-season or API changes
Returning mock standings data for development...

Standings Data Preview:
  team_name team_id  wins  losses  win_pct  run_differential
0   Yankees     NYY    95      67    0.586               150
1   Red Sox     BOS    92      70    0.568                80
2   Dodgers     LAD   100      62    0.617               200
3    Astros     HOU    98      64    0.605               180

Columns: ['team_name', 'team_id', 'wins', 'losses', 'win_pct', 'run_differential']


In [3]:
def get_recent_games_data(days_back=14, year=2024):
    """
    Get recent MLB games data using mlbgame.games() and mlbgame.day()
    
    Args:
        days_back (int): Number of days back to collect data
        year (int): Year to collect data from
    
    Returns:
        pandas.DataFrame: DataFrame containing game results with team info
    """
    try:
        print(f"Collecting game data for the last {days_back} days...")
        
        games_data = []
        current_date = datetime.now()
        
        # Collect games for the specified date range
        for i in range(days_back):
            target_date = current_date - timedelta(days=i)
            month = target_date.month
            day = target_date.day
            
            try:
                print(f"Fetching games for {year}-{month:02d}-{day:02d}...")
                
                # Get games for specific day using mlbgame.day()
                daily_games = mlbgame.day(year, month, day)
                
                if daily_games:
                    print(f"Found {len(daily_games)} games on {month}/{day}")
                    
                    for game in daily_games:
                        # Only process completed games
                        if hasattr(game, 'game_status') and game.game_status == 'Final':
                            
                            # Extract game information
                            game_info = {
                                'game_id': game.game_id,
                                'date': f"{year}-{month:02d}-{day:02d}",
                                'home_team': game.home_team,
                                'away_team': game.away_team,
                                'home_score': int(game.home_team_runs),
                                'away_score': int(game.away_team_runs),
                                'home_win': 1 if int(game.home_team_runs) > int(game.away_team_runs) else 0,
                                'winning_pitcher': getattr(game, 'w_pitcher', 'Unknown'),
                                'losing_pitcher': getattr(game, 'l_pitcher', 'Unknown'),
                                'game_status': game.game_status
                            }
                            games_data.append(game_info)
                
            except Exception as day_error:
                print(f"No games or error for {month}/{day}: {day_error}")
                continue
        
        if games_data:
            games_df = pd.DataFrame(games_data)
            print(f"\nSuccessfully collected {len(games_df)} completed games")
            return games_df
        else:
            print("No completed games found in the specified date range")
            print("This might be due to off-season or recent date range")
            
            # Return mock game data for development
            print("Returning mock game data for development...")
            mock_games = [
                {'game_id': '001', 'home_team': 'Yankees', 'away_team': 'Red Sox', 'home_score': 7, 'away_score': 4, 'home_win': 1},
                {'game_id': '002', 'home_team': 'Dodgers', 'away_team': 'Giants', 'home_score': 3, 'away_score': 8, 'home_win': 0},
                {'game_id': '003', 'home_team': 'Astros', 'away_team': 'Rangers', 'home_score': 9, 'away_score': 2, 'home_win': 1},
                {'game_id': '004', 'home_team': 'Red Sox', 'away_team': 'Yankees', 'home_score': 5, 'away_score': 3, 'home_win': 1},
                {'game_id': '005', 'home_team': 'Giants', 'away_team': 'Dodgers', 'home_score': 6, 'away_score': 4, 'home_win': 1}
            ]
            return pd.DataFrame(mock_games)
            
    except Exception as e:
        print(f"Error collecting games data: {e}")
        return None

# Collect recent games data
games_df = get_recent_games_data(days_back=14, year=2024)

if games_df is not None:
    print("\nRecent Games Data Preview:")
    print(games_df.head())
    print(f"\nColumns: {list(games_df.columns)}")
    print(f"Home team win rate: {games_df['home_win'].mean():.3f}")
else:
    print("Failed to collect games data")

Collecting game data for the last 14 days...
Error collecting games data: name 'datetime' is not defined
Failed to collect games data


In [4]:
def get_pitcher_stats_for_games(games_df):
    """
    Get pitcher statistics for games using mlbgame.player_stats()
    This will help us get actual pitcher ERAs instead of using placeholders
    
    Args:
        games_df (pd.DataFrame): DataFrame containing game information
    
    Returns:
        dict: Dictionary mapping game_id to pitcher stats
    """
    pitcher_data = {}
    
    if games_df is None or len(games_df) == 0:
        print("No games data provided for pitcher stats")
        return pitcher_data
    
    print(f"Collecting pitcher stats for {len(games_df)} games...")
    
    # Sample a few games to avoid overwhelming the API
    sample_games = games_df.head(3) if len(games_df) > 3 else games_df
    
    for idx, game in sample_games.iterrows():
        try:
            game_id = game['game_id']
            print(f"Getting pitcher stats for game {game_id}...")
            
            # Get player stats for the game using mlbgame.player_stats()
            stats = mlbgame.player_stats(game_id)
            
            # Extract starting pitcher information
            game_pitcher_info = {
                'home_starting_pitcher': 'Unknown',
                'away_starting_pitcher': 'Unknown',
                'home_pitcher_era': 4.00,  # Default ERA
                'away_pitcher_era': 4.00   # Default ERA
            }
            
            # Process home team pitching stats
            if hasattr(stats, 'home_pitching') and stats.home_pitching:
                # Usually the first pitcher in the list is the starter
                starter = stats.home_pitching[0]
                game_pitcher_info['home_starting_pitcher'] = starter.name_display_first_last
                game_pitcher_info['home_pitcher_era'] = float(starter.era) if hasattr(starter, 'era') and starter.era else 4.00
            
            # Process away team pitching stats
            if hasattr(stats, 'away_pitching') and stats.away_pitching:
                starter = stats.away_pitching[0]
                game_pitcher_info['away_starting_pitcher'] = starter.name_display_first_last
                game_pitcher_info['away_pitcher_era'] = float(starter.era) if hasattr(starter, 'era') and starter.era else 4.00
            
            pitcher_data[game_id] = game_pitcher_info
            print(f"  Home starter: {game_pitcher_info['home_starting_pitcher']} (ERA: {game_pitcher_info['home_pitcher_era']})")
            print(f"  Away starter: {game_pitcher_info['away_starting_pitcher']} (ERA: {game_pitcher_info['away_pitcher_era']})")
            
        except Exception as e:
            print(f"Error getting pitcher stats for game {game_id}: {e}")
            # Use default values if we can't get pitcher stats
            pitcher_data[game_id] = {
                'home_starting_pitcher': 'Unknown',
                'away_starting_pitcher': 'Unknown',
                'home_pitcher_era': 4.00,
                'away_pitcher_era': 4.00
            }
            continue
    
    # Fill in missing games with default pitcher data
    for _, game in games_df.iterrows():
        game_id = game['game_id']
        if game_id not in pitcher_data:
            pitcher_data[game_id] = {
                'home_starting_pitcher': 'Unknown',
                'away_starting_pitcher': 'Unknown',
                'home_pitcher_era': 4.00,
                'away_pitcher_era': 4.00
            }
    
    print(f"Collected pitcher data for {len(pitcher_data)} games")
    return pitcher_data

# Get pitcher statistics for our games
if games_df is not None:
    pitcher_stats = get_pitcher_stats_for_games(games_df)
    print(f"\nPitcher stats collected for {len(pitcher_stats)} games")
    
    # Show sample pitcher data
    if pitcher_stats:
        sample_game_id = list(pitcher_stats.keys())[0]
        print(f"\nSample pitcher data for game {sample_game_id}:")
        for key, value in pitcher_stats[sample_game_id].items():
            print(f"  {key}: {value}")
else:
    pitcher_stats = {}
    print("No games data available for pitcher stats")

No games data available for pitcher stats


## Feature Engineering

Now let's create features for our moneyline prediction model using:
- Team standings data (win percentage, run differential)
- Starting pitcher ERA data
- Home field advantage

We'll combine all the data sources to create a comprehensive feature set.

In [5]:
def create_comprehensive_features(games_df, standings_df, pitcher_stats):
    """
    Create comprehensive features for moneyline prediction using all available data sources
    
    Args:
        games_df (pd.DataFrame): Game results data
        standings_df (pd.DataFrame): Team standings data
        pitcher_stats (dict): Pitcher statistics by game_id
    
    Returns:
        pd.DataFrame: Feature matrix for machine learning
    """
    if games_df is None or standings_df is None:
        print("Missing required data for feature creation")
        return None
    
    print("Creating comprehensive features for moneyline prediction...")
    
    # Create a lookup dictionary for team stats
    # Handle both team_name and team_id as keys for flexibility
    team_lookup = {}
    for _, team in standings_df.iterrows():
        team_key = team.get('team_name', team.get('team_id', 'Unknown'))
        team_lookup[team_key] = {
            'win_pct': team.get('win_pct', 0.5),
            'wins': team.get('wins', 81),
            'losses': team.get('losses', 81),
            'run_differential': team.get('run_differential', 0),
            'runs_scored': team.get('runs_scored', 700),
            'runs_allowed': team.get('runs_allowed', 700)
        }
    
    print(f"Created team lookup for {len(team_lookup)} teams")
    
    features_list = []
    
    for idx, game in games_df.iterrows():
        game_id = game['game_id']
        home_team = game['home_team']
        away_team = game['away_team']
        
        # Get team statistics with defaults for unknown teams
        home_stats = team_lookup.get(home_team, {
            'win_pct': 0.5, 'wins': 81, 'losses': 81, 'run_differential': 0,
            'runs_scored': 700, 'runs_allowed': 700
        })
        
        away_stats = team_lookup.get(away_team, {
            'win_pct': 0.5, 'wins': 81, 'losses': 81, 'run_differential': 0,
            'runs_scored': 700, 'runs_allowed': 700
        })
        
        # Get pitcher statistics with defaults
        pitcher_info = pitcher_stats.get(game_id, {
            'home_pitcher_era': 4.00,
            'away_pitcher_era': 4.00,
            'home_starting_pitcher': 'Unknown',
            'away_starting_pitcher': 'Unknown'
        })
        
        # Create comprehensive feature vector
        feature_row = {
            # Game metadata
            'game_id': game_id,
            'date': game.get('date', 'Unknown'),
            'home_team': home_team,
            'away_team': away_team,
            
            # Team performance features (most important for moneyline prediction)
            'home_win_pct': home_stats['win_pct'],
            'away_win_pct': away_stats['win_pct'],
            'win_pct_diff': home_stats['win_pct'] - away_stats['win_pct'],  # Positive favors home
            
            # Run differential features (key predictor of team strength)
            'home_run_diff': home_stats['run_differential'],
            'away_run_diff': away_stats['run_differential'],
            'run_diff_advantage': home_stats['run_differential'] - away_stats['run_differential'],
            
            # Pitcher features (crucial for game-specific predictions)
            'home_pitcher_era': pitcher_info['home_pitcher_era'],
            'away_pitcher_era': pitcher_info['away_pitcher_era'],
            'era_advantage': pitcher_info['away_pitcher_era'] - pitcher_info['home_pitcher_era'],  # Positive favors home
            
            # Home field advantage (constant but statistically significant)
            'home_field_advantage': 1,
            
            # Derived composite features
            'team_strength_diff': (home_stats['win_pct'] + home_stats['run_differential']/1000) - (away_stats['win_pct'] + away_stats['run_differential']/1000),
            
            # Target variable
            'home_win': game['home_win']
        }
        
        features_list.append(feature_row)
    
    features_df = pd.DataFrame(features_list)
    
    print(f"Created features for {len(features_df)} games")
    print(f"Feature columns ({len(features_df.columns)}): {list(features_df.columns)}")
    
    return features_df

# Create comprehensive features
if games_df is not None and standings_df is not None:
    features_df = create_comprehensive_features(games_df, standings_df, pitcher_stats)
    
    if features_df is not None:
        print("\nFeature Engineering Summary:")
        print(f"Total games: {len(features_df)}")
        print(f"Total features: {len(features_df.columns) - 5}")  # Subtract metadata columns
        print(f"Home team win rate: {features_df['home_win'].mean():.3f}")
        
        print("\nFeature Statistics:")
        numeric_features = ['home_win_pct', 'away_win_pct', 'win_pct_diff', 'run_diff_advantage', 
                          'home_pitcher_era', 'away_pitcher_era', 'era_advantage']
        
        for feature in numeric_features:
            if feature in features_df.columns:
                mean_val = features_df[feature].mean()
                std_val = features_df[feature].std()
                print(f"  {feature}: mean={mean_val:.3f}, std={std_val:.3f}")
        
        print("\nSample of created features:")
        display_cols = ['home_team', 'away_team', 'home_win_pct', 'away_win_pct', 
                       'home_pitcher_era', 'away_pitcher_era', 'home_win']
        available_cols = [col for col in display_cols if col in features_df.columns]
        print(features_df[available_cols].head())
    else:
        print("Failed to create features")
else:
    print("Missing required data for feature creation")
    features_df = None

Missing required data for feature creation


## Model Training and Evaluation

Now let's train machine learning models to predict the probability of home team wins.
We'll use both Linear Regression and Random Forest to compare performance.

In [6]:
def train_moneyline_prediction_models(features_df):
    """
    Train machine learning models to predict home team win probability
    
    Args:
        features_df (pd.DataFrame): Feature matrix with target variable
    
    Returns:
        tuple: (models_dict, results_dict, feature_importance)
    """
    if features_df is None or len(features_df) < 3:
        print("Insufficient data for model training")
        return None, None, None
    
    print(f"Training models on {len(features_df)} games...")
    
    # Define feature columns (exclude metadata and target)
    feature_columns = [
        'home_win_pct', 'away_win_pct', 'win_pct_diff',
        'home_run_diff', 'away_run_diff', 'run_diff_advantage',
        'home_pitcher_era', 'away_pitcher_era', 'era_advantage',
        'home_field_advantage', 'team_strength_diff'
    ]
    
    # Filter to only include columns that exist in the dataframe
    available_features = [col for col in feature_columns if col in features_df.columns]
    print(f"Using {len(available_features)} features: {available_features}")
    
    # Prepare feature matrix and target
    X = features_df[available_features]
    y = features_df['home_win']
    
    # Handle case where we have very few samples
    if len(features_df) < 5:
        print("Very small dataset - using simple train/test split")
        test_size = 0.2 if len(features_df) > 2 else 1
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    print(f"Training set: {len(X_train)} games, Test set: {len(X_test)} games")
    
    # Initialize models
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42, max_depth=3)
    }
    
    trained_models = {}
    results = {}
    feature_importance = {}
    
    # Train and evaluate each model
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        
        try:
            # Train the model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test) if len(X_test) > 0 else y_pred_train
            
            # Clip predictions to [0, 1] range for probability interpretation
            y_pred_train = np.clip(y_pred_train, 0, 1)
            y_pred_test = np.clip(y_pred_test, 0, 1)
            
            # Calculate metrics
            train_mse = mean_squared_error(y_train, y_pred_train)
            train_mae = mean_absolute_error(y_train, y_pred_train)
            train_r2 = r2_score(y_train, y_pred_train)
            
            if len(X_test) > 0:
                test_mse = mean_squared_error(y_test, y_pred_test)
                test_mae = mean_absolute_error(y_test, y_pred_test)
                test_r2 = r2_score(y_test, y_pred_test)
            else:
                test_mse = train_mse
                test_mae = train_mae
                test_r2 = train_r2
            
            # Store results
            results[model_name] = {
                'train_mse': train_mse,
                'train_mae': train_mae,
                'train_r2': train_r2,
                'test_mse': test_mse,
                'test_mae': test_mae,
                'test_r2': test_r2,
                'predictions_train': y_pred_train,
                'predictions_test': y_pred_test
            }
            
            trained_models[model_name] = model
            
            # Get feature importance for Random Forest
            if hasattr(model, 'feature_importances_'):
                importance_dict = dict(zip(available_features, model.feature_importances_))
                feature_importance[model_name] = importance_dict
                
                print(f"  Feature Importance (top 5):")
                sorted_features = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
                for feature, importance in sorted_features[:5]:
                    print(f"    {feature}: {importance:.3f}")
            
            print(f"  Train R²: {train_r2:.3f}, Test R²: {test_r2:.3f}")
            print(f"  Train MAE: {train_mae:.3f}, Test MAE: {test_mae:.3f}")
            
        except Exception as e:
            print(f"Error training {model_name}: {e}")
            continue
    
    return trained_models, results, feature_importance

# Train models if we have features
if features_df is not None:
    models, model_results, feature_imp = train_moneyline_prediction_models(features_df)
    
    if models and model_results:
        print("\n" + "="*50)
        print("MODEL TRAINING RESULTS")
        print("="*50)
        
        for model_name, metrics in model_results.items():
            print(f"\n{model_name}:")
            print(f"  Training R²: {metrics['train_r2']:.3f}")
            print(f"  Test R²: {metrics['test_r2']:.3f}")
            print(f"  Training MAE: {metrics['train_mae']:.3f}")
            print(f"  Test MAE: {metrics['test_mae']:.3f}")
    else:
        print("Model training failed")
else:
    print("No features available for model training")
    models, model_results, feature_imp = None, None, None

No features available for model training


## Visualization and Analysis

Let's create visualizations to understand our model performance and feature relationships.

In [7]:
def create_analysis_visualizations(features_df, model_results, feature_importance):
    """
    Create visualizations for model analysis and feature relationships
    
    Args:
        features_df (pd.DataFrame): Feature matrix
        model_results (dict): Model performance results
        feature_importance (dict): Feature importance from models
    """
    if features_df is None:
        print("No data available for visualization")
        return
    
    print("Creating analysis visualizations...")
    
    # Set up the plotting area
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('MLB Moneyline Prediction Analysis', fontsize=16, fontweight='bold')
    
    # 1. Feature correlation heatmap
    numeric_features = ['home_win_pct', 'away_win_pct', 'win_pct_diff', 'run_diff_advantage', 
                       'home_pitcher_era', 'away_pitcher_era', 'era_advantage', 'home_win']
    available_numeric = [col for col in numeric_features if col in features_df.columns]
    
    if len(available_numeric) > 1:
        corr_matrix = features_df[available_numeric].corr()
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                   square=True, ax=axes[0,0], fmt='.2f')
        axes[0,0].set_title('Feature Correlation Matrix')
        axes[0,0].tick_params(axis='x', rotation=45)
        axes[0,0].tick_params(axis='y', rotation=0)
    else:
        axes[0,0].text(0.5, 0.5, 'Insufficient numeric features\nfor correlation analysis', 
                      ha='center', va='center', transform=axes[0,0].transAxes)
        axes[0,0].set_title('Feature Correlation Matrix')
    
    # 2. Win percentage vs Home wins
    if 'win_pct_diff' in features_df.columns:
        win_pct_diff = features_df['win_pct_diff']
        home_wins = features_df['home_win']
        
        # Create scatter plot with jitter for binary outcome
        jittered_wins = home_wins + np.random.normal(0, 0.05, len(home_wins))
        axes[0,1].scatter(win_pct_diff, jittered_wins, alpha=0.6, s=50)
        axes[0,1].set_xlabel('Win Percentage Difference (Home - Away)')
        axes[0,1].set_ylabel('Home Team Win (with jitter)')
        axes[0,1].set_title('Win % Difference vs Home Team Success')
        axes[0,1].grid(True, alpha=0.3)
        
        # Add trend line if we have enough data
        if len(win_pct_diff) > 2:
            z = np.polyfit(win_pct_diff, home_wins, 1)
            p = np.poly1d(z)
            axes[0,1].plot(sorted(win_pct_diff), p(sorted(win_pct_diff)), "r--", alpha=0.8)
    else:
        axes[0,1].text(0.5, 0.5, 'Win percentage difference\ndata not available', 
                      ha='center', va='center', transform=axes[0,1].transAxes)
        axes[0,1].set_title('Win % Difference vs Home Team Success')
    
    # 3. Feature importance (if available)
    if feature_importance and 'Random Forest' in feature_importance:
        rf_importance = feature_importance['Random Forest']
        features = list(rf_importance.keys())
        importances = list(rf_importance.values())
        
        # Sort by importance
        sorted_idx = np.argsort(importances)[::-1]
        sorted_features = [features[i] for i in sorted_idx]
        sorted_importances = [importances[i] for i in sorted_idx]
        
        # Plot top features
        top_n = min(8, len(sorted_features))
        y_pos = np.arange(top_n)
        
        axes[1,0].barh(y_pos, sorted_importances[:top_n])
        axes[1,0].set_yticks(y_pos)
        axes[1,0].set_yticklabels([f.replace('_', ' ').title() for f in sorted_features[:top_n]])
        axes[1,0].set_xlabel('Feature Importance')
        axes[1,0].set_title('Random Forest Feature Importance')
        axes[1,0].grid(True, alpha=0.3, axis='x')
    else:
        axes[1,0].text(0.5, 0.5, 'Feature importance\ndata not available', 
                      ha='center', va='center', transform=axes[1,0].transAxes)
        axes[1,0].set_title('Random Forest Feature Importance')
    
    # 4. Model performance comparison
    if model_results:
        model_names = list(model_results.keys())
        train_r2 = [model_results[name]['train_r2'] for name in model_names]
        test_r2 = [model_results[name]['test_r2'] for name in model_names]
        
        x = np.arange(len(model_names))
        width = 0.35
        
        axes[1,1].bar(x - width/2, train_r2, width, label='Training R²', alpha=0.8)
        axes[1,1].bar(x + width/2, test_r2, width, label='Test R²', alpha=0.8)
        
        axes[1,1].set_xlabel('Models')
        axes[1,1].set_ylabel('R² Score')
        axes[1,1].set_title('Model Performance Comparison')
        axes[1,1].set_xticks(x)
        axes[1,1].set_xticklabels(model_names)
        axes[1,1].legend()
        axes[1,1].grid(True, alpha=0.3, axis='y')
        
        # Add value labels on bars
        for i, (train, test) in enumerate(zip(train_r2, test_r2)):
            axes[1,1].text(i - width/2, train + 0.01, f'{train:.2f}', 
                          ha='center', va='bottom', fontsize=9)
            axes[1,1].text(i + width/2, test + 0.01, f'{test:.2f}', 
                          ha='center', va='bottom', fontsize=9)
    else:
        axes[1,1].text(0.5, 0.5, 'Model results\nnot available', 
                      ha='center', va='center', transform=axes[1,1].transAxes)
        axes[1,1].set_title('Model Performance Comparison')
    
    plt.tight_layout()
    plt.show()
    
    print("Visualizations created successfully!")

# Create visualizations
if features_df is not None:
    create_analysis_visualizations(features_df, model_results, feature_imp)
else:
    print("No data available for visualization")

No data available for visualization


## Prediction Function

Create a function to make predictions for new games using our trained models.

In [8]:
def predict_game_outcome(home_team, away_team, home_era=4.0, away_era=4.0, 
                        standings_df=None, models=None):
    """
    Predict the outcome of a specific MLB game using our trained models
    
    Args:
        home_team (str): Home team name
        away_team (str): Away team name
        home_era (float): Home starting pitcher ERA
        away_era (float): Away starting pitcher ERA
        standings_df (pd.DataFrame): Current team standings
        models (dict): Trained models
    
    Returns:
        dict: Prediction results from different models
    """
    if models is None or standings_df is None:
        print("Models or standings data not available for prediction")
        return None
    
    print(f"\nPredicting: {away_team} @ {home_team}")
    print(f"Pitching matchup: Home ERA {home_era:.2f} vs Away ERA {away_era:.2f}")
    
    # Create team lookup
    team_lookup = {}
    for _, team in standings_df.iterrows():
        team_key = team.get('team_name', team.get('team_id', 'Unknown'))
        team_lookup[team_key] = {
            'win_pct': team.get('win_pct', 0.5),
            'run_differential': team.get('run_differential', 0)
        }
    
    # Get team stats
    home_stats = team_lookup.get(home_team, {'win_pct': 0.5, 'run_differential': 0})
    away_stats = team_lookup.get(away_team, {'win_pct': 0.5, 'run_differential': 0})
    
    # Create feature vector
    features = {
        'home_win_pct': home_stats['win_pct'],
        'away_win_pct': away_stats['win_pct'],
        'win_pct_diff': home_stats['win_pct'] - away_stats['win_pct'],
        'home_run_diff': home_stats['run_differential'],
        'away_run_diff': away_stats['run_differential'],
        'run_diff_advantage': home_stats['run_differential'] - away_stats['run_differential'],
        'home_pitcher_era': home_era,
        'away_pitcher_era': away_era,
        'era_advantage': away_era - home_era,
        'home_field_advantage': 1,
        'team_strength_diff': (home_stats['win_pct'] + home_stats['run_differential']/1000) - (away_stats['win_pct'] + away_stats['run_differential']/1000)
    }
    
    # Convert to DataFrame for prediction
    feature_df = pd.DataFrame([features])
    
    predictions = {}
    
    # Make predictions with each model
    for model_name, model in models.items():
        try:
            # Get the features the model was trained on
            if hasattr(model, 'feature_names_in_'):
                model_features = model.feature_names_in_
            else:
                # Assume all features are used
                model_features = list(features.keys())
            
            # Select only the features the model expects
            available_features = [f for f in model_features if f in feature_df.columns]
            X_pred = feature_df[available_features]
            
            # Make prediction
            prob = model.predict(X_pred)[0]
            prob = np.clip(prob, 0, 1)  # Ensure probability is between 0 and 1
            
            predictions[model_name] = {
                'home_win_probability': prob,
                'away_win_probability': 1 - prob,
                'predicted_winner': home_team if prob > 0.5 else away_team,
                'confidence': abs(prob - 0.5) * 2  # Convert to 0-1 confidence scale
            }
            
        except Exception as e:
            print(f"Error making prediction with {model_name}: {e}")
            continue
    
    # Display results
    print("\nPrediction Results:")
    print("-" * 40)
    
    for model_name, pred in predictions.items():
        print(f"\n{model_name}:")
        print(f"  {home_team} win probability: {pred['home_win_probability']:.1%}")
        print(f"  {away_team} win probability: {pred['away_win_probability']:.1%}")
        print(f"  Predicted winner: {pred['predicted_winner']}")
        print(f"  Confidence: {pred['confidence']:.1%}")
    
    return predictions

# Example prediction (if we have trained models)
if models and standings_df is not None:
    print("\n" + "="*60)
    print("EXAMPLE GAME PREDICTION")
    print("="*60)
    
    # Use teams from our standings data
    if len(standings_df) >= 2:
        home_team = standings_df.iloc[0]['team_name']
        away_team = standings_df.iloc[1]['team_name']
        
        # Make a sample prediction
        sample_prediction = predict_game_outcome(
            home_team=home_team,
            away_team=away_team,
            home_era=3.50,  # Good pitcher
            away_era=4.20,  # Average pitcher
            standings_df=standings_df,
            models=models
        )
    else:
        print("Insufficient team data for example prediction")
else:
    print("Models not available for prediction example")

Models not available for prediction example


## Summary and Conclusions

Let's summarize our findings and provide insights about the model's performance.

In [9]:
def generate_analysis_summary(features_df, model_results, feature_importance):
    """
    Generate a comprehensive summary of the analysis
    
    Args:
        features_df (pd.DataFrame): Feature matrix
        model_results (dict): Model performance results
        feature_importance (dict): Feature importance from models
    """
    print("\n" + "="*70)
    print("MLB MONEYLINE PREDICTION ANALYSIS SUMMARY")
    print("="*70)
    
    # Data Summary
    print("\n📊 DATA SUMMARY:")
    if features_df is not None:
        print(f"  • Total games analyzed: {len(features_df)}")
        print(f"  • Home team win rate: {features_df['home_win'].mean():.1%}")
        print(f"  • Features used: {len(features_df.columns) - 5}")  # Exclude metadata
        
        # Feature statistics
        if 'win_pct_diff' in features_df.columns:
            avg_win_diff = features_df['win_pct_diff'].mean()
            print(f"  • Average win % difference: {avg_win_diff:+.3f}")
        
        if 'era_advantage' in features_df.columns:
            avg_era_diff = features_df['era_advantage'].mean()
            print(f"  • Average ERA advantage: {avg_era_diff:+.2f}")
    else:
        print("  • No data available for analysis")
    
    # Model Performance
    print("\n🤖 MODEL PERFORMANCE:")
    if model_results:
        best_model = None
        best_r2 = -float('inf')
        
        for model_name, metrics in model_results.items():
            test_r2 = metrics['test_r2']
            test_mae = metrics['test_mae']
            
            print(f"  • {model_name}:")
            print(f"    - Test R²: {test_r2:.3f}")
            print(f"    - Test MAE: {test_mae:.3f}")
            print(f"    - Accuracy interpretation: {test_r2:.1%} of variance explained")
            
            if test_r2 > best_r2:
                best_r2 = test_r2
                best_model = model_name
        
        if best_model:
            print(f"\n  🏆 Best performing model: {best_model} (R² = {best_r2:.3f})")
    else:
        print("  • No model results available")
    
    # Feature Importance
    print("\n🎯 KEY INSIGHTS:")
    if feature_importance and 'Random Forest' in feature_importance:
        rf_importance = feature_importance['Random Forest']
        sorted_features = sorted(rf_importance.items(), key=lambda x: x[1], reverse=True)
        
        print("  • Most important features for prediction:")
        # Highlight ERA advantage
        if 'era_advantage' in rf_importance:
            era_importance = rf_importance['era_advantage']
            print(f"    - ERA advantage: {era_importance:.3f} importance score")
    else:
        print("  • Feature importance data not available")
    
    print("\n📈 CONCLUSIONS:")
    print("  • The model can predict home team win probabilities with reasonable accuracy.")
    print("  • Key features include win percentage difference, run differential, and ERA advantage.")
    print("  • Further improvements could include more advanced models or additional features.")

# Generate summary if we have data
if features_df is not None and model_results:
    generate_analysis_summary(features_df, model_results, feature_imp)
else:
    print("No data available for analysis summary")

No data available for analysis summary
