This is a starting point for a more complex data analysis - but let's lay some groundwork.

**Mission:** Predict Moneylines from current standings and starting pitcher ERA

**Complexity:** Small

**Backtested:** No

**Chances of beating the books:** Low


## Imports

### Pip installs (for if not running locally)

In [35]:
!pip install mlb-statsapi
!pip install scikit-learn
!pip install seaborn




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\Aaron\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\Aaron\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\Aaron\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


### Imports

In [1]:
import mlbstatsapi
import requests

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings, pprint, random
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette('husl')

#when looking at a dataFrame, show all columns
pd.set_option('display.max_columns', None)

## Data Collection

Collect current MLB team standings, historical game results, and starting pitcher ERA data to build our moneyline prediction features.

In [41]:
def get_all_gamePks(season, start_month=1, end_month=12):
    season = requests.get('https://statsapi.mlb.com/api/v1/schedule', params={
        'startDate': f'{season}-{start_month}-01',
        'endDate': f'{season}-{end_month}-31',
        'sportId': 1,
        'gameType': 'R',  # Regular season games
        'language': 'en',
        'hydrate': 'game(boxscore)',
    }).json()
    return [game['gamePk'] for date in season['dates'] for game in date['games']]


twothree_gamepks = get_all_gamePks(2023)  # Example usage to fetch all game Pks for the 2023 season
twothree_gamepks[100]

718688

In [42]:
def get_line_score(game_pk):
    """
    Fetches the line score for a given game using its gamePk.
    """
    url = f'https://statsapi.mlb.com/api/v1/game/{game_pk}/linescore'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching line score for game {game_pk}: {response.status_code}")
        return None

def get_box_score(game_pk):
    """
    Fetches the box score for a given game using its gamePk.
    """
    url = f'https://statsapi.mlb.com/api/v1/game/{game_pk}/boxscore'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching box score for game {game_pk}: {response.status_code}")
        return None
    
box = get_box_score(twothree_gamepks[100]) # Example usage to fetch box score for the first gamePk
line = get_line_score(twothree_gamepks[100]) # Example usage to fetch line score for the first gamePk
# pprint.pprint(box['teams']['home'])

In [43]:
def get_player_name(box, player_id, home=True):
    """
    Extracts player name from player ID using the box score data structure.
    
    Parameters:
    - box: Box score dictionary from MLB API
    - player_id: Player ID (integer or string like 'ID502043')
    - home: Boolean, True for home team, False for away team
    
    Returns:
    - Player's full name or None if not found
    """
    if not box or 'teams' not in box:
        return None
    
    # Handle different ID formats
    if isinstance(player_id, int):
        player_key = f'ID{player_id}'
    elif isinstance(player_id, str) and not player_id.startswith('ID'):
        player_key = f'ID{player_id}'
    else:
        player_key = player_id
    
    # Get the appropriate team's player data
    team_key = 'home' if home else 'away'
    if team_key not in box['teams'] or 'players' not in box['teams'][team_key]:
        return None
    
    players = box['teams'][team_key]['players']
    
    # Look up the player
    if player_key in players:
        player_data = players[player_key]
        if 'person' in player_data and 'fullName' in player_data['person']:
            return player_data['person']['fullName']
    
    return None

# Example usage:
player_name = get_player_name(box, 502043, home=True)  # Kyle Gibson
print(player_name)
player_name = get_player_name(box, 'ID543056', home=True)  # Danny Coulombe
player_name

Kyle Gibson


'Danny Coulombe'

In [55]:
def get_team_postgame_record(box, home=True):
    """
    Extracts the pregame record of the team from the box score.
    """
    if 'teams' in box:
        team = box['teams']['home'] if home else box['teams']['away']
        return {
            'wins': team['team']['record']['wins'],
            'losses': team['team']['record']['losses'],
        }
    return None

def get_starter_pregame_stats(box, home=True, stats=['earnedRuns', 'hits', 'baseOnBalls', 'homeRuns', 'outs'], inc_player_name=False):
    """
    Extracts the starting pitcher's ERA before the game from the box score.

    possible stats to take: ['gamesPlayed', 'gamesStarted', 'flyOuts', 'groundOuts', 'airOuts', 'runs', 'doubles', 'triples',
     'homeRuns', 'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch', 'atBats', 'obp', 'caughtStealing', 'stolenBases',
      'stolenBasePercentage', 'numberOfPitches', 'era', 'inningsPitched', 'wins', 'losses', 'saves', 'saveOpportunities', 'holds',
       'blownSaves', 'earnedRuns', 'whip', 'battersFaced', 'outs', 'gamesPitched', 'completeGames', 'shutouts', 'pitchesThrown',
        'balls', 'strikes', 'strikePercentage', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffs', 'groundOutsToAirouts', 'rbi',
         'winPercentage', 'pitchesPerInning', 'gamesFinished', 'strikeoutWalkRatio', 'strikeoutsPer9Inn', 'walksPer9Inn', 'hitsPer9Inn',
          'runsScoredPer9', 'homeRunsPer9', 'inheritedRunners', 'inheritedRunnersScored', 'catchersInterference', 'sacBunts', 'sacFlies',
           'passedBall', 'popOuts', 'lineOuts']

    """
    if 'teams' in box and 'pitchers' in box['teams']['home']:
        pitchers = box['teams']['home']['pitchers'] if home else box['teams']['away']['pitchers']
        starter = box['teams']['home']['players'][f'ID{pitchers[0]}'] if home else box['teams']['away']['players'][f'ID{pitchers[0]}']
        postgame = starter['seasonStats']['pitching']
        game = starter['stats']['pitching'] 

        # Create the stats dictionary
        stats_dict = {stat: postgame.get(stat, 0) - game.get(stat, 0) for stat in stats}
        
        # Add player name
        if inc_player_name:
            stats_dict['playername'] = get_player_name(box, pitchers[0], home=home)
        
        return stats_dict
    return None

def get_batters_pregrame_stats(box, home=True, stats=['hits', 'totalBases', 'homeRuns', 'plateAppearances', 'baseOnBalls']):
    """
    Extracts the pregame stats of a batter from the box score.

    dict_keys(['summary', 'gamesPlayed', 'flyOuts', 'groundOuts', 'airOuts', 'runs', 'doubles', 'triples', 'homeRuns',
    'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch', 'atBats', 'caughtStealing', 'stolenBases',
      'stolenBasePercentage', 'groundIntoDoublePlay', 'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
        'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference', 'pickoffs', 'atBatsPerHomeRun', 'popOuts', 'lineOuts']
    """
    if 'teams' in box and 'battingOrder' in box['teams']['home']:
        batters = box['teams']['home']['battingOrder'] if home else box['teams']['away']['battingOrder']
        # print(batters)
        batter_stats = list()
        batter_sums = {
            stat: 0 for stat in stats
        }
        for batter_id in batters:
            batter = box['teams']['home']['players'][f'ID{batter_id}'] if home else box['teams']['away']['players'][f'ID{batter_id}']
            postgame = batter['seasonStats']['batting']
            game = batter['stats']['batting']
            batter_stats.append({
                stat: postgame.get(stat, 0) - game.get(stat, 0) for stat in stats
            })
            batter_stats[-1]['playername'] = get_player_name(box, batter_id, home=home)
            for stat in stats:
                if stat != 'playername':
                    batter_sums[stat] += batter_stats[-1][stat]
        batter_stats.append(batter_sums)
        return batter_stats
    return None

get_batters_pregrame_stats(box, stats=['hits', 'totalBases', 'homeRuns', 'plateAppearances', 'baseOnBalls', 'playername'])

[{'hits': 6,
  'totalBases': 13,
  'homeRuns': 2,
  'plateAppearances': 28,
  'baseOnBalls': 4,
  'playername': 'Cedric Mullins'},
 {'hits': 8,
  'totalBases': 11,
  'homeRuns': 1,
  'plateAppearances': 29,
  'baseOnBalls': 2,
  'playername': 'Adley Rutschman'},
 {'hits': 2,
  'totalBases': 2,
  'homeRuns': 0,
  'plateAppearances': 8,
  'baseOnBalls': 0,
  'playername': 'Ryan McKenna'},
 {'hits': 7,
  'totalBases': 16,
  'homeRuns': 2,
  'plateAppearances': 28,
  'baseOnBalls': 2,
  'playername': 'Ryan Mountcastle'},
 {'hits': 2,
  'totalBases': 5,
  'homeRuns': 1,
  'plateAppearances': 22,
  'baseOnBalls': 6,
  'playername': 'Gunnar Henderson'},
 {'hits': 3,
  'totalBases': 7,
  'homeRuns': 1,
  'plateAppearances': 18,
  'baseOnBalls': 0,
  'playername': 'Ramón Urías'},
 {'hits': 6,
  'totalBases': 12,
  'homeRuns': 1,
  'plateAppearances': 19,
  'baseOnBalls': 2,
  'playername': 'Adam Frazier'},
 {'hits': 7,
  'totalBases': 13,
  'homeRuns': 1,
  'plateAppearances': 21,
  'baseOnBall

In [56]:
def get_final_score(line):
    """
    Extracts the final score from the line score.
    """
    if 'teams' in line and 'home' in line['teams'] and 'away' in line['teams']:
        home_runs = line['teams']['home']['runs']
        away_runs = line['teams']['away']['runs']
        return {
            'home_runs': home_runs,
            'away_runs': away_runs,
            'home_win': home_runs > away_runs
        }
    return None

In [57]:
line.keys()

dict_keys(['copyright', 'currentInning', 'currentInningOrdinal', 'inningState', 'inningHalf', 'isTopInning', 'scheduledInnings', 'innings', 'teams', 'defense', 'offense', 'balls', 'strikes', 'outs'])

In [58]:
LOG_EXPONENT = 10
import math

def get_team_elo(team_wins, team_losses, log_exp=LOG_EXPONENT):
    """
    Calculates the Elo rating for a team based on its wins and losses.
    """
    games_played = team_wins + team_losses
    if team_wins + team_losses == 0:
        return 1500  # Default Elo rating if no games played
    return 1500 + (team_wins / games_played) * (math.log((games_played + log_exp) /log_exp) + 1) * 100 # Simplified Elo calculation

def get_starter_averages(starter_stats):
    """
    Calculates the per inning stats for a starting pitcher.
    """
    if not starter_stats or starter_stats.get('outs', 0) == 0:
        return {stat: None for stat in starter_stats}
    total_innings = starter_stats.get('outs', 0) / 3
    return {stat: starter_stats.get(stat, 0) / total_innings * 9 for stat in starter_stats if stat not in ['outs', 'gamesPlayed', 'gamesStarted']}


# TODO: perhaps a non-weighted average would be sharper here, but for now...
def get_batting_team_averages(batter_stats):
    """
    Calculates the per game stats for a batter.
    """
    if not batter_stats or batter_stats.get('plateAppearances', 0) == 0:
        return {stat: None for stat in batter_stats}
    plateAppearances = batter_stats.get('plateAppearances', 1)  # Avoid division by zero
    return {stat: batter_stats.get(stat, 0) / plateAppearances for stat in batter_stats if stat not in ['gamesPlayed', 'battingOrder']}

In [59]:
BATTING_STATS = ['hits', 'totalBases', 'homeRuns', 'plateAppearances', 'baseOnBalls', 'playername']
PITCHING_STATS = ['earnedRuns', 'hits', 'baseOnBalls', 'homeRuns', 'outs']

def get_dataframe(game_pks):
    """
    Fetches box scores and line scores for a list of game Pks and returns a DataFrame.
    """
    data = []
    for game_pk in game_pks:
        box = get_box_score(game_pk)
        line = get_line_score(game_pk)
        if box and line:
            home_team_record = get_team_postgame_record(box, home=True)
            away_team_record = get_team_postgame_record(box, home=False)
            home_starter_stats = get_starter_pregame_stats(box, home=True, stats=PITCHING_STATS, inc_player_name=True)
            away_starter_stats = get_starter_pregame_stats(box, home=False, stats=PITCHING_STATS, inc_player_name=True)
            home_batters_stats = get_batters_pregrame_stats(box, home=True, stats=BATTING_STATS)
            away_batters_stats = get_batters_pregrame_stats(box, home=False, stats=BATTING_STATS)

            final_score = get_final_score(line)

            if final_score['home_win']:
                home_team_record['wins'] -= 1
                away_team_record['losses'] -= 1
            else:
                home_team_record['losses'] -= 1
                away_team_record['wins'] -= 1

            data.append({
                'gamePk': game_pk,
                'home_wins': home_team_record['wins'],
                'home_losses': home_team_record['losses'],
                'home_winpercentage': home_team_record['wins'] / (home_team_record['wins'] + home_team_record['losses']) if (home_team_record['wins'] + home_team_record['losses']) > 0 else 0,
                'home_elo': get_team_elo(home_team_record['wins'], home_team_record['losses']),
                'away_elo': get_team_elo(away_team_record['wins'], away_team_record['losses']),
                'away_wins': away_team_record['wins'],
                'away_losses': away_team_record['losses'],
                'away_winpercentage': away_team_record['wins'] / (away_team_record['wins'] + away_team_record['losses']) if (away_team_record['wins'] + away_team_record['losses']) > 0 else 0,
                'away_runs': final_score['away_runs'],
                'home_win': final_score['home_win'],
                **{f'home_sp_{stat}': home_starter_stats.get(stat, None) for stat in PITCHING_STATS + ['playername']},
                **{f'away_sp_{stat}': away_starter_stats.get(stat, None) for stat in PITCHING_STATS + ['playername']},
                **{f'home_{i}_{stat}': home_batters_stats[i].get(stat, None) for stat in BATTING_STATS for i in range(len(home_batters_stats) - 1)},
                **{f'away_{i}_{stat}': away_batters_stats[i].get(stat, None) for stat in BATTING_STATS for i in range(len(away_batters_stats) - 1)}
            })
            if (len(data) % 100) == 5:
                print(f"Processed {len(data)} games...")
                print(data[-1]['home_0_playername'], data[-1]['away_0_playername'], data[-1]['home_sp_playername'], data[-1]['away_sp_playername'])
    
    return pd.DataFrame(data)

twothree_df = get_dataframe(twothree_gamepks)  # Example usage to fetch DataFrame for the 2023 season


Processed 5 games...
Taylor Walls Zach McKinstry Shane McClanahan Eduardo Rodriguez
Processed 105 games...
Trea Turner Jonathan India Zack Wheeler Hunter Greene
Processed 205 games...
Andrew Benintendi Cedric Mullins Mike Clevinger Tyler Wells
Processed 305 games...
Taylor Ward Bobby Witt Jr. Shohei Ohtani Taylor Clarke
Processed 405 games...
Zack Short Austin Hays Matthew Boyd Grayson Rodriguez
Processed 505 games...
Brandon Nimmo Charlie Blackmon Tylor Megill Austin Gomber
Processed 605 games...
Christian Yelich Bobby Witt Jr. Adrian Houser Zack Greinke
Processed 705 games...
Lane Thomas Zach McKinstry Josiah Gray Joey Wentz
Processed 805 games...
Steven Kwan Lars Nootbaar Hunter Gaddis Jordan Montgomery
Processed 905 games...
Marcus Semien Jarred Kelenic Nathan Eovaldi Bryce Miller
Processed 1005 games...
Jonathan Schoop Ronald Acuña Jr. Mason Englert Charlie Morton
Processed 1105 games...
Donovan Solano Rob Refsnyder Pablo López James Paxton
Processed 1205 games...
Marcus Semien Ma

In [60]:
# Save dataframe to CSV
twothree_df.to_csv('twothree_df.csv', index=False)  # Save DataFrame to CSV file

### Marcel Projections - turning them from values to ratios

This is mostly a premade dataset for us, we just have to run these couple lines to allow us to join datasets more easily later

In [25]:
hitter_projections = pd.read_csv("twothree_marcel_hitters.csv")
pitcher_projections = pd.read_csv("twothree_marcel_pitchers.csv")


In [26]:
MARCEL_HITTER_LINEAR_STATS = ['R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'TB']
MARCEL_PITCHER_LINEAR_STATS = ['H', 'R', 'ER', 'HR', 'BB', 'IBB', 'HBP', 'SO', 'WP']


hitter_projections['TB'] = (hitter_projections['H'] + hitter_projections['2B'] + (2 * hitter_projections['3B']) + (3 * hitter_projections['HR']))

for hitting_stat in MARCEL_HITTER_LINEAR_STATS:
    hitter_projections[hitting_stat + "/pa"] = hitter_projections[hitting_stat] / hitter_projections['PA']  # Convert to per game

for pitching_stat in MARCEL_PITCHER_LINEAR_STATS:
    pitcher_projections[pitching_stat + "/ip"] = pitcher_projections[pitching_stat] / pitcher_projections['IP']  # Convert to per 9 innings


In [27]:
hitter_projections.to_csv("twothree_marcel_hitters.csv", index=False)
pitcher_projections.to_csv("twothree_marcel_pitchers.csv", index=False)

## Feature Engineering

Now let's create features for our moneyline prediction model using team standings and pitcher data.

### Import Data (if stored locally)
prevents the need to run Data Collection Section, which could take 40 minutes or so

In [57]:
# Get it back from CSV
twothree_df = pd.read_csv('twothree_df.csv')  # Load DataFrame from CSV file
hitter_projections = pd.read_csv("twothree_marcel_hitters.csv")
pitcher_projections = pd.read_csv("twothree_marcel_pitchers.csv")

### Old Approach - use a generic Bayesian Multiplier for each player

In [88]:


bayesian = {
    'earnedRuns': 9,
    'sp_hits': 17,
    'sp_baseOnBalls': 8,
    'homeRuns': 3,
    'outs': 54,
    'wh': 25,  # Weighted average for WHIP
    
    #Batting stats
    'hits': 14,
    'totalBases': 23,
    'homeRuns': 1,
    'plateAppearances': 60,
    'baseOnBalls': 4
}

BAYESIAN_MULTIPLIER = 1  # Adjust this multiplier as needed for your model

for x in bayesian:
    bayesian[x] *= BAYESIAN_MULTIPLIER

In [89]:
twothree_df['home_era'] = (twothree_df['home_earnedRuns']  + bayesian['earnedRuns']) / ((twothree_df['home_outs'] + bayesian['outs']) / 27)
twothree_df['away_era'] = (twothree_df['away_earnedRuns'] + bayesian['earnedRuns']) / ((twothree_df['away_outs'] + bayesian['outs']) / 27)
twothree_df['home_whip9'] = (twothree_df['home_sp_hits'] + twothree_df['home_sp_baseOnBalls'] + bayesian['wh']) / ((twothree_df['home_outs'] + bayesian['outs']) / 3)
twothree_df['away_whip9'] = (twothree_df['away_sp_hits'] + twothree_df['away_sp_baseOnBalls'] + bayesian['wh']) / ((twothree_df['away_outs'] + bayesian['outs']) / 3)

for batting_stat in ['hits', 'totalBases', 'homeRuns', 'baseOnBalls']:
    twothree_df[f'home_{batting_stat}'] = sum([(twothree_df[f'home_{i}_{batting_stat}'] + bayesian[batting_stat]) / (twothree_df[f'home_{i}_plateAppearances'] + bayesian['plateAppearances']) for i in range(9)])/9
    twothree_df[f'away_{batting_stat}'] = sum([(twothree_df[f'away_{i}_{batting_stat}'] + bayesian[batting_stat]) / (twothree_df[f'away_{i}_plateAppearances'] + bayesian['plateAppearances']) for i in range(9)])/9


KeyError: 'home_earnedRuns'

### New Approach - Create A Bayesian Based on Preseason Marcel Projections

In [71]:
# Mapping MLB statsapi field names to Baseball Reference/Marcel projection column names
mlb_to_bbref_mapping = {
    "hits": "H",
    "totalBases": "TB", 
    "homeRuns": "HR",
    "plateAppearances": "PA",
    "baseOnBalls": "BB",
    "playername": "Name",
    "earnedRuns": "ER",
    "baseOnBalls": "BB",
    "homeRuns": "HR",
    "innings": "IP"  # Innings pitched * 3
}

MARCEL_REPLACEMENT_BATTING_STATS = {
    "H": 0.20,
    "TB": 0.35,
    "HR": 0.025,
    "BB": 0.06, 
}

MARCEL_REPLACEMENT_PITCHING_STATS = {
    "ER": 0.6,
    "H": 1.0,
    "BB": 0.3,
    "HR": 0.15,
}

In [72]:
RATE_SUFFIX = {
    False: "/pa",
    True: "/ip"
}


def get_marcel_projected_stat(hitter_projections: pd.DataFrame, 
                              playername: pd.Series, 
                              stat: str, 
                              pitcher: bool = False) -> pd.DataFrame:
    """
    Get Marcel projected stat for a given player and statistic.
    
    Parameters:
    - hitter_projections: DataFrame with Marcel projections. Must contain at least
      ['Name', stat] where 'Name' matches playername entries.
    - playername: A pandas Series (or DataFrame column) of player names to look up.
    - stat: Statistic to retrieve (using MLB statsapi / Marcel field names).
    - pitcher: Bool, whether to look up pitcher stats instead of hitter stats (defaults to False).
    
    Returns:
    - DataFrame with ['Name', stat] for each player in playername. If not found, returns NaN.
    """
    
    # Ensure consistent casing/stripping for merges
    hitter_projections = hitter_projections.copy()
    hitter_projections["Name"] = hitter_projections["Name"].str.strip()
    
    players_df = pd.DataFrame({"Name": playername.astype(str).str.strip()})

    
    # Merge on player name
    merged = players_df.merge(
    hitter_projections[["Name", mlb_to_bbref_mapping[stat] + RATE_SUFFIX[pitcher]]],
    on="Name",
    how="left"
)
    
    #the dataset might be missing player name matches, so have a default value in case the player name isn't found
    if merged[mlb_to_bbref_mapping[stat] + RATE_SUFFIX[pitcher]].isnull().any():
        replacement_value = MARCEL_REPLACEMENT_PITCHING_STATS[mlb_to_bbref_mapping[stat]] if pitcher else MARCEL_REPLACEMENT_BATTING_STATS[mlb_to_bbref_mapping[stat]]
        merged[mlb_to_bbref_mapping[stat] + RATE_SUFFIX[pitcher]].fillna(replacement_value, inplace=True)
    
    
    
    return merged


def get_marcel_pitcher_stat(pitcher_projections, playername, stat):
    """
    Get Marcel projected stat for a pitcher.
    
    Parameters:
    - pitcher_projections: DataFrame with Marcel pitcher projections
    - playername: Name of the pitcher to look up
    - stat: Statistic to retrieve
    
    Returns:
    - Projected stat value or None if player not found
    """
    # Handle different name formats and case sensitivity
    player_row = pitcher_projections[pitcher_projections['Name'].str.contains(playername, case=False, na=False)]
    
    if player_row.empty:
        # Try alternative matching by splitting names
        name_parts = playername.split()
        if len(name_parts) >= 2:
            first_name, last_name = name_parts[0], name_parts[-1]
            player_row = pitcher_projections[
                (pitcher_projections['Name'].str.contains(first_name, case=False, na=False)) &
                (pitcher_projections['Name'].str.contains(last_name, case=False, na=False))
            ]
    
    if player_row.empty:
        return None
    
    # Map pitching stats
    pitcher_mapping = {
        "earnedRuns": "ER",
        "hits": "H",
        "baseOnBalls": "BB",
        "homeRuns": "HR",
        "innnings": "IP"  # Innings pitched * 3
    }

    
    
    # Get the Baseball Reference column name
    bbref_stat = pitcher_mapping.get(stat, stat)
    
    if bbref_stat in player_row.columns:
        return player_row.iloc[0][bbref_stat + "/ip"]  # Return per inning stat
    else:
        return None

In [73]:
BAYESIAN_PAS_ADDED = 100
BAYESIAN_IPS_ADDED = 20

BAYESIAN_ADDED = {
    False: BAYESIAN_PAS_ADDED, 
    True: BAYESIAN_IPS_ADDED
}


def integrate_bayesian(df, player_number, stat, hitter_projections, home=True):
    """
    Integrate Bayesian priors with individual player stats
    
    Parameters:
    - df: DataFrame with game data
    - player_number: Player position (0-8 for batting order)
    - stat: Statistic to integrate ('hits', 'totalBases', 'homeRuns', 'baseOnBalls')
    - home: Boolean, True for home team, False for away team
    """
    home_str = 'home' if home else 'away'
    pitcher = True if player_number == "sp" else False
    print("testing", player_number, stat, pitcher, RATE_SUFFIX[pitcher])

    original_col = f'{home_str}_{player_number}_{stat}'
    pa_col = f'{home_str}_{player_number}_plateAppearances'
    ip_col = f'{home_str}_sp_outs'
    bayesian_col = original_col + "_bayesian"
    marcel_col = original_col + "_marcel_projection"
    playername_dfcol = df[f'{home_str}_{player_number}_playername']

    df[marcel_col] = get_marcel_projected_stat(hitter_projections, playername_dfcol, stat, pitcher=pitcher)[mlb_to_bbref_mapping[stat] + RATE_SUFFIX[pitcher]] * BAYESIAN_ADDED[pitcher]

    # Apply Bayesian integration: (actual + prior) / (PA + prior_PA)
    if not pitcher:
        df[bayesian_col] = (df[original_col] + df[marcel_col]) / ((df[pa_col]) + BAYESIAN_ADDED[pitcher])
    else:
        df[bayesian_col] = (df[original_col] + df[marcel_col]) / ((df[ip_col] // 3) + BAYESIAN_ADDED[pitcher])
    
    return df

def integrate_all_bayesians(df, hprojs, pprojs):
    """
    Apply Bayesian integration to all players and relevant stats
    """
    relevant_stats = ['hits', 'totalBases', 'homeRuns', 'baseOnBalls']
    
    # Apply to all players (0-8) for both home and away teams
    for player_number in range(9):  # Players 0-8 in batting order
        for stat in relevant_stats:
            # Integrate for home team
            df = integrate_bayesian(df, player_number, stat, hprojs, home=True)
            # Integrate for away team  
            df = integrate_bayesian(df, player_number, stat, hprojs, home=False)

    # pitchers
    for pitching_stat in ["earnedRuns", "hits", "baseOnBalls", "homeRuns"]:
        for home_bool in [True, False]:
            df = integrate_bayesian(df, "sp", pitching_stat, pprojs, home=home_bool)
    
    return df

# Example usage:
twothree_df = integrate_all_bayesians(twothree_df, hitter_projections, pitcher_projections)

testing 0 hits False /pa
testing 0 hits False /pa
testing 0 totalBases False /pa
testing 0 totalBases False /pa
testing 0 homeRuns False /pa
testing 0 homeRuns False /pa
testing 0 baseOnBalls False /pa
testing 0 baseOnBalls False /pa
testing 1 hits False /pa
testing 1 hits False /pa
testing 1 totalBases False /pa
testing 1 totalBases False /pa
testing 1 homeRuns False /pa
testing 1 homeRuns False /pa
testing 1 baseOnBalls False /pa
testing 1 baseOnBalls False /pa
testing 2 hits False /pa
testing 2 hits False /pa
testing 2 totalBases False /pa
testing 2 totalBases False /pa
testing 2 homeRuns False /pa
testing 2 homeRuns False /pa
testing 2 baseOnBalls False /pa
testing 2 baseOnBalls False /pa
testing 3 hits False /pa
testing 3 hits False /pa
testing 3 totalBases False /pa
testing 3 totalBases False /pa
testing 3 homeRuns False /pa
testing 3 homeRuns False /pa
testing 3 baseOnBalls False /pa
testing 3 baseOnBalls False /pa
testing 4 hits False /pa
testing 4 hits False /pa
testing 4 tota

In [74]:
def aggregate_bayesian_team_stats(df):
    """
    Aggregate individual Bayesian-adjusted player stats into team-level totals
    Equivalent to the old approach summation but using Marcel-based Bayesian stats
    """
    relevant_stats = ['hits', 'totalBases', 'homeRuns', 'baseOnBalls']
    
    for batting_stat in relevant_stats:
        # Sum Bayesian-adjusted stats for home team (players 0-8)
        df[f'home_{batting_stat}'] = sum([df[f'home_{i}_{batting_stat}_bayesian'] for i in range(9) if f'home_{i}_{batting_stat}_bayesian' in df.columns])/9
        
        # Sum Bayesian-adjusted stats for away team (players 0-8)  
        df[f'away_{batting_stat}'] = sum([df[f'away_{i}_{batting_stat}_bayesian'] for i in range(9) if f'away_{i}_{batting_stat}_bayesian' in df.columns])/9

    print(df.columns)
    #TODO: PICTHING STATS
    df['home_era'] = df['home_sp_earnedRuns_bayesian']
    df['away_era'] = df['away_sp_earnedRuns_bayesian']
    df['home_whip9'] = df['home_sp_hits_bayesian'] + df['home_sp_baseOnBalls_bayesian']
    df['away_whip9'] = df['away_sp_hits_bayesian'] + df['away_sp_baseOnBalls_bayesian']

    return df

twothree_df = aggregate_bayesian_team_stats(twothree_df)

Index(['gamePk', 'home_wins', 'home_losses', 'home_winpercentage', 'home_elo',
       'away_elo', 'away_wins', 'away_losses', 'away_winpercentage',
       'away_runs',
       ...
       'away_era', 'home_whip9', 'away_whip9', 'era_diff', 'wp_diff',
       'elo_diff', 'whip9_diff', 'obp_diff', 'slg_diff', 'HRR_diff'],
      dtype='object', length=310)


In [75]:

# Get rid of NaN values (note: this is when a pitcher makes his season debut)
twothree_df = twothree_df.dropna(subset=['home_era', 'away_era'])
twothree_df.head()  # Display the first few rows of the DataFrame

twothree_df['era_diff'] = twothree_df['home_era'] - twothree_df['away_era']
twothree_df['wp_diff'] = twothree_df['home_winpercentage'] - twothree_df['away_winpercentage']
twothree_df['elo_diff'] = twothree_df['home_elo'] - twothree_df['away_elo']
twothree_df['whip9_diff'] = twothree_df['home_whip9'] - twothree_df['away_whip9']
twothree_df['obp_diff'] = twothree_df['home_hits'] + twothree_df['home_baseOnBalls'] - twothree_df['away_hits'] - twothree_df['away_baseOnBalls']
twothree_df['slg_diff'] = twothree_df['home_totalBases'] - twothree_df['away_totalBases']
twothree_df['HRR_diff'] = twothree_df['home_homeRuns'] - twothree_df['away_homeRuns']
twothree_df.tail()  # Display the updated DataFrame with new features

Unnamed: 0,gamePk,home_wins,home_losses,home_winpercentage,home_elo,away_elo,away_wins,away_losses,away_winpercentage,away_runs,home_win,home_sp_earnedRuns,home_sp_hits,home_sp_baseOnBalls,home_sp_homeRuns,home_sp_outs,home_sp_playername,away_sp_earnedRuns,away_sp_hits,away_sp_baseOnBalls,away_sp_homeRuns,away_sp_outs,away_sp_playername,home_0_hits,home_1_hits,home_2_hits,home_3_hits,home_4_hits,home_5_hits,home_6_hits,home_7_hits,home_8_hits,home_0_totalBases,home_1_totalBases,home_2_totalBases,home_3_totalBases,home_4_totalBases,home_5_totalBases,home_6_totalBases,home_7_totalBases,home_8_totalBases,home_0_homeRuns,home_1_homeRuns,home_2_homeRuns,home_3_homeRuns,home_4_homeRuns,home_5_homeRuns,home_6_homeRuns,home_7_homeRuns,home_8_homeRuns,home_0_plateAppearances,home_1_plateAppearances,home_2_plateAppearances,home_3_plateAppearances,home_4_plateAppearances,home_5_plateAppearances,home_6_plateAppearances,home_7_plateAppearances,home_8_plateAppearances,home_0_baseOnBalls,home_1_baseOnBalls,home_2_baseOnBalls,home_3_baseOnBalls,home_4_baseOnBalls,home_5_baseOnBalls,home_6_baseOnBalls,home_7_baseOnBalls,home_8_baseOnBalls,home_0_playername,home_1_playername,home_2_playername,home_3_playername,home_4_playername,home_5_playername,home_6_playername,home_7_playername,home_8_playername,away_0_hits,away_1_hits,away_2_hits,away_3_hits,away_4_hits,away_5_hits,away_6_hits,away_7_hits,away_8_hits,away_0_totalBases,away_1_totalBases,away_2_totalBases,away_3_totalBases,away_4_totalBases,away_5_totalBases,away_6_totalBases,away_7_totalBases,away_8_totalBases,away_0_homeRuns,away_1_homeRuns,away_2_homeRuns,away_3_homeRuns,away_4_homeRuns,away_5_homeRuns,away_6_homeRuns,away_7_homeRuns,away_8_homeRuns,away_0_plateAppearances,away_1_plateAppearances,away_2_plateAppearances,away_3_plateAppearances,away_4_plateAppearances,away_5_plateAppearances,away_6_plateAppearances,away_7_plateAppearances,away_8_plateAppearances,away_0_baseOnBalls,away_1_baseOnBalls,away_2_baseOnBalls,away_3_baseOnBalls,away_4_baseOnBalls,away_5_baseOnBalls,away_6_baseOnBalls,away_7_baseOnBalls,away_8_baseOnBalls,away_0_playername,away_1_playername,away_2_playername,away_3_playername,away_4_playername,away_5_playername,away_6_playername,away_7_playername,away_8_playername,home_0_hits_marcel_projection,home_0_hits_bayesian,away_0_hits_marcel_projection,away_0_hits_bayesian,home_0_totalBases_marcel_projection,home_0_totalBases_bayesian,away_0_totalBases_marcel_projection,away_0_totalBases_bayesian,home_0_homeRuns_marcel_projection,home_0_homeRuns_bayesian,away_0_homeRuns_marcel_projection,away_0_homeRuns_bayesian,home_0_baseOnBalls_marcel_projection,home_0_baseOnBalls_bayesian,away_0_baseOnBalls_marcel_projection,away_0_baseOnBalls_bayesian,home_1_hits_marcel_projection,home_1_hits_bayesian,away_1_hits_marcel_projection,away_1_hits_bayesian,home_1_totalBases_marcel_projection,home_1_totalBases_bayesian,away_1_totalBases_marcel_projection,away_1_totalBases_bayesian,home_1_homeRuns_marcel_projection,home_1_homeRuns_bayesian,away_1_homeRuns_marcel_projection,away_1_homeRuns_bayesian,home_1_baseOnBalls_marcel_projection,home_1_baseOnBalls_bayesian,away_1_baseOnBalls_marcel_projection,away_1_baseOnBalls_bayesian,home_2_hits_marcel_projection,home_2_hits_bayesian,away_2_hits_marcel_projection,away_2_hits_bayesian,home_2_totalBases_marcel_projection,home_2_totalBases_bayesian,away_2_totalBases_marcel_projection,away_2_totalBases_bayesian,home_2_homeRuns_marcel_projection,home_2_homeRuns_bayesian,away_2_homeRuns_marcel_projection,away_2_homeRuns_bayesian,home_2_baseOnBalls_marcel_projection,home_2_baseOnBalls_bayesian,away_2_baseOnBalls_marcel_projection,away_2_baseOnBalls_bayesian,home_3_hits_marcel_projection,home_3_hits_bayesian,away_3_hits_marcel_projection,away_3_hits_bayesian,home_3_totalBases_marcel_projection,home_3_totalBases_bayesian,away_3_totalBases_marcel_projection,away_3_totalBases_bayesian,home_3_homeRuns_marcel_projection,home_3_homeRuns_bayesian,away_3_homeRuns_marcel_projection,away_3_homeRuns_bayesian,home_3_baseOnBalls_marcel_projection,home_3_baseOnBalls_bayesian,away_3_baseOnBalls_marcel_projection,away_3_baseOnBalls_bayesian,home_4_hits_marcel_projection,home_4_hits_bayesian,away_4_hits_marcel_projection,away_4_hits_bayesian,home_4_totalBases_marcel_projection,home_4_totalBases_bayesian,away_4_totalBases_marcel_projection,away_4_totalBases_bayesian,home_4_homeRuns_marcel_projection,home_4_homeRuns_bayesian,away_4_homeRuns_marcel_projection,away_4_homeRuns_bayesian,home_4_baseOnBalls_marcel_projection,home_4_baseOnBalls_bayesian,away_4_baseOnBalls_marcel_projection,away_4_baseOnBalls_bayesian,home_5_hits_marcel_projection,home_5_hits_bayesian,away_5_hits_marcel_projection,away_5_hits_bayesian,home_5_totalBases_marcel_projection,home_5_totalBases_bayesian,away_5_totalBases_marcel_projection,away_5_totalBases_bayesian,home_5_homeRuns_marcel_projection,home_5_homeRuns_bayesian,away_5_homeRuns_marcel_projection,away_5_homeRuns_bayesian,home_5_baseOnBalls_marcel_projection,home_5_baseOnBalls_bayesian,away_5_baseOnBalls_marcel_projection,away_5_baseOnBalls_bayesian,home_6_hits_marcel_projection,home_6_hits_bayesian,away_6_hits_marcel_projection,away_6_hits_bayesian,home_6_totalBases_marcel_projection,home_6_totalBases_bayesian,away_6_totalBases_marcel_projection,away_6_totalBases_bayesian,home_6_homeRuns_marcel_projection,home_6_homeRuns_bayesian,away_6_homeRuns_marcel_projection,away_6_homeRuns_bayesian,home_6_baseOnBalls_marcel_projection,home_6_baseOnBalls_bayesian,away_6_baseOnBalls_marcel_projection,away_6_baseOnBalls_bayesian,home_7_hits_marcel_projection,home_7_hits_bayesian,away_7_hits_marcel_projection,away_7_hits_bayesian,home_7_totalBases_marcel_projection,home_7_totalBases_bayesian,away_7_totalBases_marcel_projection,away_7_totalBases_bayesian,home_7_homeRuns_marcel_projection,home_7_homeRuns_bayesian,away_7_homeRuns_marcel_projection,away_7_homeRuns_bayesian,home_7_baseOnBalls_marcel_projection,home_7_baseOnBalls_bayesian,away_7_baseOnBalls_marcel_projection,away_7_baseOnBalls_bayesian,home_8_hits_marcel_projection,home_8_hits_bayesian,away_8_hits_marcel_projection,away_8_hits_bayesian,home_8_totalBases_marcel_projection,home_8_totalBases_bayesian,away_8_totalBases_marcel_projection,away_8_totalBases_bayesian,home_8_homeRuns_marcel_projection,home_8_homeRuns_bayesian,away_8_homeRuns_marcel_projection,away_8_homeRuns_bayesian,home_8_baseOnBalls_marcel_projection,home_8_baseOnBalls_bayesian,away_8_baseOnBalls_marcel_projection,away_8_baseOnBalls_bayesian,home_sp_earnedRuns_marcel_projection,home_sp_earnedRuns_bayesian,away_sp_earnedRuns_marcel_projection,away_sp_earnedRuns_bayesian,home_sp_hits_marcel_projection,home_sp_hits_bayesian,away_sp_hits_marcel_projection,away_sp_hits_bayesian,home_sp_baseOnBalls_marcel_projection,home_sp_baseOnBalls_bayesian,away_sp_baseOnBalls_marcel_projection,away_sp_baseOnBalls_bayesian,home_sp_homeRuns_marcel_projection,home_sp_homeRuns_bayesian,away_sp_homeRuns_marcel_projection,away_sp_homeRuns_bayesian,home_hits,away_hits,home_totalBases,away_totalBases,home_homeRuns,away_homeRuns,home_baseOnBalls,away_baseOnBalls,home_era,away_era,home_whip9,away_whip9,era_diff,wp_diff,elo_diff,whip9_diff,obp_diff,slg_diff,HRR_diff
2471,716356,61,100,0.378882,1645.455768,1693.146184,81,80,0.503106,2,False,32,46,19,13,116,José Ureña,18,40,25,3,136,Pedro Avila,94,34,145,62,5,32,0,25,10,134,44,242,102,9,56,0,45,17,6,1,21,10,1,6,0,6,1,401,155,610,339,70,168,3,174,49,25,8,36,28,5,3,0,23,4,Elvis Andrus,Zach Remillard,Andrew Vaughn,Gavin Sheets,Korey Lee,Lenyn Sosa,Tyler Naquin,Trayce Thompson,Carlos Pérez,30,140,17,20,109,106,92,8,17,42,214,40,31,167,177,164,17,23,2,17,6,2,9,17,13,2,1,136,621,113,98,516,452,550,32,82,16,75,9,3,50,30,74,0,4,Matthew Batten,Ha-Seong Kim,Ji Man Choi,José Azocar,Jurickson Profar,Garrett Cooper,Trent Grisham,Eguy Rosario,Brett Sullivan,21.915285,0.231368,20.853081,0.215479,33.333333,0.333999,33.175355,0.31854,2.025783,0.01602,2.369668,0.018516,6.445672,0.062766,8.056872,0.101936,20.0,0.211765,21.497121,0.22399,35.0,0.309804,34.165067,0.344196,2.5,0.013725,2.303263,0.026773,6.0,0.054902,8.253359,0.115469,23.664122,0.237555,20.0,0.173709,38.167939,0.394603,33.409091,0.344644,3.053435,0.033878,2.727273,0.040973,7.251908,0.060918,12.272727,0.099872,22.458629,0.192389,23.051948,0.217434,38.297872,0.319585,34.090909,0.328742,3.782506,0.031395,1.623377,0.0183,7.328605,0.080475,6.818182,0.049587,21.596244,0.156448,21.22807,0.211409,34.2723,0.254543,32.982456,0.324647,2.347418,0.019691,2.105263,0.018028,7.981221,0.07636,10.175439,0.097687,21.100917,0.198138,23.043478,0.233774,34.40367,0.337327,37.391304,0.38839,2.752294,0.032658,2.608696,0.035523,7.798165,0.040292,8.913043,0.070495,20.0,0.194175,19.61165,0.17171,35.0,0.339806,34.174757,0.304884,2.5,0.024272,3.106796,0.02478,6.0,0.058252,10.291262,0.129679,21.752266,0.170629,22.167488,0.228542,39.879154,0.309778,36.453202,0.404948,4.229607,0.037334,2.955665,0.037543,10.574018,0.122533,8.374384,0.063442,22.009569,0.214829,20.0,0.203297,34.92823,0.348512,35.0,0.318681,2.392344,0.022767,2.5,0.019231,7.655502,0.078225,6.0,0.054945,10.630631,0.735011,8.333333,0.405128,21.261261,1.159677,17.777778,0.888889,8.108108,0.467381,6.666667,0.487179,2.342342,0.264523,2.222222,0.080342,0.200811,0.208816,0.327551,0.341964,0.025749,0.02663,0.070525,0.087012,0.735011,0.405128,1.627058,1.376068,0.329883,-0.124224,-47.690416,0.25099,-0.024493,-0.014413,-0.000881
2472,716352,55,106,0.341615,1631.148643,1695.530704,82,79,0.509317,2,True,79,154,21,25,412,Zack Greinke,28,80,32,7,302,Michael King,126,175,137,123,92,98,29,69,47,166,313,227,207,156,163,50,106,61,4,30,23,15,11,15,2,7,2,511,690,576,598,381,457,134,341,206,38,40,19,62,22,23,10,29,12,Maikel Garcia,Bobby Witt Jr.,Salvador Perez,MJ Melendez,Edward Olivares,Michael Massey,Dairon Blanco,Nick Pratto,Matt Duffy,120,112,13,56,78,63,33,14,14,193,206,29,98,110,89,47,18,19,15,21,4,10,6,5,2,0,0,558,597,71,256,357,326,187,99,68,59,52,3,14,28,25,13,7,6,DJ LeMahieu,Anthony Volpe,Austin Wells,Kyle Higashioka,Isiah Kiner-Falefa,Oswaldo Cabrera,Oswald Peraza,Everson Pereira,Estevan Florial,23.584906,0.24482,23.420074,0.217964,37.735849,0.333447,34.200743,0.34529,2.830189,0.011179,2.230483,0.026186,8.018868,0.075317,10.408922,0.105485,20.0,0.246835,20.0,0.189383,35.0,0.440506,35.0,0.345768,2.5,0.041139,2.5,0.033716,6.0,0.058228,6.0,0.083214,24.45328,0.238836,20.0,0.192982,45.328032,0.402852,35.0,0.374269,5.367793,0.041964,2.5,0.038012,4.572565,0.034871,6.0,0.052632,20.12848,0.205055,20.289855,0.214297,34.90364,0.346567,35.362319,0.374613,3.211991,0.026092,3.768116,0.038674,11.134904,0.104778,6.376812,0.057238,23.489933,0.240104,24.390244,0.224049,36.912752,0.401066,32.833021,0.312545,3.020134,0.029148,1.313321,0.016003,6.375839,0.058993,5.816135,0.073996,22.222222,0.215839,22.727273,0.201238,34.680135,0.354901,37.412587,0.296743,2.356902,0.031161,3.146853,0.019124,6.734007,0.053382,8.391608,0.078384,20.0,0.209402,23.580786,0.197146,35.0,0.363248,37.117904,0.293094,2.5,0.019231,2.620087,0.016098,6.0,0.068376,8.733624,0.075727,19.931271,0.201658,20.0,0.170854,34.707904,0.319066,35.0,0.266332,3.092784,0.022886,2.5,0.012563,9.278351,0.086799,6.0,0.065327,20.0,0.218954,20.909091,0.207792,35.0,0.313725,34.090909,0.316017,2.5,0.014706,2.727273,0.016234,6.0,0.058824,9.090909,0.089827,8.827586,0.559411,7.868852,0.298907,21.103448,1.115309,17.04918,0.808743,4.827586,0.164507,6.885246,0.324044,2.62069,0.175928,1.967213,0.074727,0.224611,0.201745,0.363931,0.324963,0.02639,0.024068,0.066619,0.075759,0.559411,0.298907,1.279816,1.132787,0.260504,-0.167702,-64.382061,0.147029,0.013726,0.038968,0.002322
2473,716364,91,70,0.565217,1716.991391,1697.915225,83,78,0.515528,0,True,51,118,32,13,319,Adrian Houser,76,145,53,26,426,Drew Smyly,31,51,47,67,73,25,16,53,88,50,75,67,117,98,69,29,99,121,4,7,3,13,3,13,3,9,6,167,225,222,350,314,185,69,239,445,23,19,27,34,28,21,7,8,38,Blake Perkins,Victor Caratini,Sal Frelick,Rowdy Tellez,Andruw Monasterio,Josh Donaldson,Garrett Mitchell,Tyrone Taylor,Brice Turang,0,94,126,5,84,54,8,28,32,0,195,237,11,125,133,20,47,40,0,26,22,1,8,23,2,5,1,15,425,572,13,397,298,43,153,146,2,36,53,0,55,30,3,12,13,Pete Crow-Armstrong,Christopher Morel,Jeimer Candelario,Alexander Canario,Mike Tauchman,Patrick Wisdom,Jared Young,Miguel Amaya,Miles Mastrobuoni,20.0,0.191011,20.0,0.173913,35.0,0.318352,35.0,0.304348,2.5,0.024345,2.5,0.021739,6.0,0.108614,6.0,0.069565,19.592875,0.217209,22.033898,0.221017,30.279898,0.323938,38.256659,0.444298,2.290076,0.028585,3.389831,0.055981,9.160305,0.086647,8.716707,0.085175,20.0,0.208075,22.379032,0.220802,35.0,0.31677,36.693548,0.407282,2.5,0.017081,2.620968,0.036638,6.0,0.102484,8.266129,0.09117,21.052632,0.195673,20.0,0.221239,39.097744,0.346884,35.0,0.40708,4.511278,0.038914,2.5,0.030973,9.022556,0.095606,6.0,0.053097,20.0,0.224638,20.0,0.209256,35.0,0.321256,35.0,0.321932,2.5,0.013285,2.5,0.021127,6.0,0.082126,6.0,0.122736,19.734345,0.156963,19.60396,0.184935,34.535104,0.363281,38.415842,0.430693,3.41556,0.057598,4.752475,0.06973,10.626186,0.110969,8.910891,0.097766,23.931624,0.236282,20.0,0.195804,38.461538,0.399181,35.0,0.384615,2.991453,0.035452,2.5,0.031469,8.547009,0.091994,6.0,0.062937,21.860465,0.220827,20.0,0.189723,39.069767,0.407285,35.0,0.324111,3.72093,0.037525,2.5,0.029644,6.511628,0.042807,6.0,0.071146,20.0,0.198165,21.5311,0.217606,35.0,0.286239,33.971292,0.300696,2.5,0.015596,2.392344,0.01379,6.0,0.080734,7.655502,0.083965,8.870968,0.475166,8.688525,0.522769,18.548387,1.083717,19.508197,1.015483,8.225806,0.319252,6.065574,0.364602,1.935484,0.118536,3.114754,0.179721,0.205427,0.203811,0.342576,0.369451,0.02982,0.034566,0.089109,0.081951,0.475166,0.522769,1.40297,1.380085,-0.047602,0.049689,19.076166,0.022885,0.008774,-0.026874,-0.004746
2474,716353,70,91,0.434783,1666.916455,1695.530704,82,79,0.509317,3,True,104,219,39,25,583,Miles Mikolas,56,102,47,19,321,Hunter Greene,117,111,113,17,23,53,2,0,21,189,178,183,25,46,94,3,0,29,13,14,16,2,6,10,0,0,2,524,499,461,95,98,237,21,8,134,35,72,37,13,6,12,1,0,10,Tommy Edman,Lars Nootbaar,Jordan Walker,Luken Baker,Richie Palacios,Andrew Knizner,Juniel Querecuto,Irving Lopez,Masyn Winn,18,90,113,59,35,76,71,48,42,38,157,176,105,50,140,120,81,70,5,13,13,13,3,11,13,5,6,75,423,514,237,119,325,326,251,196,5,35,47,14,8,40,26,25,14,Nick Martini,Elly De La Cruz,Tyler Stephenson,Christian Encarnacion-Strand,Noelvi Marte,Will Benson,Nick Senzel,Stuart Fairchild,Luke Maile,23.972603,0.225918,20.0,0.217143,36.30137,0.36106,35.0,0.417143,2.054795,0.024126,2.5,0.042857,6.678082,0.066792,6.0,0.062857,20.725389,0.219909,20.0,0.210325,37.564767,0.359874,35.0,0.367113,3.626943,0.029427,2.5,0.029637,11.917098,0.140095,6.0,0.078394,20.0,0.237077,25.301205,0.225246,35.0,0.388592,38.253012,0.348946,2.5,0.032977,2.710843,0.025588,6.0,0.076649,8.73494,0.090774,20.0,0.189744,20.0,0.234421,35.0,0.307692,35.0,0.41543,2.5,0.023077,2.5,0.045994,6.0,0.097436,6.0,0.059347,22.137405,0.227967,20.0,0.251142,33.206107,0.400031,35.0,0.388128,1.908397,0.039941,2.5,0.025114,8.015267,0.070784,6.0,0.063927,19.452055,0.214991,21.212121,0.228734,28.219178,0.362668,32.467532,0.405806,1.643836,0.034551,2.164502,0.030975,8.767123,0.061624,7.359307,0.111434,20.0,0.181818,21.090047,0.216174,35.0,0.31405,29.85782,0.351779,2.5,0.020661,1.658768,0.03441,6.0,0.057851,7.582938,0.078833,20.0,0.185185,21.789883,0.198832,35.0,0.324074,36.18677,0.333865,2.5,0.023148,3.11284,0.023114,6.0,0.055556,7.782101,0.093396,20.0,0.175214,20.915033,0.212551,35.0,0.273504,32.352941,0.345787,2.5,0.019231,1.960784,0.026895,6.0,0.068376,8.496732,0.076002,8.0,0.523364,8.943089,0.511363,17.69697,1.106061,16.747967,0.935023,4.848485,0.204899,6.99187,0.425133,2.424242,0.128151,3.089431,0.173933,0.206425,0.221619,0.343505,0.374889,0.02746,0.03162,0.07724,0.07944,0.523364,0.511363,1.31096,1.360156,0.012002,-0.074534,-28.614249,-0.049196,-0.017394,-0.031384,-0.00416
2475,716404,72,86,0.455696,1674.138785,1698.324727,82,76,0.518987,0,True,62,120,46,16,312,David Peterson,70,158,54,22,514,Jesús Luzardo,22,21,123,150,45,38,78,72,10,28,30,286,277,79,79,159,111,15,1,2,46,30,9,11,23,9,1,121,91,641,673,223,171,414,375,57,19,5,65,64,10,14,33,29,3,Rafael Ortega,Ronny Mauricio,Pete Alonso,Francisco Lindor,Mark Vientos,DJ Stewart,Francisco Alvarez,Brett Baty,Tim Locastro,111,122,121,132,145,57,85,59,22,154,253,252,224,234,80,154,87,25,7,36,34,22,19,3,18,6,0,408,566,527,604,610,238,371,316,75,25,63,32,63,39,23,26,17,2,Jon Berti,Jorge Soler,Jake Burger,Josh Bell,Bryan De La Cruz,Garrett Hampson,Jazz Chisholm Jr.,Nick Fortes,Xavier Edwards,20.0,0.190045,20.512821,0.258884,35.0,0.285068,30.30303,0.362801,2.5,0.015837,1.631702,0.016992,6.0,0.113122,10.25641,0.069402,20.0,0.21466,19.37046,0.212268,35.0,0.340314,35.35109,0.43296,2.5,0.02356,3.874092,0.059871,6.0,0.057592,9.927361,0.109501,22.937294,0.196946,22.635135,0.229083,43.069307,0.444088,38.851351,0.463878,5.280528,0.069204,3.378378,0.059615,9.240924,0.10019,7.094595,0.062352,22.479339,0.22313,22.241379,0.219093,37.520661,0.406883,36.896552,0.370592,3.471074,0.0433,3.275862,0.035903,8.92562,0.094341,11.034483,0.105163,21.266968,0.205161,24.06015,0.238113,34.38914,0.35105,38.345865,0.383586,2.714932,0.036269,3.007519,0.030997,8.597285,0.057577,6.766917,0.06446,20.0,0.214022,20.994475,0.230753,35.0,0.420664,32.872928,0.333944,2.5,0.049815,1.933702,0.014597,6.0,0.073801,7.458564,0.090114,20.0,0.190661,22.641509,0.228538,35.0,0.377432,40.431267,0.412805,2.5,0.049611,4.043127,0.046801,6.0,0.075875,7.816712,0.071798,21.719457,0.197304,21.981424,0.194667,36.199095,0.309893,37.151703,0.298442,3.167421,0.025616,3.71517,0.023354,7.692308,0.077247,7.739938,0.059471,20.0,0.191083,20.0,0.24,35.0,0.318471,35.0,0.342857,2.5,0.022293,2.5,0.014286,6.0,0.057325,6.0,0.045714,8.928571,0.572005,9.391304,0.415661,17.678571,1.110311,17.217391,0.917369,8.214286,0.437212,7.304348,0.320965,2.321429,0.147753,2.608696,0.128841,0.202557,0.227933,0.36154,0.377985,0.037278,0.033602,0.078563,0.075331,0.572005,0.415661,1.547523,1.238334,0.156343,-0.063291,-24.185942,0.309189,-0.022143,-0.016444,0.003677


In [76]:
# Testing the Bayesian results
# get the last row
test_row = twothree_df.iloc[-1]
print(test_row['away_0_playername'], test_row['away_0_hits'], test_row['away_0_plateAppearances'],test_row['away_0_hits_bayesian'], test_row['away_0_hits_marcel_projection'])

Jon Berti 111 408 0.25888350494649703 20.51282051282051


In [77]:
twothree_df

Unnamed: 0,gamePk,home_wins,home_losses,home_winpercentage,home_elo,away_elo,away_wins,away_losses,away_winpercentage,away_runs,home_win,home_sp_earnedRuns,home_sp_hits,home_sp_baseOnBalls,home_sp_homeRuns,home_sp_outs,home_sp_playername,away_sp_earnedRuns,away_sp_hits,away_sp_baseOnBalls,away_sp_homeRuns,away_sp_outs,away_sp_playername,home_0_hits,home_1_hits,home_2_hits,home_3_hits,home_4_hits,home_5_hits,home_6_hits,home_7_hits,home_8_hits,home_0_totalBases,home_1_totalBases,home_2_totalBases,home_3_totalBases,home_4_totalBases,home_5_totalBases,home_6_totalBases,home_7_totalBases,home_8_totalBases,home_0_homeRuns,home_1_homeRuns,home_2_homeRuns,home_3_homeRuns,home_4_homeRuns,home_5_homeRuns,home_6_homeRuns,home_7_homeRuns,home_8_homeRuns,home_0_plateAppearances,home_1_plateAppearances,home_2_plateAppearances,home_3_plateAppearances,home_4_plateAppearances,home_5_plateAppearances,home_6_plateAppearances,home_7_plateAppearances,home_8_plateAppearances,home_0_baseOnBalls,home_1_baseOnBalls,home_2_baseOnBalls,home_3_baseOnBalls,home_4_baseOnBalls,home_5_baseOnBalls,home_6_baseOnBalls,home_7_baseOnBalls,home_8_baseOnBalls,home_0_playername,home_1_playername,home_2_playername,home_3_playername,home_4_playername,home_5_playername,home_6_playername,home_7_playername,home_8_playername,away_0_hits,away_1_hits,away_2_hits,away_3_hits,away_4_hits,away_5_hits,away_6_hits,away_7_hits,away_8_hits,away_0_totalBases,away_1_totalBases,away_2_totalBases,away_3_totalBases,away_4_totalBases,away_5_totalBases,away_6_totalBases,away_7_totalBases,away_8_totalBases,away_0_homeRuns,away_1_homeRuns,away_2_homeRuns,away_3_homeRuns,away_4_homeRuns,away_5_homeRuns,away_6_homeRuns,away_7_homeRuns,away_8_homeRuns,away_0_plateAppearances,away_1_plateAppearances,away_2_plateAppearances,away_3_plateAppearances,away_4_plateAppearances,away_5_plateAppearances,away_6_plateAppearances,away_7_plateAppearances,away_8_plateAppearances,away_0_baseOnBalls,away_1_baseOnBalls,away_2_baseOnBalls,away_3_baseOnBalls,away_4_baseOnBalls,away_5_baseOnBalls,away_6_baseOnBalls,away_7_baseOnBalls,away_8_baseOnBalls,away_0_playername,away_1_playername,away_2_playername,away_3_playername,away_4_playername,away_5_playername,away_6_playername,away_7_playername,away_8_playername,home_0_hits_marcel_projection,home_0_hits_bayesian,away_0_hits_marcel_projection,away_0_hits_bayesian,home_0_totalBases_marcel_projection,home_0_totalBases_bayesian,away_0_totalBases_marcel_projection,away_0_totalBases_bayesian,home_0_homeRuns_marcel_projection,home_0_homeRuns_bayesian,away_0_homeRuns_marcel_projection,away_0_homeRuns_bayesian,home_0_baseOnBalls_marcel_projection,home_0_baseOnBalls_bayesian,away_0_baseOnBalls_marcel_projection,away_0_baseOnBalls_bayesian,home_1_hits_marcel_projection,home_1_hits_bayesian,away_1_hits_marcel_projection,away_1_hits_bayesian,home_1_totalBases_marcel_projection,home_1_totalBases_bayesian,away_1_totalBases_marcel_projection,away_1_totalBases_bayesian,home_1_homeRuns_marcel_projection,home_1_homeRuns_bayesian,away_1_homeRuns_marcel_projection,away_1_homeRuns_bayesian,home_1_baseOnBalls_marcel_projection,home_1_baseOnBalls_bayesian,away_1_baseOnBalls_marcel_projection,away_1_baseOnBalls_bayesian,home_2_hits_marcel_projection,home_2_hits_bayesian,away_2_hits_marcel_projection,away_2_hits_bayesian,home_2_totalBases_marcel_projection,home_2_totalBases_bayesian,away_2_totalBases_marcel_projection,away_2_totalBases_bayesian,home_2_homeRuns_marcel_projection,home_2_homeRuns_bayesian,away_2_homeRuns_marcel_projection,away_2_homeRuns_bayesian,home_2_baseOnBalls_marcel_projection,home_2_baseOnBalls_bayesian,away_2_baseOnBalls_marcel_projection,away_2_baseOnBalls_bayesian,home_3_hits_marcel_projection,home_3_hits_bayesian,away_3_hits_marcel_projection,away_3_hits_bayesian,home_3_totalBases_marcel_projection,home_3_totalBases_bayesian,away_3_totalBases_marcel_projection,away_3_totalBases_bayesian,home_3_homeRuns_marcel_projection,home_3_homeRuns_bayesian,away_3_homeRuns_marcel_projection,away_3_homeRuns_bayesian,home_3_baseOnBalls_marcel_projection,home_3_baseOnBalls_bayesian,away_3_baseOnBalls_marcel_projection,away_3_baseOnBalls_bayesian,home_4_hits_marcel_projection,home_4_hits_bayesian,away_4_hits_marcel_projection,away_4_hits_bayesian,home_4_totalBases_marcel_projection,home_4_totalBases_bayesian,away_4_totalBases_marcel_projection,away_4_totalBases_bayesian,home_4_homeRuns_marcel_projection,home_4_homeRuns_bayesian,away_4_homeRuns_marcel_projection,away_4_homeRuns_bayesian,home_4_baseOnBalls_marcel_projection,home_4_baseOnBalls_bayesian,away_4_baseOnBalls_marcel_projection,away_4_baseOnBalls_bayesian,home_5_hits_marcel_projection,home_5_hits_bayesian,away_5_hits_marcel_projection,away_5_hits_bayesian,home_5_totalBases_marcel_projection,home_5_totalBases_bayesian,away_5_totalBases_marcel_projection,away_5_totalBases_bayesian,home_5_homeRuns_marcel_projection,home_5_homeRuns_bayesian,away_5_homeRuns_marcel_projection,away_5_homeRuns_bayesian,home_5_baseOnBalls_marcel_projection,home_5_baseOnBalls_bayesian,away_5_baseOnBalls_marcel_projection,away_5_baseOnBalls_bayesian,home_6_hits_marcel_projection,home_6_hits_bayesian,away_6_hits_marcel_projection,away_6_hits_bayesian,home_6_totalBases_marcel_projection,home_6_totalBases_bayesian,away_6_totalBases_marcel_projection,away_6_totalBases_bayesian,home_6_homeRuns_marcel_projection,home_6_homeRuns_bayesian,away_6_homeRuns_marcel_projection,away_6_homeRuns_bayesian,home_6_baseOnBalls_marcel_projection,home_6_baseOnBalls_bayesian,away_6_baseOnBalls_marcel_projection,away_6_baseOnBalls_bayesian,home_7_hits_marcel_projection,home_7_hits_bayesian,away_7_hits_marcel_projection,away_7_hits_bayesian,home_7_totalBases_marcel_projection,home_7_totalBases_bayesian,away_7_totalBases_marcel_projection,away_7_totalBases_bayesian,home_7_homeRuns_marcel_projection,home_7_homeRuns_bayesian,away_7_homeRuns_marcel_projection,away_7_homeRuns_bayesian,home_7_baseOnBalls_marcel_projection,home_7_baseOnBalls_bayesian,away_7_baseOnBalls_marcel_projection,away_7_baseOnBalls_bayesian,home_8_hits_marcel_projection,home_8_hits_bayesian,away_8_hits_marcel_projection,away_8_hits_bayesian,home_8_totalBases_marcel_projection,home_8_totalBases_bayesian,away_8_totalBases_marcel_projection,away_8_totalBases_bayesian,home_8_homeRuns_marcel_projection,home_8_homeRuns_bayesian,away_8_homeRuns_marcel_projection,away_8_homeRuns_bayesian,home_8_baseOnBalls_marcel_projection,home_8_baseOnBalls_bayesian,away_8_baseOnBalls_marcel_projection,away_8_baseOnBalls_bayesian,home_sp_earnedRuns_marcel_projection,home_sp_earnedRuns_bayesian,away_sp_earnedRuns_marcel_projection,away_sp_earnedRuns_bayesian,home_sp_hits_marcel_projection,home_sp_hits_bayesian,away_sp_hits_marcel_projection,away_sp_hits_bayesian,home_sp_baseOnBalls_marcel_projection,home_sp_baseOnBalls_bayesian,away_sp_baseOnBalls_marcel_projection,away_sp_baseOnBalls_bayesian,home_sp_homeRuns_marcel_projection,home_sp_homeRuns_bayesian,away_sp_homeRuns_marcel_projection,away_sp_homeRuns_bayesian,home_hits,away_hits,home_totalBases,away_totalBases,home_homeRuns,away_homeRuns,home_baseOnBalls,away_baseOnBalls,home_era,away_era,home_whip9,away_whip9,era_diff,wp_diff,elo_diff,whip9_diff,obp_diff,slg_diff,HRR_diff
0,718780,0,0,0.000000,1500.000000,1500.000000,0,0,0.000000,7,False,0,0,0,0,0,Patrick Corbin,0,0,0,0,0,Max Fried,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lane Thomas,Joey Meneses,Jeimer Candelario,Dominic Smith,Keibert Ruiz,Corey Dickerson,Luis García Jr.,Victor Robles,CJ Abrams,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Ronald Acuña Jr.,Matt Olson,Austin Riley,Ozzie Albies,Sean Murphy,Sam Hilliard,Michael Harris II,Travis d'Arnaud,Orlando Arcia,21.400000,0.214000,20.000000,0.200000,35.600000,0.356000,35.000000,0.350000,2.800000,0.028000,2.500000,0.025000,9.000000,0.090000,6.000000,0.060000,25.937500,0.259375,21.393841,0.213938,43.750000,0.437500,41.491086,0.414911,4.062500,0.040625,4.862237,0.048622,6.875000,0.068750,11.183144,0.111831,22.379032,0.223790,25.122349,0.251223,36.693548,0.366935,45.187602,0.451876,2.620968,0.026210,4.730832,0.047308,8.266129,0.082661,7.993475,0.079935,22.153846,0.221538,23.821340,0.238213,35.692308,0.356923,41.439206,0.414392,2.461538,0.024615,3.722084,0.037221,7.076923,0.070769,6.699752,0.066998,23.474178,0.234742,20.871143,0.208711,34.507042,0.345070,36.297641,0.362976,2.112676,0.021127,3.266788,0.032668,7.276995,0.072770,9.255898,0.092559,23.896104,0.238961,19.135802,0.191358,35.844156,0.358442,33.950617,0.339506,2.077922,0.020779,3.395062,0.033951,6.233766,0.062338,9.259259,0.092593,20.000000,0.200000,26.840855,0.268409,35.000000,0.350000,44.893112,0.448931,2.500000,0.025000,3.800475,0.038005,6.000000,0.060000,6.175772,0.061758,20.000000,0.200000,23.394495,0.233945,29.545455,0.295455,40.137615,0.401376,1.590909,0.015909,3.669725,0.036697,6.590909,0.065909,6.192661,0.061927,23.361823,0.233618,21.779141,0.217791,33.903134,0.339031,35.889571,0.358896,1.709402,0.017094,3.067485,0.030675,4.558405,0.045584,8.282209,0.082822,12.207792,0.610390,6.508876,0.325444,23.896104,1.194805,17.041420,0.852071,6.623377,0.331169,4.733728,0.236686,3.376623,0.168831,1.656805,0.082840,0.225114,0.224843,0.356151,0.393652,0.024373,0.036683,0.068753,0.078936,0.610390,0.325444,1.525974,1.088757,0.284946,0.000000,0.000000,0.437217,-0.009912,-0.037501,-0.012310
1,718781,0,0,0.000000,1500.000000,1500.000000,0,0,0.000000,0,True,0,0,0,0,0,Gerrit Cole,0,0,0,0,0,Logan Webb,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,DJ LeMahieu,Aaron Judge,Anthony Rizzo,Estevan Florial,Josh Donaldson,Gleyber Torres,Oswaldo Cabrera,Jose Trevino,Anthony Volpe,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,LaMonte Wade Jr.,Michael Conforto,Wilmer Flores,Joc Pederson,J.D. Davis,Thairo Estrada,Brandon Crawford,Brett Wisely,Joey Bart,23.420074,0.234201,20.879121,0.208791,34.200743,0.342007,36.813187,0.368132,2.230483,0.022305,3.571429,0.035714,10.408922,0.104089,8.791209,0.087912,24.549918,0.245499,20.000000,0.200000,48.608838,0.486088,35.000000,0.350000,6.710311,0.067103,2.500000,0.025000,12.765957,0.127660,6.000000,0.060000,19.924812,0.199248,21.467890,0.214679,37.406015,0.374060,36.697248,0.366972,4.323308,0.043233,3.486239,0.034862,9.398496,0.093985,8.807339,0.088073,20.909091,0.209091,21.935484,0.219355,34.090909,0.340909,39.139785,0.391398,2.727273,0.027273,4.086022,0.040860,9.090909,0.090909,8.387097,0.083871,19.734345,0.197343,22.029703,0.220297,34.535104,0.345351,35.148515,0.351485,3.415560,0.034156,2.722772,0.027228,10.626186,0.106262,10.396040,0.103960,23.234201,0.232342,23.347107,0.233471,37.174721,0.371747,36.570248,0.365702,2.973978,0.029740,2.892562,0.028926,8.364312,0.083643,6.611570,0.066116,22.727273,0.227273,22.520661,0.225207,37.412587,0.374126,37.190083,0.371901,3.146853,0.031469,3.099174,0.030992,8.391608,0.083916,8.677686,0.086777,22.604423,0.226044,20.000000,0.200000,34.889435,0.348894,35.000000,0.350000,2.457002,0.024570,2.500000,0.025000,5.159705,0.051597,6.000000,0.060000,20.000000,0.200000,20.809249,0.208092,35.000000,0.350000,33.236994,0.332370,2.500000,0.025000,2.890173,0.028902,6.000000,0.060000,7.803468,0.078035,7.709497,0.385475,7.176471,0.358824,16.424581,0.821229,17.882353,0.894118,5.251397,0.262570,5.529412,0.276471,2.905028,0.145251,1.411765,0.070588,0.219005,0.214432,0.370354,0.360885,0.033872,0.030832,0.089118,0.079416,0.385475,0.358824,1.083799,1.170588,0.026651,0.000000,0.000000,-0.086789,0.014274,0.009469,0.003040
2,718782,0,0,0.000000,1500.000000,1500.000000,0,0,0.000000,10,False,0,0,0,0,0,Corey Kluber,0,0,0,0,0,Kyle Gibson,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Alex Verdugo,Rafael Devers,Rob Refsnyder,Masataka Yoshida,Adam Duvall,Triston Casas,Christian Arroyo,Connor Wong,Raimel Tapia,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Cedric Mullins,Adley Rutschman,Ryan McKenna,Ryan Mountcastle,Gunnar Henderson,Ramón Urías,Austin Hays,Adam Frazier,Jorge Mateo,25.773196,0.257732,24.172185,0.241722,37.972509,0.379725,39.238411,0.392384,2.061856,0.020619,2.980132,0.029801,7.388316,0.073883,7.450331,0.074503,25.479930,0.254799,22.068966,0.220690,45.549738,0.455497,37.471264,0.374713,4.537522,0.045375,2.758621,0.027586,8.202443,0.082024,11.954023,0.119540,23.026316,0.230263,20.588235,0.205882,34.868421,0.348684,31.699346,0.316993,2.302632,0.023026,1.960784,0.019608,8.552632,0.085526,8.823529,0.088235,20.000000,0.200000,23.623446,0.236234,35.000000,0.350000,40.674956,0.406750,2.500000,0.025000,4.085258,0.040853,6.000000,0.060000,7.282416,0.072824,20.581114,0.205811,22.932331,0.229323,40.193705,0.401937,37.593985,0.375940,5.084746,0.050847,3.007519,0.030075,6.537530,0.065375,9.774436,0.097744,20.967742,0.209677,23.008850,0.230088,36.290323,0.362903,36.946903,0.369469,3.629032,0.036290,3.097345,0.030973,11.693548,0.116935,7.743363,0.077434,24.456522,0.244565,23.345588,0.233456,38.315217,0.383152,38.602941,0.386029,2.717391,0.027174,3.125000,0.031250,5.706522,0.057065,6.066176,0.060662,21.397380,0.213974,23.362832,0.233628,34.934498,0.349345,32.566372,0.325664,2.620087,0.026201,1.238938,0.012389,8.296943,0.082969,7.433628,0.074336,20.000000,0.200000,21.355236,0.213552,35.000000,0.350000,34.702259,0.347023,2.500000,0.025000,2.258727,0.022587,6.000000,0.060000,5.544148,0.055441,9.333333,0.466667,9.876543,0.493827,20.533333,1.026667,19.629630,0.981481,4.933333,0.246667,6.666667,0.333333,2.400000,0.120000,2.469136,0.123457,0.224091,0.227175,0.375694,0.366107,0.031059,0.027236,0.075975,0.080080,0.466667,0.493827,1.273333,1.314815,-0.027160,0.000000,0.000000,-0.041481,-0.007188,0.009587,0.003823
3,718777,0,0,0.000000,1500.000000,1500.000000,0,0,0.000000,0,True,0,0,0,0,0,Marcus Stroman,0,0,0,0,0,Corbin Burnes,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Nico Hoerner,Dansby Swanson,Ian Happ,Cody Bellinger,Trey Mancini,Yan Gomes,Eric Hosmer,Patrick Wisdom,Miles Mastrobuoni,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Christian Yelich,Jesse Winker,Willy Adames,Rowdy Tellez,William Contreras,Luis Urías,Garrett Mitchell,Brian Anderson,Brice Turang,25.000000,0.250000,20.754717,0.207547,35.924370,0.359244,32.761578,0.327616,1.680672,0.016807,2.401372,0.024014,7.142857,0.071429,12.864494,0.128645,23.817292,0.238173,21.647510,0.216475,39.804241,0.398042,37.356322,0.373563,3.588907,0.035889,3.639847,0.036398,7.504078,0.075041,12.452107,0.124521,22.299652,0.222997,22.517730,0.225177,38.675958,0.386760,41.134752,0.411348,3.484321,0.034843,4.255319,0.042553,9.930314,0.099303,8.687943,0.086879,19.215686,0.192157,21.052632,0.210526,34.313725,0.343137,39.097744,0.390977,3.333333,0.033333,4.511278,0.045113,8.235294,0.082353,9.022556,0.090226,21.981982,0.219820,23.341523,0.233415,36.036036,0.360360,40.049140,0.400491,3.063063,0.030631,4.176904,0.041769,8.468468,0.084685,9.582310,0.095823,22.395833,0.223958,21.703854,0.217039,35.677083,0.356771,35.902637,0.359026,2.864583,0.028646,3.245436,0.032454,4.947917,0.049479,10.141988,0.101420,23.605150,0.236052,23.931624,0.239316,35.193133,0.351931,38.461538,0.384615,2.360515,0.023605,2.991453,0.029915,8.154506,0.081545,8.547009,0.085470,19.603960,0.196040,21.052632,0.210526,38.415842,0.384158,33.253589,0.332536,4.752475,0.047525,2.631579,0.026316,8.910891,0.089109,9.090909,0.090909,21.531100,0.215311,20.000000,0.200000,33.971292,0.339713,35.000000,0.350000,2.392344,0.023923,2.500000,0.025000,7.655502,0.076555,6.000000,0.060000,7.619048,0.380952,6.553672,0.327684,18.095238,0.904762,15.141243,0.757062,5.578231,0.278912,5.310734,0.265537,2.176871,0.108844,1.807910,0.090395,0.221612,0.217780,0.364457,0.370019,0.030578,0.033726,0.078833,0.095988,0.380952,0.327684,1.183673,1.022599,0.053269,0.000000,0.000000,0.161075,-0.013323,-0.005562,-0.003148
4,718776,0,0,0.000000,1500.000000,1500.000000,0,0,0.000000,0,True,0,0,0,0,0,Shane McClanahan,0,0,0,0,0,Eduardo Rodriguez,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Taylor Walls,Wander Franco,Isaac Paredes,Randy Arozarena,Luke Raley,Manuel Margot,Brandon Lowe,Christian Bethancourt,Jose Siri,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Zach McKinstry,Javier Báez,Riley Greene,Eric Haase,Spencer Torkelson,Austin Meadows,Miguel Cabrera,Nick Maton,Ryan Kreidler,17.738359,0.177384,20.322581,0.203226,27.937916,0.279379,34.516129,0.345161,1.773836,0.017738,2.903226,0.029032,10.643016,0.106430,7.741935,0.077419,25.558313,0.255583,22.545455,0.225455,39.950372,0.399504,38.545455,0.385455,2.233251,0.022333,3.636364,0.036364,7.940447,0.079404,4.909091,0.049091,20.050125,0.200501,23.227384,0.232274,36.090226,0.360902,34.229829,0.342298,3.759398,0.037594,1.711491,0.017115,10.275689,0.102757,8.557457,0.085575,23.842196,0.238422,21.980676,0.219807,40.308748,0.403087,38.888889,0.388889,3.259005,0.032590,4.106280,0.041063,7.890223,0.078902,7.004831,0.070048,20.164609,0.201646,20.149254,0.201493,32.098765,0.320988,31.592040,0.315920,2.469136,0.024691,2.238806,0.022388,7.407407,0.074074,8.955224,0.089552,23.831776,0.238318,21.021021,0.210210,34.813084,0.348131,36.936937,0.369369,1.869159,0.018692,3.303303,0.033033,7.476636,0.074766,9.609610,0.096096,21.265823,0.212658,22.174840,0.221748,40.000000,0.400000,31.982942,0.319829,4.556962,0.045570,2.345416,0.023454,10.126582,0.101266,7.249467,0.072495,22.888283,0.228883,22.265625,0.222656,37.329700,0.373297,37.500000,0.375000,2.997275,0.029973,3.125000,0.031250,5.449591,0.054496,8.593750,0.085938,21.253406,0.212534,20.661157,0.206612,34.332425,0.343324,32.644628,0.326446,2.724796,0.027248,2.479339,0.024793,6.539510,0.065395,7.851240,0.078512,6.794872,0.339744,9.421488,0.471074,16.153846,0.807692,19.834711,0.991736,5.384615,0.269231,6.611570,0.330579,2.179487,0.108974,2.479339,0.123967,0.218437,0.215942,0.358735,0.352041,0.028492,0.028721,0.081943,0.078303,0.339744,0.471074,1.076923,1.322314,-0.131331,0.000000,0.000000,-0.245391,0.006135,0.006694,-0.000229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2471,716356,61,100,0.378882,1645.455768,1693.146184,81,80,0.503106,2,False,32,46,19,13,116,José Ureña,18,40,25,3,136,Pedro Avila,94,34,145,62,5,32,0,25,10,134,44,242,102,9,56,0,45,17,6,1,21,10,1,6,0,6,1,401,155,610,339,70,168,3,174,49,25,8,36,28,5,3,0,23,4,Elvis Andrus,Zach Remillard,Andrew Vaughn,Gavin Sheets,Korey Lee,Lenyn Sosa,Tyler Naquin,Trayce Thompson,Carlos Pérez,30,140,17,20,109,106,92,8,17,42,214,40,31,167,177,164,17,23,2,17,6,2,9,17,13,2,1,136,621,113,98,516,452,550,32,82,16,75,9,3,50,30,74,0,4,Matthew Batten,Ha-Seong Kim,Ji Man Choi,José Azocar,Jurickson Profar,Garrett Cooper,Trent Grisham,Eguy Rosario,Brett Sullivan,21.915285,0.231368,20.853081,0.215479,33.333333,0.333999,33.175355,0.318540,2.025783,0.016020,2.369668,0.018516,6.445672,0.062766,8.056872,0.101936,20.000000,0.211765,21.497121,0.223990,35.000000,0.309804,34.165067,0.344196,2.500000,0.013725,2.303263,0.026773,6.000000,0.054902,8.253359,0.115469,23.664122,0.237555,20.000000,0.173709,38.167939,0.394603,33.409091,0.344644,3.053435,0.033878,2.727273,0.040973,7.251908,0.060918,12.272727,0.099872,22.458629,0.192389,23.051948,0.217434,38.297872,0.319585,34.090909,0.328742,3.782506,0.031395,1.623377,0.018300,7.328605,0.080475,6.818182,0.049587,21.596244,0.156448,21.228070,0.211409,34.272300,0.254543,32.982456,0.324647,2.347418,0.019691,2.105263,0.018028,7.981221,0.076360,10.175439,0.097687,21.100917,0.198138,23.043478,0.233774,34.403670,0.337327,37.391304,0.388390,2.752294,0.032658,2.608696,0.035523,7.798165,0.040292,8.913043,0.070495,20.000000,0.194175,19.611650,0.171710,35.000000,0.339806,34.174757,0.304884,2.500000,0.024272,3.106796,0.024780,6.000000,0.058252,10.291262,0.129679,21.752266,0.170629,22.167488,0.228542,39.879154,0.309778,36.453202,0.404948,4.229607,0.037334,2.955665,0.037543,10.574018,0.122533,8.374384,0.063442,22.009569,0.214829,20.000000,0.203297,34.928230,0.348512,35.000000,0.318681,2.392344,0.022767,2.500000,0.019231,7.655502,0.078225,6.000000,0.054945,10.630631,0.735011,8.333333,0.405128,21.261261,1.159677,17.777778,0.888889,8.108108,0.467381,6.666667,0.487179,2.342342,0.264523,2.222222,0.080342,0.200811,0.208816,0.327551,0.341964,0.025749,0.026630,0.070525,0.087012,0.735011,0.405128,1.627058,1.376068,0.329883,-0.124224,-47.690416,0.250990,-0.024493,-0.014413,-0.000881
2472,716352,55,106,0.341615,1631.148643,1695.530704,82,79,0.509317,2,True,79,154,21,25,412,Zack Greinke,28,80,32,7,302,Michael King,126,175,137,123,92,98,29,69,47,166,313,227,207,156,163,50,106,61,4,30,23,15,11,15,2,7,2,511,690,576,598,381,457,134,341,206,38,40,19,62,22,23,10,29,12,Maikel Garcia,Bobby Witt Jr.,Salvador Perez,MJ Melendez,Edward Olivares,Michael Massey,Dairon Blanco,Nick Pratto,Matt Duffy,120,112,13,56,78,63,33,14,14,193,206,29,98,110,89,47,18,19,15,21,4,10,6,5,2,0,0,558,597,71,256,357,326,187,99,68,59,52,3,14,28,25,13,7,6,DJ LeMahieu,Anthony Volpe,Austin Wells,Kyle Higashioka,Isiah Kiner-Falefa,Oswaldo Cabrera,Oswald Peraza,Everson Pereira,Estevan Florial,23.584906,0.244820,23.420074,0.217964,37.735849,0.333447,34.200743,0.345290,2.830189,0.011179,2.230483,0.026186,8.018868,0.075317,10.408922,0.105485,20.000000,0.246835,20.000000,0.189383,35.000000,0.440506,35.000000,0.345768,2.500000,0.041139,2.500000,0.033716,6.000000,0.058228,6.000000,0.083214,24.453280,0.238836,20.000000,0.192982,45.328032,0.402852,35.000000,0.374269,5.367793,0.041964,2.500000,0.038012,4.572565,0.034871,6.000000,0.052632,20.128480,0.205055,20.289855,0.214297,34.903640,0.346567,35.362319,0.374613,3.211991,0.026092,3.768116,0.038674,11.134904,0.104778,6.376812,0.057238,23.489933,0.240104,24.390244,0.224049,36.912752,0.401066,32.833021,0.312545,3.020134,0.029148,1.313321,0.016003,6.375839,0.058993,5.816135,0.073996,22.222222,0.215839,22.727273,0.201238,34.680135,0.354901,37.412587,0.296743,2.356902,0.031161,3.146853,0.019124,6.734007,0.053382,8.391608,0.078384,20.000000,0.209402,23.580786,0.197146,35.000000,0.363248,37.117904,0.293094,2.500000,0.019231,2.620087,0.016098,6.000000,0.068376,8.733624,0.075727,19.931271,0.201658,20.000000,0.170854,34.707904,0.319066,35.000000,0.266332,3.092784,0.022886,2.500000,0.012563,9.278351,0.086799,6.000000,0.065327,20.000000,0.218954,20.909091,0.207792,35.000000,0.313725,34.090909,0.316017,2.500000,0.014706,2.727273,0.016234,6.000000,0.058824,9.090909,0.089827,8.827586,0.559411,7.868852,0.298907,21.103448,1.115309,17.049180,0.808743,4.827586,0.164507,6.885246,0.324044,2.620690,0.175928,1.967213,0.074727,0.224611,0.201745,0.363931,0.324963,0.026390,0.024068,0.066619,0.075759,0.559411,0.298907,1.279816,1.132787,0.260504,-0.167702,-64.382061,0.147029,0.013726,0.038968,0.002322
2473,716364,91,70,0.565217,1716.991391,1697.915225,83,78,0.515528,0,True,51,118,32,13,319,Adrian Houser,76,145,53,26,426,Drew Smyly,31,51,47,67,73,25,16,53,88,50,75,67,117,98,69,29,99,121,4,7,3,13,3,13,3,9,6,167,225,222,350,314,185,69,239,445,23,19,27,34,28,21,7,8,38,Blake Perkins,Victor Caratini,Sal Frelick,Rowdy Tellez,Andruw Monasterio,Josh Donaldson,Garrett Mitchell,Tyrone Taylor,Brice Turang,0,94,126,5,84,54,8,28,32,0,195,237,11,125,133,20,47,40,0,26,22,1,8,23,2,5,1,15,425,572,13,397,298,43,153,146,2,36,53,0,55,30,3,12,13,Pete Crow-Armstrong,Christopher Morel,Jeimer Candelario,Alexander Canario,Mike Tauchman,Patrick Wisdom,Jared Young,Miguel Amaya,Miles Mastrobuoni,20.000000,0.191011,20.000000,0.173913,35.000000,0.318352,35.000000,0.304348,2.500000,0.024345,2.500000,0.021739,6.000000,0.108614,6.000000,0.069565,19.592875,0.217209,22.033898,0.221017,30.279898,0.323938,38.256659,0.444298,2.290076,0.028585,3.389831,0.055981,9.160305,0.086647,8.716707,0.085175,20.000000,0.208075,22.379032,0.220802,35.000000,0.316770,36.693548,0.407282,2.500000,0.017081,2.620968,0.036638,6.000000,0.102484,8.266129,0.091170,21.052632,0.195673,20.000000,0.221239,39.097744,0.346884,35.000000,0.407080,4.511278,0.038914,2.500000,0.030973,9.022556,0.095606,6.000000,0.053097,20.000000,0.224638,20.000000,0.209256,35.000000,0.321256,35.000000,0.321932,2.500000,0.013285,2.500000,0.021127,6.000000,0.082126,6.000000,0.122736,19.734345,0.156963,19.603960,0.184935,34.535104,0.363281,38.415842,0.430693,3.415560,0.057598,4.752475,0.069730,10.626186,0.110969,8.910891,0.097766,23.931624,0.236282,20.000000,0.195804,38.461538,0.399181,35.000000,0.384615,2.991453,0.035452,2.500000,0.031469,8.547009,0.091994,6.000000,0.062937,21.860465,0.220827,20.000000,0.189723,39.069767,0.407285,35.000000,0.324111,3.720930,0.037525,2.500000,0.029644,6.511628,0.042807,6.000000,0.071146,20.000000,0.198165,21.531100,0.217606,35.000000,0.286239,33.971292,0.300696,2.500000,0.015596,2.392344,0.013790,6.000000,0.080734,7.655502,0.083965,8.870968,0.475166,8.688525,0.522769,18.548387,1.083717,19.508197,1.015483,8.225806,0.319252,6.065574,0.364602,1.935484,0.118536,3.114754,0.179721,0.205427,0.203811,0.342576,0.369451,0.029820,0.034566,0.089109,0.081951,0.475166,0.522769,1.402970,1.380085,-0.047602,0.049689,19.076166,0.022885,0.008774,-0.026874,-0.004746
2474,716353,70,91,0.434783,1666.916455,1695.530704,82,79,0.509317,3,True,104,219,39,25,583,Miles Mikolas,56,102,47,19,321,Hunter Greene,117,111,113,17,23,53,2,0,21,189,178,183,25,46,94,3,0,29,13,14,16,2,6,10,0,0,2,524,499,461,95,98,237,21,8,134,35,72,37,13,6,12,1,0,10,Tommy Edman,Lars Nootbaar,Jordan Walker,Luken Baker,Richie Palacios,Andrew Knizner,Juniel Querecuto,Irving Lopez,Masyn Winn,18,90,113,59,35,76,71,48,42,38,157,176,105,50,140,120,81,70,5,13,13,13,3,11,13,5,6,75,423,514,237,119,325,326,251,196,5,35,47,14,8,40,26,25,14,Nick Martini,Elly De La Cruz,Tyler Stephenson,Christian Encarnacion-Strand,Noelvi Marte,Will Benson,Nick Senzel,Stuart Fairchild,Luke Maile,23.972603,0.225918,20.000000,0.217143,36.301370,0.361060,35.000000,0.417143,2.054795,0.024126,2.500000,0.042857,6.678082,0.066792,6.000000,0.062857,20.725389,0.219909,20.000000,0.210325,37.564767,0.359874,35.000000,0.367113,3.626943,0.029427,2.500000,0.029637,11.917098,0.140095,6.000000,0.078394,20.000000,0.237077,25.301205,0.225246,35.000000,0.388592,38.253012,0.348946,2.500000,0.032977,2.710843,0.025588,6.000000,0.076649,8.734940,0.090774,20.000000,0.189744,20.000000,0.234421,35.000000,0.307692,35.000000,0.415430,2.500000,0.023077,2.500000,0.045994,6.000000,0.097436,6.000000,0.059347,22.137405,0.227967,20.000000,0.251142,33.206107,0.400031,35.000000,0.388128,1.908397,0.039941,2.500000,0.025114,8.015267,0.070784,6.000000,0.063927,19.452055,0.214991,21.212121,0.228734,28.219178,0.362668,32.467532,0.405806,1.643836,0.034551,2.164502,0.030975,8.767123,0.061624,7.359307,0.111434,20.000000,0.181818,21.090047,0.216174,35.000000,0.314050,29.857820,0.351779,2.500000,0.020661,1.658768,0.034410,6.000000,0.057851,7.582938,0.078833,20.000000,0.185185,21.789883,0.198832,35.000000,0.324074,36.186770,0.333865,2.500000,0.023148,3.112840,0.023114,6.000000,0.055556,7.782101,0.093396,20.000000,0.175214,20.915033,0.212551,35.000000,0.273504,32.352941,0.345787,2.500000,0.019231,1.960784,0.026895,6.000000,0.068376,8.496732,0.076002,8.000000,0.523364,8.943089,0.511363,17.696970,1.106061,16.747967,0.935023,4.848485,0.204899,6.991870,0.425133,2.424242,0.128151,3.089431,0.173933,0.206425,0.221619,0.343505,0.374889,0.027460,0.031620,0.077240,0.079440,0.523364,0.511363,1.310960,1.360156,0.012002,-0.074534,-28.614249,-0.049196,-0.017394,-0.031384,-0.004160


In [78]:
print(twothree_df['home_era'].mean())
print(twothree_df['away_era'].mean())
print(twothree_df['home_whip9'].mean())
print(twothree_df['away_whip9'].mean())

0.4735441918073316
0.4702200258921263
1.282375229855711
1.2794611727735619


## Model Training and Evaluation

Now let's train regression models to predict the probability of home team wins (which can be converted to moneylines).

In [83]:
feature_cols = [
    'era_diff','whip9_diff', 'obp_diff', 'slg_diff', 'wp_diff', 'elo_diff', 'HRR_diff',   
]

#import boossted_stump
from sklearn.ensemble import GradientBoostingRegressor

#import K nearest Neighbors
from sklearn.neighbors import KNeighborsRegressor

def train_moneyline_models(features_df):
    """Train models to predict home team win probability"""
    if features_df is None or len(features_df) < 10:
        print("Insufficient data for training")
        return None, None
    
    # Select feature columns (exclude metadata and target)
    X = features_df[feature_cols]
    y = features_df['home_win']


    print(f"Training on {len(X)} games with {len(feature_cols)} features")
    print(f"Home team win rate: {y.mean():.3f}")
    
    # Normalize features
    # for col in X.columns:
    #     print(f"Normalizing feature: {col}")
    #     print(f"  Mean: {X[col].mean():.4f}, Std: {X[col].std():.4f}")
    X = (X - X.mean()) / X.std()

    # Split data
    if len(X) > 20:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random.randint(1, 10000))
    else:
        # Use all data for training if dataset is small
        X_train, X_test, y_train, y_test = X, X, y, y
        print("Small dataset - using all data for both training and testing")
    
    # Train models
    models = {}

    # KNN Regressor
    knn_model = KNeighborsRegressor(n_neighbors=50)
    knn_model.fit(X_train, y_train)
    models['KNN Regressor'] = knn_model

    
    # Logistic Regression
    lgr_model = LogisticRegression()
    lgr_model.fit(X_train, y_train)
    models['Logistic Regression'] = lgr_model
    
    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=random.randint(1, 10000))
    rf_model.fit(X_train, y_train)
    models['Random Forest'] = rf_model
    
    # Evaluate models
    results = {}
    for name, model in models.items():
        if hasattr(model, 'predict_proba'):
            # For classification models
            y_pred = model.predict_proba(X_test)[:, 1]
        else:
            # For regression models
            y_pred = model.predict(X_test)
        # Clip predictions to [0, 1] range for probability
        y_pred = np.clip(y_pred, 0, 1)
        
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Classification accuracy (using 0.5 threshold)
        y_pred_class = (y_pred > 0.5).astype(int)
        accuracy = (y_pred_class == y_test).mean()
        
        results[name] = {
            'MSE': mse,
            'MAE': mae,
            'R²': r2,
            'Accuracy': accuracy,
            'predictions': y_pred
        }
        
        print(f"\n{name} Results:")
        print(f"  MSE: {mse:.4f}")
        print(f"  MAE: {mae:.4f}")
        print(f"  R²: {r2:.4f}")
        print(f"  Accuracy: {accuracy:.4f}")
    
    return models, results

# Train models
all_models, all_results = list(), list()
for i in range(10):
    print(f"Training iteration {i+1}")
    models, results = train_moneyline_models(twothree_df)
    all_models.append(models)
    all_results.append(results)

print("After training 10 iterations:")
#Print average results
avg_results = {name: {metric: np.mean([result[name][metric] for result in all_results]) for metric in all_results[0][name]} for name in all_results[0]}
for name, metrics in avg_results.items():
    print(f"\n{name} Average Results:")
    for metric, value in metrics.items():
        if metric == 'predictions':
            continue
        print(f"  {metric}: {value:.4f}")



Training iteration 1
Training on 2476 games with 7 features
Home team win rate: 0.520

KNN Regressor Results:
  MSE: 0.2421
  MAE: 0.4821
  R²: 0.0249
  Accuracy: 0.5760

Logistic Regression Results:
  MSE: 0.2414
  MAE: 0.4852
  R²: 0.0278
  Accuracy: 0.5814

Random Forest Results:
  MSE: 0.2422
  MAE: 0.4663
  R²: 0.0248
  Accuracy: 0.5976
Training iteration 2
Training on 2476 games with 7 features
Home team win rate: 0.520

KNN Regressor Results:
  MSE: 0.2450
  MAE: 0.4846
  R²: 0.0190
  Accuracy: 0.5437

Logistic Regression Results:
  MSE: 0.2463
  MAE: 0.4887
  R²: 0.0139
  Accuracy: 0.5518

Random Forest Results:
  MSE: 0.2427
  MAE: 0.4665
  R²: 0.0282
  Accuracy: 0.5720
Training iteration 3
Training on 2476 games with 7 features
Home team win rate: 0.520

KNN Regressor Results:
  MSE: 0.2469
  MAE: 0.4859
  R²: 0.0028
  Accuracy: 0.5478

Logistic Regression Results:
  MSE: 0.2453
  MAE: 0.4880
  R²: 0.0093
  Accuracy: 0.5424

Random Forest Results:
  MSE: 0.2507
  MAE: 0.4765


In [80]:
feature_cols = [
    'era', 'whip9', 'avg', 'slg', 'HRR',
]

def train_elo_model(feature_df):
    elo_df = pd.DataFrame()

    home_dict = {col: feature_df[f'home_pregame_{col}'] for col in feature_cols}
    home_dict['win'] = feature_df['home_win']

    away_dict = {col: feature_df[f'away_pregame_{col}'] for col in feature_cols}
    away_dict['win'] = 1 - feature_df['home_win']  # Invert home win to get away win

    elo_df = pd.concat([pd.DataFrame(home_dict), pd.DataFrame(away_dict)], ignore_index=True)
    print(f"Training Elo model on {len(elo_df)} games with {len(feature_cols)} features")

In [81]:
# Show each coefficients for the Linear Regression model
print("\nLinear Regression Coefficients:")
for feature, coef in zip(feature_cols, models['Linear Regression'].coef_):
    print(f"{feature}: {coef:.4f}")
    print("appx imporance:", abs(coef) / sum(abs(models['Linear Regression'].coef_)))
print("LinearRegression constant:", models['Linear Regression'].intercept_)


# Show each coefficients for the Logistic Regression model
print("\nLogistic Regression Coefficients:")
for feature, coef in zip(feature_cols, models['Logistic Regression'].coef_[0]):
    print(f"{feature}: {coef:.4f}")
print("LogisticRegression constant:", models['Logistic Regression'].intercept_[0])
print("Expected Home Win Probability based on Intercept:", 
      1 / (1 + np.exp(-models['Logistic Regression'].intercept_[0])))

# Plotting feature importance for Random Forest model
if 'Random Forest' in models:
    rf_importances = models['Random Forest'].feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': rf_importances
    }).sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
    plt.title('Random Forest Feature Importance')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.show()




Linear Regression Coefficients:


KeyError: 'Linear Regression'

## Moneyline Conversion and Predictions

Convert win probabilities to moneylines and make predictions for upcoming games.

In [None]:
def probability_to_moneyline(prob):
    """Convert win probability to American moneyline odds"""
    if prob <= 0 or prob >= 1:
        return None
    
    if prob > 0.5:
        # Favorite (negative odds)
        return -int(prob / (1 - prob) * 100)
    else:
        # Underdog (positive odds)
        return int((1 - prob) / prob * 100)

def moneyline_to_probability(moneyline):
    """Convert American moneyline odds to win probability"""
    if moneyline > 0:
        return 100 / (moneyline + 100)
    else:
        return abs(moneyline) / (abs(moneyline) + 100)

def get_todays_games():
    """Get today's scheduled games for predictions"""
    try:
        today = datetime.now().strftime('%Y-%m-%d')
        schedule = mlbstatsapi.schedule(date=today)
        
        todays_games = []
        for game in schedule:
            if game['status']['statusCode'] in ['S', 'P']:  # Scheduled or Pre-game
                game_data = {
                    'game_id': game['gamePk'],
                    'date': today,
                    'home_team_id': game['teams']['home']['team']['id'],
                    'home_team_name': game['teams']['home']['team']['name'],
                    'away_team_id': game['teams']['away']['team']['id'],
                    'away_team_name': game['teams']['away']['team']['name'],
                    'game_time': game.get('gameDate', 'TBD')
                }
                todays_games.append(game_data)
        
        return pd.DataFrame(todays_games)
    except Exception as e:
        print(f"Error getting today's games: {e}")
        return None

# Get today's games and make predictions
if 'models' in locals() and models is not None and 'standings_df' in locals():
    todays_games = get_todays_games()
    
    if todays_games is not None and len(todays_games) > 0:
        print(f"Found {len(todays_games)} games scheduled for today:")
        
        # Create features for today's games
        todays_features = create_features_for_games(todays_games, standings_df)
        
        if todays_features is not None:
            feature_cols = [
                'home_win_pct', 'away_win_pct', 'win_pct_diff',
                'home_run_diff', 'away_run_diff', 'run_diff_advantage',
                'home_field_advantage', 'home_team_era', 'away_team_era', 'era_advantage'
            ]
            
            X_today = todays_features[feature_cols]
            
            # Make predictions with best model (Random Forest)
            if 'Random Forest' in models:
                predictions = models['Random Forest'].predict(X_today)
                predictions = np.clip(predictions, 0.01, 0.99)  # Avoid extreme probabilities
                
                print("\n=== TODAY'S PREDICTIONS ===")
                for i, (_, game) in enumerate(todays_features.iterrows()):
                    home_prob = predictions[i]
                    away_prob = 1 - home_prob
                    
                    home_ml = probability_to_moneyline(home_prob)
                    away_ml = probability_to_moneyline(away_prob)
                    
                    print(f"\nGame: {game['away_team_name']} @ {game['home_team_name']}")
                    print(f"  Home Win Probability: {home_prob:.3f} ({home_ml:+d})")
                    print(f"  Away Win Probability: {away_prob:.3f} ({away_ml:+d})")
                    
                    # Determine recommendation
                    if home_prob > 0.55:
                        print(f"  Recommendation: Bet {game['home_team_name']} (Home)")
                    elif away_prob > 0.55:
                        print(f"  Recommendation: Bet {game['away_team_name']} (Away)")
                    else:
                        print(f"  Recommendation: No strong edge detected")
            else:
                print("Random Forest model not available for predictions")
        else:
            print("Could not create features for today's games")
    else:
        print("No games scheduled for today or could not retrieve schedule")
else:
    print("Models not trained - cannot make predictions")

In [None]:
models, results = train_moneyline_models(twothree_df)