In [537]:
import pandas as pd
import numpy as np
from datetime import datetime


import haversine as hs   
from haversine import Unit

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

import plotly.graph_objects as go



# Download Data


In [2]:
# Download Base Data
path = r'/Users/willmiraglia/Desktop/Datathon 2024/datathon_2024_dataset_corrected.csv' #r'/content/datathon_2024_dataset.csv'
baseball_df = pd.read_csv(path).iloc[:, 1:] # just to remove the indexing column

# Download City Data
city_path = r'/Users/willmiraglia/Desktop/Datathon 2024/city_cords.csv' 
city_cords = pd.read_csv(city_path)

## Merge Data
baseball_df = pd.merge(baseball_df, city_cords, on='city', how='left').reset_index()
baseball_df.head(5)

Unnamed: 0,index,game_date,home_team,away_team,is_day_game,home_score,away_score,venue,venue_name,city,...,home_2b,home_3b,home_hr,home_fo,home_so,home_bb,home_hbp,lat,lng,time_zone
0,0,20000329,NYN,CHN,False,3,5,TOK01,Tokyo Dome,Tokyo,...,1,0,1,24,4,3,0,35.6764,139.65,9
1,1,20000330,CHN,NYN,False,1,5,TOK01,Tokyo Dome,Tokyo,...,0,0,0,24,9,6,0,35.6764,139.65,9
2,2,20000403,ATL,COL,True,2,0,ATL02,Turner Field,Atlanta,...,0,0,2,17,6,1,1,33.7488,-84.3877,-5
3,3,20000403,CIN,MIL,True,3,3,CIN08,Cinergy Field,Cincinnati,...,1,0,1,10,4,1,0,39.1031,-84.512,-5
4,4,20000403,FLO,SFN,False,6,4,MIA01,Sun Life Stadium,Miami,...,3,0,0,17,7,1,1,25.7617,-80.1918,-5


# Add Travel Data

## Distance Functions

In [3]:
def get_haversine_distance_game(row):
    city1 = (row['last_lat_game'], row['last_lng_game'])
    city2 = (row['lat'], row['lng'])
    
    return hs.haversine(city1, city2, unit=hs.Unit.KILOMETERS)

def get_haversine_distance_series(row):
    city1 = (row['last_lat_series'], row['last_lng_series'])
    city2 = (row['lat'], row['lng'])
    
    return hs.haversine(city1, city2, unit=hs.Unit.KILOMETERS)

## Clean Original Data

In [7]:
def clean_data(baseball_df):
    # convert to datetime formatting
    baseball_df['game_date'] = pd.to_datetime(baseball_df['game_date'], format = "%Y%m%d") #change to yyyy-mm-dd
    baseball_df['year'] = baseball_df['game_date'].dt.year

    # add home win column
    baseball_df['home_win']= [1 if x>y else 0 for x,y in zip(baseball_df['home_score'],baseball_df['away_score'])] #1 if home team won, 0 else

    # clean the abbrevations
    clean_abrev_dict = {
        'NYN': 'NYM', # mets
        'CHN': 'CHC', # cubs
        'ATL': 'ATL', # braves
        'CIN': 'CIN', # reds
        'FLO': 'MIA', # marlins
        'MON': 'MON', # expos
        'SLN': 'STL', # cardinals
        'ANA': 'LAA', # angels
        'BAL': 'BAL', # orioles
        'MIN': 'MIN', # twins
        'OAK': 'OAK', # a's
        'TEX': 'TEX', # rangers
        'TOR': 'TOR', # blue jays
        'ARI':  'AZ',  # d-backs
        'PIT': 'PIT', # pirates
        'SEA': 'SEA', # mariners
        'HOU': 'HOU', # astros
        'KCA': 'KC',  # royals
        'TBA': 'TB',  # rays
        'COL': 'COL', # rockies
        'MIL': 'MIL', # brewers
        'PHI': 'PHI', # phillies
        'SDN': 'SD',  # padres
        'SFN': 'SF',  # giants
        'BOS': 'BOS', # red sox
        'DET': 'DET', # tigers
        'NYA': 'NYY', # yankees
        'LAN': 'LAD', # dodgers
        'CHA': 'CWS', # white sox
        'CLE': 'CLE', # guardians
        'WAS': 'WAS', # nationals
        'MIA': 'MIA', # marlins
    }

    # replace the data abbreviations with the correct abbreviations
    baseball_df['home_team'] = baseball_df['home_team'].map(clean_abrev_dict)
    baseball_df['away_team'] = baseball_df['away_team'].map(clean_abrev_dict)

    return baseball_df

## Create Travel Dictionary by Team

In [2]:
runTeamDict = False

unique_teams = pd.unique(pd.concat([baseball_df['home_team'], baseball_df['away_team']]))
if runTeamDict:
    team_schedules = {}
    for team in unique_teams:

        all_team_games = baseball_df[(baseball_df['home_team'] == team) | (baseball_df['away_team'] == team)]
        all_team_games = all_team_games.sort_values(by='game_date', ascending=True).reset_index(drop=True)
        all_team_games['time_between_games'] = all_team_games['game_date'] - all_team_games['game_date'].shift(1)

        all_team_games['last_lat_game'] = all_team_games['lat'].shift(1)
        all_team_games['last_lng_game'] = all_team_games['lng'].shift(1)

        all_team_games['distance_travelled_game'] = all_team_games.apply(get_haversine_distance_game, axis=1)

        all_team_games['last_lat_series'] = np.nan
        all_team_games['last_lng_series'] = np.nan

        last_series_date = all_team_games.iloc[0]['game_date']
        for i in range(1, len(all_team_games)):
            curr_game = all_team_games.iloc[i]
            
            curr_home = curr_game['home_team']
            curr_away = curr_game['away_team']
            curr_venue = curr_game['venue_name']

            last_game = all_team_games.iloc[i-1]

            last_home = last_game['home_team']
            last_away = last_game['away_team']
            last_venue = last_game['venue_name']

            if (curr_home == last_home) and (curr_away == last_away) and (curr_venue == last_venue): # same series
                last_lat = last_game['last_lat_series']
                last_lng = last_game['last_lng_series']
            else: # new series
                last_series_date = last_game['game_date']
                last_time_zone = last_game['time_zone']
                last_lat = last_game['lat']
                last_lng = last_game['lng']

            
            all_team_games.loc[i, 'last_series_time_zone'] = last_time_zone
            
            days_since_last_series = curr_game['game_date'] - last_series_date
            all_team_games.loc[i, 'time_since_last_series'] = days_since_last_series
            
            all_team_games.loc[i, 'last_lat_series'] = last_lat
            all_team_games.loc[i, 'last_lng_series'] = last_lng    

        all_team_games['distance_travelled_series'] = all_team_games.apply(get_haversine_distance_series, axis=1)

        team_schedules[team] = all_team_games
        

NameError: name 'baseball_df' is not defined

## Merge Travel Data to Games

In [4]:
runGameJoin = False


if runGameJoin:
    baseball_sub = baseball_df[baseball_df['year'] > 2000].copy()

    home_last_time_zone_list = []
    home_time_between_games_list = []
    home_time_since_last_series_list = []
    home_distance_travelled_game_list = []
    home_distance_travelled_series_list = []

    away_last_time_zone_list = []
    away_time_between_games_list = []
    away_time_since_last_series_list = []
    away_distance_travelled_game_list = []
    away_distance_travelled_series_list = []

    for row in tqdm(range(len(baseball_sub))):
        # Get home, away, and date
        home_team = baseball_sub.iloc[row]['home_team']
        away_team = baseball_sub.iloc[row]['away_team']
        game_index = baseball_sub.iloc[row]['index']

        ## HOME TEAM DATA
        home_schedule = team_schedules[home_team]
        home_last_time_zone = home_schedule[home_schedule['index'] == game_index]['last_series_time_zone'].reset_index(drop=True).iloc[0]
        home_time_between_games = home_schedule[home_schedule['index'] == game_index]['time_between_games'].reset_index(drop=True).iloc[0].days
        home_time_since_last_series = home_schedule[home_schedule['index'] == game_index]['time_since_last_series'].reset_index(drop=True).iloc[0].days
        home_distance_travelled_game = home_schedule[home_schedule['index'] == game_index]['distance_travelled_game'].reset_index(drop=True).iloc[0]
        home_distance_travelled_series = home_schedule[home_schedule['index'] == game_index]['distance_travelled_series'].reset_index(drop=True).iloc[0]
        # Home Lists
        home_last_time_zone_list.append( home_last_time_zone )
        home_time_between_games_list.append( home_time_between_games )
        home_time_since_last_series_list.append( home_time_since_last_series )
        home_distance_travelled_game_list.append( home_distance_travelled_game )
        home_distance_travelled_series_list.append( home_distance_travelled_series )

        ## AWAY TEAM DATA
        away_schedule = team_schedules[away_team]
        away_last_time_zone = away_schedule[away_schedule['index'] == game_index]['last_series_time_zone'].reset_index(drop=True).iloc[0]
        away_time_between_games = away_schedule[away_schedule['index'] == game_index]['time_between_games'].reset_index(drop=True).iloc[0].days
        away_time_since_last_series = away_schedule[away_schedule['index'] == game_index]['time_since_last_series'].reset_index(drop=True).iloc[0].days
        away_distance_travelled_game = away_schedule[away_schedule['index'] == game_index]['distance_travelled_game'].reset_index(drop=True).iloc[0]
        away_distance_travelled_series = away_schedule[away_schedule['index'] == game_index]['distance_travelled_series'].reset_index(drop=True).iloc[0]
        # Away Lists
        away_last_time_zone_list.append( away_last_time_zone )
        away_time_between_games_list.append( away_time_between_games )
        away_time_since_last_series_list.append (away_time_since_last_series )
        away_distance_travelled_game_list.append( away_distance_travelled_game )
        away_distance_travelled_series_list.append( away_distance_travelled_series )


    # last time zone
    baseball_sub['home_last_time_zone'] = home_last_time_zone_list
    baseball_sub['away_last_time_zone'] = away_last_time_zone_list

    # time between games
    baseball_sub['home_time_between_games'] = home_time_between_games_list
    baseball_sub['away_time_between_games'] = away_time_between_games_list

    # time since last series
    baseball_sub['home_time_since_last_series'] = home_time_since_last_series_list
    baseball_sub['away_time_since_last_series'] = away_time_since_last_series_list

    # distance travelled between games
    baseball_sub['home_distance_travelled_game'] = home_distance_travelled_game_list
    baseball_sub['away_distance_travelled_game'] = away_distance_travelled_game_list

    # distance travelled between series
    baseball_sub['home_distance_travelled_series'] = home_distance_travelled_series_list
    baseball_sub['away_distance_travelled_series'] = away_distance_travelled_series_list

# Download Necessary Data

## Re-download Travel Data

In [10]:
baseball_path = r'/Users/willmiraglia/Desktop/Datathon 2024/datathon2024_data_w_dist.csv'
baseball_df = pd.read_csv(baseball_path)
baseball_df

Unnamed: 0,index,game_date,home_team,away_team,is_day_game,home_score,away_score,venue,venue_name,city,...,home_last_time_zone,away_last_time_zone,home_time_between_games,away_time_between_games,home_time_since_last_series,away_time_since_last_series,home_distance_travelled_game,away_distance_travelled_game,home_distance_travelled_series,away_distance_travelled_series
0,2429,2001-04-01,TOR,TEX,True,8,1,SJU01,Estadio Hiram Bithorn,San Juan,...,-5.0,-8.0,182,182.0,182,182.0,2957.191537,5815.094146,2957.191537,5815.094146
1,2430,2001-04-02,BAL,BOS,True,2,1,BAL12,Oriole Park at Camden Yards,Baltimore,...,-5.0,-5.0,183,183.0,183,183.0,0.000000,1396.336454,0.000000,1396.336454
2,2431,2001-04-02,CLE,CWS,True,4,7,CLE08,Progressive Field,Cleveland,...,-5.0,-6.0,183,183.0,183,183.0,0.000000,494.028306,0.000000,494.028306
3,2432,2001-04-02,NYY,KC,True,7,3,NYC16,Yankee Stadium I,New York,...,-5.0,-6.0,183,183.0,183,183.0,272.552421,1143.759125,272.552421,1143.759125
4,2433,2001-04-02,SEA,OAK,False,5,4,SEA03,Safeco Field,Seattle,...,-8.0,-8.0,183,183.0,183,183.0,1575.035287,1089.912340,1575.035287,1089.912340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54341,56770,2023-10-01,CWS,SD,True,1,2,CHI12,Guaranteed Rate Field;U.S. Cellular Field,Chicago,...,-6.0,-8.0,1,1.0,3,4.0,0.000000,0.000000,0.000000,2986.457169
54342,56771,2023-10-01,DET,CLE,True,5,2,DET05,Comerica Park,Detroit,...,-5.0,-5.0,1,1.0,3,4.0,0.000000,0.000000,0.000000,145.132345
54343,56772,2023-10-01,KC,NYY,True,5,2,KAN06,Kauffman Stadium,Kansas City,...,-5.0,-5.0,1,1.0,3,3.0,0.000000,0.000000,1042.655277,1369.898836
54344,56773,2023-10-01,SEA,TEX,True,1,0,SEA03,Safeco Field,Seattle,...,-8.0,-8.0,1,1.0,4,4.0,0.000000,0.000000,0.000000,1575.035287


## Add Odds Data and Join with Travel Data

## Add Odds Data

In [532]:
odds_path = r'/Users/willmiraglia/Desktop/Datathon 2024/Odds/Full_Odds.csv'
odds_df = pd.read_csv(odds_path)

full_datathon_df = pd.merge(baseball_df, odds_df, on=['game_date', 'home_team', 'away_team', 'home_score', 'away_score']).dropna()
full_datathon_df.to_csv(r'/Users/willmiraglia/Desktop/Datathon 2024/datathon2024_FULL.csv')
full_datathon_df = full_datathon_df.drop(['home_so', 'away_so'], axis=1)

### Quick Linear Regression Showing that Odds perform extremely well

In [549]:
odds_reg_df = full_datathon_df[['home_odds', 'home_win']]

X = odds_reg_df[['home_odds']]
y = odds_reg_df['home_win']

# split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# linear regression
model = LinearRegression()

# fit model
model.fit(X_train, y_train)

print("Coefficient: ", round(model.coef_[0], 4))

y_pred = model.predict(X_test)

pd.DataFrame({'Odds': X_test['home_odds'].round(3), 'Predictions': y_pred.round(3)})



Coefficient:  1.0299


Unnamed: 0,Odds,Predictions
6611,0.564,0.564
18719,0.513,0.511
3433,0.558,0.558
953,0.511,0.509
3271,0.515,0.513
...,...,...
10560,0.450,0.447
22029,0.632,0.634
24824,0.661,0.664
9643,0.580,0.580


## Add Pitcher Data

In [355]:
pitcher_path = r'/Users/willmiraglia/Desktop/Datathon 2024/pitcher_info.csv'
pitcher_df = pd.read_csv(pitcher_path).iloc[:,1:]
pitcher_df['year'] = pd.to_datetime(pitcher_df['game_date']).dt.year
pitcher_df = pitcher_df[['game_date', 'home_team', 'away_team', 'home_pitcher', 'away_pitcher', 
                          'home_earned_runs', 'away_earned_runs', 'home_so', 'away_so']]
pitcher_df['home_pitcher'] = pitcher_df['home_pitcher'].replace(0, np.nan)
pitcher_df['away_pitcher'] = pitcher_df['away_pitcher'].replace(0, np.nan)

full_datathon_df = pd.merge(full_datathon_df, pitcher_df, on=['game_date', 'home_team', 'away_team'], how='left').dropna()

### Pitcher Stats Callable Dictionary

In [356]:
unique_pitchers = pd.unique(pd.concat([full_datathon_df['home_pitcher'], full_datathon_df['away_pitcher']]))
pitcher_schedules = {}
print(len(unique_pitchers))

for pitcher in unique_pitchers:
    all_pitcher_games = full_datathon_df[(full_datathon_df['home_pitcher'] == pitcher) | (full_datathon_df['away_pitcher'] == pitcher)]
    
    all_pitcher_games = all_pitcher_games.sort_values(by='game_date', ascending=True).reset_index(drop=True)
    unique_years = pd.unique(all_pitcher_games['year'])
    year_to_games = {}
    for year in unique_years:
        if year == 2020:
            continue
        pitcher_year_df = all_pitcher_games[all_pitcher_games['year'] == year].copy()

        pitch_summaries = []

        pitcher_year_df['game_date'] = pd.to_datetime(pitcher_year_df['game_date']).dt.date
        pitcher_half1 = pitcher_year_df[pitcher_year_df['game_date'] <= pd.to_datetime(f'{year}-06-30').date()]

        for row in range(len(pitcher_half1)):
            game = pd.DataFrame(pitcher_half1.iloc[row]).T.reset_index(drop=True)
            if game.iloc[0]['home_pitcher'] == pitcher:
                home_pitcher_dict = {
                    'game_date': 'game_date', 
                    'home_earned_runs': 'earned_runs',
                    'home_so': 'strikeouts'
                }
                game = game.rename(columns=home_pitcher_dict)
                pitch_summary = game[home_pitcher_dict.values()]

            elif game.iloc[0]['away_pitcher'] == pitcher:
                away_pitcher_dict = {
                    'game_date': 'game_date', 
                    'away_earned_runs': 'earned_runs',
                    'away_so': 'strikeouts'
                }
                game = game.rename(columns=away_pitcher_dict)
                pitch_summary = game[away_pitcher_dict.values()]
            else:
                raise KeyError

            pitch_summaries.append(pitch_summary)

        if len(pitch_summaries) != 0:
            pitch_stats_df = pd.concat(pitch_summaries)
            pitch_stat_dict = {
                'avg_ER': pitch_stats_df['earned_runs'].mean(),
                'avg_SO': pitch_stats_df['strikeouts'].mean()
            }
        else:
            pitch_stat_dict = {
                'avg_ER': np.nan,
                'avg_SO': np.nan                
            }


        year_to_games[year] = pitch_stat_dict

    pitcher_schedules[pitcher] = year_to_games

1419


### Make dictionary of team to season and season to schedule

In [357]:
# this dictionary goes team to year to yearly schedule
team_schedules = {}
unique_teams = pd.unique(pd.concat([full_datathon_df['home_team'], full_datathon_df['away_team']]))

for team in unique_teams:
    all_team_games = full_datathon_df[(full_datathon_df['home_team'] == team) | (full_datathon_df['away_team'] == team)]
    all_team_games = all_team_games.sort_values(by='game_date', ascending=True).reset_index(drop=True)
    unique_years = pd.unique(all_team_games['year'])
    year_to_schedule = {}
    for year in unique_years:
        if year == 2020:
            continue
        team_year_df = all_team_games[all_team_games['year'] == year]
        year_to_schedule[year] = team_year_df
    team_schedules[team] = year_to_schedule


### Create Year to Season Callable Dictionary

In [484]:
# this schedule goes year to season
year_to_season = {}

unique_years = pd.unique(full_datathon_df['year'])
for year in unique_years:
    if year == 2020:
        continue

    season_df = full_datathon_df[full_datathon_df['year'] == year][['game_date', 'home_team', 'away_team', 
                                                                    'home_pitcher', 'away_pitcher',
                                                                    'home_score', 'away_score',
                                                                    'time_zone', 'home_last_time_zone', 'away_last_time_zone', 
                                                                    'home_time_between_games', 'away_time_between_games', 
                                                                    'home_time_since_last_series', 'away_time_since_last_series', 
                                                                    'home_distance_travelled_series', 'away_distance_travelled_series', 
                                                                    'home_distance_travelled_game', 'away_distance_travelled_game', 
                                                                    'home_odds', 'away_odds', 'home_win']]
    year_to_season[year] = season_df


### Full Team Summary Stats Callable Dictionary (team to year to stats)

In [359]:
team_summary_stats = {}

for team in team_schedules.keys():

    year_to_half1 = {}
    
    for year in team_schedules[team].keys():
        game_summaries = []

        team_schedule = team_schedules[team][year].reset_index(drop=True)
        team_schedule['game_date'] = pd.to_datetime(team_schedule['game_date']).dt.date
        team_half1 = team_schedule[team_schedule['game_date'] <= pd.to_datetime(f'{year}-06-30').date()]

        for row in range(len(team_half1)):
            game = pd.DataFrame(team_half1.iloc[row]).T.reset_index(drop=True)
            if game.iloc[0]['home_team'] == team:
                home_team_dict = {
                    'game_date': 'game_date', 
                    'home_team': 'home_team',
                    'away_team': 'away_team',
                    'home_score': 'runs_scored',
                    'away_score': 'runs_allowed',
                    'away_pa': 'allowed_pa',
                    'away_1b': 'allowed_1b',
                    'away_2b': 'allowed_2b',
                    'away_3b': 'allowed_3b', 
                    'away_hr': 'allowed_hr',
                    'away_bb': 'allowed_bb',
                    'home_pa': 'team_pa',
                    'home_1b': 'team_1b',
                    'home_2b': 'team_2b',
                    'home_3b': 'team_3b', 
                    'home_hr': 'team_hr',
                    'home_bb': 'team_bb',
                }
                game = game.rename(columns=home_team_dict)
                game_summary = game[home_team_dict.values()]
                
            elif game.iloc[0]['away_team'] == team:
                away_team_dict = {
                    'game_date': 'game_date', 
                    'home_team': 'home_team',
                    'away_team': 'away_team',
                    'home_score': 'runs_allowed',
                    'away_score': 'runs_scored',
                    'away_pa': 'team_pa',
                    'away_1b': 'team_1b',
                    'away_2b': 'team_2b',
                    'away_3b': 'team_3b', 
                    'away_hr': 'team_hr',
                    'away_bb': 'team_bb',
                    'home_pa': 'allowed_pa',
                    'home_1b': 'allowed_1b',
                    'home_2b': 'allowed_2b',
                    'home_3b': 'allowed_3b', 
                    'home_hr': 'allowed_hr',
                    'home_bb': 'allowed_bb',
                }
                game = game.rename(columns=away_team_dict)
                game_summary = game[away_team_dict.values()]
            else:
                raise KeyError

            game_summaries.append(game_summary)

        half1_game_summaries = pd.concat(game_summaries)

        half1_game_summaries['allowed_hits'] = (
            half1_game_summaries['allowed_1b'] + 
            half1_game_summaries['allowed_2b'] + 
            half1_game_summaries['allowed_3b'] + 
            half1_game_summaries['allowed_hr']
        )

        half1_game_summaries['team_hits'] = (
            half1_game_summaries['team_1b'] + 
            half1_game_summaries['team_2b'] + 
            half1_game_summaries['team_3b'] + 
            half1_game_summaries['team_hr']   
        )

        half1_game_summaries['allowed_batting_avg'] = half1_game_summaries['allowed_hits'] / (half1_game_summaries['allowed_pa'] - half1_game_summaries['allowed_bb'])
        half1_game_summaries['team_batting_avg'] = half1_game_summaries['team_hits'] / (half1_game_summaries['team_pa'] - half1_game_summaries['team_bb'])

        half1_dict = {
            'team_BA': half1_game_summaries['team_batting_avg'].mean(),
            'allowed_BA': half1_game_summaries['allowed_batting_avg'].mean(), 
            'team_avg_HR': half1_game_summaries['team_hr'].mean(),
            'allowed_avg_HR': half1_game_summaries['allowed_hr'].mean(), 
            'avg_runs_scored': half1_game_summaries['runs_scored'].mean(),
            'avg_runs_allowed': half1_game_summaries['runs_allowed'].mean()
        }
        year_to_half1[year] = half1_dict
    
    team_summary_stats[team] = year_to_half1


### Create Year to Regression Dataframes

In [486]:
reg_half2_dict = {}

for year in year_to_season.keys():
    
    season_df = year_to_season[year]
    season_df['game_date'] = pd.to_datetime(season_df['game_date']).dt.date
    season_half2 = season_df[season_df['game_date'] > pd.to_datetime(f'{year}-06-30').date()].copy().reset_index(drop=True)


    home_team_BA = []
    home_allowed_BA = []
    home_team_avg_HR = []
    home_allowed_avg_HR = []
    home_avg_runs_scored = []
    home_avg_runs_allowed = []
    home_pitcher_avg_ER = []
    home_pitcher_avg_SO = []

    away_team_BA = []
    away_allowed_BA = []
    away_team_avg_HR = []
    away_allowed_avg_HR = []
    away_avg_runs_scored = []
    away_avg_runs_allowed = []
    away_pitcher_avg_ER = []
    away_pitcher_avg_SO = []


    for row in range(len(season_half2)):
        home_team = season_half2.loc[row, 'home_team']
        home_stats = team_summary_stats[home_team][year]
        
        home_pitcher = season_half2.loc[row, 'home_pitcher']
        home_pitcher_stats = pitcher_schedules[home_pitcher][year]

        home_team_BA.append( home_stats['team_BA'] )
        home_allowed_BA.append( home_stats['allowed_BA'])
        home_team_avg_HR.append( home_stats['team_avg_HR'])
        home_allowed_avg_HR.append( home_stats['allowed_avg_HR'] )
        home_avg_runs_scored.append( home_stats['avg_runs_scored'])
        home_avg_runs_allowed.append( home_stats['avg_runs_allowed'])
        home_pitcher_avg_ER.append( home_pitcher_stats['avg_ER'] )
        home_pitcher_avg_SO.append( home_pitcher_stats['avg_SO'])

        away_team = season_half2.loc[row, 'away_team']
        away_stats = team_summary_stats[away_team][year]

        away_pitcher = season_half2.loc[row, 'away_pitcher']
        away_pitcher_stats = pitcher_schedules[away_pitcher][year]

        away_team_BA.append( away_stats['team_BA'] )
        away_allowed_BA.append( away_stats['allowed_BA'])
        away_team_avg_HR.append( away_stats['team_avg_HR'])
        away_allowed_avg_HR.append( away_stats['allowed_avg_HR'] )
        away_avg_runs_scored.append( away_stats['avg_runs_scored'])
        away_avg_runs_allowed.append( away_stats['avg_runs_allowed'])
        away_pitcher_avg_ER.append( away_pitcher_stats['avg_ER'] )
        away_pitcher_avg_SO.append( away_pitcher_stats['avg_SO'])

    season_half2['home_team_BA'] = home_team_BA
    season_half2['home_allowed_BA'] = home_allowed_BA
    season_half2['home_team_avg_HR'] = home_team_avg_HR
    season_half2['home_allowed_avg_HR'] = home_allowed_avg_HR
    season_half2['home_avg_runs_scored'] = home_avg_runs_scored
    season_half2['home_avg_runs_allowed'] = home_avg_runs_allowed
    season_half2['home_avg_ER'] = home_pitcher_avg_ER
    season_half2['home_avg_SO'] = home_pitcher_avg_SO

    season_half2['away_team_BA'] = away_team_BA
    season_half2['away_allowed_BA'] = away_allowed_BA
    season_half2['away_team_avg_HR'] = away_team_avg_HR
    season_half2['away_allowed_avg_HR'] = away_allowed_avg_HR
    season_half2['away_avg_runs_scored'] = away_avg_runs_scored
    season_half2['away_avg_runs_allowed'] = away_avg_runs_allowed
    season_half2['away_avg_ER'] = away_pitcher_avg_ER
    season_half2['away_avg_SO'] = away_pitcher_avg_SO

    reg_half2_dict[year] = season_half2




# Predict Odds

## Predicting Percent Chance of Win using Baseball Stats

### Feature Importance

In [504]:
def run_random_forest_feature_importance(year, regressors):
    reg_data = reg_half2_dict[year].dropna().copy()

    reg_data['home_time_zone_change'] = abs(reg_data['home_last_time_zone'] - reg_data['time_zone'])
    reg_data['away_time_zone_change'] = abs(reg_data['away_last_time_zone'] - reg_data['time_zone'])
    reg_data['home_time_zone_advantage'] = reg_data['home_time_zone_change'] - reg_data['away_time_zone_change']
    reg_data['home_travel_advantage'] = reg_data['home_distance_travelled_series'] - reg_data['away_distance_travelled_series']

    X = reg_data[regressors]
    y = reg_data['home_odds']

    # split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    # random forest model
    rf_model = RandomForestRegressor(n_estimators=50, random_state=1)

    # fit model
    rf_model.fit(X_train, y_train)

    # feature importance dataframe
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        f'Importance_{year}': rf_model.feature_importances_
    })
    
    return feature_importance


In [505]:
def get_mean_feature_importance(all_cols):

    FI_dfs = []
    for year in reg_half2_dict.keys():
        FI_dfs.append( run_random_forest_feature_importance(year, all_cols))

    all_feature_importances = pd.DataFrame({'Feature': []})
    for df in FI_dfs:
        all_feature_importances = pd.merge(df, all_feature_importances, on='Feature', how='left')

    # Set 'Feature' as the index
    all_feature_importances.set_index('Feature', inplace=True)

    # Calculate the mean impoartance across different years
    mean_importance = pd.DataFrame(all_feature_importances.mean(axis=1))
    mean_importance.columns = ['Importance']
    mean_importance['Importance'] = mean_importance['Importance'].round(4)
    mean_importance = mean_importance.sort_values(by='Importance', ascending=False).reset_index(drop=False)
    
    return mean_importance

In [506]:
all_cols = ['home_last_time_zone', 'away_last_time_zone', 'home_time_between_games', 
            'away_time_between_games', 'home_time_since_last_series',
            'away_time_since_last_series', 'home_distance_travelled_series',
            'away_distance_travelled_series', 'home_distance_travelled_game',
            'away_distance_travelled_game',
            'home_team_BA', 'home_allowed_BA', 'home_team_avg_HR',
            'home_allowed_avg_HR', 'home_avg_runs_scored', 'home_avg_runs_allowed',
            'home_avg_ER', 'home_avg_SO', 'away_team_BA', 'away_allowed_BA',
            'away_team_avg_HR', 'away_allowed_avg_HR', 'away_avg_runs_scored',
            'away_avg_runs_allowed', 'away_avg_ER', 'away_avg_SO']

get_mean_feature_importance(all_cols)

Unnamed: 0,Feature,Importance
0,away_avg_runs_allowed,0.1434
1,home_avg_runs_allowed,0.1138
2,home_allowed_BA,0.0849
3,away_allowed_BA,0.0713
4,home_avg_runs_scored,0.0603
5,home_avg_ER,0.052
6,away_avg_ER,0.0491
7,away_team_avg_HR,0.049
8,home_team_avg_HR,0.048
9,away_avg_runs_scored,0.048


### Random Forest Model Performance

In [528]:
def run_random_forest_by_year(year, regressors):
    reg_data = reg_half2_dict[year].dropna().copy()

    reg_data['home_time_zone_change'] = abs(reg_data['home_last_time_zone'] - reg_data['time_zone'])
    reg_data['away_time_zone_change'] = abs(reg_data['away_last_time_zone'] - reg_data['time_zone'])
    reg_data['home_time_zone_advantage'] = reg_data['home_time_zone_change'] - reg_data['away_time_zone_change']
    reg_data['home_travel_advantage'] = reg_data['home_distance_travelled_series'] - reg_data['away_distance_travelled_series']

    X = reg_data[regressors]
    y = reg_data['home_odds']

    # split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    # random forest model
    rf_model = RandomForestRegressor(n_estimators=50, random_state=1)

    # fit model
    rf_model.fit(X_train, y_train)

    # make predictions
    y_pred = rf_model.predict(X_test)

    # calculate MAE
    mae = mean_absolute_error(y_test, y_pred)

    return mae


def run_random_forest_by_year_and_plot(year, regressors):
    reg_data = reg_half2_dict[year].dropna().copy()

    reg_data['home_time_zone_change'] = abs(reg_data['home_last_time_zone'] - reg_data['time_zone'])
    reg_data['away_time_zone_change'] = abs(reg_data['away_last_time_zone'] - reg_data['time_zone'])
    reg_data['home_time_zone_advantage'] = reg_data['home_time_zone_change'] - reg_data['away_time_zone_change']
    reg_data['home_travel_advantage'] = reg_data['home_distance_travelled_series'] - reg_data['away_distance_travelled_series']

    X = reg_data[regressors]
    y = reg_data['home_odds']

    # split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    # random forest model
    rf_model = RandomForestRegressor(n_estimators=50, random_state=1)

    # fit model
    rf_model.fit(X_train, y_train)

    # make predictions
    y_pred = rf_model.predict(X_test)

    # plot y_pred vs y_actual
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Predictions'))
    fig.add_trace(go.Scatter(x=[min(y_pred), max(y_pred)], y=[min(y_pred), max(y_pred)], name='y=x'))

    fig.update_layout(
        title=f'Predictions vs Observed Values: {year}',
        xaxis_title='Observed Values',
        yaxis_title='Predicted Values',
        xaxis_range=[0.25,0.75],
        yaxis_range=[0.25,0.75],
        width = 800, height = 600
    )

    fig.show()




#### Mean Absolute Error by Year

In [529]:
maes = []
for year in reg_half2_dict.keys():
    my_regressors = ['home_avg_runs_allowed', 'away_avg_runs_allowed', 'home_allowed_BA', 'away_allowed_BA'] # top 4 feature importance

    mae = run_random_forest_by_year(year, my_regressors)
    
    maes.append(mae)

mae_df = pd.DataFrame({'Year': reg_half2_dict.keys(), 'Mean-Absolute-Error': maes})
mae_df

Unnamed: 0,Year,Mean-Absolute-Error
0,2008,0.049275
1,2009,0.057098
2,2010,0.041469
3,2011,0.040452
4,2012,0.043394
5,2013,0.03842
6,2014,0.043898
7,2015,0.047415
8,2016,0.048612
9,2017,0.049086


#### Plot 2023 Results using top 4 Feature Importances

In [530]:
run_random_forest_by_year_and_plot(2023, my_regressors)