Baseball Prediction: 3b - Augment DataFrame with Odds Data

In the previous notebook, we got historical odds data from oddsshark.com and saved them as a set of csv files (with a particular naming convention).

In this notebook we will load that data and augment our primary (game-level) data frame so that it includes this odds data - specifically, the implied probabilities and the over/under, for each game.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lxml
import html5lib
%matplotlib inline
from urllib.request import urlopen
import time
import structureboost as stb
import ml_insights as mli
pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

In [None]:
df=pd.read_csv('df_bp1.csv', low_memory=False)

Plan of attack

- Create a dictionary structure to enable us to easily go to a specific game for a particular team and season.
- Iterate through the rows of our main dataframe, look up the home and visiting team for that game to get their odds. (Note: need two different lookups per game)

In [None]:
# Use the same mapping as before
# (more elegant to save to file...)

oddsshark_num_to_team_dict = {}
oddsshark_num_to_team_dict[26995]='PHI'
oddsshark_num_to_team_dict[26996]='SDN'
oddsshark_num_to_team_dict[26997]='SFN'
oddsshark_num_to_team_dict[26998]='ANA'
oddsshark_num_to_team_dict[26999]='DET'
oddsshark_num_to_team_dict[27000]='CIN'
oddsshark_num_to_team_dict[27001]='NYA'
oddsshark_num_to_team_dict[27002]='TEX'
oddsshark_num_to_team_dict[27003]='TBA'
oddsshark_num_to_team_dict[27004]='COL'
oddsshark_num_to_team_dict[27005]='MIN'
oddsshark_num_to_team_dict[27006]='KCA'
oddsshark_num_to_team_dict[27007]='ARI'
oddsshark_num_to_team_dict[27008]='BAL'
oddsshark_num_to_team_dict[27009]='ATL'
oddsshark_num_to_team_dict[27010]='TOR'
oddsshark_num_to_team_dict[27011]='SEA'
oddsshark_num_to_team_dict[27012]='MIL'
oddsshark_num_to_team_dict[27013]='PIT'
oddsshark_num_to_team_dict[27014]='NYN'
oddsshark_num_to_team_dict[27015]='LAN'
oddsshark_num_to_team_dict[27016]='OAK'
oddsshark_num_to_team_dict[27017]='WAS'
oddsshark_num_to_team_dict[27018]='CHA'
oddsshark_num_to_team_dict[27019]='SLN'
oddsshark_num_to_team_dict[27020]='CHN'
oddsshark_num_to_team_dict[27021]='BOS'
oddsshark_num_to_team_dict[27022]='MIA'
oddsshark_num_to_team_dict[27023]='HOU'
oddsshark_num_to_team_dict[27024]='CLE'
 
# Use the saved files to get the odds information
# We create a dict based on team and season for easy lookup
df_odds_dict={}
for i in range(26995, 27025):
    team_name = oddsshark_num_to_team_dict[i]
    df_odds_dict[team_name] = {}
    print(team_name)
    for season in range(2019,2023):
        fname = 'oddsshark_'+team_name+'_'+str(season)+'.csv'
        df_temp = pd.read_csv('/Users/antiprotons/Desktop/DA/oddshark/'+fname)
        df_temp['date_dblhead'] = (df_temp.date_numeric.astype(str) + df_temp.dblheader_num.astype(str)).astype(int)
        df_temp.set_index('date_dblhead', inplace=True)
        df_odds_dict[team_name][season] = df_temp

Augment our main dataframe

In [None]:
# Again, we iterate through the main dataframe
# get the team, season, game and then get
# the relevant info from the odds dictionary

implied_prob_h = np.zeros(df.shape[0])
implied_prob_v = np.zeros(df.shape[0])
over_under = np.zeros(df.shape[0])
ou_result = np.full(df.shape[0],'', dtype=object)
for ind, row in df.iterrows():
    if (ind%1000)==0:
        print(ind)
    if row.season<2019:
        continue
    else:
        season = row['season']
        home_team = row['team_h']
        visit_team = row['team_v']
        home_game_no = row['game_no_h']
        visit_game_no = row['game_no_v']
        date_dblh = row['date_dblhead']
        try:
            implied_prob_h[ind] = df_odds_dict[home_team][season].loc[date_dblh,'prob_implied']
            over_under[ind] = df_odds_dict[home_team][season].loc[date_dblh,'Total']
            ou_result[ind] = df_odds_dict[home_team][season].loc[date_dblh,'OU']
        except KeyError:
            print(f'Game not found wrt home_team:{home_team} vs {visit_team} date_dbl {date_dblh}')
        try:
            implied_prob_v[ind] = df_odds_dict[visit_team][season].loc[date_dblh,'prob_implied']
        except KeyError:
            print(f'Game not found wrt visit_team:{visit_team} vs {home_team} date_dbl {date_dblh}')

In [None]:
df['implied_prob_h'] = implied_prob_h
df['implied_prob_v'] = implied_prob_v
df['implied_prob_h_mid'] = (implied_prob_h + (1-implied_prob_v))/2
df['over_under_line']=over_under
df['over_under_result']=ou_result

Drop games with no odds...

In [None]:
df[(df.season>=2019) & (df.implied_prob_h==0)]

In [None]:
indicies_to_drop = df[(df.season>=2019) & (df.implied_prob_h==0)].index
indicies_to_drop

In [None]:
df.shape

In [None]:
df.drop(indicies_to_drop, inplace=True)
df.shape

In [None]:
df.reset_index(inplace=True, drop=True)
df.shape

In [None]:
df.to_csv('df_bp3.csv', index=False)