In [1]:
# baseball-predictor.ipynb based on hello-world-predictor
# Objective: Read the dataset, do a simple prediction for games on a given date using data prior to that date, then
#            and record some stats on how well it did.
# 
#
# 12/21/19, Alexis: Currently this "helloworld" predictor is very simple. It does the following:
#  1. Reads in the 2010 season as the baseline games dataset. It does some simple selection
#     of columns and generates a few calculated fields.
#  2. It prompts the user to specify 'game day' which are the games it is to predict.
#  3. It calculates the avg net number of runs for the home team (when playing at home) and
#     the avg net number of runs for the visiting team (when playing away).
#  4. For a given game, the predicted winner is the team with the greatest avg net runs at
#     home or away (as applicable).
#  5. The prediction is compared with the actual results that day, with % correct reported.
#
#  Proposed next steps:
#  1. Integrate Venkat's "concat" capability to combine datasets across year blocks to result
#     in a much larger dataset. Note that for development we may want to keep the dataset
#     smaller size so that it doesn't take a long time to run.
#  2. Move this out of jupyter notebook into standard Python and embed the prediction logic
#     into a function that can be called repeatedly with different dates.
#  3. Run trials with large numbers of dates to product large numbers of predictions and
#     results.
#  4. Write results to a file. Possibly generate some plot of results as a function of
#     training set window size.
#  5. Start experimenting with diffent prediction functions, initially across individual
#     factors, and then with multiple factors.
#  6. Consider a statistically meaningful regresssion analysis to select factors and training
#     set window size, by factor.
#  7. If someone has energy, consider using a web API to hit a website with current day
#     game schedule so we can predict games more recent than the dataset.
#
#
# 12/21/19, Venkat : Modified the code and included the below changes
#  1. Included the logic to combine all data files into one data frame
#  2. Included cleanup logic to clean missing data rows if any such rows exists
#
# 1/4/20, Alexis: Extend from "hello-world" to a useful predictor harness.
#  1. Implement a window to constrain the lookback period to n gamedays.
#  2. Add support for predicting a range of dates, not just one day.


In [2]:
# Dataset origin:
#      The information used here was obtained free of
#      charge from and is copyrighted by Retrosheet.  Interested
#      parties may contact Retrosheet at "www.retrosheet.org".

In [3]:
# Modules
import os
import csv
import pprint
import pandas as pd
import glob
import pprint
import datetime

files = glob.glob("../datasets/Final_Data_Files/GL*.csv")
files.sort()

def reader(f):
    df = pd.read_csv(f, index_col=False, header=None)  
    df.columns = [("Col_"+str(i)) for i in range(1,df.shape[1]+1)]       

    return df

#season_df = pd.concat([reader(f) for f in files], keys=files)
season_df = pd.concat([reader(f) for f in files])

# season_df

In [4]:
season_df.shape # Useful to make sure we don't loose rows when adding column headers.

(19437, 161)

In [5]:
season_df = season_df.rename(columns={'Col_1':'Date',
                                     'Col_4':'Visiting Team',
                                     'Col_5':'Visiting League',
                                     'Col_7':'Home Team',
                                     'Col_8':'Home League',
                                     'Col_10':'Visiting Score',
                                     'Col_11':'Home Score'})
season_df = season_df[['Date', 'Visiting Team', 'Visiting League', 'Home Team', 'Home League',
                       'Visiting Score','Home Score']]
# season_df

In [6]:
# Identify incomplete rows
season_df.count()

Date               19437
Visiting Team      19437
Visiting League    19437
Home Team          19437
Home League        19437
Visiting Score     19437
Home Score         19437
dtype: int64

In [7]:
# Create a dataframe of unique game days with the game ID for the first game

# season_df.head()
gamedays = pd.Series(season_df['Date'].unique())
# print(gamedays.head())
# print(gamedays.tail())
# print(type(gamedays.iloc[0]))

def gamedays_offset(base_date, n): # returns new game date offset by n
    if base_date not in gamedays.values:
        raise ValueError(f"Looking for date, {base_date}, not in the the gamedays series.")
    base_date_index = gamedays[gamedays==base_date].index[0]
    if ((n + base_date_index) < 0) or n + base_date_index >= len(gamedays):
        raise ValueError(f"Attempting to caluclate a game date outside the range of the dataset.")
        return(0)  # Out of range
    else:
        new_index = base_date_index + n
#         print(f"new_index={new_index}")
        return gamedays.iloc[new_index]

# This is a manual test of the gamedays_offset() function.
# print(gamedays_offset(20120406, -100))
    

In [8]:
# Drop all rows with missing information
season_df = season_df.dropna(how='any')
len(season_df)

19437

In [9]:
# Create a column 
season_df['Home Winner'] = season_df['Home Score'] > season_df['Visiting Score']
season_df['V NetRuns'] = season_df['Visiting Score'] - season_df['Home Score']
season_df['H NetRuns'] = - season_df['V NetRuns']
# season_df['V AvgNetRuns'] = ""
# season_df['H AvgNetRuns'] = ""
# season_df['Predict Home Wins?'] = ""
# season_df['Prediction Correct?'] = ""
# season_df.head()

In [10]:
# Ask the user how big of a window of prior game days should we evaluate.
print(len(gamedays))
good_value = False
while not good_value:
    lookback_n = int(input(f"How many gamedays back do you want the predictor to look? "))
    good_value = lookback_n > 0 and lookback_n < len(gamedays)

1438
How many gamedays back do you want the predictor to look? 1000


In [11]:
# Get key dates for the analysis from the user:
#  * Start and End dates for the gamedays to predict
#  * Number of days to look back in order to predict a given game day.
# Prune the dataset accordingly.

def date_str(date: int):
    s = str(date)
    return f"{s[0:4]}-{s[4:6]}-{s[6:]}"

dataset_start_date = int(season_df['Date'].min())
start_date = gamedays_offset(dataset_start_date, lookback_n)
end_date = season_df['Date'].max()
print(f"dataset_start_date: {date_str(dataset_start_date)}")
print(f"start_date: {date_str(start_date)}")
print(f"end_date: {date_str(end_date)}")

good_date = False
while not good_date:
    first_game_day = int(input(f"What is the first game day (YYYYMMDD) you want us to predict (between {start_date} and {end_date}: "))
#    game_day = 20100715
    last_game_day = int(input(f"What is the last game day (YYYYMMDD) you want us to predict (between {start_date} and {end_date}: "))

    if first_game_day < start_date or first_game_day > last_game_day or last_game_day > end_date:
        print("Sorry, the date you selected is outside the range of our dataset or the last day preceeds the first day.")
        # game_day_df = season_df.loc[(season_df['Date'] == (game_day))]
    elif first_game_day not in gamedays.values:
        print(f"Sorry, no game on {first_game_day}.")
    elif last_game_day not in gamedays.values:
        print(f"Sorry, no game on {last_game_day}.")
    else:
        lookback_start_day = gamedays_offset(first_game_day, -lookback_n)
        lookback_end_day = gamedays_offset(first_game_day, -1)
        train_df = season_df.loc[((season_df['Date'] <= (last_game_day)) & (season_df['Date'] >= (lookback_start_day)))]
        print(f"Nice! We're ready to run the predictor for games from {date_str(first_game_day)} through {date_str(last_game_day)}.")
        print(f"      We'll look back {lookback_n} days prior to each gameday to build our prediction.")
        print(f"      The first game day's lookback window is {date_str(lookback_start_day)} through {date_str(lookback_end_day)}.")
        good_date = True

# train_df.head()

dataset_start_date: 2010-04-04
start_date: 2015-07-17
end_date: 2017-10-01
What is the first game day (YYYYMMDD) you want us to predict (between 20150717 and 20171001: 20160401
What is the last game day (YYYYMMDD) you want us to predict (between 20150717 and 20171001: 20161001
Sorry, no game on 20160401.
What is the first game day (YYYYMMDD) you want us to predict (between 20150717 and 20171001: 20160501
What is the last game day (YYYYMMDD) you want us to predict (between 20150717 and 20171001: 20161001
Nice! We're ready to run the predictor for games from 2016-05-01 through 2016-10-01.
      We'll look back 1000 days prior to each gameday to build our prediction.
      The first game day's lookback window is 2010-07-24 through 2016-04-30.


In [12]:
# Create a DF to hold team stats starting at the beginning of the lookback window.
# Consider adding logic to combine codes for teams that rebranded,
#   example: The Florida Marlins (FLO) became Miami Marlis (MIA) after
#   after the 2011 season.

start_processing = datetime.datetime.now()
print(f"{start_processing}: Starting build of net runs rolling average tables.")

# Update gamedays to show only the dates we care about (prediction range and lookback)
gamedays = gamedays.loc[((gamedays >= lookback_start_day) & (gamedays <= last_game_day))]

# Create home and visitor tables to hold net points, by team and gameday.
# Start with the visiting teams.
v_teams = train_df['Visiting Team'].unique()
v_teams.sort()
v_np_df = pd.DataFrame(columns=v_teams, index=gamedays) # Visiting teams net points on a givne day
v_ng_df = pd.DataFrame(columns=v_teams, index=gamedays) # Visiting teams number of games on a givne day

# Then the home teams.
h_teams = train_df['Home Team'].unique()
h_teams.sort()
h_np_df = pd.DataFrame(columns=h_teams, index=gamedays)
h_ng_df = pd.DataFrame(columns=h_teams, index=gamedays)

# Now populate the visiting and home team net points by game day dataframe
for day in v_np_df.index:
    for team in v_np_df.columns: # Visiting Team Data
        net_runs = train_df.loc[((train_df['Visiting Team']==team)&(train_df['Date']==day)),:]['V NetRuns'].sum()
        num_games = train_df.loc[((train_df['Visiting Team']==team)&(train_df['Date']==day)),:]['V NetRuns'].count()
        v_np_df.at[day, team] = net_runs
        v_ng_df.at[day, team] = num_games
        
    for team in h_np_df.columns: # Home Team Data
        net_runs = train_df.loc[((train_df['Home Team']==team)&(train_df['Date']==day)),:]['H NetRuns'].sum()
        num_games = train_df.loc[((train_df['Home Team']==team)&(train_df['Date']==day)),:]['H NetRuns'].count()
        h_np_df.at[day, team] = net_runs
        h_ng_df.at[day, team] = num_games
        
# Now populate the visiting team net points rolling averages dataframe
# Avg = sum of net points divided by # of games
v_ra_np_df = v_np_df.rolling(lookback_n).sum() / v_ng_df.rolling(lookback_n).sum()  # Rolling average of mean net points
h_ra_np_df = h_np_df.rolling(lookback_n).sum() / h_ng_df.rolling(lookback_n).sum()  # Rolling average of mean net points

end_processing = datetime.datetime.now()
duration = end_processing - start_processing
print(f'{end_processing}: Net point tables calculated in {duration} hr/min/sec.')
# Consider calculating the number of games behind each value.
# print(v_np_df.head(40))
# print(v_np_df.head(40))
# print(v_ra_np_df.head(30))
# print(h_ra_np_df.head(30))

    

2020-01-05 18:47:04.786812: Starting build of net runs rolling average tables.
2020-01-05 18:51:58.293221: Net point tables calculated in 0:04:53.506409 hr/min/sec.


In [13]:
# Now calculate the predictions and results on a rolling basis
start_processing = datetime.datetime.now()

# Go through all the games and make the predictions
results_df = train_df.loc[(train_df['Date'] >= first_game_day)].copy(deep=True)
results_df['Predict Home Wins?'] = ""
# results_df['Prediction Correct?'] = ""
print(len(results_df))

for game in results_df.index:
    game_day = results_df.loc[game,'Date']
    v_team = results_df.loc[game, "Visiting Team"]
    h_team = results_df.loc[game, "Home Team"]
    v_avg_np = v_ra_np_df.loc[game_day, v_team]
    h_avg_np = h_ra_np_df.loc[game_day, v_team]
    results_df.at[game, 'Predict Home Wins?'] = h_avg_np >= v_avg_np # Tie goes to home because home bats last

# results_df.loc['Prediction Correct?'] = results_df['Predict Home Wins?'] == results_df['Home Winner']
results_df['Prediction Correct?'] = results_df['Predict Home Wins?'] == results_df['Home Winner']

# print(train_df.tail())

end_processing = datetime.datetime.now()
duration = end_processing - start_processing
print(f'{end_processing}: Predictions calculated in {duration} hr/min/sec.')

# results_df


2059
2020-01-05 18:52:07.447055: Predictions calculated in 0:00:00.105658 hr/min/sec.


In [14]:
# Run stats on the prediction outcomes.
num_games = len(results_df)
num_correct = results_df['Prediction Correct?'].values.sum()

print(f"{num_games} games with {num_correct} predicted correctly.")
print(f"{round((num_correct/num_games*100.),1)}%")


2059 games with 1053 predicted correctly.
51.1%


In [15]:
results_df

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,Home Winner,V NetRuns,H NetRuns,Predict Home Wins?,Prediction Correct?
354,20160501,CHA,AL,BAL,AL,7,1,False,6,-6,False,True
355,20160501,NYA,AL,BOS,AL,7,8,True,-1,1,True,True
356,20160501,DET,AL,MIN,AL,6,5,False,1,-1,True,False
357,20160501,HOU,AL,OAK,AL,2,1,False,1,-1,True,False
358,20160501,KCA,AL,SEA,AL,4,1,False,3,-3,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2408,20161001,MIL,NL,COL,NL,4,3,False,1,-1,True,False
2409,20161001,NYN,NL,PHI,NL,5,3,False,2,-2,False,True
2410,20161001,LAN,NL,SFN,NL,0,3,True,-3,3,True,True
2411,20161001,PIT,NL,SLN,NL,3,4,True,-1,1,True,True
