In [1]:
# baseball-predictor.ipynb based on hello-world-predictor
# Objective: Read the dataset, do a simple prediction for games on a given date using data prior to that date, then
#            and record some stats on how well it did.
# 
#
# 12/21/19, Alexis: Currently this "helloworld" predictor is very simple. It does the following:
#  1. Reads in the 2010 season as the baseline games dataset. It does some simple selection
#     of columns and generates a few calculated fields.
#  2. It prompts the user to specify 'game day' which are the games it is to predict.
#  3. It calculates the avg net number of runs for the home team (when playing at home) and
#     the avg net number of runs for the visiting team (when playing away).
#  4. For a given game, the predicted winner is the team with the greatest avg net runs at
#     home or away (as applicable).
#  5. The prediction is compared with the actual results that day, with % correct reported.
#
#  Proposed next steps:
#  1. Integrate Venkat's "concat" capability to combine datasets across year blocks to result
#     in a much larger dataset. Note that for development we may want to keep the dataset
#     smaller size so that it doesn't take a long time to run.
#  2. Move this out of jupyter notebook into standard Python and embed the prediction logic
#     into a function that can be called repeatedly with different dates.
#  3. Run trials with large numbers of dates to product large numbers of predictions and
#     results.
#  4. Write results to a file. Possibly generate some plot of results as a function of
#     training set window size.
#  5. Start experimenting with diffent prediction functions, initially across individual
#     factors, and then with multiple factors.
#  6. Consider a statistically meaningful regresssion analysis to select factors and training
#     set window size, by factor.
#  7. If someone has energy, consider using a web API to hit a website with current day
#     game schedule so we can predict games more recent than the dataset.
#
#
# 12/21/19, Venkat : Modified the code and included the below changes
#  1. Included the logic to combine all data files into one data frame
#  2. Included cleanup logic to clean missing data rows if any such rows exists
#
# 1/4/20, Alexis: Extend from "hello-world" to a useful predictor harness.
#  1. Implement a window to constrain the lookback period to n gamedays.
#
#
# 1/5/20, Venkat: Building the new prediction algorithm.
#                 Calculate Avg Net runs only for head-head teams, and see if it improves the forecst accuracy 




In [2]:
# Modules
import os
import csv
import pprint
import pandas as pd
import glob
import pprint

files = glob.glob("../datasets/Final_Data_Files/GL*.csv")
files.sort()

def reader(f):
    df = pd.read_csv(f, index_col=False, header=None)  
    df.columns = [("Col_"+str(i)) for i in range(1,df.shape[1]+1)]       

    return df

#season_df = pd.concat([reader(f) for f in files], keys=files)
season_df = pd.concat([reader(f) for f in files])



In [3]:
season_df = season_df.rename(columns={'Col_1':'Date',
                                     'Col_4':'Visiting Team',
                                     'Col_5':'Visiting League',
                                     'Col_7':'Home Team',
                                     'Col_8':'Home League',
                                     'Col_10':'Visiting Score',
                                     'Col_11':'Home Score',
                                     'Col_102':'V Start Pitcher ID',
                                     'Col_104':'H Start Pitcher ID'})

season_df = season_df[['Date', 'Visiting Team', 'Visiting League', 'Home Team', 'Home League',
                      'Visiting Score','Home Score','V Start Pitcher ID','H Start Pitcher ID']]
season_df

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,V Start Pitcher ID,H Start Pitcher ID
0,20100404,NYA,AL,BOS,AL,7,9,sabac001,beckj002
1,20100405,MIN,AL,ANA,AL,3,6,bakes002,weavj003
2,20100405,CLE,AL,CHA,AL,0,6,westj001,buehm001
3,20100405,DET,AL,KCA,AL,8,4,verlj001,greiz001
4,20100405,SEA,AL,OAK,AL,5,3,hernf002,sheeb001
...,...,...,...,...,...,...,...,...,...
2425,20171001,ARI,NL,KCA,AL,14,2,ray-r002,vargj001
2426,20171001,DET,AL,MIN,AL,1,5,sanca004,colob001
2427,20171001,TOR,AL,NYA,AL,2,1,andeb004,montj004
2428,20171001,BAL,AL,TBA,AL,0,6,gausk001,snelb001


In [4]:
# Create a dataframe of unique game days with the game ID for the first game

# season_df.head()
gamedays = pd.Series(season_df['Date'].unique())
print(gamedays.head())
print(gamedays.tail())
print(type(gamedays.iloc[0]))

def gamedays_offset(base_date, n): # returns new game date offset by n
    base_date_index = gamedays[gamedays==base_date].index[0]
    if ((n + base_date_index) < 0) or n + base_date_index >= len(gamedays):
        raise ValueError(f"Attempting to caluclate a game date outside the range of the dataset.")
        return(0)  # Out of range
    else:
        new_index = base_date_index + n
#         print(f"new_index={new_index}")
        return gamedays.iloc[new_index]

# This is a manual test of the gamedays_offset() function.
print(gamedays_offset(20120406, -100))
    

0    20100404
1    20100405
2    20100406
3    20100407
4    20100408
dtype: int64
1433    20170927
1434    20170928
1435    20170929
1436    20170930
1437    20171001
dtype: int64
<class 'numpy.int64'>
20110622


In [5]:
# Create a column 
season_df['Home Winner'] = season_df['Home Score'] > season_df['Visiting Score']


In [6]:
# Ask the user how big of a window of prior game days should we evaluate.
good_value = False
while not good_value:
    lookback_n = int(input(f"How many gamedays back do you want the predictor to look? "))
    good_value = lookback_n > 0 and lookback_n < len(gamedays)

How many gamedays back do you want the predictor to look? 50


In [7]:
dataset_start_date = int(season_df['Date'].min())
start_date = gamedays_offset(dataset_start_date, lookback_n)
end_date = season_df['Date'].max()
print(f"dataset_start_date: {dataset_start_date}")
print(f"start_date: {start_date}")
print(f"end_date: {end_date}")

good_date = False
while not good_date:
    game_day = int(input(f"What is the game day (YYYYMMDD) you want us to predict (between {start_date} and {end_date}: "))
#    game_day = 20100715
    if game_day >= start_date and game_day <= end_date:
        game_day_df = season_df.loc[(season_df['Date'] == (game_day))]
        if len(game_day_df)==0:
            print("Sorry, no games are scheduled for that day")
        else:
            lookback_start_day = gamedays_offset(game_day, -lookback_n)
            lookback_end_day = gamedays_offset(game_day, -1)
            print(f"Lookback window starts {lookback_start_day} - {lookback_end_day}")
            train_df = season_df.loc[((season_df['Date'] < (game_day)) & (season_df['Date'] >= (lookback_start_day)))]
            gds = str(game_day)
            print(f"Nice! There are {len(game_day_df)} games on {gds[0:4]}-{gds[4:6]}-{gds[6:]}.")
            print(f"  We also have {len(train_df)} games in our training set, which should be plenty!")
            good_date = True
    else:
        print("Sorry, the date you selected is outside the range of our dataset.")

train_df.head()

dataset_start_date: 20100404
start_date: 20100524
end_date: 20171001
What is the game day (YYYYMMDD) you want us to predict (between 20100524 and 20171001: 20170403
Lookback window starts 20160815 - 20170402
Nice! There are 11 games on 2017-04-03.
  We also have 676 games in our training set, which should be plenty!


Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,V Start Pitcher ID,H Start Pitcher ID,Home Winner
1755,20160815,SEA,AL,ANA,AL,3,2,hernf002,nolar001,False
1756,20160815,BOS,AL,CLE,AL,3,2,pomed001,tomlj001,False
1757,20160815,KCA,AL,DET,AL,3,1,kenni001,norrd002,False
1758,20160815,TOR,AL,NYA,AL,0,1,dickr001,greec003,True
1759,20160815,SDN,NL,TBA,AL,2,8,perdl002,smyld001,True


In [8]:
# We now have the set of games we want to predict, and the training set defined.
# game_day_df2 = game_day_df.copy(deep=True)
game_day_df2 = game_day_df.copy(deep=True)
game_day_df2.head()

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,V Start Pitcher ID,H Start Pitcher ID,Home Winner
3,20170403,PHI,NL,CIN,NL,4,3,hellj001,felds001,False
4,20170403,SDN,NL,LAN,NL,3,14,chacj001,kersc001,True
5,20170403,COL,NL,MIL,NL,7,5,grayj003,guerj003,False
6,20170403,ATL,NL,NYN,NL,0,6,tehej001,syndn001,True
7,20170403,MIA,NL,WAS,NL,2,4,volqe001,stras001,True


In [9]:
v_pitchers = game_day_df2['V Start Pitcher ID'].to_list()
v_pitcher_net_score = []
for pitcher in v_pitchers:
    v_pitcher_net_score.append(train_df.loc[(train_df['V Start Pitcher ID']==pitcher),:]['Home Score'].mean())
game_day_df2['V Avg Pitcher Runs given'] = v_pitcher_net_score

h_pitchers = game_day_df2['H Start Pitcher ID'].to_list()
h_pitcher_net_score = []
for pitcher in h_pitchers:
    h_pitcher_net_score.append(train_df.loc[(train_df['H Start Pitcher ID']==pitcher),:]['Visiting Score'].mean())
game_day_df2['H Avg Pitcher Runs given'] = h_pitcher_net_score

game_day_df2['Predict Home Pitcher Wins?'] = game_day_df2['H Avg Pitcher Runs given'] < game_day_df2['V Avg Pitcher Runs given']

game_day_df2


Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,V Start Pitcher ID,H Start Pitcher ID,Home Winner,V Avg Pitcher Runs given,H Avg Pitcher Runs given,Predict Home Pitcher Wins?
3,20170403,PHI,NL,CIN,NL,4,3,hellj001,felds001,False,8.25,,False
4,20170403,SDN,NL,LAN,NL,3,14,chacj001,kersc001,True,3.5,1.0,True
5,20170403,COL,NL,MIL,NL,7,5,grayj003,guerj003,False,6.0,,False
6,20170403,ATL,NL,NYN,NL,0,6,tehej001,syndn001,True,4.666667,4.0,True
7,20170403,MIA,NL,WAS,NL,2,4,volqe001,stras001,True,4.5,4.0,True
8,20170403,TOR,AL,BAL,AL,2,3,estrm001,gausk001,True,4.2,2.75,True
9,20170403,PIT,NL,BOS,AL,3,5,coleg001,porcr001,True,6.0,2.666667,True
10,20170403,SEA,AL,HOU,AL,0,3,hernf002,keucd001,True,5.6,5.0,True
11,20170403,KCA,AL,MIN,AL,1,7,duffd001,sante001,True,5.75,6.333333,False
12,20170403,ANA,AL,OAK,AL,2,4,nolar001,gravk001,True,2.333333,4.5,False


In [10]:
game_day_df2.fillna(999)

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,V Start Pitcher ID,H Start Pitcher ID,Home Winner,V Avg Pitcher Runs given,H Avg Pitcher Runs given,Predict Home Pitcher Wins?
3,20170403,PHI,NL,CIN,NL,4,3,hellj001,felds001,False,8.25,999.0,False
4,20170403,SDN,NL,LAN,NL,3,14,chacj001,kersc001,True,3.5,1.0,True
5,20170403,COL,NL,MIL,NL,7,5,grayj003,guerj003,False,6.0,999.0,False
6,20170403,ATL,NL,NYN,NL,0,6,tehej001,syndn001,True,4.666667,4.0,True
7,20170403,MIA,NL,WAS,NL,2,4,volqe001,stras001,True,4.5,4.0,True
8,20170403,TOR,AL,BAL,AL,2,3,estrm001,gausk001,True,4.2,2.75,True
9,20170403,PIT,NL,BOS,AL,3,5,coleg001,porcr001,True,6.0,2.666667,True
10,20170403,SEA,AL,HOU,AL,0,3,hernf002,keucd001,True,5.6,5.0,True
11,20170403,KCA,AL,MIN,AL,1,7,duffd001,sante001,True,5.75,6.333333,False
12,20170403,ANA,AL,OAK,AL,2,4,nolar001,gravk001,True,2.333333,4.5,False


In [11]:
# Evaluate the predictions
game_day_df2['Prediction Correct?'] = game_day_df2['Predict Home Pitcher Wins?'] == game_day_df2['Home Winner']
game_day_df2_correct = game_day_df2.loc[(game_day_df2['Prediction Correct?']),:]
print(f"{game_day}: {len(game_day_df2)} games with {len(game_day_df2_correct)} predicted correctly.")
print(f"{round((len(game_day_df2_correct)/len(game_day_df2)*100.),1)}%")

20170403: 11 games with 9 predicted correctly.
81.8%
