In [80]:
# hello-world-predictor.ipynb
# Objective: Read the dataset, do a simple prediction for games on a given date using data prior to that date, then
#            and record some stats on how well it did.
# 
#
# 12/21/19, Alexis: Currently this "helloworld" predictor is very simple. It does the following:
#  1. Reads in the 2010 season as the baseline games dataset. It does some simple selection
#     of columns and generates a few calculated fields.
#  2. It prompts the user to specify 'game day' which are the games it is to predict.
#  3. It calculates the avg net number of runs for the home team (when playing at home) and
#     the avg net number of runs for the visiting team (when playing away).
#  4. For a given game, the predicted winner is the team with the greatest avg net runs at
#     home or away (as applicable).
#  5. The prediction is compared with the actual results that day, with % correct reported.
#
#  Proposed next steps:
#  1. Integrate Venkat's "concat" capability to combine datasets across year blocks to result
#     in a much larger dataset. Note that for development we may want to keep the dataset
#     smaller size so that it doesn't take a long time to run.
#  2. Move this out of jupyter notebook into standard Python and embed the prediction logic
#     into a function that can be called repeatedly with different dates.
#  3. Run trials with large numbers of dates to product large numbers of predictions and
#     results.
#  4. Write results to a file. Possibly generate some plot of results as a function of
#     training set window size.
#  5. Start experimenting with diffent prediction functions, initially across individual
#     factors, and then with multiple factors.
#  6. Consider a statistically meaningful regresssion analysis to select factors and training
#     set window size, by factor.
#  7. If someone has energy, consider using a web API to hit a website with current day
#     game schedule so we can predict games more recent than the dataset.
#
#
#12/21/19, Venkat : Modified the code and included the below changes
#  1. Included the logic to combine all data files into one data frame
#  2. Included cleanup logic to clean missing data rows if any such rows exists
#
#


In [1]:
# Modules
import os
import csv
import pprint
import pandas as pd
import glob

files = glob.glob("../datasets/Final_Data_Files/GL*.csv")

def reader(f):
    df = pd.read_csv(f, index_col=False, header=None)  
    df.columns = [("Col_"+str(i)) for i in range(1,df.shape[1]+1)]       

    return df

#season_df = pd.concat([reader(f) for f in files], keys=files)
season_df = pd.concat([reader(f) for f in files])

season_df


Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,...,Col_152,Col_153,Col_154,Col_155,Col_156,Col_157,Col_158,Col_159,Col_160,Col_161
0,20100404,0,Sun,NYA,AL,1,BOS,AL,1,7,...,J.D. Drew,9,camem001,Mike Cameron,8,scutm001,Marco Scutaro,6,,Y
1,20100405,0,Mon,MIN,AL,1,ANA,AL,1,3,...,Howie Kendrick,4,woodb003,Brandon Wood,5,mathj001,Jeff Mathis,2,,Y
2,20100405,0,Mon,CLE,AL,1,CHA,AL,1,0,...,A.J. Pierzynski,2,teahm001,Mark Teahen,5,ramia003,Alexei Ramirez,6,,Y
3,20100405,0,Mon,DET,AL,1,KCA,AL,1,8,...,Yuniesky Betancourt,6,kendj001,Jason Kendall,2,getzc001,Chris Getz,4,,Y
4,20100405,0,Mon,SEA,AL,1,OAK,AL,1,5,...,Mark Ellis,4,buckt001,Travis Buck,7,pennc001,Cliff Pennington,6,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2425,20171001,0,Sun,ARI,NL,162,KCA,AL,162,14,...,Alex Gordon,7,buted001,Drew Butera,2,cainl001,Lorenzo Cain,8,,Y
2426,20171001,0,Sun,DET,AL,162,MIN,AL,162,1,...,Byron Buxton,8,castj006,Jason Castro,2,grosr001,Robbie Grossman,9,,Y
2427,20171001,0,Sun,TOR,AL,162,NYA,AL,162,2,...,Austin Romine,2,frazc001,Clint Frazier,7,austt001,Tyler Austin,9,,Y
2428,20171001,0,Sun,BAL,AL,162,TBA,AL,162,0,...,Curt Casali,2,smitm007,Mallex Smith,8,robed004,Daniel Robertson,6,,Y


In [94]:
season_df.shape # Useful to make sure we don't loose rows when adding column headers.

(19437, 161)

In [95]:
season_df = season_df.rename(columns={'Col_1':'Date',
                                     'Col_4':'Visiting Team',
                                     'Col_5':'Visiting League',
                                     'Col_7':'Home Team',
                                     'Col_8':'Home League',
                                     'Col_10':'Visiting Score',
                                     'Col_11':'Home Score'})
season_df = season_df[['Date', 'Visiting Team', 'Visiting League', 'Home Team', 'Home League',
                       'Visiting Score','Home Score']]
season_df

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score
0,20100404,NYA,AL,BOS,AL,7,9
1,20100405,MIN,AL,ANA,AL,3,6
2,20100405,CLE,AL,CHA,AL,0,6
3,20100405,DET,AL,KCA,AL,8,4
4,20100405,SEA,AL,OAK,AL,5,3
...,...,...,...,...,...,...,...
2425,20171001,ARI,NL,KCA,AL,14,2
2426,20171001,DET,AL,MIN,AL,1,5
2427,20171001,TOR,AL,NYA,AL,2,1
2428,20171001,BAL,AL,TBA,AL,0,6


In [96]:
# Identify incomplete rows
season_df.count()

Date               19437
Visiting Team      19437
Visiting League    19437
Home Team          19437
Home League        19437
Visiting Score     19437
Home Score         19437
dtype: int64

In [97]:
# Drop all rows with missing information
season_df = season_df.dropna(how='any')
season_df.count()

Date               19437
Visiting Team      19437
Visiting League    19437
Home Team          19437
Home League        19437
Visiting Score     19437
Home Score         19437
dtype: int64

In [98]:
# Create a column 
season_df['Home Winner'] = season_df['Home Score'] > season_df['Visiting Score']
season_df['V NetRuns'] = season_df['Visiting Score'] - season_df['Home Score']
season_df['H NetRuns'] = - season_df['V NetRuns']
season_df['V AvgNetRuns'] = ""
season_df['H AvgNetRuns'] = ""
season_df['Predict Home Wins?'] = ""
season_df['Prediction Correct?'] = ""
season_df.head()

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,Home Winner,V NetRuns,H NetRuns,V AvgNetRuns,H AvgNetRuns,Predict Home Wins?,Prediction Correct?
0,20100404,NYA,AL,BOS,AL,7,9,True,-2,2,,,,
1,20100405,MIN,AL,ANA,AL,3,6,True,-3,3,,,,
2,20100405,CLE,AL,CHA,AL,0,6,True,-6,6,,,,
3,20100405,DET,AL,KCA,AL,8,4,False,4,-4,,,,
4,20100405,SEA,AL,OAK,AL,5,3,False,2,-2,,,,


In [89]:
start_date = int(season_df['Date'].min())
end_date = season_df['Date'].max()

good_date = False
while not good_date:
    game_day = int(input(f"What is the game day (YYYYMMDD) you want us to predict (between {start_date} and {end_date}: "))
#    game_day = 20100715
    if game_day > start_date and game_day <= end_date:
        game_day_df = season_df.loc[(season_df['Date'] == (game_day))]
        if len(game_day_df)==0:
            print("Sorry, no games are scheduled for that day")
        else:
            train_df = season_df.loc[(season_df['Date'] < (game_day))]
            gds = str(game_day)
            print(f"Nice! There are {len(game_day_df)} games on {gds[0:4]}-{gds[4:6]}-{gds[6:]}.")
            print(f"  We also have {len(train_df)} games in our training set, which should be plenty!")
            good_date = True
    else:
        print("Sorry, the date you selected is outside the range of our dataset.")


What is the game day (YYYYMMDD) you want us to predict (between 20100404 and 20171001: 20170403
Nice! There are 11 games on 2017-04-03.
  We also have 17010 games in our training set, which should be plenty!


In [90]:
# We now have the set of games we want to predict, and the training set defined.
game_day_df2 = game_day_df.copy(deep=True)
#game_day_df2 = game_day_df2.reset_index()
game_day_df2.head()

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,Home Winner,V NetRuns,H NetRuns,V AvgNetRuns,H AvgNetRuns,Predict Home Wins?,Prediction Correct?
3,20170403,PHI,NL,CIN,NL,4,3,False,1,-1,,,,
4,20170403,SDN,NL,LAN,NL,3,14,True,-11,11,,,,
5,20170403,COL,NL,MIL,NL,7,5,False,2,-2,,,,
6,20170403,ATL,NL,NYN,NL,0,6,True,-6,6,,,,
7,20170403,MIA,NL,WAS,NL,2,4,True,-2,2,,,,


In [91]:
v_teams = game_day_df2['Visiting Team'].to_list()
v_team_net_score = []
for team in v_teams:
    v_team_net_score.append(train_df.loc[(train_df['Visiting Team']==team),:]['V NetRuns'].mean())
game_day_df2['V AvgNetRuns'] = v_team_net_score

h_teams = game_day_df2['Home Team'].to_list()
h_team_net_score = []
for team in h_teams:
    h_team_net_score.append(train_df.loc[(train_df['Home Team']==team),:]['H NetRuns'].mean())
game_day_df2['H AvgNetRuns'] = h_team_net_score

# prediction = [] # True means we predict home wins, False is visitor
# for game in game_day_df2:
#     print(type(game))
#     #prediction.append(game['H AvgNetRuns'] > game['V AvgNetRuns'])
# game_day_df2['Predict Home Wins?'] = prediction

game_day_df2['Predict Home Wins?'] = game_day_df2['H AvgNetRuns'] > game_day_df2['V AvgNetRuns']

game_day_df2.head()

Unnamed: 0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,Home Winner,V NetRuns,H NetRuns,V AvgNetRuns,H AvgNetRuns,Predict Home Wins?,Prediction Correct?
3,20170403,PHI,NL,CIN,NL,4,3,False,1,-1,-0.416226,0.148148,True,
4,20170403,SDN,NL,LAN,NL,3,14,True,-11,11,-0.45679,0.342152,True,
5,20170403,COL,NL,MIL,NL,7,5,False,2,-2,-0.938272,-0.001764,True,
6,20170403,ATL,NL,NYN,NL,0,6,True,-6,6,-0.19258,-0.097002,True,
7,20170403,MIA,NL,WAS,NL,2,4,True,-2,2,-0.720988,0.501767,True,


In [77]:
# Evaluate the predictions
game_day_df2['Prediction Correct?'] = game_day_df2['Predict Home Wins?'] == game_day_df2['Home Winner']
game_day_df2_correct = game_day_df2.loc[(game_day_df2['Prediction Correct?']),:]
print(f"{game_day}: {len(game_day_df2)} games with {len(game_day_df2_correct)} predicted correctly.")
print(f"{round((len(game_day_df2_correct)/len(game_day_df2)*100.),1)}%")
game_day_df2[['Date', 'Visiting Team', 'Home Team', 'Visiting Score', 'Home Score', 'Home Winner',
              'V AvgNetRuns', 'H AvgNetRuns', 'Predict Home Wins?', 'Prediction Correct?']]

20100715: 7 games with 4 predicted correctly.
57.1%


Unnamed: 0,Date,Visiting Team,Home Team,Visiting Score,Home Score,Home Winner,V AvgNetRuns,H AvgNetRuns,Predict Home Wins?,Prediction Correct?
1324,20100715,SEA,ANA,3,8,True,-1.44186,-0.113636,True,True
1325,20100715,TEX,BOS,7,2,False,-0.026316,0.869565,True,False
1326,20100715,CHA,MIN,8,7,False,0.097561,1.395349,True,False
1327,20100715,MIL,ATL,1,2,True,0.372093,1.425,True,True
1328,20100715,PHI,CHN,6,12,True,0.2,-0.651163,False,False
1329,20100715,NYN,SFN,0,2,True,-0.5,1.047619,True,True
1330,20100715,LAN,SLN,1,7,True,0.595238,1.357143,True,True
