In [1]:
# baseball-logisitic-regression.ipynb
# Alexis Perumal, Venkat Pinnika, Young You, 1/11/2020
#
# Objective:
# . Build a predictor for the winner of baseball games (visitor or home)
#   with > 50% prediction accuracy, with statistical accuracy.
# . Analysis will be done by season, looking at all the regular season
#   games of a given year with each days' game predicted based on info
#   up to, but not including that day.
#
# Strategy:
# . Build a dataframe of games (rows) and factors to build a regression
#   model for visiting net points, and by extension, predict the winner
#   of each game (visitor or home).
# . Explore candidate factors with regression analysis (T-stat, F-stat,
#   p-value), build and optimize a simple LSR model using a training set
#   through the 2017 Season.
# . Plot the results.
# . Ultimately, apply the model to the 2018, then 2019 seasons.
#
# 1/9/20 - Updating to add pitcher-based prediction
# 1/10/20 - Coding different values as columns and then running a linear regression on them.
# 1/11/20 - Cleaned up a defect where it was using the wrong model outputs to calc.
#         - replaced manual prediction with call to .predict()

In [2]:
# Modules
import os
import csv
import pprint
import pandas as pd
import glob
import pprint
import datetime
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
%matplotlib inline
!pwd

/Users/aperumal/OneDrive/Personal/UCSD_Data_Science_Bootcamp/Homework/2019-12-18_Proj1_Baseball-Predictor/baseball-predictor/alexis-regression


In [3]:
# Support functions
def date_str(date: int):
    s = str(date)
    return f"{s[0:4]}-{s[4:6]}-{s[6:]}"


# Passed the gamedays series (YYYYMMDD) and season (year) return with the first
# and last game dates.
def date_range(gamedays, season):
    # Converts 'YYYY' string to an integer start date, YYYY0101 and end date YYYY1231
    def season_to_date(season): 
#         return (int(season)*10000 + 101, int(season)*10000+1231)
        # Hack to shorten the season for dev purposes (faster analysis)
        return (int(season)*10000 + 101, int(season)*10000+430)
    
    first_of_year, last_of_year = season_to_date(season)
    season_gamedays = gamedays.loc[((gamedays >= first_of_year) &
                                    (gamedays <= last_of_year))]
    return (season_gamedays.iloc[0], season_gamedays.iloc[-1])
    

# returns new game date offset by n. Passed in the gamedays series.
def gamedays_offset(gamedays, base_date, n): 
    if base_date not in gamedays.values:
        raise ValueError(f"{base_date}, not in the the gamedays series.")
    base_date_index = gamedays[gamedays==base_date].index[0]
    if ((n + base_date_index) < 0) or n + base_date_index >= len(gamedays):
        raise ValueError(f"Attempting to calculate a game date outside the dataset.")
        return(0)  # Out of range
    else:
        new_index = base_date_index + n
        return gamedays.iloc[new_index]
    
def derive_metrics(results_df):
    num_games = len(results_df)
    num_correct = results_df['Prediction Correct?'].values.sum()
    percent_correct = num_correct/num_games*100.
    return (num_games, num_correct, percent_correct)

In [4]:
def read_source_data():
    def reader(f):
        df = pd.read_csv(f, index_col=False, header=None)  
        df.columns = [("Col_"+str(i)) for i in range(1,df.shape[1]+1)]       
        return df

    files = glob.glob("../datasets/Final_Data_Files/GL*.csv")
    files.sort()
    df = pd.concat([reader(f) for f in files])
    old_df_len = len(df)
    
    # Insert column headers
    df = df.rename(columns={'Col_1':'Date',
                            'Col_4':'Visiting Team',
                            'Col_5':'Visiting League',
                            'Col_7':'Home Team',
                            'Col_8':'Home League',
                            'Col_10':'Visiting Score',
                            'Col_11':'Home Score',
                            'Col_102':'V Start Pitcher ID',
                            'Col_104':'H Start Pitcher ID'
                           }).reset_index()
    df = df[['Date', 'Visiting Team', 'Visiting League', 'Home Team', 'Home League',
            'Visiting Score','Home Score', 'V Start Pitcher ID','H Start Pitcher ID']]
    
    df = df.replace('FLO','MIA') # After the 2011 season, the Florida Marlins
                                 # rebranded themselves the Miami Marlins. This
                                 # search and replace makes the two the same.
    
    # Drop all rows with missing information
    # print(df.head())
    df = df.dropna(how='any')
    if len(df) < old_df_len:
        print(f"Dropped {old_df_len-len(df)} rows due to missing data.")
    
    # Create new columns we'll need. 
#     df['Key'] = df['Date'].astype(str) + df['Visiting Team']
    df['Home Winner'] = df['Home Score'] > df['Visiting Score']
    df['V NetRuns'] = df['Visiting Score'] - df['Home Score']
    df['H NetRuns'] = - df['V NetRuns']
    
    #     print(df.shape)
    print(f"Dataset loaded with {df.shape[0]} games, ", end='')
    print(f"{df.shape[1]} columns, {date_str(df.iloc[0, 0])} - ", end='')
    print(f"{date_str(df.iloc[-1,0])}")
    return df

# read_source_data()

In [5]:
# Add columns to the dataset that will be used for regression analysis.

# Get the source dataset.
g = read_source_data()
g.index.set_names('games_idx', inplace=True)  # Name the row index column so we can sort by it

# Calculate baseline stats on home wins.
num_games = len(g)
home_wins = g['Home Winner'].sum()
home_wins_percent = home_wins / num_games * 100.
print(f'Baseline results: {num_games} games, {home_wins} home wins = {round(home_wins_percent, 2)}%')

# Calculate rolling average of runs allowed by starting pitcher excluding the current row (shift=prior_exclude+1)
lookback_n = 10
g = g.sort_values(by = ['Visiting Team','games_idx'])
g['V Runs Avg'] = g.groupby('Visiting Team')['Visiting Score'].transform(lambda x: x.shift(periods=1).rolling(window=lookback_n).mean())
g = g.sort_values(by = ['Home Team','games_idx'])
g['H Runs Avg'] = g.groupby('Home Team')['Home Score'].transform(lambda x: x.shift(periods=1).rolling(window=lookback_n).mean())
g = g.sort_index()

# Calculate rolling average of runs allowed by starting pitcher excluding the current row (shift=prior_exclude+1)
lookback_n = 5
g = g.sort_values(by = ['V Start Pitcher ID', 'games_idx'])
g['V Runs Allowed'] = g.groupby('V Start Pitcher ID')['Home Score'].transform(lambda x: x.shift(periods=1).rolling(window=lookback_n).mean())
g = g.sort_values(by = ['H Start Pitcher ID', 'games_idx'])
g['H Runs Allowed'] = g.groupby('H Start Pitcher ID')['Visiting Score'].transform(lambda x: x.shift(periods=1).rolling(window=lookback_n).mean())
g = g.sort_index()

g

Dataset loaded with 19437 games, 12 columns, 2010-04-04 - 2017-10-01
Baseline results: 19437 games, 10438 home wins = 53.7%


Unnamed: 0_level_0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,V Start Pitcher ID,H Start Pitcher ID,Home Winner,V NetRuns,H NetRuns,V Runs Avg,H Runs Avg,V Runs Allowed,H Runs Allowed
games_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,20100404,NYA,AL,BOS,AL,7,9,sabac001,beckj002,True,-2,2,,,,
1,20100405,MIN,AL,ANA,AL,3,6,bakes002,weavj003,True,-3,3,,,,
2,20100405,CLE,AL,CHA,AL,0,6,westj001,buehm001,True,-6,6,,,,
3,20100405,DET,AL,KCA,AL,8,4,verlj001,greiz001,False,4,-4,,,,
4,20100405,SEA,AL,OAK,AL,5,3,hernf002,sheeb001,False,2,-2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19432,20171001,ARI,NL,KCA,AL,14,2,ray-r002,vargj001,False,12,-12,3.4,4.2,2.4,4.8
19433,20171001,DET,AL,MIN,AL,1,5,sanca004,colob001,True,-4,4,2.4,6.9,7.4,5.0
19434,20171001,TOR,AL,NYA,AL,2,1,andeb004,montj004,False,1,-1,4.4,5.7,3.4,4.2
19435,20171001,BAL,AL,TBA,AL,0,6,gausk001,snelb001,True,-6,6,2.7,3.5,3.8,2.6


# Logistic Regression Analysis

In [6]:
len(g)

19437

In [7]:
# Perform regression analysis. First drop blanks.
num_before_drop = len(g)
g = g.dropna(how='any')
num_after_drop = len(g)
print(f"Dropped {num_before_drop-num_after_drop} rows due to missing data.")

Dropped 5060 rows due to missing data.


In [12]:
# Build the logistic model
y = g["Home Winner"]
X = g[['V Runs Avg', 'H Runs Avg', 'H Runs Allowed', 'V Runs Allowed']]
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.687788
         Iterations 4


0,1,2,3
Dep. Variable:,Home Winner,No. Observations:,14377.0
Model:,Logit,Df Residuals:,14373.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 11 Jan 2020",Pseudo R-squ.:,0.002638
Time:,08:21:23,Log-Likelihood:,-9888.3
converged:,True,LL-Null:,-9914.5
Covariance Type:,nonrobust,LLR p-value:,2.562e-11

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
V Runs Avg,-0.0311,0.013,-2.367,0.018,-0.057,-0.005
H Runs Avg,0.0632,0.013,4.879,0.000,0.038,0.089
H Runs Allowed,-0.0484,0.011,-4.513,0.000,-0.069,-0.027
V Runs Allowed,0.0519,0.011,4.801,0.000,0.031,0.073


In [14]:
# Build predictions based on the new logistic model
# Generate predictions
g2 = g.copy(deep=True)
g2['Predict Home Wins Score'] = result.predict()
g2.head()

Unnamed: 0_level_0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,V Start Pitcher ID,H Start Pitcher ID,Home Winner,V NetRuns,H NetRuns,V Runs Avg,H Runs Avg,V Runs Allowed,H Runs Allowed,Predict Home Wins Score
games_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
611,20100520,BAL,AL,TEX,AL,7,13,matub001,felds001,True,-6,6,3.3,6.0,4.8,5.0,0.570502
614,20100520,COL,NL,HOU,NL,4,0,jimeu001,oswar001,False,4,-4,3.7,2.6,1.4,2.6,0.499038
660,20100523,NYA,AL,NYN,NL,4,6,sabac001,santj003,True,-2,2,4.8,5.3,3.2,2.4,0.558649
674,20100525,DET,AL,SEA,AL,3,5,verlj001,fistd001,True,-2,2,4.2,3.7,3.4,2.4,0.540818
684,20100526,TOR,AL,ANA,AL,5,6,morrb001,pinej001,True,-1,1,4.8,4.4,6.2,4.2,0.561574


In [17]:
# Calculate baseline stats on home wins.
# Calculate baseline stats on home wins.
num_games = len(g2)
home_wins = g2['Home Winner'].sum()
home_wins_percent = home_wins / num_games * 100.
print(f'Baseline results: {num_games} games, {home_wins} home wins = {round(home_wins_percent, 2)}%')
threshold = 1 - (g2['Home Winner'].sum()/num_games)
threshold

Baseline results: 14377 games, 7793 home wins = 54.2%


0.45795367601029424

In [19]:
g2['Predict Home Wins?'] = g2['Predict Home Wins Score'] >= threshold
g2['Prediction Correct'] = g2['Predict Home Wins?'] == g2['Home Winner']

# Calculate baseline stats on home wins.
num_games_now = len(g2)
num_correct = g2['Prediction Correct'].sum()
num_correct_percent = num_correct / num_games_now * 100.
print(f'Baseline results: {num_games} games, {home_wins} home wins = {round(home_wins_percent, 4)}%')
print(f'Prediction results: {num_games_now} games, {num_correct} predicted correctly = {round(num_correct_percent, 4)}%')


Baseline results: 14377 games, 7793 home wins = 54.2046%
Prediction results: 14377 games, 7798 predicted correctly = 54.2394%


# Multiple Regression Analysis
Ignore everything below this line.

In [8]:
# Perform regression analysis.
num_before_drop = len(g)
g = g.dropna(how='any')
num_after_drop = len(g)
print(f"Dropped {num_after_drop-num_before_drop} rows due to missing data.")

# The data table is built, now let's do the regression or raw points for Visitors
v_y = g['Visiting Score']
v_x = g[['V Runs Avg', 'H Runs Allowed']]
v_x = sm.add_constant(v_x)
v_est = sm.OLS(v_y,v_x).fit()
# g['V Score Predicted'] = v_est.params[0] + v_est.params[1]*g['V Runs Avg'] + v_est.params[2]*g['H Runs Allowed']
# g['V Score Predicted'] = v_est.predict()
print(v_est.params)
v_est.summary()


Dropped 0 rows due to missing data.
const             2.967805
V Runs Avg        0.107252
H Runs Allowed    0.187474
dtype: float64


  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,Visiting Score,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,70.17
Date:,"Sat, 11 Jan 2020",Prob (F-statistic):,4.73e-31
Time:,08:16:10,Log-Likelihood:,-36474.0
No. Observations:,14377,AIC:,72950.0
Df Residuals:,14374,BIC:,72980.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9678,0.126,23.477,0.000,2.720,3.216
V Runs Avg,0.1073,0.024,4.424,0.000,0.060,0.155
H Runs Allowed,0.1875,0.017,10.840,0.000,0.154,0.221

0,1,2,3
Omnibus:,2183.501,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3612.06
Skew:,1.024,Prob(JB):,0.0
Kurtosis:,4.354,Cond. No.,31.0


In [9]:
# The data table is built, now let's do the regression or raw points for HOME
h_y = g['Home Score']
h_x = g[['H Runs Avg', 'V Runs Allowed']]
h_x = sm.add_constant(h_x)
h_est = sm.OLS(h_y,h_x).fit()
# g['H Score Predicted'] = h_est.params[0] + h_est.params[1]*g['H Runs Avg'] + h_est.params[2]*g['V Runs Allowed']
# g['H Score Predicted'] = h_est.predict()
print(h_est.params)
h_est.summary()

const             2.462087
H Runs Avg        0.300328
V Runs Allowed    0.137150
dtype: float64


0,1,2,3
Dep. Variable:,Home Score,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.017
Method:,Least Squares,F-statistic:,124.0
Date:,"Sat, 11 Jan 2020",Prob (F-statistic):,3.8599999999999994e-54
Time:,08:16:45,Log-Likelihood:,-36058.0
No. Observations:,14377,AIC:,72120.0
Df Residuals:,14374,BIC:,72140.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.4621,0.125,19.750,0.000,2.218,2.706
H Runs Avg,0.3003,0.022,13.571,0.000,0.257,0.344
V Runs Allowed,0.1372,0.018,7.716,0.000,0.102,0.172

0,1,2,3
Omnibus:,2014.431,Durbin-Watson:,2.011
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3283.842
Skew:,0.961,Prob(JB):,0.0
Kurtosis:,4.338,Cond. No.,32.3


In [10]:
# Generate predictions
g2 = g.copy(deep=True)
g2['V Score Predicted'] = v_est.predict()
g2['H Score Predicted'] = h_est.predict()
g2.head()

Unnamed: 0_level_0,Date,Visiting Team,Visiting League,Home Team,Home League,Visiting Score,Home Score,V Start Pitcher ID,H Start Pitcher ID,Home Winner,V NetRuns,H NetRuns,V Runs Avg,H Runs Avg,V Runs Allowed,H Runs Allowed,V Score Predicted,H Score Predicted
games_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
611,20100520,BAL,AL,TEX,AL,7,13,matub001,felds001,True,-6,6,3.3,6.0,4.8,5.0,4.259103,4.922373
614,20100520,COL,NL,HOU,NL,4,0,jimeu001,oswar001,False,4,-4,3.7,2.6,1.4,2.6,3.852067,3.434949
660,20100523,NYA,AL,NYN,NL,4,6,sabac001,santj003,True,-2,2,4.8,5.3,3.2,2.4,3.932549,4.492704
674,20100525,DET,AL,SEA,AL,3,5,verlj001,fistd001,True,-2,2,4.2,3.7,3.4,2.4,3.868198,4.03961
684,20100526,TOR,AL,ANA,AL,5,6,morrb001,pinej001,True,-1,1,4.8,4.4,6.2,4.2,4.270002,4.633859


In [11]:
g2['Predict Home Wins?'] = g2['H Score Predicted'] >= g2['V Score Predicted']
g2['Prediction Correct'] = g2['Predict Home Wins?'] == g2['Home Winner']

# Calculate baseline stats on home wins.
num_games_now = len(g2)
num_correct = g2['Prediction Correct'].sum()
num_correct_percent = num_correct / num_games_now * 100.
print(f'Baseline results: {num_games} games, {home_wins} home wins = {round(home_wins_percent, 2)}%')
print(f'Prediction results: {num_games_now} games, {num_correct} predicted correctly = {round(num_correct_percent, 2)}%')


Baseline results: 19437 games, 10438 home wins = 53.7%
Baseline results: 14377 games, 7637 predicted correctly = 53.12%


In [None]:
print(f"Across the dataset: Visiting avg = {g['Visiting Score'].mean()}, Home avg = {g['Home Score'].mean()}")

# Individual Regression Analysis
Ignore everything from here down. It is older code no longer needed, but kept for our reference.

In [None]:
# Perform regression analysis.
num_before_drop = len(g)
g = g.dropna(how='any')
num_after_drop = len(g)
print(f"Dropped {num_after_drop-num_before_drop} rows due to missing data.")

# The data table is built, now let's do the regression or raw points for Visitors
x = g['V Runs Avg']
y = g['Visiting Score']
slope, y_int, r, p, se = stats.linregress(x, y)

print(f"slope: {slope}, int: {y_int}, r: {r}, r2: {r**2}, p: {p}, std error of the mean: {se}")


In [None]:
# The data table is built, now let's do the regression or raw points for Home
x = g['H Runs Avg']
y = g['Home Score']
slope, y_int, r, p, se = stats.linregress(x, y)

print(f"slope: {slope}, int: {y_int}, r: {r}, r2: {r**2}, p: {p}, std error of the mean: {se}")


In [None]:
# Do a regression on runs given up
x = g['V Runs Allowed']
y = g['Home Score']
slope, y_int, r, p, se = stats.linregress(x, y)

print(f"slope: {slope}, int: {y_int}, r: {r}, r2: {r**2}, p: {p}, std error of the mean: {se}")


In [None]:
# Do a regression on runs given up
x = g['H Runs Allowed']
y = g['Visiting Score']
slope, y_int, r, p, se = stats.linregress(x, y)

print(f"slope: {slope}, int: {y_int}, r: {r}, r2: {r**2}, p: {p}, std error of the mean: {se}")


In [None]:
# Do a regression on runs given up, doesn't make sense.
x = g['V Runs Allowed']
y = g['Visiting Score']
slope, y_int, r, p, se = stats.linregress(x, y)

print(f"slope: {slope}, int: {y_int}, r: {r}, r2: {r**2}, p: {p}, std error of the mean: {se}")


In [None]:
# Do multiple regression


# Legacy code below for point predictors. Recommend don't run anything below this point.


In [None]:
# Given a games df with predictions and prediction resuls by game already captured,
# generate an outcomes dataframe with stats by season.
def append_outcomes(outcomes_df, g, ra_d): # g is the games dataframe with predictions
                                             # d is the dictionary with run attributes to log
    # 1. Capture results spanning the entire dataset (all seasons)
    num_games = len(g)
    num_correct = g['Prediction Correct?'].values.sum()
    home_win_percent = g['Home Winner'].values.sum() / num_games * 100
    home_predict_percent = g['Home Wins?'].values.sum() / num_games * 100
    percent_correct = num_correct/num_games*100.
    
    new_row = pd.Series({'Predictor':'Net or Raw Points (see Net)',
                         'Season':0, # Indicates all seasons in the dataset.
                         'Lookback':ra_d['Lookback'],
                         'Net':ra_d['Net'],
                         'Num Games':num_games,
                         'Home Win %':home_win_percent,
                         'Home Predict %':home_predict_percent,
                         'Num Correct':num_correct,
                         '% Correct':percent_correct,
                         'Comment':('All Seasons' + ra_d['Comment'])})
    outcomes_df = outcomes_df.append(new_row, ignore_index=True)
    
    # 2. Caculate individual seasons represented in the dataset
    seasons = g['Date'].transform(lambda x: int(x/10000)).unique().tolist()
#     pprint.pprint(seasons)
    
    for season in seasons:
        start_date = int(season)*10000 + 101  # YYYYMMDD, Jan 01 of the Year
        end_date = int(season)*10000 + 1231   # YYYYMMDD, DEC 31 of the Year
        season_df = g.loc[((g['Date']>= start_date) &(g['Date']<= end_date)), :]
        
        num_games = len(season_df)
        num_correct = season_df['Prediction Correct?'].values.sum()
        home_win_percent = season_df['Home Winner'].values.sum() / num_games * 100
        home_predict_percent = season_df['Home Wins?'].values.sum() / num_games * 100
        percent_correct = num_correct/num_games*100.
        
        new_row = pd.Series({'Predictor':ra_d['Predictor'],
                     'Season':season,
                     'Lookback':ra_d['Lookback'],
                     'Net':ra_d['Net'],
                     'Num Games':num_games,
                     'Home Win %':home_win_percent,
                     'Home Predict %':home_predict_percent,
                     'Num Correct':num_correct,
                     '% Correct':percent_correct,
                     'Comment':('All Seasons' + ra_d['Comment'])})
        
        outcomes_df = outcomes_df.append(new_row, ignore_index=True)
        
    return outcomes_df

In [None]:
# Predict games based on net points or raw points over a lookback window, visitor vs home.
def net_point_predictor(outcomes_df, lookback_n, prior_exclude=0, net=True):
    # Let's get started. Read the dataset.
    games_df = read_source_data()
    
    num_games = len(games_df)
    home_wins = games_df['Home Winner'].sum()
    home_wins_percent = home_wins / num_games * 100.
    print(f'Baseline results: {num_games} games, {home_wins} home wins = {round(home_wins_percent, 2)}%')
    
    games_df.index.set_names('games_idx', inplace=True)
    
    # Calculate rolling average of net points excluding the current row (shift=prior_exclude+1)
    if net:
        games_df = games_df.sort_values(by = ['Visiting Team','games_idx'])
        games_df['V NP Avg'] = games_df.groupby('Visiting Team')['V NetRuns'].transform(lambda x: x.shift(periods=prior_exclude+1).rolling(window = lookback_n).mean())

        games_df = games_df.sort_values(by = ['Home Team','games_idx'])
        games_df['H NP Avg'] = games_df.groupby('Home Team')['H NetRuns'].transform(lambda x: x.shift(periods=prior_exclude+1).rolling(window = lookback_n).mean())
    else:
        games_df = games_df.sort_values(by = ['Visiting Team','games_idx'])
        games_df['V NP Avg'] = games_df.groupby('Visiting Team')['Visiting Score'].transform(lambda x: x.shift(periods=prior_exclude+1).rolling(window = lookback_n).mean())

        games_df = games_df.sort_values(by = ['Home Team','games_idx'])
        games_df['H NP Avg'] = games_df.groupby('Home Team')['Home Score'].transform(lambda x: x.shift(periods=prior_exclude+1).rolling(window = lookback_n).mean())
       
    games_df = games_df.sort_index()

    print(f"  # of games before dropping lookback window: {len(games_df)}")
    games_df.to_csv("output/result_before_dropna.csv")
    games_df = games_df.dropna(how='any')
    print(f"  # of games after dropping lookback window: {len(games_df)}")

    games_df['Home Wins?'] = games_df['H NP Avg'] >= games_df['V NP Avg']
    games_df['Prediction Correct?'] = games_df['Home Wins?'] == games_df['Home Winner']
    
    # Setup run attributes so they can be included in the outcomes_df generated.
#     run_attributes = {}
#     run_attributes['Predictor'] = 'Net or Raw Points (see Net)'
#     run_attributes['Lookback'] = n
#     run_attributes['Net'] = net
#     run_attributes['Comment'] = ''
    run_attributes = {'Predictor':'Points', 'Lookback':n, 'Net':False, 'Comment':''}
    
    return append_outcomes(outcomes_df, games_df, run_attributes)


In [None]:
# Predict games based on pitcher performance over a lookback window, visitor vs home.
def pitcher_predictor(outcomes_df, lookback_n):
    # Let's get started. Read the dataset.
    g = read_source_data()
    
    num_games = len(g)
    home_wins = g['Home Winner'].sum()
    home_wins_percent = home_wins / num_games * 100.
    print(f'Baseline results: {num_games} games, {home_wins} home wins = {round(home_wins_percent, 2)}%')
    
    g.index.set_names('games_idx', inplace=True)
    
    
    # Calculate rolling average of net points excluding the current row (shift=prior_exclude+1)
    g = g.sort_values(by = ['V Start Pitcher ID', 'games_idx'])
    g['V Runs Allowed'] = g.groupby('V Start Pitcher ID')['Home Score'].transform(lambda x: x.shift(periods=prior_exclude+1).rolling(window = lookback_n).mean())
#     print(g.head())

    g = g.sort_values(by = ['H Start Pitcher ID', 'games_idx'])
    g['H Runs Allowed'] = g.groupby('H Start Pitcher ID')['Visiting Score'].transform(lambda x: x.shift(periods=prior_exclude+1).rolling(window = lookback_n).mean())
#     print(g.head())
    
    g = g.sort_index()

    # Todo: For na's due to a lack of pitcher data, we should substitute data, not remove.
    print(f"  # of games before dropping lookback window: {len(g)}")
    num_na = g['V Runs Allowed'].isna().sum()
    print(f'num_na = {num_na}')
    
    
    g.to_csv("output/result_before_dropna.csv")
    # g = g.dropna(how='any')
    g = g.fillna(999)
    print(f"  # of games after dropping lookback window: {len(g)}")
    # print("  Warning, na's dropped that should be substituted instead.")

    g['Home Wins?'] = g['H Runs Allowed'] <= g['V Runs Allowed']
    g['Prediction Correct?'] = g['Home Wins?'] == g['Home Winner']
    
    # Setup run attributes so they can be included in the outcomes_df generated.
#     run_attributes = {}
#     run_attributes['Predictor'] = 'Pitcher'
#     run_attributes['Lookback'] = n
#     run_attributes['Net'] = False
#     run_attributes['Comment'] = ''
    run_attributes = {'Predictor':'Pitcher', 'Lookback':n, 'Net':False, 'Comment':''}
    
#     base_path = str(datetime.datetime.now()).replace(':', '-').replace(' ', '_')
#     data_output_path = "debug-pitcher-output/" + base_path + "_output.csv"
#     g.to_csv(path_or_buf=data_output_path)
    
    return append_outcomes(outcomes_df, g, run_attributes)


In [None]:
# Top level (main) code, read in the dataset, add the columns we care about.
outcomes_df = pd.DataFrame(columns=['Season', 'Lookback', 'Net', 'Num Games',
                                  'Num Correct', '% Correct', 'Comment'])
# outcomes2_df = pd.DataFrame(columns=['Season', 'Lookback', 'Net', 'Num Games',
#                                   'Num Correct', '% Correct', 'Comment'])

# for n in [1, 2, 5, 10, 15, 20, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500]:
# for n in [50, 60, 70, 80, 90, 100]:
# for n in [5, 10, 25, 50, 75, 100, 150, 200]:
for n in [5, 10, 20, 40, 60, 80, 100]:
# for n in [5]:
    prior_exclude = 0

    if True:
        net=False
        outcomes_df = net_point_predictor(outcomes_df, n, prior_exclude, net=net)
        print(f"  Net Point Predictor, lookback={n}, prior exclude={prior_exclude}, net={net} done.")
        print("")
    else:
        outcomes_df = pitcher_predictor(outcomes_df, n)

outcomes_df


In [None]:
seasons = outcomes_df['Season'].unique().tolist()
seasons

In [None]:
season_gb = outcomes_df.groupby('Season')
type(season_gb)
y = season_gb.get_group(0)['% Correct']
print(y)
x = season_gb.get_group(0)['Lookback']
print(x)

In [None]:
fig = plt.figure(figsize=(12,8))

axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])

axes.set_title('Prediction performance by lookback window length')
axes.set_xlabel('Lookback window size')
axes.set_ylabel('% Correct')

# Todo: Need to represent the predictor type in the plot text!

# for season in range(2012, 2018):
for season in outcomes_df['Season'].unique().tolist():
    x = season_gb.get_group(season)['Lookback']
    y = season_gb.get_group(season)['% Correct']
    axes.plot(x, y, label=str(season), marker='o')
    
axes.legend(loc='upper right')
axes.grid()
plt.show()

In [None]:
if True:
    base_path = str(datetime.datetime.now()).replace(':', '-').replace(' ', '_')
    data_output_path = "output/" + base_path + "_output.csv"
    outcomes_df.to_csv(path_or_buf=data_output_path)
    plot_output_path = "plot/" + base_path + "_plot.png"
    fig.savefig(plot_output_path)

In [None]:
# Regression Analysis
def net_points_regression(lookback_n, prior_exclude=0):
    # Let's get started. Read the dataset.
    games_df = read_source_data()
    
    # Caculate seasons represented in the dataset
    seasons = games_df['Date'].transform(lambda x: int(x/10000)).unique().tolist()
    pprint.pprint(seasons)
    

    # Calculate rolling average of net points excluding the current row (shift=prior_exclude+1)
    games_df = games_df.sort_values(by = ['Visiting Team','Date'])
    games_df['V NP Avg'] = games_df.groupby('Visiting Team')['V NetRuns'].transform(lambda x: x.shift(periods=prior_exclude+1).rolling(window = lookback_n).mean())
    games_df['V P Avg'] = games_df.groupby('Visiting Team')['Visiting Score'].transform(lambda x: x.shift(periods=prior_exclude+1).rolling(window = lookback_n).mean())
    
    games_df = games_df.sort_values(by = ['Home Team','Date'])
    games_df['H NP Avg'] = games_df.groupby('Home Team')['H NetRuns'].transform(lambda x: x.shift(periods=prior_exclude+1).rolling(window = lookback_n).mean())

    games_df = games_df.sort_index()

    print(f"  # of games before dropping lookback window: {len(games_df)}")
    games_df.to_csv("output/result_before_dropna.csv")
    games_df = games_df.dropna(how='any')
    print(f"  # of games after dropping lookback window: {len(games_df)}")

    
    # The data table is built, now let's do the regression
    x = games_df['V NP Avg']
#     y = games_df['V NetRuns']
    y = games_df['V P Avg']
    slope, y_int, r, p, se = stats.linregress(x, y)
    
    print(f"slope: {slope}, int: {y_int}, r: {r}, r2: {r**2}, p: {p}, std error of the mean: {se}")
    
    
    
#     games_df['Pred V NP']
    
    
#     games_df['Home Wins?'] = games_df['H NP Avg'] >= games_df['V NP Avg']
#     games_df['Prediction Correct?'] = games_df['Home Wins?'] == games_df['Home Winner']

# This calls the above code
if False:
    net_points_regression(80)

In [None]:
# From Venkat, on pitchers
# v_pitchers = game_day_df2['V Start Pitcher ID'].to_list()
# v_pitcher_net_score = []
# for pitcher in v_pitchers:
#     v_pitcher_net_score.append(train_df.loc[(train_df['V Start Pitcher ID']==pitcher),:]['Home Score'].mean())
# game_day_df2['V Avg Pitcher Runs given'] = v_pitcher_net_score
# h_pitchers = game_day_df2['H Start Pitcher ID'].to_list()
# h_pitcher_net_score = []
# for pitcher in h_pitchers:
#     h_pitcher_net_score.append(train_df.loc[(train_df['H Start Pitcher ID']==pitcher),:]['Visiting Score'].mean())
# game_day_df2['H Avg Pitcher Runs given'] = h_pitcher_net_score
# game_day_df2['Predict Home Pitcher Wins?'] = game_day_df2['H Avg Pitcher Runs given'] < game_day_df2['V Avg Pitcher Runs given']
# game_day_df2