In [11]:
from pybaseball import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import *

from sklearn.metrics import (
    auc,
    brier_score_loss,
    f1_score,
    log_loss,
    precision_score,
    recall_score,
    roc_curve,
)

In [12]:
teams = ['MIN','CLE','KCR','CHW','DET', #AL Cenntral
         'TOR','BAL','BOS','TBR','NYY', #AL East
         'OAK','HOU','SEA','LAA','TEX', #AL West
         'ATL','PHI','NYM','WSN','MIA','FLA', #NL East
         'PIT','CIN','CHC','MIL','STL', #NL Central
         'ARI','SDP','SFG','COL','LAD', #NL West
        ]

years = np.arange(2010,2021,1)

full_games = pd.DataFrame()
batting_df = pd.DataFrame()
pitching_df = pd.DataFrame()

for team in teams:
    print('Importing', team)
    for year in years:
        try:
            data = schedule_and_record(year, team)
            data['season'] = year
            full_games = pd.concat([full_games,data])
            
            # game logs for each teams hitters
            batting_logs = team_game_logs(year, team)
            batting_logs['season'] = year
            batting_logs['Tm'] = team
            batting_df = pd.concat([batting_df,batting_logs])
            
            # game logs for each teams pitching
            #should eventually be replaced by starter level data
            pitching_logs = team_game_logs(year, team, 'pitching')
            pitching_logs['season'] = year
            pitching_logs['Tm'] = team
            pitching_df = pd.concat([pitching_df,pitching_logs])
        except:
            continue
            
print('\nAll Teams Imported')

Importing MIN
Importing CLE
Importing KCR
Importing CHW
Importing DET
Importing TOR
Importing BAL
Importing BOS
Importing TBR
Importing NYY
Importing OAK
Importing HOU
Importing SEA
Importing LAA
Importing TEX
Importing ATL
Importing PHI
Importing NYM
Importing WSN
Importing MIA
Importing FLA
Importing PIT
Importing CIN
Importing CHC
Importing MIL
Importing STL
Importing ARI
Importing SDP
Importing SFG
Importing COL
Importing LAD

All Teams Imported


In [13]:
#combined the historic data
comb_df = full_games.merge(batting_df,
                           how = 'left',
                           left_on=['Date','Tm','season'],
                           right_on=['Date','Tm','season']
                          ).merge(
                            pitching_df,
                           how = 'left',
                           left_on=['Date','Tm','season'],
                           right_on=['Date','Tm','season']
)

In [14]:
batting_df.columns

Index(['Game', 'Date', 'Home', 'Opp', 'Rslt', 'PA', 'AB', 'R', 'H', '2B', '3B',
       'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'ROE', 'GDP', 'SB',
       'CS', 'BA', 'OBP', 'SLG', 'OPS', 'LOB', 'NumPlayers', 'Thr', 'OppStart',
       'season', 'Tm'],
      dtype='object')

In [15]:
import os
os.chdir('/Users/andrew.green/Downloads/')
odds2010 = pd.read_csv('mlb_odds_2010.csv')
odds2011 = pd.read_csv('mlb_odds_2011.csv')
odds2012 = pd.read_csv('mlb_odds_2012.csv')
odds2013 = pd.read_csv('mlb_odds_2013.csv')
odds2014 = pd.read_csv('mlb_odds_2014.csv')
odds2015 = pd.read_csv('mlb_odds_2015.csv')
odds2016 = pd.read_csv('mlb_odds_2016.csv')
odds2017 = pd.read_csv('mlb_odds_2017.csv')
odds2018 = pd.read_csv('mlb_odds_2018.csv')
odds2019 = pd.read_csv('mlb_odds_2019_2.csv')
odds2020 = pd.read_csv('mlb_odds_2020.csv')

game_odds = pd.concat([odds2010
                      ,odds2011
                      ,odds2012
                      ,odds2013
                      ,odds2014
                      ,odds2015
                      ,odds2016
                      ,odds2017
                      ,odds2018
                      ,odds2019
                      ,odds2020],axis=0)
game_odds = game_odds[['Date','Team','Close']]

#converting the odds to win probabilities
game_odds['implied_odds'] = (abs(game_odds['Close']) / (abs(game_odds['Close']) + 100) ).where(
            game_odds["Close"] <0, (100 / (game_odds['Close'] + 100))
        )

#cleaning csv data to have the same team codes as the team data
#there is 1 game that gets lost in this process
game_odds['Team'] = game_odds['Team'].replace({'SFO':'SFG'
                                               ,'TAM':'TBR'
                                               ,'WAS':'WSN'
                                               ,'KAN':'KCR'
                                               ,'CUB':'CHC'
                                               ,'SDG':'SDP'
                                               ,'CWS':'CHW'
                                               ,'LOS':'LAD'
                                                })
game_odds

Unnamed: 0,Date,Team,Close,implied_odds
0,4/4/2010,NYY,-103.0,0.507389
1,4/4/2010,BOS,-117.0,0.539171
2,4/5/2010,PHI,-200.0,0.666667
3,4/5/2010,WSN,170.0,0.370370
4,4/5/2010,MIA,-112.0,0.528302
...,...,...,...,...
1895,10/24/2020,TBR,148.0,0.403226
1896,10/25/2020,LAD,-160.0,0.615385
1897,10/25/2020,TBR,150.0,0.400000
1898,10/27/2020,TBR,111.0,0.473934
