In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import sys
sys.path.append('../code')
import features

In [34]:
import queue
import threading

# Load Data

In [14]:
games = features.get_games()
batting = features.get_batting()
pitching = features.get_pitching()
pitchers = features.get_pitchers()

In [15]:
print("start hour vs. is_night_game")
print("night games seem to start from 5PM to 10PM")
games.groupby(games.start_time.str[:-3])['is_night_game'].agg(['mean','count'])

start hour vs. is_night_game
night games seem to start from 5PM to 10PM


Unnamed: 0_level_0,mean,count
start_time,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.000213,4691
10,0.997934,484
11,0.052632,19
12,0.0,1069
2,0.0,558
3,0.001776,563
4,0.001037,964
5,0.980263,304
6,0.99568,1852
7,0.998595,12100


# Features

## Create dataset

In [16]:
df = games[['game_id','home_team_abbr','away_team_abbr','date','is_night_game']]
df['home_team_win'] = games.home_team_runs.astype('int')>games.away_team_runs

In [17]:
home_pitchers = pitchers[['name','game_id']].where((pitchers.is_home_team)&(pitchers.is_starting_pitcher)).dropna()
home_pitchers['home_pitcher'] = home_pitchers['name']
home_pitchers = home_pitchers.groupby('game_id')['home_pitcher'].first()
df = pd.merge(left=df, right=home_pitchers, on='game_id', how='left')

In [18]:
away_pitchers = pitchers[['name','game_id']].where((~pitchers.is_home_team)&(pitchers.is_starting_pitcher)).dropna()
away_pitchers['away_pitcher'] = away_pitchers['name']
away_pitchers = away_pitchers.groupby('game_id')['away_pitcher'].first()
df = pd.merge(left=df, right=away_pitchers, on='game_id', how='left')

In [19]:
df = df.sort_values(by='date').reset_index(drop=True)

## Add Small Feats

In [20]:
df['dh_game_no'] = df['game_id'][-1:]

In [21]:
#datetime
date = pd.to_datetime(df['date'])
df['season'] = date.dt.year
df['month']=date.dt.month
df['week_num'] = date.dt.week
df['dow']=date.dt.weekday.astype('int')

In [22]:
df['matchup_season'] = df.apply(lambda x: sorted([x.away_team_abbr, x.home_team_abbr])[0] + '_' + 
                         sorted([x.away_team_abbr, x.home_team_abbr])[1] + '_' +
                         str(x.season),
                         axis=1)
df['game_num'] = df.groupby(by=['season','matchup_season'])['home_team_abbr'].cumcount()

In [23]:
%%time
from trueskill import rate_1vs1, Rating
def add_trueskill_ratings(df):
    #https://www.microsoft.com/en-us/research/project/trueskill-ranking-system/
    ratings = {}
    for x in df.home_team_abbr.unique():
        ratings[x]=25
    for x in df.away_team_abbr.unique():
        ratings[x]=25

    home_trueskill_pre = []
    away_trueskill_pre = []
    for i, r in df.iterrows():
        # get pre-match trueskill ratings from dict
        home_trueskill_pre.append(ratings[r.home_team_abbr])
        away_trueskill_pre.append(ratings[r.away_team_abbr])

        if r.date < df.date.max():
            #doubleheaders get screwed up if we do this on current day
            # update ratings dictionary with post-match ratings
            ts1 = Rating(ratings[r.home_team_abbr])
            ts2 = Rating(ratings[r.away_team_abbr])
            if r.home_team_win==1:
                ts1, ts2 = rate_1vs1(ts1, ts2)
            else:
                ts2, ts1 = rate_1vs1(ts2, ts1)
            ratings[r.home_team_abbr] = ts1.mu
            ratings[r.away_team_abbr] = ts2.mu

    df['home_trueskill_pre']= home_trueskill_pre
    df['away_trueskill_pre']= away_trueskill_pre
    df['ts_diff'] = df.home_trueskill_pre-df.away_trueskill_pre
    
    df.replace({np.inf: 0}, inplace=True)
    return df
df = add_trueskill_ratings(df)

CPU times: user 41.4 s, sys: 28.1 ms, total: 41.4 s
Wall time: 41.4 s


In [24]:
%%time
def add_rest_durations(df):
    # time between games for teams and pitchers
    df.date = pd.to_datetime(df.date)
    
    #initalize rest dictionary
    rest = {}
    for x in df.home_team_abbr.unique():
        rest[x]=pd.to_datetime('12-31-2009')
    for x in df.away_team_abbr.unique():
        rest[x]=pd.to_datetime('12-31-2009')
    for x in df.home_pitcher.unique():
        rest[x]=pd.to_datetime('12-31-2009')
    for x in df.away_pitcher.unique():
        rest[x]=pd.to_datetime('12-31-2009')

    # lists to temporairily hold results
    home_team_rest = []
    away_team_rest = []
    home_pitch_rest = []
    away_pitch_rest = []

    for i, r in df.iterrows():
        # get pre-match trueskill ratings from dict
        home_team_rest.append(r.date - rest[r.home_team_abbr])
        away_team_rest.append(r.date - rest[r.away_team_abbr])
        home_pitch_rest.append(r.date - rest[r.home_pitcher])
        away_pitch_rest.append(r.date - rest[r.away_pitcher])

        # update ratings dictionary with post-match ratings
        if r.date < df.date.max():
            #doubleheaders get screwed up if we do this on current day
            rest[r.home_team_abbr] = r.date
            rest[r.away_team_abbr] = r.date
            rest[r.home_pitcher] = r.date
            rest[r.away_pitcher] = r.date

    # add results to df
    df['home_team_rest']= home_team_rest
    df['away_team_rest']= away_team_rest
    df['home_pitcher_rest']= home_pitch_rest
    df['away_pitcher_rest']= away_pitch_rest

    for x in ['home_team_rest','away_team_rest','home_pitcher_rest','away_pitcher_rest']:
        df[x] = df[x].dt.days
        df[x] = df[x].clip(1,30)   # rest doesn't matter for large values

    # match comparisons
    df['team_rest_diff'] = df.home_team_rest - df.away_team_rest
    df['pitcher_rest_diff'] = df.home_pitcher_rest - df.away_pitcher_rest

    return df
df = add_rest_durations(df)

CPU times: user 16.9 s, sys: 23 ms, total: 16.9 s
Wall time: 16.9 s


In [25]:
df.shape

(23888, 24)

In [26]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23878,23879,23880,23881,23882,23883,23884,23885,23886,23887
game_id,BOS201004040,HOU201004050,CHA201004050,NYN201004050,WAS201004050,TEX201004050,ARI201004050,CIN201004050,ATL201004050,KCA201004050,...,MIL201908130,MIA201908130,CLE201908130,DET201908130,DET201908130,NYA201908130,MIA201908130,SFN201908130,COL201908130,KCA201908130
home_team_abbr,BOS,HOU,CHW,NYM,WSN,TEX,ARI,CIN,ATL,KCR,...,MIL,MIA,CLE,DET,DET,NYY,MIA,SFG,COL,KCR
away_team_abbr,NYY,SFG,CLE,FLA,PHI,TOR,SDP,STL,CHC,DET,...,MIN,LAD,BOS,SEA,SEA,BAL,LAD,OAK,ARI,STL
date,2010-04-04 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,...,2019-08-13 00:00:00,2019-08-13 00:00:00,2019-08-13 00:00:00,2019-08-13 00:00:00,2019-08-13 00:00:00,2019-08-13 00:00:00,2019-08-13 00:00:00,2019-08-13 00:00:00,2019-08-13 00:00:00,2019-08-13 00:00:00
is_night_game,True,True,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
home_team_win,True,False,True,True,False,True,True,False,True,False,...,False,False,False,False,False,True,False,True,False,False
home_pitcher,beckejo02,oswalro01,buehrma01,santajo02,lannajo01,feldmsc01,harenda01,haranaa01,lowede01,greinza01,...,anderch01,yamamjo01,clevimi01,boydma01,boydma01,germado01,yamamjo01,bumgama01,hoffmje02,sparkgl01
away_pitcher,sabatc.01,linceti01,westbja01,johnsjo09,hallaro01,marcush01,garlajo01,carpech01,zambrca01,verlaju01,...,perezma02,maydu01,salech01,kikucyu01,kikucyu01,meansjo01,maydu01,anderbr04,galleza01,flaheja01
dh_game_no,,,,,,,,,,,...,,,,,,,,,,KCA201908130
season,2010,2010,2010,2010,2010,2010,2010,2010,2010,2010,...,2019,2019,2019,2019,2019,2019,2019,2019,2019,2019


## Add Stats

### Game Stats

In [None]:
# errors, spread

In [31]:
games.columns

Index(['away_team_abbr', 'away_team_errors', 'away_team_hits',
       'away_team_runs', 'date', 'game_id', 'home_team_abbr',
       'home_team_errors', 'home_team_hits', 'home_team_runs', 'start_time',
       'venue', 'is_night_game', 'is_grass', 'spread'],
      dtype='object')

### Batting Stats

In [None]:
q = queue.Queue()
batting_stats = ['A', 'AB', 'BB', 'H', 'PA', 'PO', 'R', 'RBI', 'SO', 'batting_avg',
             'leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging', 'pitches', 
             're24_bat', 'slugging_perc', 'strikes_total', 'wpa_bat', 'wpa_bat_neg', 
             'wpa_bat_pos']
for b in batting_stats: q.put(b)

num_threads = 7
result_q = queue.Queue()

#start the workers
for i in range(num_threads):
    worker = threading.Thread(target=features.calc_stat_worker,
                              args=(q,batting,df,result_q))
    worker.setDaemon(True)
    worker.start()
        
#wait for workers to finish
q.join()

In [29]:
while not result_q.empty():
    key, result = result_q.get()
    df['batting'+key]=result
    df['batting'+key].fillna(0, inplace=True)
#     print(len(result), key)
    result_q.task_done()
df.shape

(23888, 164)

In [30]:
df.to_csv("../data/df_feats_thru_batting.csv", index=False)