In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import sys
sys.path.append('../code')
import features

In [4]:
import queue
import threading
from scipy.stats import skew

# Load Data

In [5]:
games = features.get_games()
batting = features.get_batting()
pitching = features.get_pitching()
pitchers = features.get_pitchers()

In [6]:
print("start hour vs. is_night_game")
print("night games seem to start from 5PM to 10PM")
games.groupby(games.start_time.str[:-3])['is_night_game'].agg(['mean','count'])

start hour vs. is_night_game
night games seem to start from 5PM to 10PM


Unnamed: 0_level_0,mean,count
start_time,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.000213,4697
10,0.997908,478
11,0.052632,19
12,0.0,1069
2,0.0,562
3,0.001773,564
4,0.001035,966
5,0.980263,304
6,0.99568,1852
7,0.99851,12081


# Features

## Create dataset

In [7]:
df = games[['game_id','home_team_abbr','away_team_abbr','date','is_night_game']]
df['home_team_win'] = games.home_team_runs.astype('int')>games.away_team_runs

In [8]:
home_pitchers = pitchers[['name','game_id']].where((pitchers.is_home_team)&(pitchers.is_starting_pitcher)).dropna()
home_pitchers['home_pitcher'] = home_pitchers['name']
home_pitchers = home_pitchers.groupby('game_id')['home_pitcher'].first()
df = pd.merge(left=df, right=home_pitchers, on='game_id', how='left')

In [9]:
away_pitchers = pitchers[['name','game_id']].where((~pitchers.is_home_team)&(pitchers.is_starting_pitcher)).dropna()
away_pitchers['away_pitcher'] = away_pitchers['name']
away_pitchers = away_pitchers.groupby('game_id')['away_pitcher'].first()
df = pd.merge(left=df, right=away_pitchers, on='game_id', how='left')

In [10]:
df = df.sort_values(by='date').reset_index(drop=True)

## Add Small Feats

In [11]:
df['dh_game_no'] = df['game_id'][-1:]

In [12]:
#datetime
date = pd.to_datetime(df['date'])
df['season'] = date.dt.year
df['month']=date.dt.month
df['week_num'] = date.dt.week
df['dow']=date.dt.weekday.astype('int')

In [13]:
df['matchup_season'] = df.apply(lambda x: sorted([x.away_team_abbr, x.home_team_abbr])[0] + '_' + 
                         sorted([x.away_team_abbr, x.home_team_abbr])[1] + '_' +
                         str(x.season),
                         axis=1)
df['game_num'] = df.groupby(by=['season','matchup_season'])['home_team_abbr'].cumcount()

In [14]:
%%time
from trueskill import rate_1vs1, Rating
def add_trueskill_ratings(df):
    #https://www.microsoft.com/en-us/research/project/trueskill-ranking-system/
    ratings = {}
    for x in df.home_team_abbr.unique():
        ratings[x]=25
    for x in df.away_team_abbr.unique():
        ratings[x]=25

    home_trueskill_pre = []
    away_trueskill_pre = []
    for i, r in df.iterrows():
        # get pre-match trueskill ratings from dict
        home_trueskill_pre.append(ratings[r.home_team_abbr])
        away_trueskill_pre.append(ratings[r.away_team_abbr])

        if r.date < df.date.max():
            #doubleheaders get screwed up if we do this on current day
            # update ratings dictionary with post-match ratings
            ts1 = Rating(ratings[r.home_team_abbr])
            ts2 = Rating(ratings[r.away_team_abbr])
            if r.home_team_win==1:
                ts1, ts2 = rate_1vs1(ts1, ts2)
            else:
                ts2, ts1 = rate_1vs1(ts2, ts1)
            ratings[r.home_team_abbr] = ts1.mu
            ratings[r.away_team_abbr] = ts2.mu

    df['home_trueskill_pre']= home_trueskill_pre
    df['away_trueskill_pre']= away_trueskill_pre
    df['ts_diff'] = df.home_trueskill_pre-df.away_trueskill_pre
    
    df.replace({np.inf: 0}, inplace=True)
    return df
df = add_trueskill_ratings(df)

CPU times: user 36.8 s, sys: 2.8 ms, total: 36.8 s
Wall time: 36.8 s


In [15]:
%%time
def add_rest_durations(df):
    # time between games for teams and pitchers
    df.date = pd.to_datetime(df.date)
    
    #initalize rest dictionary
    rest = {}
    for x in df.home_team_abbr.unique():
        rest[x]=pd.to_datetime('12-31-2009')
    for x in df.away_team_abbr.unique():
        rest[x]=pd.to_datetime('12-31-2009')
    for x in df.home_pitcher.unique():
        rest[x]=pd.to_datetime('12-31-2009')
    for x in df.away_pitcher.unique():
        rest[x]=pd.to_datetime('12-31-2009')

    # lists to temporairily hold results
    home_team_rest = []
    away_team_rest = []
    home_pitch_rest = []
    away_pitch_rest = []

    for i, r in df.iterrows():
        # get pre-match trueskill ratings from dict
        home_team_rest.append(r.date - rest[r.home_team_abbr])
        away_team_rest.append(r.date - rest[r.away_team_abbr])
        home_pitch_rest.append(r.date - rest[r.home_pitcher])
        away_pitch_rest.append(r.date - rest[r.away_pitcher])

        # update ratings dictionary with post-match ratings
        if r.date < df.date.max():
            #doubleheaders get screwed up if we do this on current day
            rest[r.home_team_abbr] = r.date
            rest[r.away_team_abbr] = r.date
            rest[r.home_pitcher] = r.date
            rest[r.away_pitcher] = r.date

    # add results to df
    df['home_team_rest']= home_team_rest
    df['away_team_rest']= away_team_rest
    df['home_pitcher_rest']= home_pitch_rest
    df['away_pitcher_rest']= away_pitch_rest

    for x in ['home_team_rest','away_team_rest','home_pitcher_rest','away_pitcher_rest']:
        df[x] = df[x].dt.days
        df[x] = df[x].clip(1,30)   # rest doesn't matter for large values

    # match comparisons
    df['team_rest_diff'] = df.home_team_rest - df.away_team_rest
    df['pitcher_rest_diff'] = df.home_pitcher_rest - df.away_pitcher_rest

    return df
df = add_rest_durations(df)

CPU times: user 13.2 s, sys: 15.9 ms, total: 13.2 s
Wall time: 13.2 s


In [16]:
df.shape

(23867, 24)

In [17]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23857,23858,23859,23860,23861,23862,23863,23864,23865,23866
game_id,BOS201004040,CIN201004050,ATL201004050,WAS201004050,PIT201004050,KCA201004050,NYN201004050,MIL201004050,ANA201004050,HOU201004050,...,TOR201908110,MIA201908110,SEA201908110,LAN201908110,CIN201908110,CHA201908110,MIL201908110,SLN201908110,MIN201908110,SFN201908110
home_team_abbr,BOS,CIN,ATL,WSN,PIT,KCR,NYM,MIL,LAA,HOU,...,TOR,MIA,SEA,LAD,CIN,CHW,MIL,STL,MIN,SFG
away_team_abbr,NYY,STL,CHC,PHI,LAD,DET,FLA,COL,MIN,SFG,...,NYY,ATL,TBR,ARI,CHC,OAK,TEX,PIT,CLE,PHI
date,2010-04-04 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,2010-04-05 00:00:00,...,2019-08-11 00:00:00,2019-08-11 00:00:00,2019-08-11 00:00:00,2019-08-11 00:00:00,2019-08-11 00:00:00,2019-08-11 00:00:00,2019-08-11 00:00:00,2019-08-11 00:00:00,2019-08-11 00:00:00,2019-08-11 00:00:00
is_night_game,True,False,False,False,False,False,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
home_team_win,True,False,True,False,True,False,True,False,True,False,...,False,False,False,True,False,False,False,True,False,True
home_pitcher,beckejo02,haranaa01,lowede01,lannajo01,dukeza01,greinza01,santajo02,gallayo01,weaveje02,oswalro01,...,thorntr01,noesihe01,tuivasa01,ryuhy01,castilu02,giolilu01,lylesjo01,mikolmi01,berrijo01,menezco01
away_pitcher,sabatc.01,carpech01,zambrca01,hallaro01,padilvi01,verlaju01,johnsjo09,jimenub01,bakersc02,linceti01,...,tanakma01,foltymi01,yarbrry01,leakemi01,lestejo01,bassich01,minormi01,braulst01,civalaa01,arrieja01
dh_game_no,,,,,,,,,,,...,,,,,,,,,,SFN201908110
season,2010,2010,2010,2010,2010,2010,2010,2010,2010,2010,...,2019,2019,2019,2019,2019,2019,2019,2019,2019,2019


## Add Stats

In [18]:
# errors

### Batting Stats

In [39]:
def calc_stat(stat, batting_df, games_df, result_q):
    hmean,amean = [],[]
    hstdev,astdev = [],[]
    hskew,askew = [],[]
    
    df_len = len(games_df)
    # merge in home team stat to df
    b = batting[['game_id',stat]][batting['is_home_team']==True].groupby('game_id').first().reset_index()
    games_df = pd.merge(left=games_df, right=b,on='game_id', how='left')
    games_df['home_'+stat] = games_df[stat]
    games_df.drop(columns=stat, inplace=True)
    
    #now moerge in away team stat
    b = batting[['game_id',stat]][batting['is_home_team']==False].groupby('game_id').first().reset_index()
    games_df = pd.merge(left=games_df, right=b, on='game_id', how='left')
    games_df['away_'+stat] = games_df[stat]
    games_df.drop(columns=stat, inplace=True)
    
    assert df_len == len(games_df)
    
    stats = {}
    for t in games_df.home_team_abbr.unique():stats[t]=[]
    for t in games_df.away_team_abbr.unique():stats[t]=[]
    
    for i, r in games_df.iterrows():
        
        #get distributions
        h = np.array(stats[r.home_team_abbr])
        a = np.array(stats[r.away_team_abbr])
        
        #calc stat  and append to dict
        hmean.append(h.mean())
        amean.append(a.mean())

        hstdev.append(h.std())
        astdev.append(a.std())

        hskew.append(skew(h))
        askew.append(skew(a))
        
        #update stats
        stats[r.home_team_abbr].append(r['home_'+stat])
        stats[r.away_team_abbr].append(r['away_'+stat])
    diff = np.array(hmean) - np.array(amean)
    
    names = ['home_batting_'+stat+'_mean', 'away_batting_'+stat+'_mean',
            'home_batting_'+stat+'_stdev', 'away_batting_'+stat+'_stdev',
            'home_batting_'+stat+'_skew', 'away_batting_'+stat+'_skew',
            'batting_'+stat+'_diff']
    lists = [hmean,amean,hstdev,astdev,hskew,askew,diff]
    for i in range(len(names)):
        result_q.put((names[i],lists[i]))

def do_work(q,batting,df,result_q):
    #start working through the queue
    while not q.empty():
        stat = q.get()
        calc_stat(stat,batting,df,result_q)
        print(stat,'Done!')
        q.task_done()

In [40]:
q = queue.Queue()
batting_stats = ['A', 'AB', 'BB', 'H', 'PA', 'PO', 'R', 'RBI', 'SO', 'batting_avg',
             'leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging', 'pitches', 
             're24_bat', 'slugging_perc', 'strikes_total', 'wpa_bat', 'wpa_bat_neg', 
             'wpa_bat_pos']
for b in batting_stats: q.put(b)

num_threads = 7
result_q = queue.Queue()

#start the workers
for i in range(num_threads):
    worker = threading.Thread(target=do_work, args=(q,batting,df,result_q))
    worker.setDaemon(True)
    worker.start()
        
#wait for workers to finish
q.join()

AB Done!
PA Done!
BB Done!
H Done!
PO Done!
A Done!
R Done!
leverage_index_avg Done!
batting_avg Done!
onbase_plus_slugging Done!
onbase_perc Done!
re24_bat Done!
slugging_perc Done!
wpa_bat Done!
wpa_bat_neg Done!
wpa_bat_pos Done!
RBI Done!
SO Done!
pitches Done!
strikes_total Done!


In [41]:
while not result_q.empty():
    key, result = result_q.get()
    df[key]=result
    df[key].fillna(0, inplace=True)
#     print(len(result), key)
    result_q.task_done()
df.shape

(23867, 164)

In [42]:
df.to_csv("../data/df_feats_thru_batting.csv", index=False)