In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import sys
sys.path.append('../code')
import scrape, features
import dataframe_utilities as util

# Update DB & Get Today Games

In [4]:
last_day = pd.to_datetime(pd.read_csv('../data/game_summaries.csv')['date']).max()
get_day = last_day + pd.Timedelta(days=1)
get_day

Timestamp('2019-08-20 00:00:00')

In [5]:
while get_day.date() < pd.datetime.today().date():
    links = scrape.get_game_links(get_day)
    for l in links:
        scrape.process_link(l)
    get_day += + pd.Timedelta(days=1)

ARI201908190 done
BAL201908190 done
CIN201908190 done
HOU201908190 done
MIN201908190 done
PIT201908190 done
SLN201908190 done
TBA201908190 done
TEX201908190 done


In [6]:
test_df = scrape.get_today_games()

no pitcher {'away_team_abbr': 'SEA', 'home_team_abbr': 'TBR', 'time': '1:10PM', 'away_pitcher': 'mortoch02'}
no pitcher {'away_team_abbr': 'SFG', 'home_team_abbr': 'CHC', 'time': '8:05PM'}


In [7]:
test_df

Unnamed: 0,away_pitcher,away_team_abbr,home_pitcher,home_team_abbr,time,date
0,strahma01,SDP,castilu02,CIN,12:35PM,2019-08-21
1,mortoch02,SEA,,TBR,1:10PM,2019-08-21
2,giolilu01,CHW,odorija01,MIN,1:10PM,2019-08-21
3,grayjo02,COL,leakemi01,ARI,3:40PM,2019-08-21
4,corbipa01,WSN,musgrjo01,PIT,7:05PM,2019-08-21
5,sandopa02,LAA,minormi01,TEX,7:05PM,2019-08-21
6,montgmi01,KCR,brookaa01,BAL,7:05PM,2019-08-21
7,smylydr01,PHI,porceri01,BOS,7:10PM,2019-08-21
8,plutkad01,CLE,stromma01,NYM,7:10PM,2019-08-21
9,smithca03,MIA,teherju01,ATL,7:20PM,2019-08-21


# Process Stats for Today's Games

## Merge test and train dfs

In [8]:
test_df['is_night_game'] = True
test_df['is_night_game'][test_df['time'].str[:1].astype('int')<5] = False
test_df['is_night_game'][test_df['time'].str[1:2].isin(['0','1'])] = True #for 10,11 PM games
test_df.drop(columns='time', inplace=True)

In [9]:
test_df['is_test'] = True
test_df['home_team_win']=np.nan
test_df['game_id'] = test_df.home_team_abbr + test_df.date.astype('str').str.replace('-','') + '0'
test_df.shape

(15, 9)

In [10]:
df = features.get_game_df()
df['is_test'] = False
df.shape

(24090, 9)

In [11]:
df = pd.concat([df,test_df])
df = df.sort_values(by='date').reset_index(drop=True)
df.shape

(24105, 9)

## Add Features

In [12]:
df = features.add_trueskill_ratings(df)
df = features.add_rest_durations(df)
df.shape

(24105, 18)

In [13]:
#datetime
date = pd.to_datetime(df['date'])
df['season'] = date.dt.year
df['month']=date.dt.month
df['week_num'] = date.dt.week
df['dow']=date.dt.weekday.astype('int')

In [14]:
df['dh_game_no'] = pd.to_numeric(df['game_id'].str[-1:],errors='coerce')
df['date'] = (pd.to_datetime(df['date']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') #epoch time

### Add Stats

In [15]:
games = features.get_games()
batting = features.get_batting()
pitching = features.get_pitching()
pitchers = features.get_pitchers()

#### Rolling 10 Day Stats

In [16]:
b_stats = ['batting_avg','leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging']
df = features.add_10RA_rolling(batting, df, b_stats, True, 'batting')

In [17]:
pitching['SO_batters_faced'] = pitching['SO'] / pitching['batters_faced']
pitching['H_batters_faced'] = pitching['H'] / pitching['batters_faced']
pitching['BB_batters_faced'] = pitching['BB'] / pitching['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitching, df, b_stats, True, 'team_pitching')

In [18]:
pitchers['SO_batters_faced'] = pitchers['SO'] / pitchers['batters_faced']
pitchers['H_batters_faced'] = pitchers['H'] / pitchers['batters_faced']
pitchers['BB_batters_faced'] = pitchers['BB'] / pitchers['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitchers, df, b_stats, False, 'pitcher')

#### Games Stats

In [19]:
df = features.game_stats(games,df)

In [20]:
df.shape

(24105, 82)

#### Season Stats

In [21]:
batting_stats = ['A', 'AB', 'BB', 'H', 'PA', 'PO', 'R', 'RBI', 'SO', 'batting_avg',
             'leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging', 'pitches', 
             're24_bat', 'slugging_perc', 'strikes_total', 'wpa_bat', 'wpa_bat_neg', 
             'wpa_bat_pos']
df = features.add_season_rolling(batting, df, batting_stats, True,'batting')
df.shape

(24105, 222)

In [22]:
pitching_stats = ['BB', 'ER', 'H', 'HR', 'IP', 'R', 'SO', 'batters_faced',
               'earned_run_avg', 'game_score', 'inherited_runners',
               'inherited_score', 'inplay_fb_total', 'inplay_gb_total', 'inplay_ld',
               'inplay_unk', 'leverage_index_avg', 'pitches', 're24_def',
               'strikes_contact', 'strikes_looking', 'strikes_swinging',
               'strikes_total', 'wpa_def','SO_batters_faced','H_batters_faced',
                'BB_batters_faced']
df = features.add_season_rolling(pitching, df, pitching_stats, True,'team_pitching')
df.shape

(24105, 411)

In [23]:
df = features.add_season_rolling(pitchers, df, pitching_stats, False,'pitcher')
df.shape

(24105, 600)

## Cleanup

In [24]:
df = util.fix_na(df, False)

# Generate Predictions

In [25]:
X_test = df[df.is_test].drop(columns=['is_test'])
cols = pd.read_csv("../data/df_w_features.csv").columns
X_test = X_test[cols]

X_test.drop(columns=['home_team_win','game_id'], inplace=True)

In [26]:
import pickle
encoder, model = pickle.load(open('../data/encoder_model.pk','rb'))
encoder, model

(CatBoostEncoder(cols=['home_team_abbr', 'away_team_abbr', 'home_pitcher',
                       'away_pitcher', 'home_team_season', 'away_team_season'],
                 drop_invariant=False, handle_missing='value',
                 handle_unknown='value', random_state=13, return_df=True,
                 sigma=None, verbose=0),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=None, colsample_bytree=0.65, gamma=0,
               learning_rate=0.3, max_delta_step=0, max_depth=26,
               min_child_weight=18.0, missing=nan, n_estimators=100, n_jobs=1,
               nthread=None, num_boost_round=100, objective='binary:logistic',
               random_state=13, reg_alpha=0.057490000000000006, reg_lambda=1,
               scale_pos_weight=1, seed=None, silent=True,
               subsample=0.6000000000000001, verbosity=None))

In [27]:
X_test = encoder.transform(X_test)

test_df = test_df.sort_values(by=['date','game_id']).reset_index(drop=True)
pred_df = df[df.is_test][['away_pitcher', 'away_team_abbr', 'home_pitcher', 'home_team_abbr']]
pred_df['xgb_probability']= model.predict_proba(X_test)[:,1]
pred_df['xgb_winner']=pred_df.home_team_abbr
pred_df['xgb_winner'][~model.predict(X_test)]=pred_df.away_team_abbr

pred_df

Unnamed: 0,away_pitcher,away_team_abbr,home_pitcher,home_team_abbr,xgb_probability,xgb_winner
24090,giolilu01,CHW,odorija01,MIN,0.540249,MIN
24091,housead01,MIL,wainwad01,STL,0.733115,STL
24092,corbipa01,WSN,musgrjo01,PIT,0.428208,WSN
24093,happja01,NYY,fiersmi01,OAK,0.69222,OAK
24094,plutkad01,CLE,stromma01,NYM,0.516305,NYM
24095,fontwi01,TOR,buehlwa01,LAD,0.65545,LAD
24096,mortoch02,SEA,Unknown,TBR,0.479512,SEA
24097,strahma01,SDP,castilu02,CIN,0.645518,CIN
24098,Unknown,SFG,Unknown,CHC,0.577858,CHC
24099,smylydr01,PHI,porceri01,BOS,0.624853,BOS
