In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import sys
sys.path.append('../code')
import scrape, features
import dataframe_utilities as util

# Update DB & Get Today Games

In [4]:
last_day = pd.to_datetime(pd.read_csv('../data/game_summaries.csv')['date']).max()
get_day = last_day + pd.Timedelta(days=1)
get_day

Timestamp('2019-08-19 00:00:00')

In [5]:
while get_day.date() < pd.datetime.today().date():
    links = scrape.get_game_links(get_day)
    for l in links:
        scrape.process_link(l)
    get_day += + pd.Timedelta(days=1)

In [6]:
test_df = scrape.get_today_games()

In [7]:
test_df

Unnamed: 0,away_pitcher,away_team_abbr,home_pitcher,home_team_abbr,time,date
0,lopezjo02,KCR,meansjo01,BAL,7:05PM,2019-08-19
1,rossjo01,WSN,willitr01,PIT,7:05PM,2019-08-19
2,lauerer01,SDP,bauertr01,CIN,7:10PM,2019-08-19
3,gonzama02,SEA,mckaybr01,TBR,7:10PM,2019-08-19
4,davieza02,MIL,hudsoda02,STL,7:45PM,2019-08-19
5,peterdi01,LAA,allarko01,TEX,8:05PM,2019-08-19
6,jacksed01,DET,mileywa01,HOU,8:10PM,2019-08-19
7,novaiv01,CHW,gibsoky01,MIN,8:10PM,2019-08-19
8,gonzach01,COL,galleza01,ARI,9:40PM,2019-08-19


# Process Stats for Today's Games

## Merge test and train dfs

In [8]:
test_df['is_night_game'] = True
test_df['is_night_game'][test_df['time'].str[:1].astype('int')<5] = False
test_df['is_night_game'][test_df['time'].str[1:2].isin(['0','1'])] = True #for 10,11 PM games
test_df.drop(columns='time', inplace=True)

In [9]:
test_df['is_test'] = True
test_df['home_team_win']=np.nan
test_df['game_id'] = test_df.home_team_abbr + test_df.date.astype('str').str.replace('-','') + '0'
test_df.shape

(9, 9)

In [10]:
df = features.get_game_df()
df['is_test'] = False
df.shape

(24079, 9)

In [11]:
df = pd.concat([df,test_df])
df = df.sort_values(by='date').reset_index(drop=True)
df.shape

(24088, 9)

## Add Features

In [12]:
df = features.add_trueskill_ratings(df)
df = features.add_rest_durations(df)
df.shape

(24088, 18)

In [13]:
#datetime
date = pd.to_datetime(df['date'])
df['season'] = date.dt.year
df['month']=date.dt.month
df['week_num'] = date.dt.week
df['dow']=date.dt.weekday.astype('int')

In [14]:
df['dh_game_no'] = pd.to_numeric(df['game_id'].str[-1:],errors='coerce')
df['date'] = (pd.to_datetime(df['date']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') #epoch time

### Add Stats

In [15]:
games = features.get_games()
batting = features.get_batting()
pitching = features.get_pitching()
pitchers = features.get_pitchers()

#### Rolling 10 Day Stats

In [16]:
b_stats = ['batting_avg','leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging']
df = features.add_10RA_rolling(batting, df, b_stats, True, 'batting')

In [17]:
pitching['SO_batters_faced'] = pitching['SO'] / pitching['batters_faced']
pitching['H_batters_faced'] = pitching['H'] / pitching['batters_faced']
pitching['BB_batters_faced'] = pitching['BB'] / pitching['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitching, df, b_stats, True, 'team_pitching')

In [18]:
pitchers['SO_batters_faced'] = pitchers['SO'] / pitchers['batters_faced']
pitchers['H_batters_faced'] = pitchers['H'] / pitchers['batters_faced']
pitchers['BB_batters_faced'] = pitchers['BB'] / pitchers['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitchers, df, b_stats, False, 'pitcher')

#### Games Stats

In [19]:
df = features.game_stats(games,df)

In [20]:
df.shape

(24088, 82)

#### Season Stats

In [21]:
batting_stats = ['A', 'AB', 'BB', 'H', 'PA', 'PO', 'R', 'RBI', 'SO', 'batting_avg',
             'leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging', 'pitches', 
             're24_bat', 'slugging_perc', 'strikes_total', 'wpa_bat', 'wpa_bat_neg', 
             'wpa_bat_pos']
df = features.add_season_rolling(batting, df, batting_stats, True,'batting')
df.shape

(24088, 222)

In [22]:
pitching_stats = ['BB', 'ER', 'H', 'HR', 'IP', 'R', 'SO', 'batters_faced',
               'earned_run_avg', 'game_score', 'inherited_runners',
               'inherited_score', 'inplay_fb_total', 'inplay_gb_total', 'inplay_ld',
               'inplay_unk', 'leverage_index_avg', 'pitches', 're24_def',
               'strikes_contact', 'strikes_looking', 'strikes_swinging',
               'strikes_total', 'wpa_def','SO_batters_faced','H_batters_faced',
                'BB_batters_faced']
df = features.add_season_rolling(pitching, df, pitching_stats, True,'team_pitching')
df.shape

(24088, 411)

In [23]:
df = features.add_season_rolling(pitchers, df, pitching_stats, False,'pitcher')
df.shape

(24088, 600)

## Cleanup

In [24]:
df = util.fix_na(df, False)

# Generate Predictions

In [25]:
X_test = df[df.is_test].drop(columns=['is_test'])
cols = pd.read_csv("../data/df_w_features.csv").columns
X_test = X_test[cols]

X_test.drop(columns=['home_team_win','game_id'], inplace=True)

In [26]:
import pickle
encoder, model = pickle.load(open('../data/encoder_model.pk','rb'))
encoder, model

(CatBoostEncoder(cols=['home_team_abbr', 'away_team_abbr', 'home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season'],
         drop_invariant=False, handle_missing='value',
         handle_unknown='value', random_state=13, return_df=True,
         sigma=None, verbose=0),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=0.75, gamma=0, learning_rate=0.01,
        max_delta_step=0, max_depth=28, min_child_weight=14.0, missing=nan,
        n_estimators=100, n_jobs=1, nthread=None, num_boost_round=100,
        objective='binary:logistic', random_state=-1, reg_alpha=0.42431,
        reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
        subsample=0.2))

In [29]:
X_test = encoder.transform(X_test)

test_df = test_df.sort_values(by=['date','game_id']).reset_index(drop=True)
pred_df = df[df.is_test][['away_pitcher', 'away_team_abbr', 'home_pitcher', 'home_team_abbr']]
pred_df['xgb_probability']= model.predict_proba(X_test)[:,1]
pred_df['xgb_winner']=pred_df.home_team_abbr
pred_df['xgb_winner'][~model.predict(X_test)]=pred_df.away_team_abbr

pred_df

Unnamed: 0,away_pitcher,away_team_abbr,home_pitcher,home_team_abbr,xgb_probability,xgb_winner
24079,gonzama02,SEA,mckaybr01,TBR,0.512003,TBR
24080,gonzach01,COL,galleza01,ARI,0.521917,ARI
24081,lopezjo02,KCR,meansjo01,BAL,0.500788,BAL
24082,lauerer01,SDP,bauertr01,CIN,0.510078,CIN
24083,jacksed01,DET,mileywa01,HOU,0.517218,HOU
24084,novaiv01,CHW,gibsoky01,MIN,0.507571,MIN
24085,rossjo01,WSN,willitr01,PIT,0.491929,WSN
24086,davieza02,MIL,hudsoda02,STL,0.49687,MIL
24087,peterdi01,LAA,allarko01,TEX,0.502692,TEX
