In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import sys
sys.path.append('../code')
import scrape, features
import dataframe_utilities as util

# Update DB & Get Today Games

In [4]:
last_day = pd.to_datetime(pd.read_csv('../data/game_summaries.csv')['date']).max()
get_day = last_day + pd.Timedelta(days=1)
get_day

Timestamp('2019-08-26 00:00:00')

In [5]:
while get_day.date() < pd.datetime.today().date():
    links = scrape.get_game_links(get_day)
    for l in links:
        scrape.process_link(l)
    get_day += + pd.Timedelta(days=1)

COL201908260 done
KCA201908260 done
MIA201908260 done
MIL201908260 done
PHI201908260 done
SDN201908260 done
SEA201908260 done
SFN201908260 done


In [6]:
test_df = scrape.get_today_games()

no pitcher {'away_team_abbr': 'ATL', 'home_team_abbr': 'TOR', 'time': '7:07PM', 'away_pitcher': 'sorokmi01'}
no pitcher {'away_team_abbr': 'BOS', 'home_team_abbr': 'COL', 'time': '8:40PM', 'away_pitcher': 'porceri01'}


In [7]:
test_df

Unnamed: 0,away_pitcher,away_team_abbr,home_pitcher,home_team_abbr,time,date
0,brookaa01,BAL,corbipa01,WSN,7:05PM,2019-08-27
1,braulst01,PIT,smylydr01,PHI,7:05PM,2019-08-27
2,sorokmi01,ATL,,TOR,7:07PM,2019-08-27
3,plutkad01,CLE,turnbsp01,DET,7:10PM,2019-08-27
4,darviyu01,CHC,stromma01,NYM,7:10PM,2019-08-27
5,castilu02,CIN,smithca03,MIA,7:10PM,2019-08-27
6,mikolmi01,STL,housead01,MIL,7:40PM,2019-08-27
7,mortoch02,TBR,verlaju01,HOU,8:10PM,2019-08-27
8,pinedmi01,MIN,giolilu01,CHW,8:10PM,2019-08-27
9,fiersmi01,OAK,montgmi01,KCR,8:15PM,2019-08-27


# Process Stats for Today's Games

## Merge test and train dfs

In [8]:
test_df['is_night_game'] = True
test_df['is_night_game'][test_df['time'].str[:1].astype('int')<5] = False
test_df['is_night_game'][test_df['time'].str[1:2].isin(['0','1'])] = True #for 10,11 PM games
test_df.drop(columns='time', inplace=True)

In [9]:
test_df['is_test'] = True
test_df['home_team_win']=np.nan
test_df['game_id'] = test_df.home_team_abbr + test_df.date.astype('str').str.replace('-','') + '0'
test_df.shape

(15, 9)

In [10]:
df = features.get_game_df()
df['is_test'] = False
df.shape

(24183, 9)

In [11]:
df = pd.concat([df,test_df])
df = df.sort_values(by='date').reset_index(drop=True)
df.shape

(24198, 9)

## Add Features

In [12]:
df = features.add_trueskill_ratings(df)
df = features.add_rest_durations(df)
df.shape

(24198, 18)

In [13]:
#datetime
date = pd.to_datetime(df['date'])
df['season'] = date.dt.year
df['month']=date.dt.month
df['week_num'] = date.dt.week
df['dow']=date.dt.weekday.astype('int')

In [14]:
df['dh_game_no'] = pd.to_numeric(df['game_id'].str[-1:],errors='coerce')
df['date'] = (pd.to_datetime(df['date']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') #epoch time

### Add Stats

In [15]:
games = features.get_games()
batting = features.get_batting()
pitching = features.get_pitching()
pitchers = features.get_pitchers()

#### Rolling 10 Day Stats

In [16]:
b_stats = ['batting_avg','leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging']
df = features.add_10RA_rolling(batting, df, b_stats, True, 'batting')

In [17]:
pitching['SO_batters_faced'] = pitching['SO'] / pitching['batters_faced']
pitching['H_batters_faced'] = pitching['H'] / pitching['batters_faced']
pitching['BB_batters_faced'] = pitching['BB'] / pitching['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitching, df, b_stats, True, 'team_pitching')

In [18]:
pitchers['SO_batters_faced'] = pitchers['SO'] / pitchers['batters_faced']
pitchers['H_batters_faced'] = pitchers['H'] / pitchers['batters_faced']
pitchers['BB_batters_faced'] = pitchers['BB'] / pitchers['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitchers, df, b_stats, False, 'pitcher')

#### Games Stats

In [19]:
df = features.game_stats(games,df)

In [20]:
df.shape

(24198, 82)

#### Season Stats

In [21]:
batting_stats = ['A', 'AB', 'BB', 'H', 'PA', 'PO', 'R', 'RBI', 'SO', 'batting_avg',
             'leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging', 'pitches', 
             're24_bat', 'slugging_perc', 'strikes_total', 'wpa_bat', 'wpa_bat_neg', 
             'wpa_bat_pos']
df = features.add_season_rolling(batting, df, batting_stats, True,'batting')
df.shape

(24198, 222)

In [22]:
pitching_stats = ['BB', 'ER', 'H', 'HR', 'IP', 'R', 'SO', 'batters_faced',
               'earned_run_avg', 'game_score', 'inherited_runners',
               'inherited_score', 'inplay_fb_total', 'inplay_gb_total', 'inplay_ld',
               'inplay_unk', 'leverage_index_avg', 'pitches', 're24_def',
               'strikes_contact', 'strikes_looking', 'strikes_swinging',
               'strikes_total', 'wpa_def','SO_batters_faced','H_batters_faced',
                'BB_batters_faced']
df = features.add_season_rolling(pitching, df, pitching_stats, True,'team_pitching')
df.shape

(24198, 411)

In [23]:
df = features.add_season_rolling(pitchers, df, pitching_stats, False,'pitcher')
df.shape

(24198, 600)

## Cleanup

In [24]:
df = util.fix_na(df, False)

# Generate Predictions

In [33]:
X_test = df[df.is_test].drop(columns=['is_test'])

# make columns match training data
cols = pd.read_csv("../data/df_w_features.csv").columns
X_test = X_test[cols]

X_test.drop(columns=['home_team_win','game_id'], inplace=True)

In [34]:
import pickle
encoder, model = pickle.load(open('../data/encoder_model.pk','rb'))

X_test = encoder.transform(X_test)
proba = model.predict_proba(X_test)[:,1]
pred = model.predict(X_test)

In [35]:
test_df = test_df.sort_values(by=['date','game_id']).reset_index(drop=True)
pred_df = df[df.is_test][['away_pitcher', 'away_team_abbr', 'home_pitcher', 'home_team_abbr']]
pred_df['home'] = pred_df['home_team_abbr']
pred_df['away'] = pred_df['away_team_abbr']
pred_df.drop(columns=['home_team_abbr','away_team_abbr'], inplace=True)
pred_df['xgb_proba']= proba
pred_df['xgb_winner']=pred_df.home
pred_df['xgb_winner'][~pred]=pred_df.away

In [36]:
# get daily odds from covers.com
import requests
from bs4 import BeautifulSoup as bs
html = requests.get('https://www.covers.com/sports/mlb/matchups').text
soup = bs(html)
games = []
for s in soup.findAll('div',{'class':'cmg_matchup_game_box cmg_game_data'}):
    g = {}
    g['home'] = s['data-home-team-shortname-search']
    g['home_odds'] = s['data-game-odd']
    
    if g['home']=='SD':g['home']='SDP'
    if g['home']=='KC':g['home']='KCR'
    if g['home']=='SF':g['home']='SFG'
    if g['home']=='WAS':g['home']='WSN'
    
    games.append(g)
odds = pd.DataFrame(games)

# merge in the odds
pred_df = pd.merge(left=pred_df, right=odds, on='home', how='left')
pred_df['home_odds']=pd.to_numeric(pred_df['home_odds'], errors='coerce')
# pred_df['online_odds'][pred_df.xgb_probability<0.5] = -pred_df['online_odds'] #convert odds to pred winner odds (not home team odds)

In [37]:
# online proba
#https://www.bettingexpert.com/en-au/learn/understanding-betting-odds/how-to-convert-odds
pred_df['online_proba'] = -pred_df['home_odds']/(-pred_df['home_odds']+100)
pred_df['online_proba'][pred_df['home_odds']>0] = 100/(pred_df['home_odds']+100)

# Confidence
pred_df['confidence'] = np.abs(pred_df['xgb_proba']-0.5)+.5
online_conf = np.abs(pred_df['online_proba']-0.5)+.5

pred_df['conf_diff'] = pred_df['confidence'] - online_conf
pred_df['conf_diff'][(pred_df['xgb_proba']>.5)&(pred_df['online_proba']<.5)] = 'Contrary'
pred_df['conf_diff'][(pred_df['xgb_proba']<.5)&(pred_df['online_proba']>.5)] = 'Contrary'

In [38]:
# merge in team names
teams = pd.read_csv("../data/teams.csv")
pred_df = pd.merge(left=pred_df, right=teams, 
                   left_on='xgb_winner',right_on='Abbr',
                   how='left')

pred_df['pred_winner'] = pred_df['Team']
pred_df.drop(columns=['xgb_winner','Abbr','Team'], inplace=True)

In [39]:
from IPython.display import HTML
pd.options.display.float_format = '{:.3f}'.format

print("70% success rate for bets with confidence >0.6")
HTML(pred_df.sort_values(by='confidence', ascending=False).to_html(index=False))


70% success rate for bets with confidence >0.6


away_pitcher,home_pitcher,home,away,xgb_proba,home_odds,online_proba,confidence,conf_diff,pred_winner
plutkad01,turnbsp01,DET,CLE,0.238,140.0,0.417,0.762,0.179,Indians
mortoch02,verlaju01,HOU,TBR,0.712,-194.0,0.66,0.712,0.052,Astros
brookaa01,corbipa01,WSN,BAL,0.691,-345.0,0.775,0.691,-0.084,Nationals
mikolmi01,housead01,MIL,STL,0.31,-145.0,0.592,0.69,Contrary,Cardinals
minormi01,heanean01,LAA,TEX,0.677,-151.0,0.602,0.677,0.076,Angels
braulst01,smylydr01,PHI,PIT,0.668,-143.0,0.588,0.668,0.079,Phillies
porceri01,Unknown,COL,BOS,0.362,113.0,0.469,0.638,0.108,Red Sox
buehlwa01,quantca01,SDP,LAD,0.413,170.0,0.37,0.587,-0.042,Dodgers
castilu02,smithca03,MIA,CIN,0.425,140.0,0.417,0.575,-0.008,Reds
sorokmi01,Unknown,TOR,ATL,0.452,,,0.548,,Braves
