In [54]:
from concurrent.futures import ProcessPoolExecutor, Future
import pandas as pd
import numpy as np
import glob
import re
import json
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from recursive_selection import FeatureSelector

# Real World Gambling Test

The attempt to model baseball games with high accuracy has been a total failure so far. However, this may not mean there is not value to be discovered here. The methodology here will be simple: I will generate a model using a the dataset and building on the work of previous modeling notebook (see __01classification_win_loss.ipynb__) but instead of trying to accurately predict every game, I will look for games that have a higher probability of falling into a class (winning) with good value (a __+__ moneyline) and see if the model would have generated profit for a given season. The lesson here may be that there is no real value in trying to predict every game, but picking ones spots may be highly beneficial. 

# Preparing Data

The first challenge is taking historical odds data, which is publicly available at SportsbookReviews.com, and converting into a usable form for this analysis. I have designed a loop below that iterates through the .XSLX files and extracts the data I will need and returns Pandas DataFrames that are ready to merge with the primary dataframe.

In [3]:
#get file paths for MLB betting data
all_files = glob.glob('./historical_odds/*.xlsx')

#create dictionary to store dataframes 
odds_by_season = {}

#iterate over all .XSLX files
for file in all_files:
    #extract year from file name using regex
    match_pattern = '([\d]{4})'
    year = re.findall(match_pattern, file)[0]
    #years before 2014 do not have the run line, will worry about those ones later
    if int(year) < 2014:
        continue
    season_odds = pd.read_excel(file)
    #rename juice columns
    season_odds = season_odds.rename(columns = {'Unnamed: 18' : 'runline_juice',
                                       'Unnamed: 20' : 'total_juice_open',
                                       'Unnamed: 22' : 'total_juice_close'})
    #convert date to str, add in year which is missing, convert date column to datetime object
    season_odds['Date'] = season_odds['Date'].astype('str')
    season_odds['Date'] = season_odds['Date'] + year
    season_odds['Date'] = pd.to_datetime(season_odds['Date'], format = '%m%d%Y')
    #extract meaningful columns
    col_selected = season_odds[['Date', 'VH', 'Team', 'Open', 'Close', 'Run Line', 'runline_juice', 'Open OU',
                               'total_juice_open', 'Close OU', 'total_juice_close']]
    #split frame into neutral, road, and home games
    neutral_site = col_selected[col_selected.VH == 'N'].reset_index(drop = True)
    home_teams = col_selected[col_selected.VH == 'H'].reset_index(drop = True)
    road_teams = col_selected[col_selected.VH == 'V'].reset_index(drop = True)
    
    #rename home and road dataframe feature sets
    home_cols = ['date', 'home_flag', 'team1', 'home_opening', 'home_closing', 'home_runline', 
                'home_runline_juice', 'home_open_ou', 'home_ou_juice_open', 'home_close_ou', 'home_ou_juice_close']
    road_cols = ['date', 'home_flag', 'team2', 'road_opening', 'road_closing', 'road_runline', 
                'road_runline_juice', 'road_open_ou', 'road_ou_juice_open', 'road_close_ou', 'road_ou_juice_close']
    home_teams.columns = home_cols
    road_teams.columns = road_cols
    
    #drop redundant columns (from both), drop date from road frame 
    home_teams = home_teams.drop(columns = ['home_flag'])
    road_teams = road_teams.drop(columns = ['date', 'home_flag'])
    
    #merge home and road datafames
    full_season = pd.concat([home_teams, road_teams], axis = 1)
    
    #add feature for second game of double headers (this is necessary for merging later)
    full_season = full_season.assign(is_doubleheader = 0)
    game_counts = full_season.groupby('team1').date.value_counts()
    double_headers = game_counts[game_counts == 2]

    all_double_headers = []
    for j in double_headers.index:
        all_double_headers.append(j)

    for index in all_double_headers:
        game_indices = full_season[(full_season.team1 == index[0]) & (full_season.date == index[1])].index
        if len(game_indices) > 1:
            full_season.at[game_indices[1], 'is_doubleheader'] = 1
        else:
            print(index)
    #add to dataframe dict
    odds_by_season[year] = full_season

Now that the DataFrames are read in and in the format I want them to be in, I need to prepare the team codes are uniform and will be able to merge with the Elo DataFrame. To do this, I will create a baseline list of team codes used from the MLB odds archive dataframe, and ensure that only those team codes appear in each dataframe. 

In [4]:
#create baseline list of teams
base_teams = odds_by_season['2014'].team1.unique()

#create list to store tuples of problem team codes
problem_teams = []
#iterate through all keys and check for problem team codes (mistyped, changed name, etc.)
for key in odds_by_season.keys():
    home_teams = list(odds_by_season[key].team1.unique())
    road_teams = list(odds_by_season[key].team2.unique())
    for team in home_teams:
        if team not in base_teams:
            problem_teams.append((key, team))
    for team in road_teams:
        if team not in base_teams:
            problem_teams.append((key, team))
#view problem teams
problem_teams

[('2018', 'LAD'),
 ('2018', 'LAD'),
 ('2019', 'LAD'),
 ('2019', 'HOW'),
 ('2019', 'LAD'),
 ('2016', 'LAD')]

It appears that they switched from 'LOS' to 'LAD' for the Dodgers code, and a quick lookup shows that 'HOW' was a mistaken key and was supposed to be 'HOU'. Now, I will replace the problem teams and prepare to compare with the Elo frame. 

In [5]:
#create dict
trouble_dict = {'LOS' : 'LAD',
               'HOW' : 'HOU'}
#iterate through all dataframes
for key in odds_by_season.keys():
    odds_by_season[key] = odds_by_season[key].replace({'team1' : trouble_dict,
                                                      'team2' : trouble_dict})
    print(len(odds_by_season[key].team1.unique()))

30
30
30
30
30
30


Now that the team codes among the new odds DataFrames are consistent, now it is necessary to locate discrepancies between the team codes used in the primary dataframe and the new team codes. In order to do that, I will now read in the primary dataframe, and compare the team codes used in that frame against the new frames.

In [6]:
#read in primary dataframe 
primary_df = pd.read_csv('./all_features.csv.gz', compression = 'gzip')

#prepare primary dataframe 
primary_df['date'] = pd.to_datetime(primary_df['date'], format = '%Y-%m-%d')
primary_df['year'] = pd.DatetimeIndex(primary_df['date']).year
elo_codes = list(primary_df.team1.unique())

In [7]:
#empty list for trouble_elo codes
elo_trouble = []

#iterate through dataframes
for key in odds_by_season.keys():
    team_codes = list(odds_by_season[key].team1.unique())
    for team in team_codes:
        if team not in elo_codes and team not in elo_trouble:
            elo_trouble.append(team)
#view results
elo_trouble

['MIA', 'SDG', 'TAM', 'KAN', 'LAA', 'SFO', 'WAS', 'CWS', 'CUB']

In [8]:
#create dictionary mapping to elo team codes
elo_map = {'WAS' : 'WSN',
         'SDG' : 'SDP',
         'MIA' : 'FLA',
         'TAM' : 'TBD',
         'KAN' : 'KCR',
         'LAA' : 'ANA',
         'SFO' : 'SFG',
         'CWS' : 'CHW',
         'CUB' : 'CHC'}
#iterate through all dataframes
for key in odds_by_season.keys():
    odds_by_season[key] = odds_by_season[key].replace({'team1' : elo_map,
                                                      'team2' : elo_map})
    print(len(odds_by_season[key].team1.unique()))

30
30
30
30
30
30


Now, with the DataFrames fully prepared to merge, I will merge the odds data with the primary dataset. 

In [9]:
#create empty list to store dataframes
completed_frames = []

#create list of merge keys
merge_keys = ['date', 'team1', 'team2', 'is_doubleheader']
#iterate through df dict
for key in odds_by_season.keys():
    season_elo = primary_df[primary_df.year == int(key)].reset_index(drop = True)
    completed = season_elo.merge(odds_by_season[key], how = 'left', left_on = merge_keys, right_on = merge_keys)
    completed_frames.append(completed)
    print(len(completed[completed.home_opening.isnull()]))

0
5
0
4
2
0


The dataframe merging went smoothly, with only 11 games missed. After further observation, these $11$ games were neutral site games, and afer concatenating the frames, I will be dropping these observations. 

In [10]:
#concatenate frames
all_features_with_odds = pd.concat(completed_frames, axis = 0)
#drop rows with null values
all_features_with_odds = all_features_with_odds.dropna(axis = 0, how = 'any')
#sort by date
all_features_with_odds = all_features_with_odds.sort_values(by = ['date']).reset_index(drop = True)
#view frame
all_features_with_odds.head()


Unnamed: 0,date,is_doubleheader,is_tripleheader,home_OBPS,home_AVG_RUNS,home_AVG_H,home_BULLPEN_ERA,home_BULLPEN_WHIP,home_BULLPEN_AVG_INNINGS,home_total_OBPS,...,home_close_ou,home_ou_juice_close,road_opening,road_closing,road_runline,road_runline_juice,road_open_ou,road_ou_juice_open,road_close_ou,road_ou_juice_close
0,2014-03-30,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.5,-115.0,-105,-115.0,-1.5,150.0,6.5,-120.0,6.5,-105.0
1,2014-03-31,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.5,-120.0,-165,-179.0,-1.5,-105.0,6.5,-115.0,6.5,100.0
2,2014-03-31,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,-106.0,120,130.0,1.5,-175.0,7.0,-105.0,7.0,-114.0
3,2014-03-31,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.5,-105.0,-115,-105.0,1.5,-215.0,8.0,-110.0,7.5,-115.0
4,2014-03-31,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,-125.0,-115,-115.0,-1.5,140.0,8.5,-115.0,9.0,105.0


In [20]:
#isolate vegas features from team based features 
odds_features = list(full_.columns[105:122])
odds_frame = all_features_with_odds[odds_features]
#view datatypes
odds_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14550 entries, 0 to 14549
Data columns (total 16 columns):
home_opening           14550 non-null object
home_closing           14550 non-null float64
home_runline           14550 non-null float64
home_runline_juice     14550 non-null float64
home_open_ou           14550 non-null float64
home_ou_juice_open     14550 non-null float64
home_close_ou          14550 non-null float64
home_ou_juice_close    14550 non-null float64
road_opening           14550 non-null object
road_closing           14550 non-null float64
road_runline           14550 non-null float64
road_runline_juice     14550 non-null float64
road_open_ou           14550 non-null float64
road_ou_juice_open     14550 non-null float64
road_close_ou          14550 non-null float64
road_ou_juice_close    14550 non-null float64
dtypes: float64(14), object(2)
memory usage: 1.8+ MB


For some reason, home opening and road opening lines have been stored as objects. I need to investigate what is happening with these features.

In [21]:
try:
    odds_frame['home_opening'] = odds_frame['home_opening'].astype('int64')
except Exception as e:
    print(e)

invalid literal for int() with base 10: 'NL'


In [22]:
#investigate problem observations
problem_lines = all_features_with_odds[(all_features_with_odds.home_opening == 'NL') |
                                      (all_features_with_odds.road_opening == 'NL')]
#view problem observations
problem_lines

Unnamed: 0,date,is_doubleheader,is_tripleheader,home_OBPS,home_AVG_RUNS,home_AVG_H,home_BULLPEN_ERA,home_BULLPEN_WHIP,home_BULLPEN_AVG_INNINGS,home_total_OBPS,...,home_close_ou,home_ou_juice_close,road_opening,road_closing,road_runline,road_runline_juice,road_open_ou,road_ou_juice_open,road_close_ou,road_ou_juice_close
12329,2019-04-13,0,0,0.757488,4.666667,8.5,11.17434,2.327988,3.221667,0.797698,...,9.0,-120.0,NL,109.0,1.5,-195.0,8.5,-110.0,9.0,100.0
12403,2019-04-18,0,0,0.686178,3.777778,7.444444,0.710526,0.710526,4.222222,0.798646,...,8.5,-105.0,NL,220.0,1.5,106.0,8.5,-110.0,8.5,-115.0
12428,2019-04-20,0,0,0.594466,4.166667,6.333333,2.612482,1.112724,3.445,0.603827,...,8.5,-120.0,NL,124.0,1.5,-180.0,7.5,-110.0,8.5,100.0
12525,2019-04-28,0,0,0.819577,4.9,9.9,3.96,1.46,5.0,0.761468,...,8.0,-115.0,NL,-120.0,-1.5,140.0,8.5,-110.0,8.0,-105.0
12568,2019-05-01,0,0,0.748242,5.1875,8.625,5.883339,1.609118,3.72875,0.736517,...,8.0,-110.0,NL,-220.0,-1.5,-145.0,8.5,-110.0,8.0,-110.0


The opening lines are missing from these games. However, the closing lines are not. Instead of removing the observations, I am going to substitute in the value of the closing line for the opening line.

In [23]:
problem_indices = all_features_with_odds[(all_features_with_odds.home_opening == 'NL') |
                                      (all_features_with_odds.road_opening == 'NL')].index
for index in problem_indices:
    all_features_with_odds.at[index, 'home_opening'] = all_features_with_odds.iloc[index]['home_closing']
    all_features_with_odds.at[index, 'road_opening'] = all_features_with_odds.iloc[index]['road_closing']

In [24]:
#save dataframe
all_features_with_odds.to_csv('./all_features_with_odds.csv.gz', compression = 'gzip', index = False)

# end data prep

In [4]:
#read in prepared DataFrame 
full_ = pd.read_csv('./all_features_with_odds.csv.gz', compression = 'gzip')
#convert date to datetime object
full_['date'] = pd.to_datetime(full_['date'], format = '%Y-%m-%d')

In [5]:
#change pandas option to view all features
pd.set_option('max.columns', 125)
#view frame
full_.head()

Unnamed: 0,date,is_doubleheader,is_tripleheader,home_OBPS,home_AVG_RUNS,home_AVG_H,home_BULLPEN_ERA,home_BULLPEN_WHIP,home_BULLPEN_AVG_INNINGS,home_total_OBPS,home_total_AVG_RUNS,home_total_AVG_H,home_total_BULLPEN_ERA,home_total_BULLPEN_WHIP,home_total_BULLPEN_AVG_INNINGS,road_OBPS,road_AVG_RUNS,road_AVG_H,road_BULLPEN_ERA,road_BULLPEN_WHIP,road_BULLPEN_AVG_INNINGS,road_total_OBPS,road_total_AVG_RUNS,road_total_AVG_H,road_total_BULLPEN_ERA,road_total_BULLPEN_WHIP,road_total_BULLPEN_AVG_INNINGS,home_starter,home_career_ERA,home_career_WHIP,home_career_AVGIP,home_career_ERA_AH,home_career_WHIP_AH,home_career_AVGIP_AH,home_season_ERA,home_season_WHIP,home_season_AVGIP,home_season_ERA_AH,home_season_WHIP_AH,home_season_AVGIP_AH,road_starter,road_career_ERA,road_career_WHIP,road_career_AVGIP,road_career_ERA_OR,road_career_WHIP_OR,road_career_AVGIP_OR,road_season_ERA,road_season_WHIP,road_season_AVGIP,road_season_ERA_OR,road_season_WHIP_OR,road_season_AVGIP_OR,team1,team2,home_loss,elo1_pre,elo2_pre,elo_prob1,elo_prob2,rating1_pre,rating2_pre,pitcher1_rgs,pitcher2_rgs,pitcher1_adj,pitcher2_adj,rating_prob1,rating_prob2,score1,score2,pitching_park_factor,batting_park_factor,TMAX,PRCP,SNOW,SNWD,attendance,current_streak_hm_tm,current_streak_hm_at_hm,home_record_hm,run_differential_hm,avg_margin_hm,distance_traveled,current_streak_rd_tm,current_streak_rd_tm_on_rd,rd_record_rd,run_differential_rd,avg_margin_rd,roof_closed,is_retractable,is_wildcard,is_divisional,is_championship,is_world_series,is_daygame,is_march,is_april,is_may,is_june,is_july,is_august,is_september,is_october,is_november,year,home_opening,home_closing,home_runline,home_runline_juice,home_open_ou,home_ou_juice_open,home_close_ou,home_ou_juice_close,road_opening,road_closing,road_runline,road_runline_juice,road_open_ou,road_ou_juice_open,road_close_ou,road_ou_juice_close
0,2014-03-30,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.769012,5.0,9.0,7.109005,2.21169,3.165,0.769012,5.0,9.0,7.109005,2.21169,3.165,casha001,3.215468,1.103347,5.947813,2.52809,1.0,6.357143,0.0,0.0,0.0,0.0,0.0,0.0,ryu-h001,2.924006,1.187878,6.354516,3.510351,1.20012,6.249375,0.0,0.6,5.0,0.0,0.6,5.0,SDP,LAD,0,1492.867,1537.084,0.470938,0.529062,1491.997,1533.625,53.721,52.503,24.4776,0.1316,0.509215,0.490785,3.0,1.0,92.0,92.0,200.0,0.0,0.0,0.0,45567.0,0.0,0.0,0.0,0.0,0.0,182.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,-105.0,105.0,1.5,-170.0,6.5,100.0,6.5,-115.0,-105.0,-115.0,-1.5,150.0,6.5,-120.0,6.5,-105.0
1,2014-03-31,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,gee-d001,3.843994,1.297373,6.15679,3.23166,1.235521,6.317073,0.0,0.0,0.0,0.0,0.0,0.0,stras001,2.963253,1.072942,5.790933,3.425824,1.171992,5.704571,0.0,0.0,0.0,0.0,0.0,0.0,NYM,WSN,1,1486.573,1520.297,0.48601,0.51399,1488.144,1520.107,51.389,55.551,5.358,19.4768,0.474005,0.525995,7.0,9.0,94.0,94.0,133.0,53.0,0.0,0.0,42442.0,0.0,0.0,0.0,0.0,0.0,342.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,145.0,159.0,1.5,-115.0,6.5,-105.0,6.5,-120.0,-165.0,-179.0,-1.5,-105.0,6.5,-115.0,6.5,100.0
2,2014-03-31,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,grays001,2.85,1.133333,6.0,1.99164,0.93435,6.778333,0.0,0.0,0.0,0.0,0.0,0.0,mastj001,4.131346,1.378198,6.1578,4.996183,1.48729,5.922329,0.0,0.0,0.0,0.0,0.0,0.0,OAK,CLE,1,1539.804,1518.841,0.564348,0.435652,1542.484,1520.415,51.444,54.744,0.1833,17.0328,0.547669,0.452331,0.0,2.0,97.0,98.0,139.0,145.0,0.0,0.0,36067.0,0.0,0.0,0.0,0.0,0.0,3466.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,-135.0,-145.0,-1.5,155.0,7.0,-115.0,7.0,-106.0,120.0,130.0,1.5,-175.0,7.0,-105.0,7.0,-114.0
3,2014-03-31,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,schet001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lee-c003,3.517918,1.188584,6.654952,3.687615,1.202648,6.548696,0.0,0.0,0.0,0.0,0.0,0.0,TEX,PHI,1,1526.809,1477.388,0.604116,0.395884,1528.187,1475.318,47.4,59.728,-19.7635,49.6837,0.514708,0.485292,10.0,14.0,104.0,104.0,272.0,0.0,0.0,0.0,49031.0,0.0,0.0,0.0,0.0,0.0,2113.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,105.0,-105.0,-1.5,185.0,8.0,-110.0,7.5,-105.0,-115.0,-105.0,1.5,-215.0,8.0,-110.0,7.5,-115.0
4,2014-03-31,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tillc001,4.261886,1.325497,5.63131,4.250521,1.301961,5.732439,0.0,0.0,0.0,0.0,0.0,0.0,lestj001,3.769387,1.305179,6.251318,3.738755,1.276514,6.213504,0.0,0.0,0.0,0.0,0.0,0.0,BAL,BOS,0,1516.659,1558.985,0.473651,0.526349,1514.563,1558.601,54.138,57.534,21.0466,26.4422,0.466616,0.533384,2.0,1.0,103.0,103.0,183.0,0.0,0.0,0.0,30155.0,0.0,0.0,0.0,0.0,0.0,576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,105.0,105.0,1.5,-160.0,8.5,-105.0,9.0,-125.0,-115.0,-115.0,-1.5,140.0,8.5,-115.0,9.0,105.0


In [6]:
#isolate vegas features from team based features 
odds_features = list(full_.columns[105:122])

In [7]:
odds_features

['home_opening',
 'home_closing',
 'home_runline',
 'home_runline_juice',
 'home_open_ou',
 'home_ou_juice_open',
 'home_close_ou',
 'home_ou_juice_close',
 'road_opening',
 'road_closing',
 'road_runline',
 'road_runline_juice',
 'road_open_ou',
 'road_ou_juice_open',
 'road_close_ou',
 'road_ou_juice_close']

# Experiment with predicting power using only odds, no team statistics whatsoever

In [8]:
#isolate odds 
odds_frame = full_[odds_features]

In [9]:
#view potential features
odds_frame.head()

Unnamed: 0,home_opening,home_closing,home_runline,home_runline_juice,home_open_ou,home_ou_juice_open,home_close_ou,home_ou_juice_close,road_opening,road_closing,road_runline,road_runline_juice,road_open_ou,road_ou_juice_open,road_close_ou,road_ou_juice_close
0,-105.0,105.0,1.5,-170.0,6.5,100.0,6.5,-115.0,-105.0,-115.0,-1.5,150.0,6.5,-120.0,6.5,-105.0
1,145.0,159.0,1.5,-115.0,6.5,-105.0,6.5,-120.0,-165.0,-179.0,-1.5,-105.0,6.5,-115.0,6.5,100.0
2,-135.0,-145.0,-1.5,155.0,7.0,-115.0,7.0,-106.0,120.0,130.0,1.5,-175.0,7.0,-105.0,7.0,-114.0
3,105.0,-105.0,-1.5,185.0,8.0,-110.0,7.5,-105.0,-115.0,-105.0,1.5,-215.0,8.0,-110.0,7.5,-115.0
4,105.0,105.0,1.5,-160.0,8.5,-105.0,9.0,-125.0,-115.0,-115.0,-1.5,140.0,8.5,-115.0,9.0,105.0


In [10]:
#get target features from full dataframe, concatentate them to my dataframe 
targets = full_[['home_loss', 'score1', 'score2']]
odds_frame = pd.concat([odds_frame, targets], axis = 1)

In [11]:
#create score differential and total feature to look into modeling with runline
odds_frame['score_differential'] = odds_frame['score1'] - odds_frame['score2']
odds_frame['game_total'] = odds_frame['score1'] + odds_frame['score2']
#get rid of score1 and score2 column
odds_frame = odds_frame.drop(columns = ['score1', 'score2'])

In [12]:
odds_frame.head()

Unnamed: 0,home_opening,home_closing,home_runline,home_runline_juice,home_open_ou,home_ou_juice_open,home_close_ou,home_ou_juice_close,road_opening,road_closing,road_runline,road_runline_juice,road_open_ou,road_ou_juice_open,road_close_ou,road_ou_juice_close,home_loss,score_differential,game_total
0,-105.0,105.0,1.5,-170.0,6.5,100.0,6.5,-115.0,-105.0,-115.0,-1.5,150.0,6.5,-120.0,6.5,-105.0,0,2.0,4.0
1,145.0,159.0,1.5,-115.0,6.5,-105.0,6.5,-120.0,-165.0,-179.0,-1.5,-105.0,6.5,-115.0,6.5,100.0,1,-2.0,16.0
2,-135.0,-145.0,-1.5,155.0,7.0,-115.0,7.0,-106.0,120.0,130.0,1.5,-175.0,7.0,-105.0,7.0,-114.0,1,-2.0,2.0
3,105.0,-105.0,-1.5,185.0,8.0,-110.0,7.5,-105.0,-115.0,-105.0,1.5,-215.0,8.0,-110.0,7.5,-115.0,1,-4.0,24.0
4,105.0,105.0,1.5,-160.0,8.5,-105.0,9.0,-125.0,-115.0,-115.0,-1.5,140.0,8.5,-115.0,9.0,105.0,0,1.0,3.0


Now that I have score differential and game total, before generating regression models on this data, I will start with binary classification to determine if the home team covered and if the total went over.

In [13]:
class CoverCalculator():
    def __init__(self, df):
        self.df = df
        self.home_runline_cover = []
        self.total_over = []
        
    def runline_cover(self):
        for k in range(len(self.df)):
            if self.df.iloc[k]['home_runline'] < 0:
                if self.df.iloc[k]['score_differential'] >= (-1 * self.df.iloc[k]['home_runline']):
                    self.home_runline_cover.append(1)
                else:
                    self.home_runline_cover.append(0)
            else:
                if self.df.iloc[k]['home_loss'] == 0:
                    self.home_runline_cover.append(1)
                elif (-1 * self.df.iloc[k]['score_differential']) < self.df.iloc[k]['home_runline']:
                    self.home_runline_cover.append(1)
                else:
                    self.home_runline_cover.append(0)
        return(self.home_runline_cover)
    
    def over(self):
        for k in range(len(self.df)):
            if self.df.iloc[k]['home_close_ou'] < self.df.iloc[k]['game_total']:
                self.total_over.append(1)
            else:
                self.total_over.append(0)
        return(self.total_over)

In [14]:
#iterate through dataframe and calculate binary target variables
calc = CoverCalculator(odds_frame)
runline_covers = calc.runline_cover()
totals = calc.over()

In [15]:
#create series objects and concatenate them to dataframe 
runline_covers = pd.Series(runline_covers)
totals = pd.Series(totals)
odds_frame = pd.concat([odds_frame, runline_covers, totals], axis = 1)

In [16]:
#rename new columns
odds_frame = odds_frame.rename(columns = {0 : 'home_runline_cover', 1 : 'over_hit'})

In [17]:
#drop redundant columns
odds_frame = odds_frame.drop(columns = ['road_open_ou', 'road_ou_juice_open', 'road_close_ou',
                                       'road_ou_juice_close'])
odds_frame = odds_frame.rename(columns = {'home_open_ou' : 'opening_total', 'home_ou_juice_open' : 'total_juice_open',
                                         'home_close_ou' : 'closing_total', 'home_ou_juice_close' : 'total_juice_close'})

In [18]:
#drop regression targets 
odds_frame = odds_frame.drop(columns = ['score_differential', 'game_total'])

In [19]:
#extract modeling features from potential targets
X = odds_frame.drop(columns = ['home_loss', 'home_runline_cover', 'over_hit'])
y1 = odds_frame['home_loss']
y2 = odds_frame['home_runline_cover']
y3 = odds_frame['over_hit']

## Baseline Modeling

In [20]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y1, test_size = 0.25, random_state = 43)
lr = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
lr.fit(xtrain, ytrain)
y_preds = lr.predict(xtest)
print('Baseline win/loss accuracy {}'.format(accuracy_score(ytest, y_preds)))

Baseline win/loss accuracy 0.5678944474986256


In [21]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y2, test_size = 0.25, random_state = 43)
lr = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
lr.fit(xtrain, ytrain)
y_preds = lr.predict(xtest)
print('Baseline win/loss accuracy {}'.format(accuracy_score(ytest, y_preds)))

Baseline win/loss accuracy 0.5907091808686091


In [22]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y3, test_size = 0.25, random_state = 43)
lr = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
lr.fit(xtrain, ytrain)
y_preds = lr.predict(xtest)
print('Baseline win/loss accuracy {}'.format(accuracy_score(ytest, y_preds)))

Baseline win/loss accuracy 0.5277625068719076


In [23]:
#check feature subsets
selector = FeatureSelector(X, y2, algorithm = LGBMClassifier(), params = {'max_depth' : 5, 'num_leaves' : 50},
                          verbose = 2)
selector.recursive_selection()

2 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration
1 features have been dropped, moving to next iteration


'Cannot reduce feature frame anymore. Reduce drop size if desired'

In [24]:
selector.best_subset

Index(['home_opening', 'home_closing', 'home_runline_juice'], dtype='object')

This did not yield any more or less predictive power.

Detour to determine which statistics (if any) are relevant in predicting win/loss, covering runline, etc. 

In [25]:
non_odds_features = list(full_.columns[:105])
no_odds_frame = full_[non_odds_features]

In [26]:
no_odds_frame = no_odds_frame.drop(columns = ['date', 'is_doubleheader', 'is_tripleheader', 'home_starter',
                                             'road_starter', 'team1', 'team2', 'home_loss', 'score1', 'score2',
                                             'roof_closed', 'is_retractable', 'is_wildcard', 'is_divisional',
                                             'is_championship', 'is_world_series', 'is_daygame', 'is_march',
                                             'is_april', 'is_may', 'is_june', 'is_july', 'is_august', 'is_september',
                                             'is_october', 'is_november', 'year'])
no_odds_frame.head()

Unnamed: 0,home_OBPS,home_AVG_RUNS,home_AVG_H,home_BULLPEN_ERA,home_BULLPEN_WHIP,home_BULLPEN_AVG_INNINGS,home_total_OBPS,home_total_AVG_RUNS,home_total_AVG_H,home_total_BULLPEN_ERA,home_total_BULLPEN_WHIP,home_total_BULLPEN_AVG_INNINGS,road_OBPS,road_AVG_RUNS,road_AVG_H,road_BULLPEN_ERA,road_BULLPEN_WHIP,road_BULLPEN_AVG_INNINGS,road_total_OBPS,road_total_AVG_RUNS,road_total_AVG_H,road_total_BULLPEN_ERA,road_total_BULLPEN_WHIP,road_total_BULLPEN_AVG_INNINGS,home_career_ERA,home_career_WHIP,home_career_AVGIP,home_career_ERA_AH,home_career_WHIP_AH,home_career_AVGIP_AH,home_season_ERA,home_season_WHIP,home_season_AVGIP,home_season_ERA_AH,home_season_WHIP_AH,home_season_AVGIP_AH,road_career_ERA,road_career_WHIP,road_career_AVGIP,road_career_ERA_OR,road_career_WHIP_OR,road_career_AVGIP_OR,road_season_ERA,road_season_WHIP,road_season_AVGIP,road_season_ERA_OR,road_season_WHIP_OR,road_season_AVGIP_OR,elo1_pre,elo2_pre,elo_prob1,elo_prob2,rating1_pre,rating2_pre,pitcher1_rgs,pitcher2_rgs,pitcher1_adj,pitcher2_adj,rating_prob1,rating_prob2,pitching_park_factor,batting_park_factor,TMAX,PRCP,SNOW,SNWD,attendance,current_streak_hm_tm,current_streak_hm_at_hm,home_record_hm,run_differential_hm,avg_margin_hm,distance_traveled,current_streak_rd_tm,current_streak_rd_tm_on_rd,rd_record_rd,run_differential_rd,avg_margin_rd
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.769012,5.0,9.0,7.109005,2.21169,3.165,0.769012,5.0,9.0,7.109005,2.21169,3.165,3.215468,1.103347,5.947813,2.52809,1.0,6.357143,0.0,0.0,0.0,0.0,0.0,0.0,2.924006,1.187878,6.354516,3.510351,1.20012,6.249375,0.0,0.6,5.0,0.0,0.6,5.0,1492.867,1537.084,0.470938,0.529062,1491.997,1533.625,53.721,52.503,24.4776,0.1316,0.509215,0.490785,92.0,92.0,200.0,0.0,0.0,0.0,45567.0,0.0,0.0,0.0,0.0,0.0,182.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.843994,1.297373,6.15679,3.23166,1.235521,6.317073,0.0,0.0,0.0,0.0,0.0,0.0,2.963253,1.072942,5.790933,3.425824,1.171992,5.704571,0.0,0.0,0.0,0.0,0.0,0.0,1486.573,1520.297,0.48601,0.51399,1488.144,1520.107,51.389,55.551,5.358,19.4768,0.474005,0.525995,94.0,94.0,133.0,53.0,0.0,0.0,42442.0,0.0,0.0,0.0,0.0,0.0,342.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.85,1.133333,6.0,1.99164,0.93435,6.778333,0.0,0.0,0.0,0.0,0.0,0.0,4.131346,1.378198,6.1578,4.996183,1.48729,5.922329,0.0,0.0,0.0,0.0,0.0,0.0,1539.804,1518.841,0.564348,0.435652,1542.484,1520.415,51.444,54.744,0.1833,17.0328,0.547669,0.452331,97.0,98.0,139.0,145.0,0.0,0.0,36067.0,0.0,0.0,0.0,0.0,0.0,3466.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.517918,1.188584,6.654952,3.687615,1.202648,6.548696,0.0,0.0,0.0,0.0,0.0,0.0,1526.809,1477.388,0.604116,0.395884,1528.187,1475.318,47.4,59.728,-19.7635,49.6837,0.514708,0.485292,104.0,104.0,272.0,0.0,0.0,0.0,49031.0,0.0,0.0,0.0,0.0,0.0,2113.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.261886,1.325497,5.63131,4.250521,1.301961,5.732439,0.0,0.0,0.0,0.0,0.0,0.0,3.769387,1.305179,6.251318,3.738755,1.276514,6.213504,0.0,0.0,0.0,0.0,0.0,0.0,1516.659,1558.985,0.473651,0.526349,1514.563,1558.601,54.138,57.534,21.0466,26.4422,0.466616,0.533384,103.0,103.0,183.0,0.0,0.0,0.0,30155.0,0.0,0.0,0.0,0.0,0.0,576.0,0.0,0.0,0.0,0.0,0.0


The greatest challenge of this work is selecting which, if any features, have a good predictive power. Thus far, I have not been able to isolate a feature set that is able to establish a decision boundary in Win/Loss classification, Score Differential regression, Runline Cover classification, or Total Over classification. My methodology has been certained on feature importance, with little success. I have isolated features into categories of feature to determine if any subset of these features will yield a usable decision boundary, and then I will begin adding subsets of features together to determine if there is interaction between subsets that can yield predictive power.

In [27]:
stat_cols = pd.Series(no_odds_frame.columns)
OBPS = list(stat_cols[stat_cols.str.contains('OBPS')].values)
RUNS = list(stat_cols[stat_cols.str.contains('RUNS')].values)
HITS = list(stat_cols[stat_cols.str.contains('AVG_H')].values)
STARTER_ERA = list(stat_cols[(stat_cols.str.contains('ERA')) & ~(stat_cols.str.contains('BULLPEN'))].values)
STARTER_WHIP = list(stat_cols[(stat_cols.str.contains("WHIP")) & ~(stat_cols.str.contains("BULLPEN"))].values)
STARTER_AVGIP = list(stat_cols[stat_cols.str.contains("AVGIP")].values)
BULLPEN_ERA = list(stat_cols[(stat_cols.str.contains('ERA')) & (stat_cols.str.contains('BULLPEN'))].values)
BULLPEN_WHIP = list(stat_cols[(stat_cols.str.contains("WHIP")) & (stat_cols.str.contains("BULLPEN"))].values)
BULLPEN_AVGIP = list(stat_cols[stat_cols.str.contains("AVG_INNINGS")].values)
ELORatings = ['elo1_pre', 'elo2_pre']
ADJRatings = ['rating1_pre', 'rating2_pre']
PITCHERratings = ['pitcher1_rgs', 'pitcher2_rgs', 'pitcher1_adj', 'pitcher2_adj']
ELOProbs = ['elo_prob1', 'elo_prob2']
ADJProbs = ['rating_prob1', 'rating_prob2']
EXTERNALS = list(stat_cols.iloc[60:67].values)
MOMENTUM = list(stat_cols.iloc[67:].values)

eval_dict = {'OBPS' : OBPS,
            'RUNS' : RUNS,
            'HITS' : HITS,
            'STARTER_ERA' : STARTER_ERA,
            'STARTER_WHIP' : STARTER_WHIP,
            'STARTER_AVGIP' : STARTER_AVGIP,
            'BULLPEN_ERA' : BULLPEN_ERA,
            'BULLPEN_WHIP' : BULLPEN_WHIP,
            'BULLPEN_AVGIP' : BULLPEN_AVGIP,
            'ELORatings' : ELORatings,
            'ADJRatings' : ADJRatings,
            'PITCHERratings' : PITCHERratings,
            'ELOProbs' : ELOProbs,
            'ADJProbs' : ADJProbs,
            'EXTERNALS' : EXTERNALS,
            'MOMENTUM' : MOMENTUM}

In [61]:
class SubsetSampler():
    def __init__(self, X, y, all_subsets, algorithm = LogisticRegression(), eval_type = 'roc', params = None,
                 n_jobs = None, cv = 5, VIF = False, vif_tolerance = 5.0):
        self.X = X
        self.y = y
        self.all_subsets = all_subsets
        self.algorithm = algorithm
        self.eval_type = eval_type
        self.params = params
        self.n_jobs = n_jobs
        self.cv = cv
        self.vif_tolerance = vif_tolerance
        self.VIF = VIF
        self.evals = []
        self.current_eval = 0.0
        self.best_eval = 0.0
        self.current_feature_set = None
        self.best_feature_set = None
        self.results_list = []
        self.results_frame = None
    
    def sample(self):
        for category in self.all_subsets.keys():
            self.current_feature_set = self.all_subsets[category]
            self.cross_validation()
            if self.current_eval >= self.best_eval:
                self.best_eval = self.current_eval
                self.best_feature_set = self.current_feature_set
            self.results_list.append(self.current_eval)
        self.results_frame = pd.DataFrame({'category' : list(self.all_subsets.keys()), 'score' : self.results_list})
    
    def cross_validation(self):
        X_ = self.X[self.current_feature_set]
        if self.VIF:
            to_drop = self.calculate_vif(X_, self.vif_tolerance)
            X_ = X_.drop(columns = to_drop)
        kf = KFold(n_splits = self.cv)
        results = []
        with ProcessPoolExecutor(max_workers = self.n_jobs) as executor:
            for train, test in kf.split(X_):
                xtrain, xtest = X_.iloc[train], X_.iloc[test]
                ytrain, ytest = self.y.iloc[train], self.y.iloc[test]
                score = self.train_test_eval(xtrain, xtest, ytrain, ytest, self.algorithm,
                                             self.eval_type, self.params)
                results.append(score / self.cv)
        self.current_eval = sum(results)
            
    @staticmethod
    def train_test_eval(xtrain, xtest, ytrain, ytest, algorithm, eval_type, params):
        if params:
            algo = algorithm.set_params(**params)
        else:
            algo = algorithm
        algo.fit(xtrain, ytrain)
        ypreds = algo.predict(xtest)
        if eval_type == 'roc':
            score = roc_auc_score(ytest, ypreds)
        elif eval_type == 'accuracy':
            score = accuracy_score(ytest, ypreds)
        elif eval_type == 'precision':
            score = precision_score(ytest, ypreds)
        return(score)
        
    @staticmethod
    def calculate_vif(X, tol):
        dropped_list = []
        while True:
            X = X.assign(constant = 1)
            X_cols = list(X.columns)
            npX = np.array(X)
            vif = [variance_inflation_factor(npX, i) for i in np.arange(npX.shape[1])]
            vif_ = pd.Series(vif, index = X_cols)
            vif_ = vif_.drop('constant')
            max_vif = vif_.idxmax()
            if vif_.max() > tol:
                X = X.drop(columns = [max_vif, 'constant'])
                dropped_list.append(max_vif)
            else:
                return(dropped_list)
        

In [62]:
all_results = []
for target in(y1, y2, y3):
    sampler = SubsetSampler(no_odds_frame, target, eval_dict, params = {'solver' : 'lbfgs', 'max_iter' : 1000},
                           VIF = True)
    sampler.sample()
    all_results.append(sampler.results_frame)
frame = pd.concat(all_results, axis = 1)
frame.columns = ['category', 'win_loss_score', 'DROP1', 'runline_score', 'DROP2', 'total_score']
frame = frame.drop(columns = ['DROP1', 'DROP2'])
frame

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


Unnamed: 0,category,win_loss_score,runline_score,total_score
0,OBPS,0.513234,0.499649,0.49895
1,RUNS,0.517848,0.499094,0.498941
2,HITS,0.502409,0.499503,0.499968
3,STARTER_ERA,0.502093,0.499793,0.499969
4,STARTER_WHIP,0.501922,0.499715,0.499903
5,STARTER_AVGIP,0.509161,0.500102,0.502802
6,BULLPEN_ERA,0.510143,0.499755,0.499492
7,BULLPEN_WHIP,0.513517,0.500131,0.499266
8,BULLPEN_AVGIP,0.506181,0.501834,0.50106
9,ELORatings,0.55086,0.500312,0.5


In [None]:
b

In [10]:
#remove odds 
without_odds = full_.drop(columns = odds_features)
#generate score differential Series, will be used in future model
score_differential = full_.score1 - full_.score2
#prepare dataframe for modeling by removing merge keys 
drop_cols = ['date', 'is_doubleheader', 'is_tripleheader', 'home_starter', 'road_starter', 'team1', 'team2', 
            'elo_prob2', 'rating_prob2', 'home_loss', 'score1', 'score2', 'year']
#generate feature frame 
X = without_odds.drop(columns = drop_cols)
#generate target vector
y = without_odds['home_loss']

In [13]:
#baseline Logistic Regression model 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 43)
#instantiate Logistic Regression model
lr = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
#fit Logistic Regression model
lr.fit(x_train, y_train)
#generate validation predictions on hold out set
y_preds = lr.predict(x_test)
base_accuracy = accuracy_score(y_test, y_preds)
print('The baseline accuracy is {}'.format(base_accuracy))

The baseline accuracy is 0.5726397800183318


The baseline accuracy is exactly what I expected after the prior research in the __01classificiation_win_loss.ipynb__ notebook. I will run through feature selection and parameter tuning on this reduced dataset, and I will also try the large scale parameters generated in the previous notebook. 

In [15]:
#instantiate FeatureSelector object from recursive_selection.py file
selector = FeatureSelector(X, y, algorithm = LGBMClassifier(), params = {'num_leaves' : 50,
                                                                        'max_depth' : 3}, drop_size = 10)
#iterate through feature selection 
selector.recursive_selection()

'Cannot reduce feature frame anymore. Reduce drop size if desired'

In [18]:
#instantiate FeatureSelector with different parameters
selector = FeatureSelector(X, y, algorithm = LGBMClassifier(), params = {'num_leaves' : 50,
                                                                        'max_depth' : 3}, VIF = True, VIF_tol = 5.0,
                          drop_size = 10)
selector.recursive_selection()

'Cannot reduce feature frame anymore. Reduce drop size if desired'

The performance is no better (in fact it is worse) than the full dataframe. Now I will begin the experimentation with Feature Engineering and adding gambling data.

In [21]:
#generate list of all features
all_features = pd.Series(X.columns)
#split into list of home/road features, collected from retrosheet
home_team_retro = all_features[all_features.str.contains('home')]
road_team_retro = all_features[all_features.str.contains('road')]
#split into list of home/road features, collected from elo dataset
home_team_elo = all_features[all_features.str.contains('1')]
road_team_elo = all_features[all_features.str.contains('2')]

In [22]:
#check lengths of features
len(home_team_retro), len(road_team_retro), len(home_team_elo), len(road_team_elo)

(25, 24, 6, 4)

In [23]:
#view the series that is too long and determine which feature should not be there
home_team_retro

0                          home_OBPS
1                      home_AVG_RUNS
2                         home_AVG_H
3                   home_BULLPEN_ERA
4                  home_BULLPEN_WHIP
5           home_BULLPEN_AVG_INNINGS
6                    home_total_OBPS
7                home_total_AVG_RUNS
8                   home_total_AVG_H
9             home_total_BULLPEN_ERA
10           home_total_BULLPEN_WHIP
11    home_total_BULLPEN_AVG_INNINGS
24                   home_career_ERA
25                  home_career_WHIP
26                 home_career_AVGIP
27                home_career_ERA_AH
28               home_career_WHIP_AH
29              home_career_AVGIP_AH
30                   home_season_ERA
31                  home_season_WHIP
32                 home_season_AVGIP
33                home_season_ERA_AH
34               home_season_WHIP_AH
35              home_season_AVGIP_AH
67                    home_record_hm
dtype: object

In [24]:
#home_record is showing up in home_team_features, dropping from axis 
home_team_retro = home_team_retro.drop(67, axis = 0)

In [25]:
#check why home_elo and road_elo are different lengths
home_team_elo

48        elo1_pre
50       elo_prob1
51     rating1_pre
53    pitcher1_rgs
55    pitcher1_adj
57    rating_prob1
dtype: object

In [26]:
#drop prob1 ratings (as prob2 was dropped, no need for a differential)
home_team_elo = home_team_elo.drop([50, 57], axis = 0)

In [27]:
#reset indices, combine like series into dataframe to look at feature names together
home_team_retro = home_team_retro.reset_index(drop = True)
road_team_retro = road_team_retro.reset_index(drop = True)
home_team_elo = home_team_elo.reset_index(drop = True)
road_team_elo = road_team_elo.reset_index(drop = True)

#concat retro features together to examine side by side
pd.concat([home_team_retro, road_team_retro, home_team_elo, road_team_elo], axis = 1)

Unnamed: 0,0,1,2,3
0,home_OBPS,road_OBPS,elo1_pre,elo2_pre
1,home_AVG_RUNS,road_AVG_RUNS,rating1_pre,rating2_pre
2,home_AVG_H,road_AVG_H,pitcher1_rgs,pitcher2_rgs
3,home_BULLPEN_ERA,road_BULLPEN_ERA,pitcher1_adj,pitcher2_adj
4,home_BULLPEN_WHIP,road_BULLPEN_WHIP,,
5,home_BULLPEN_AVG_INNINGS,road_BULLPEN_AVG_INNINGS,,
6,home_total_OBPS,road_total_OBPS,,
7,home_total_AVG_RUNS,road_total_AVG_RUNS,,
8,home_total_AVG_H,road_total_AVG_H,,
9,home_total_BULLPEN_ERA,road_total_BULLPEN_ERA,,


In [28]:
#create empty list to store series of stat differences (retrosheet)
all_series = []
#col names (retro)
column_names = []

#iterate through feature series, storing differential in all_series
#to keep consistency with target variable, it will be (home value - road value), as target is (home_score - road_score)
for j in range(len(home_team_retro)):
    home_stat = home_team_retro.iloc[j]
    #some stat names are formatted differently between home and road, account for those 
    if home_stat.split('_')[-1] != 'AH':
        stat_category = home_stat.split('home')[1]
        road_stat = '{}{}'.format('road', stat_category)
        difference = X[home_stat] - X[road_stat]
    else:
        stat_category = home_stat.split('home')[1].split('AH')[0]
        road_stat = '{}{}{}'.format('road', stat_category, 'OR')
        difference = X[home_stat] - X[road_stat]
    all_series.append(difference)
    column_names.append(stat_category)
#iterate through elo feature series
for j in range(len(home_team_elo)):
    home_stat = home_team_elo.iloc[j]
    if home_stat.split('1')[1] != '':
        stat_category = home_stat.split('1')[0] + '{}' + home_stat.split('1')[1]
    else:
        stat_category = home_stat.split('1')[0] + '{}'
    road_stat = stat_category.format('2')
    difference = X[home_stat] - X[road_stat]
    stat_category = stat_category.replace('{}', '')
    all_series.append(difference)
    column_names.append(stat_category)

#aggregate all series into dataframe     
difference_frame = pd.concat(all_series, axis = 1)
difference_frame.columns = column_names
#view dataframe
difference_frame.head()

Unnamed: 0,_OBPS,_AVG_RUNS,_AVG_H,_BULLPEN_ERA,_BULLPEN_WHIP,_BULLPEN_AVG_INNINGS,_total_OBPS,_total_AVG_RUNS,_total_AVG_H,_total_BULLPEN_ERA,_total_BULLPEN_WHIP,_total_BULLPEN_AVG_INNINGS,_career_ERA,_career_WHIP,_career_AVGIP,_career_ERA_,_career_WHIP_,_career_AVGIP_,_season_ERA,_season_WHIP,_season_AVGIP,_season_ERA_,_season_WHIP_,_season_AVGIP_,elo_pre,rating_pre,pitcher_rgs,pitcher_adj
0,-0.769012,-5.0,-9.0,-7.109005,-2.21169,-3.165,-0.769012,-5.0,-9.0,-7.109005,-2.21169,-3.165,0.291462,-0.084531,-0.406704,-0.982261,-0.20012,0.107768,0.0,-0.6,-5.0,0.0,-0.6,-5.0,-44.217,-41.628,1.218,24.346
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.880741,0.224432,0.365857,-0.194164,0.063529,0.612502,0.0,0.0,0.0,0.0,0.0,0.0,-33.724,-31.963,-4.162,-14.1188
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.281346,-0.244865,-0.1578,-3.004543,-0.55294,0.856005,0.0,0.0,0.0,0.0,0.0,0.0,20.963,22.069,-3.3,-16.8495
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.517918,-1.188584,-6.654952,-3.687615,-1.202648,-6.548696,0.0,0.0,0.0,0.0,0.0,0.0,49.421,52.869,-12.328,-69.4472
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.492499,0.020318,-0.620009,0.511766,0.025448,-0.481065,0.0,0.0,0.0,0.0,0.0,0.0,-42.326,-44.038,-3.396,-5.3956


In [29]:
#generate list of all features from original frame not included in updated frame 
not_head2head = list(all_features[~(all_features.str.contains('home')) & ~(all_features.str.contains('road')) &
                                  ~(all_features.str.contains('1')) & ~(all_features.str.contains('2'))].values) \
+ ['elo_prob1', 'rating_prob1']

#generate partial frame from original frame using selected columns 
add_on = X[not_head2head]

#insure frames are same length
len(difference_frame), len(add_on)

(14545, 14545)

In [30]:
#concatenate both frames into a single frame 
updated_X = pd.concat([difference_frame, add_on], axis = 1)

In [38]:
#bring in best feature set from previous notebook
with open('best_features.json', 'r') as f:
    best_features = json.load(f)
#prepare frame with best features
best_X = updated_X[best_features]

In [44]:
#generate gambling features to add 
gambling_features = ['home_closing', 'home_runline', 'road_closing', 'road_runline']
#concatenate these features to primary dataframe 
best_X = pd.concat([best_X, full_[gambling_features]], axis = 1)
#view info
best_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14545 entries, 0 to 14544
Data columns (total 26 columns):
_OBPS                         14545 non-null float64
_AVG_RUNS                     14545 non-null float64
_BULLPEN_WHIP                 14545 non-null float64
_total_OBPS                   14545 non-null float64
_total_AVG_RUNS               14545 non-null float64
_total_AVG_H                  14545 non-null float64
_total_BULLPEN_WHIP           14545 non-null float64
_total_BULLPEN_AVG_INNINGS    14545 non-null float64
_career_ERA_                  14545 non-null float64
_season_ERA                   14545 non-null float64
_season_AVGIP                 14545 non-null float64
_season_AVGIP_                14545 non-null float64
elo_pre                       14545 non-null float64
rating_pre                    14545 non-null float64
pitcher_rgs                   14545 non-null float64
pitcher_adj                   14545 non-null float64
pitching_park_factor          14545 non-nul

In [50]:
#run FeatureSelector with new gambling features
selector = FeatureSelector(best_X, y, algorithm = LGBMClassifier(), params = {'num_leaves' : 50,
                                                                             'max_depth' : 3},
                          drop_size = 10)
selector.recursive_selection()
#print best evaluation
print('The best evaluation was {}'.format(selector.best_eval))

The best evaluation was 0.5782743210725335


As this evaluation is the best to date, I will proceed with the subset generated from this selection.

In [64]:
#generate final frame 
X_final = best_X[list(selector.best_subset)]
#split into training and test data
xtrain, xtest, ytrain, ytest = train_test_split(X_final, y, test_size = 0.3, random_state = 43)
#load in parameters from parameter tuning research
with open('best_LGB_params.json', 'r') as f:
    best_params = json.load(f)
#instantiate LGBMClassifier with best parameters generated via Hyperopt search
lgb = LGBMClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 150, num_leaves = 50)
#fit the model
lgb.fit(xtrain, ytrain)
#generate predictions using the model 
ypreds = lgb.predict(xtest)
#view accuracy score
lgb_acc = accuracy_score(ytest, ypreds)
print('The accuracy of the LGBMClassifier on the holdout set was {}'.format(lgb_acc))

The accuracy of the LGBMClassifier on the holdout set was 0.576076993583868


There is one more feature I would like to consider and that is the difference between the opening lines and closing lines. This could have predictive power. 

In [67]:
#generate opening/closing differentials 
home_line_diff = full_.home_closing - full_.home_opening
road_line_diff = full_.road_closing - full_.road_opening
#create new dataframe 
X_final = pd.concat([best_X, home_line_diff, road_line_diff], axis = 1)
#split into training and test data
xtrain, xtest, ytrain, ytest = train_test_split(X_final, y, test_size = 0.3, random_state = 43)
#load in parameters from parameter tuning research
with open('best_LGB_params.json', 'r') as f:
    best_params = json.load(f)
#instantiate LGBMClassifier with best parameters generated via Hyperopt search
lgb = LGBMClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 150, num_leaves = 50)
#fit the model
lgb.fit(xtrain, ytrain)
#generate predictions using the model 
ypreds = lgb.predict(xtest)
#view accuracy score
lgb_acc = accuracy_score(ytest, ypreds)
print('The accuracy of the LGBMClassifier on the holdout set was {}'.format(lgb_acc))

The accuracy of the LGBMClassifier on the holdout set was 0.571494042163153


As the performance was worse, I will proceed with the previous subset. I am going to write an object that performs model stacking and determine if stacking the models improves the performance. 

In [91]:
class ModelStacker():
    def __init__(self, X, y, algorithm = LGBMClassifier(), params = {'learning_rate' : 0.1,
                                                                    'max_depth' : 30,
                                                                    'max_depth' : 3,
                                                                    'num_leaves' : 50,
                                                                    'random_state' : 43},
                num_models = 5, early_stopping = False, early_stopping_threshold = 3, metric = 'accuracy'):
        
        self.X = X
        self.y = y
        self.algorithm = algorithm
        self.params = params
        self.num_models = num_models
        self.early_stopping = early_stopping
        self.early_stopping_threshold = early_stopping_threshold
        self.metric = metric
        self.last_preds = None
        self.last_eval = 0
        self.best_eval = 0
        self.stack_size = 0
        self.best_eval_stack_size = 0
    
    def stack(self):
        no_improvement = 0
        for k in range(self.num_models):
            self.split_fit_test()
            self.stack_size += 1
            if self.last_eval > self.best_eval:
                self.best_eval = self.last_eval
                self.best_eval_stack_size = self.stack_size
            else:
                no_improvement += 1
            if self.early_stopping and no_improvement >= self.early_stopping_threshold:
                return('Improvement has stopped')
            current_columns = list(self.X.columns)
            self.X = pd.concat([self.X, self.last_preds], axis = 1)
            current_columns += ['preds{}'.format(str(k))]
            if len(current_columns) != len(self.X.columns):
                raise ValueError('The columns are not working')
            else:
                self.X.columns = current_columns

    def split_fit_test(self):
        xtrain, xtest, ytrain, ytest = train_test_split(self.X, self.y, test_size = 0.3, random_state = 43)
        algo = self.algorithm.set_params(**self.params)
        algo.fit(xtrain, ytrain)
        self.last_preds = pd.Series(algo.predict(xtest))
        if self.metric == 'accuracy':
            self.last_eval = accuracy_score(ytest, self.last_preds)
        else:
            raise ValueError('Passed unsupported metric')

In [92]:
large_stack = ModelStacker(X_final, y, num_models = 10, early_stopping = True)
large_stack.stack()

'Improvement has stopped'

In [95]:
large_stack.best_eval_stack_size

2

In [96]:
large_stack.best_eval

0.576076993583868

This did not yield any improvement whatsoever. The next step is looking at what can be done with runline and score differential. 

# 2019 Moneyline Experiment

I have been dealing with modeling using $100$ years of baseball data and randomly selected training and test sets throughout those $100$ years of data. Baseball has changed drastically over the last $100$ years, and my initial theory for how to account for this was to add binary dummy variables for different eras: Deadball, Liveball, Steriod, etc. Perhaps this was not the right way to go about this. I am going to try a much more target experiment: using $5$ years of data to predict a single year, then I will trying using $5$ years of data and the prediction_probability method offered by scikit-learn compatible algorithms to test what the maximum profit that could have been generated from moneyline bets during the $2019$ season.

In [6]:
#create feature frame without odds
X = full_[full_.year < 2019].drop(columns = ['date', 'is_doubleheader', 'is_tripleheader', 'home_starter', 
                                             'road_starter','team1', 'team2', 'elo_prob2', 'rating_prob2', 
                                             'home_loss', 'score1', 'score2', 'year'])
#I will begin with moneyline, which only concerns with wins and losses, then move to run line
y = full_[full_.year < 2019].home_loss

In [7]:
#isolate team features and moneylines
reduced = list(X.columns[:57])
lines = list(X.columns[92:95])
reduced += lines
#take these features from main dataframe
X_reduced = X[reduced]

In [8]:
from recursive_selection import FeatureSelector

In [9]:
selector = FeatureSelector(X_reduced, y, algorithm = LGBMClassifier(), params = {'num_leaves' : 50, 'max_depth' : 3},
                          VIF = True, VIF_tol = 5.0, drop_size = 10)
selector.recursive_selection()

'Cannot reduce feature frame anymore. Reduce drop size if desired'

In [10]:
#isolate features that yielded the best evaluation
features = selector.best_subset

In [11]:
#initalize LGBMClassifier using best parameters (discovered in notebook 01)
LGBM = LGBMClassifier(n_estimators = 150, max_depth = 3, num_leaves = 50)
#fit using previous years of data
LGBM.fit(X[features], y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=3,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=150, n_jobs=-1, num_leaves=50, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [12]:
#get 2019 data to test predicictions on
all_2019 = full_[full_.year == 2019].reset_index(drop = True)
#drop unneeded features
all_2019X = all_2019[features]
#isolate target variable 
all_2019y = all_2019['home_loss']

In [13]:
#get win probabilities
wp_2019 = LGBM.predict_proba(all_2019X)
#take single dimensional array of home win probabilities
home_wp = pd.Series([i[1] for i in wp_2019])
#actual_predictions
preds_2019 = pd.Series(LGBM.predict(all_2019X))

In [14]:
#add probabilities to frame 
with_prob2019 = pd.concat([all_2019, home_wp, preds_2019], axis = 1).rename(columns = {0 : 'road_win_probability',
                                                                                      1 : 'model_prediction'})
#retain only necessary features
with_prob2019 = with_prob2019[['team1', 'team2', 'home_loss', 'score1', 'score2', 'home_opening', 'home_closing',
                              'road_opening', 'road_closing', 'road_win_probability', 'model_prediction']]

In [15]:
#view frame
with_prob2019.head(10)

Unnamed: 0,team1,team2,home_loss,score1,score2,home_opening,home_closing,road_opening,road_closing,road_win_probability,model_prediction
0,CIN,PIT,0,5.0,3.0,-115.0,-117.0,105.0,107.0,0.449083,0
1,MIL,STL,0,5.0,4.0,-114.0,-114.0,104.0,104.0,0.464498,0
2,PHI,ATL,0,10.0,4.0,-190.0,-185.0,170.0,170.0,0.391707,0
3,FLA,COL,1,3.0,6.0,131.0,125.0,-145.0,-135.0,0.548412,1
4,MIN,CLE,0,2.0,0.0,122.0,-117.0,-135.0,107.0,0.497959,0
5,SDP,SFG,0,2.0,0.0,104.0,-127.0,-114.0,117.0,0.480239,0
6,TOR,DET,1,0.0,2.0,-140.0,-142.0,126.0,132.0,0.399729,0
7,NYY,BAL,0,7.0,2.0,-340.0,-380.0,296.0,320.0,0.395938,0
8,LAD,ARI,0,12.0,5.0,-160.0,-165.0,144.0,155.0,0.417594,0
9,TEX,CHC,1,4.0,12.0,122.0,105.0,-135.0,-115.0,0.507615,1


In [50]:
##NOTE: adding only home / only road capability ?
class BankrollCalculator():
    def __init__(self, df, unit = 100, with_juice = False, use_closing = False, play_levels = 'all', tier1_home = 0.2,
                tier2_home = [0.2, 0.3], tier3_home = [0.3, 0.4], tier1_road = 0.8, tier2_road = [0.7,0.8],
                tier3_road = [0.6, 0.7]):
        self.df = df
        self.unit = unit
        self.with_juice = with_juice
        self.use_closing = use_closing
        self.play_levels = play_levels
        self.tier1_home = tier1_home
        self.tier2_home = tier2_home
        self.tier3_home = tier3_home
        self.tier1_road = tier1_road
        self.tier2_road = tier2_road
        self.tier3_road = tier3_road
        self.total_bankroll = 0
        self.tier1_bankroll = 0
        self.tier2_bankroll = 0
        self.tier3_bankroll = 0
        self.n_plays = 0
        self.frames_dict = {}
    
    def calculate_bankroll(self):
        self.get_plays()
        for key in self.frames_dict.keys():
            outcome = 0
            level, logic = key.split('_')[0], key.split('_')[1]
            if logic == 'home':
                for j in range(len(self.frames_dict[key])):
                    outcome += self.place_bet_home(self.frames_dict[key].iloc[j])
                    self.n_plays += 1
            else:
                for j in range(len(self.frames_dict[key])):
                    outcome += self.place_bet_road(self.frames_dict[key].iloc[j])
                    self.n_plays += 1
            if level == 'tier1':
                self.tier1_bankroll += outcome
            elif level == 'tier2':
                self.tier2_bankroll += outcome
            else:
                self.tier3_bankroll += outcome
        self.total_bankroll = self.tier1_bankroll + self.tier2_bankroll + self.tier3_bankroll
            
    def get_plays(self):
        if self.play_levels == 'all':
            self.frames_dict['tier1_home'] = self.df[self.df.road_win_probability < self.tier1_home]
            self.frames_dict['tier1_road'] = self.df[self.df.road_win_probability > self.tier1_road]
            self.frames_dict['tier2_home'] = self.df[self.df.road_win_probability.\
                                                     between(self.tier2_home[0], self.tier2_home[1])]
            self.frames_dict['tier2_road'] = self.df[self.df.road_win_probability.\
                                                     between(self.tier2_road[0], self.tier2_road[1])]
            self.frames_dict['tier3_home'] = self.df[self.df.road_win_probability.\
                                                     between(self.tier3_home[0], self.tier3_home[1])]
            self.frames_dict['tier3_road'] = self.df[self.df.road_win_probability.\
                                                     between(self.tier3_road[0], self.tier3_road[1])]
        elif self.play_levels == 'best':
            self.frames_dict['tier1_home'] = self.df[self.df.road_win_probability < self.tier1_home]
            self.frames_dict['tier1_road'] = self.df[self.df.road_win_probability > self.tier1_road]
        
        elif self.play_levels == 'top2':
            self.frames_dict['tier1_home'] = self.df[self.df.road_win_probability < self.tier1_home]
            self.frames_dict['tier1_road'] = self.df[self.df.road_win_probability > self.tier1_road]
            self.frames_dict['tier2_home'] = self.df[self.df.road_win_probability.\
                                                     between(self.tier2_home[0], self.tier2_home[1])]
            self.frames_dict['tier2_road'] = self.df[self.df.road_win_probability.\
                                                     between(self.tier2_road[0], self.tier2_road[1])]
    
    
    def place_bet_home(self, obs):
        if self.use_closing:
            line = obs['home_closing']
        else:
            line = obs['home_opening']
        if line > 0:
            line_type = 'dog'
        else:
            line_type = 'fav'
        if obs['model_prediction'] == obs['home_loss']:
            if line_type == 'dog':
                result = (line / 100) * self.unit 
            else:
                if self.with_juice:
                    result = self.unit
                else:
                    result = (100 / abs(line)) * self.unit
        else:
            if line_type == 'dog':
                result = -1 * self.unit
            else:
                if self.with_juice:
                    result = (line / 100) * self.unit
                else:
                    result = -1 * self.unit
        return(result)
    
    def place_bet_road(self, obs):
        if self.use_closing:
            line = obs['road_closing']
        else:
            line = obs['road_opening']
        if line > 0:
            line_type = 'dog'
        else:
            line_type = 'fav'
        if obs['model_prediction'] == obs['home_loss']:
            if line_type == 'dog':
                result = (line / 100) * self.unit
            else:
                if self.with_juice:
                    result = self.unit
                else:
                    result = (100 / abs(line)) * self.unit
        else:
            if line_type == 'dog':
                result = -1 * self.unit
            else:
                if self.with_juice:
                    result = (line / 100) * self.unit
                else:
                    result = -1 * self.unit
        return(result)
                


In [58]:
bc = BankrollCalculator(with_prob2019, with_juice = True)

In [59]:
bc.calculate_bankroll()

In [64]:
bc.total_bankroll

-365.0

In [66]:
bc.tier1_bankroll

465.0

In [49]:
place_bet_home(with_prob2019.iloc[3], False, False, 100)

131.0