Analyzing Odds Data

In the last notebook, we obtained historical odds data from oddsshark, and then augmented our game level data to include the implied probabilities, and over/under lines.
We saved that data to a file called 'df_bp3.csv'
In this notebook, we will do some initial exploration of that odds data, and compare the quality of our first model predictions to the implied probabilities given by the oddsmakers.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as lgbm
import structureboost as stb
import ml_insights as mli
from structureboost import log_loss
pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)


In [None]:
df=pd.read_csv('df_bp3.csv', low_memory=False)

In [None]:
df.info()

In [None]:
# checks to see if '0-probability' occurs in correct spots

pd.crosstab(df.implied_prob_h>0, df.season)

In [None]:
df_odds = df[df.season>=2019]

In [None]:
df_odds.sample(5)

In [None]:
plt.hist(df_odds.implied_prob_h_mid, bins=np.linspace(.15,.85,85));

In [None]:
plt.hist(df_odds.implied_prob_h + df_odds.implied_prob_v, bins=np.linspace(1,1.06,61));

# interesting to see second peak around 1.04-1.05 ... IDEAS??

In [None]:
df = df[df.run_diff!=0]
df_train = df[(df.season>1901) & (df.season<=2000)]
df_valid = df[(df.season>=2001) & (df.season<=2020)]
df_test = df[df.season>=2021]

In [None]:
features = ['OBP_162_h','OBP_162_v',
            'SLG_162_h','SLG_162_v', 
            # 'OBP_30_h','OBP_30_v',
            # 'SLG_30_h','SLG_30_v',
            # 'game_no_h',
           ]
target = 'home_victory'

In [None]:
X_train = df_train.loc[:,features]
X_valid = df_valid.loc[:,features]
X_test = df_test.loc[:,features]

y_train = df_train[target].to_numpy()
y_valid = df_valid[target].to_numpy()
y_test = df_test[target].to_numpy()
X_train.shape, X_valid.shape, X_test.shape, 

In [None]:
lgbm1 = lgbm.LGBMClassifier(n_estimators=1000, learning_rate=.02, max_depth=3)
lgbm1.fit(X_train, y_train, eval_set=(X_valid, y_valid), eval_metric='logloss', 
          callbacks=[lgbm.early_stopping(stopping_rounds=50), lgbm.log_evaluation(10)])

In [None]:
preds_lgbm = lgbm1.predict_proba(X_test)[:,1]

In [None]:
hv_mean = y_train.mean()

In [None]:
lgbm_loss = log_loss(y_test, preds_lgbm)
naive_loss = log_loss(y_test, hv_mean*np.ones(len(y_test)))
lgbm_loss, naive_loss

In [None]:
lv_loss = log_loss(y_test, df_test.implied_prob_h_mid)
lv_loss

In [None]:
(naive_loss - lgbm_loss)/(naive_loss - lv_loss)

In [None]:
plt.figure(figsize=(16,6))
mli.plot_reliability_diagram(y_test, df_test.implied_prob_h_mid, show_histogram=True);

In [None]:
test_disc = np.abs(preds_lgbm - df_test.implied_prob_h_mid)

In [None]:
plt.hist(test_disc, np.linspace(0,.35,36));

In [None]:
df_test[test_disc>0.25]

In [None]:
# 'Fancy' code that basically says, if the home team is favored then take select their SP. If not, select the oppo SP.

# These are the pitchers that were favored in these games
[row[1]['pitcher_start_name_h']  if row[1]['implied_prob_h_mid']>.5 
 else row[1]['pitcher_start_name_v'] 
 for row in df_test[test_disc>.25].iterrows() ]

In [None]:
# These are the underdog pitchers in these games
[row[1]['pitcher_start_name_h']  if row[1]['implied_prob_h_mid']<.5 
 else row[1]['pitcher_start_name_v'] for row in df_test[test_disc>.25].iterrows() ]

In [None]:
pd.crosstab(df_test.home_victory[test_disc>.2], 
            np.round(df_test.implied_prob_h_mid[test_disc>.2], decimals=1))

In [None]:
pd.crosstab(df_test.home_victory[test_disc>.2], 
            np.round(preds_lgbm[test_disc>.2], decimals=1))

Analysis 

- LV odds are "better" than our current model
- Largest discrepancies appear when we have a strong pitcher vs a weak pitcher
- LV probs seem to be "right" in those cases

CONCLUSION: Need to factor in the starting pitcher to improve our model!