In [2]:
import pandas as pd
import numpy as np
import mysql.connector
from sqlalchemy import create_engine
from sqlalchemy import text
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from datetime import time

def convert_odds(prob):
    if prob <= 0 or prob >= 1:
        return None  
    if prob >= 0.5:
        return int(-100 * prob / (1 - prob))
    else:
        return int(100 * (1 - prob) / prob)

training_date = 20250506
#end_date = 20240930

start_dt = pd.to_datetime(str(training_date), format="%Y%m%d")
end_dt = pd.to_datetime(str(training_date), format="%Y%m%d")

dates = pd.date_range(start=start_dt, end=end_dt)
loop_dates = dates.strftime('%Y%m%d').tolist()

prediction_data = pd.DataFrame(columns=['no05', 'yes05', 'no15', 'yes15','no25', 'yes25',
    'no35', 'yes35', 'no45', 'yes45', 'no55', 'yes55', 'no65', 'yes65', 'no75', 'yes75',
    'no85', 'yes85', 'no95', 'yes95'])

df_today_stg = pd.DataFrame(columns=['player_id', 'pitcher_name', 'team', 'opponent_team', 'game_id',
       'game_date', 'game_time', 'season', 'stadium', 'home_team',
       'game_number', 'ip_per_game', 'bb_per_game', 'so_per_game',
       'so_per_inning', 'batter_so_per_game', 'ab_per_game', 'batter_so_rate',
       'pa_per_game', 'batter_walk_per_game', 'adjusted_time'])

    
#start training
training_query = f'call training_data({training_date})'

engine = create_engine('mysql+pymysql://root:password@localhost/baseball_data')

connection = engine.connect()

with engine.connect() as conn:
    df = pd.read_sql(training_query, connection)

#df.head()
engine.dispose()

df_2 = df.drop(['game_id', 'player_id', 'game_date'], axis = 1)
#df_2.head()

df_2['adjusted_time'] = df_2['game_time'].apply(pd.to_datetime)
df_2['adjusted_time'] = df_2['adjusted_time'].apply(lambda d : d.time())
df_2['adjusted_time'] = df_2['adjusted_time'].apply(lambda d : d.hour)

df_opponents = pd.get_dummies(df_2['opponent_team'], dtype=int)
#df_opponents.head()

df_stadium = pd.get_dummies(df_2['stadium'], dtype=int)
df_stadium.head()

df_targets = pd.DataFrame(columns=['target_05', 'target_15', 'target_25', 'target_35', 'target_45', 'target_55',
                                  'target_65', 'target_75', 'target_85', 'target_95'])

df_targets['target_05'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 0.5, 1, 0)
df_targets['target_15'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 1.5, 1, 0)
df_targets['target_25'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 2.5, 1, 0)
df_targets['target_35'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 3.5, 1, 0)
df_targets['target_45'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 4.5, 1, 0)
df_targets['target_55'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 5.5, 1, 0)
df_targets['target_65'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 6.5, 1, 0)
df_targets['target_75'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 7.5, 1, 0)
df_targets['target_85'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 8.5, 1, 0)
df_targets['target_95'] = np.where(df_2['PitcherStrikeouts'].apply(pd.to_numeric) > 9.5, 1, 0)

#df_targets

df_final = df_2

ytrain05 = np.array(df_targets['target_05'])
ytrain15 = np.array(df_targets['target_15'])
ytrain25 = np.array(df_targets['target_25'])
ytrain35 = np.array(df_targets['target_35'])
ytrain45 = np.array(df_targets['target_45'])
ytrain55 = np.array(df_targets['target_55'])
ytrain65 = np.array(df_targets['target_65'])
ytrain75 = np.array(df_targets['target_75'])
ytrain85 = np.array(df_targets['target_85'])
ytrain95 = np.array(df_targets['target_95'])


Xtrain = np.array(df_final.drop(['pitcher_name', 'team', 'game_time', 'opponent_team',
                                'stadium', 'season', 'PitcherStrikeouts', 'game_number'], axis=1))

xgbModel05 = RandomForestClassifier(n_estimators = 200)
xgbModel05.fit(Xtrain, ytrain05)

xgbModel15 = RandomForestClassifier(n_estimators = 200)
xgbModel15.fit(Xtrain, ytrain15)

xgbModel25 = RandomForestClassifier(n_estimators = 200)
xgbModel25.fit(Xtrain, ytrain25)

xgbModel35 = RandomForestClassifier(n_estimators = 200)
xgbModel35.fit(Xtrain, ytrain35)

xgbModel45 = RandomForestClassifier(n_estimators = 200)
xgbModel45.fit(Xtrain, ytrain45)

xgbModel55 = RandomForestClassifier(n_estimators = 200)
xgbModel55.fit(Xtrain, ytrain55)

xgbModel65 = RandomForestClassifier(n_estimators = 200)
xgbModel65.fit(Xtrain, ytrain65)

xgbModel75 = RandomForestClassifier(n_estimators = 200)
xgbModel75.fit(Xtrain, ytrain75)

xgbModel85 = RandomForestClassifier(n_estimators = 200)
xgbModel85.fit(Xtrain, ytrain85)

xgbModel95 = RandomForestClassifier(n_estimators = 200)
xgbModel95.fit(Xtrain, ytrain95)

#start testing
test_query = f'call test_data({training_date})'

engine = create_engine('mysql+pymysql://root:password@localhost/baseball_data')

connection = engine.connect()

with engine.connect() as conn:
    df_today = pd.read_sql(test_query, connection)

engine.dispose()

df_today_stg = pd.concat([df_today_stg, df_today], axis=0)

df_today['adjusted_time'] = df_today['game_time'].apply(pd.to_datetime)
df_today['adjusted_time'] = df_today['adjusted_time'].apply(lambda d : d.time())
df_today['adjusted_time'] = df_today['adjusted_time'].apply(lambda d : d.hour)

df_today2 = df_today.drop(['game_time', 'team', 'player_id', 'stadium', 'opponent_team', 'game_id',
                          'game_date', 'season', 'pitcher_name', 'game_number'], axis=1)

df_today2 = df_today2.fillna(0)

df_dropped = df_final.drop(['pitcher_name', 'team', 'game_time', 'opponent_team',
                               'stadium', 'season', 'PitcherStrikeouts'], axis=1)

Xtest = np.array(df_today2)

try:
    preds05 = np.round(xgbModel05.predict_proba(Xtest),3)
except:
    pass
try:
    preds15 = np.round(xgbModel15.predict_proba(Xtest),3)
except: 
    pass
try:
    preds25 = np.round(xgbModel25.predict_proba(Xtest),3)
except:
    pass
try:
    preds35 = np.round(xgbModel35.predict_proba(Xtest),3)
except:
    pass
try:
    preds45 = np.round(xgbModel45.predict_proba(Xtest),3)
except:
    pass
try:
    preds55 = np.round(xgbModel55.predict_proba(Xtest),3)
except:
    pass
try:
    preds65 = np.round(xgbModel65.predict_proba(Xtest),3)
except:
    pass
try:
    preds75 = np.round(xgbModel75.predict_proba(Xtest),3)
except:
    pass
try:
    preds85 = np.round(xgbModel85.predict_proba(Xtest),3)
except:
    pass
try:
    preds95 = np.round(xgbModel95.predict_proba(Xtest),3)
except:
    pass

under05 = preds05[:,0].tolist()
under15 = preds15[:,0].tolist() 
under25 = preds25[:,0].tolist()
under35 = preds35[:,0].tolist()
under45 = preds45[:,0].tolist()
under55 = preds55[:,0].tolist()
under65 = preds65[:,0].tolist()
under75 = preds75[:,0].tolist()
under85 = preds85[:,0].tolist()
under95 = preds95[:,0].tolist()

over05 = preds05[:,1].tolist()
over15 = preds15[:,1].tolist() 
over25 = preds25[:,1].tolist()
over35 = preds35[:,1].tolist()
over45 = preds45[:,1].tolist()
over55 = preds55[:,1].tolist()
over65 = preds65[:,1].tolist()
over75 = preds75[:,1].tolist()
over85 = preds85[:,1].tolist()
over95 = preds95[:,1].tolist()

prediction_data_staging = pd.DataFrame({
    'over05': over05, 'under05': under05, 
    'over15': over15, 'under15': under15, 
    'over25': over25, 'under25': under25, 
    'over35': over35, 'under35': under35, 
    'over45': over45, 'under45': under45, 
    'over55': over55, 'under55': under55, 
    'over65': over65, 'under65': under65, 
    'over75': over75, 'under75': under75, 
    'over85': over85, 'under85': under85, 
    'over95': over95, 'under95': under95 
})

american_odds = pd.DataFrame({
    'over05_odds': [convert_odds(p) for p in over05], 'under05_odds': [convert_odds(p) for p in under05], 
    'over15_odds': [convert_odds(p) for p in over15], 'under15_odds': [convert_odds(p) for p in under15], 
    'over25_odds': [convert_odds(p) for p in over25], 'under25_odds': [convert_odds(p) for p in under25], 
    'over35_odds': [convert_odds(p) for p in over35], 'under35_odds': [convert_odds(p) for p in under35], 
    'over45_odds': [convert_odds(p) for p in over45], 'under45_odds': [convert_odds(p) for p in under45], 
    'over55_odds': [convert_odds(p) for p in over55], 'under55_odds': [convert_odds(p) for p in under55], 
    'over65_odds': [convert_odds(p) for p in over65], 'under65_odds': [convert_odds(p) for p in under65], 
    'over75_odds': [convert_odds(p) for p in over75], 'under75_odds': [convert_odds(p) for p in under75], 
    'over85_odds': [convert_odds(p) for p in over85], 'under85_odds': [convert_odds(p) for p in under85], 
    'over95_odds': [convert_odds(p) for p in over95], 'under95_odds': [convert_odds(p) for p in under95] 
})

#prediction_data = pd.concat([prediction_data, prediction_data_staging], axis=0)

df_output = pd.concat([df_today_stg.reset_index(drop=True), 
                       american_odds.reset_index(drop=True),
                       prediction_data_staging.reset_index(drop=True)], axis=1)

df_output

  df_today_stg = pd.concat([df_today_stg, df_today], axis=0)


Unnamed: 0,player_id,pitcher_name,team,opponent_team,game_id,game_date,game_time,season,stadium,home_team,...,over55,under55,over65,under65,over75,under75,over85,under85,over95,under95
0,10185,Patrick Corbin,TEX,BOS,144916,20250506,6:45PM,2025,Fenway Park,0,...,0.290,0.710,0.140,0.860,0.060,0.940,0.035,0.965,0.000,1.000
1,10196,Robbie Ray,SF,CHC,144919,20250506,7:40PM,2025,Wrigley Field,0,...,0.405,0.595,0.300,0.700,0.185,0.815,0.055,0.945,0.065,0.935
2,10228,Julio Teheran,NYM,ARI,144926,20250506,9:40PM,2025,Chase Field,0,...,0.085,0.915,0.020,0.980,0.035,0.965,0.005,0.995,0.005,0.995
3,10252,Kevin Gausman,TOR,LAA,144925,20250506,9:38PM,2025,Angel Stadium,0,...,0.440,0.560,0.045,0.955,0.015,0.985,0.010,0.990,0.005,0.995
4,10257,Wade Miley,MIL,HOU,144922,20250506,7:40PM,2025,American Family Field,1,...,0.005,0.995,0.010,0.990,0.000,1.000,0.000,1.000,0.000,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,78156,Austin Kitchen,MIA,LAD,144914,20250506,6:40PM,2025,LoanDepot Park,1,...,0.010,0.990,0.000,1.000,0.005,0.995,0.000,1.000,0.000,1.000
438,78236,Spencer Bivens,SF,CHC,144919,20250506,7:40PM,2025,Wrigley Field,0,...,0.025,0.975,0.030,0.970,0.005,0.995,0.000,1.000,0.005,0.995
439,78240,Jack Dreyer,LAD,MIA,144914,20250506,6:40PM,2025,LoanDepot Park,0,...,0.000,1.000,0.005,0.995,0.000,1.000,0.000,1.000,0.000,1.000
440,78290,Spencer Arrighetti,HOU,MIL,144922,20250506,7:40PM,2025,American Family Field,0,...,0.300,0.700,0.135,0.865,0.115,0.885,0.115,0.885,0.025,0.975


In [4]:
df_csv = df_output.drop(columns = ['player_id', 'team', 'opponent_team', 'game_id',
       'game_date', 'game_time', 'season', 'stadium', 'home_team',
       'game_number', 'ip_per_game', 'bb_per_game', 'so_per_game',
       'so_per_inning', 'batter_so_per_game', 'ab_per_game', 'batter_so_rate',
       'pa_per_game', 'batter_walk_per_game', 'adjusted_time'])

df_csv.head()

df_csv.to_csv('pitcher_odds_today.csv', index=False)

In [None]:
df_csv