In [132]:
#Importing Dependencies
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from pybaseball import team_batting

#Linear regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

#Logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#Data
from mega import model_df
from mega import unplayed_games




In [50]:
#Copying and Viewing our data
df=model_df.copy()

display(df.head())
display(df.tail())

Unnamed: 0,Gm#,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
0,1,L,D,A,MIL,31,0.258065,8,1,8,0.351351,4,38,0.315789,12,3,11,0.333333,1
1,2,W,N,A,MIL,32,0.28125,9,1,5,0.342857,3,37,0.378378,14,2,11,0.390244,2
2,3,W,N,A,MIL,34,0.264706,9,1,5,0.305556,2,37,0.405405,15,3,8,0.463415,4
3,4,W,D,H,SDP,33,0.212121,7,0,8,0.235294,1,0,0.0,0,0,0,0.0,0
4,5,W,N,H,SDP,33,0.242424,8,0,4,0.305556,3,0,0.0,0,0,0,0.0,0


Unnamed: 0,Gm#,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
1808,129,L,N,A,NYY,28,0.178571,5,2,5,0.233333,2,31,0.129032,4,0,9,0.205882,3
1809,130,W,D,A,NYY,29,0.172414,5,1,9,0.294118,4,39,0.333333,13,2,14,0.386364,4
1810,131,L,D,A,NYY,35,0.342857,12,5,6,0.410256,3,29,0.206897,6,0,8,0.323529,5
1811,132,W,N,H,MIA,34,0.264706,9,0,8,0.324324,3,28,0.214286,6,1,11,0.3125,4
1812,133,L,N,H,MIA,35,0.342857,12,2,5,0.428571,6,34,0.382353,13,4,8,0.405405,1


In [51]:
#Removing game number, index works just fine.
df=df.drop("Gm#",axis=1) 
display(df.head())
display(df.tail())

Unnamed: 0,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
0,L,D,A,MIL,31,0.258065,8,1,8,0.351351,4,38,0.315789,12,3,11,0.333333,1
1,W,N,A,MIL,32,0.28125,9,1,5,0.342857,3,37,0.378378,14,2,11,0.390244,2
2,W,N,A,MIL,34,0.264706,9,1,5,0.305556,2,37,0.405405,15,3,8,0.463415,4
3,W,D,H,SDP,33,0.212121,7,0,8,0.235294,1,0,0.0,0,0,0,0.0,0
4,W,N,H,SDP,33,0.242424,8,0,4,0.305556,3,0,0.0,0,0,0,0.0,0


Unnamed: 0,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
1808,L,N,A,NYY,28,0.178571,5,2,5,0.233333,2,31,0.129032,4,0,9,0.205882,3
1809,W,D,A,NYY,29,0.172414,5,1,9,0.294118,4,39,0.333333,13,2,14,0.386364,4
1810,L,D,A,NYY,35,0.342857,12,5,6,0.410256,3,29,0.206897,6,0,8,0.323529,5
1811,W,N,H,MIA,34,0.264706,9,0,8,0.324324,3,28,0.214286,6,1,11,0.3125,4
1812,L,N,H,MIA,35,0.342857,12,2,5,0.428571,6,34,0.382353,13,4,8,0.405405,1


## Going Through Linear and Logistic Regression Models

### Linear

In [52]:
#The first step is getting the already preworked data into a useable format. 
#This includes the binary values and the categorical.

#Converting the binary columns with dummies.
dummies= pd.get_dummies(df[['W/L', 'D/N', 'H/A']], drop_first=True).astype(int)
df[['W/L', 'D/N', 'H/A']]=dummies[["W/L_W","D/N_N","H/A_H"]]

df.head()

Unnamed: 0,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
0,0,0,0,MIL,31,0.258065,8,1,8,0.351351,4,38,0.315789,12,3,11,0.333333,1
1,1,1,0,MIL,32,0.28125,9,1,5,0.342857,3,37,0.378378,14,2,11,0.390244,2
2,1,1,0,MIL,34,0.264706,9,1,5,0.305556,2,37,0.405405,15,3,8,0.463415,4
3,1,0,1,SDP,33,0.212121,7,0,8,0.235294,1,0,0.0,0,0,0,0.0,0
4,1,1,1,SDP,33,0.242424,8,0,4,0.305556,3,0,0.0,0,0,0,0.0,0


In [53]:
#Now to convert the opponent column into values.

#Getting the label encoder
encoder=LabelEncoder()
#Fit and transform the categorical column
df['Opp'] = encoder.fit_transform(df['Opp'])
df.head()

Unnamed: 0,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
0,0,0,0,14,31,0.258065,8,1,8,0.351351,4,38,0.315789,12,3,11,0.333333,1
1,1,1,0,14,32,0.28125,9,1,5,0.342857,3,37,0.378378,14,2,11,0.390244,2
2,1,1,0,14,34,0.264706,9,1,5,0.305556,2,37,0.405405,15,3,8,0.463415,4
3,1,0,1,21,33,0.212121,7,0,8,0.235294,1,0,0.0,0,0,0,0.0,0
4,1,1,1,21,33,0.242424,8,0,4,0.305556,3,0,0.0,0,0,0,0.0,0


In [54]:
#Listing the columns to make sure the continuous columns are numeric in order to scale
column_list = df.columns.tolist()
categorical_cols={"W/L","D/N","H/A","Opp"}
continuous_cols=list(set(column_list) - categorical_cols)
continuous_cols


#Making sure any non numeric data gets converted to nans.
df[continuous_cols] = df[continuous_cols].apply(pd.to_numeric, errors='coerce')
#dropping those as they'd mess up the data
df=df.dropna()

In [56]:
#Now that the data is cleaned up we can train test split.
X=df.drop(columns="W/L")
y=df["W/L"]
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=69)

In [61]:
cat=["D/N","H/A","Opp"]
#Getting the scaler ready
scaler=StandardScaler()

#Fit and transform
x_train_scaled=scaler.fit_transform(X_train[continuous_cols])
x_test_scaled=scaler.transform(X_test[continuous_cols])

#Adding back the categorical data
x_train_scaled_df=pd.DataFrame(x_train_scaled,columns=continuous_cols,index=X_train.index)
x_train_final=pd.concat([x_train_scaled_df,X_train[cat]],axis=1)
x_train_final=x_train_final.dropna()

x_test_scaled_df=pd.DataFrame(x_test_scaled,columns=continuous_cols,index=X_test.index)
x_test_final=pd.concat([x_test_scaled_df,X_test[cat]],axis=1)
x_test_final=x_test_final.dropna()

In [62]:
#Making the model
lr=LinearRegression()
# Fit the model using the training data
lr.fit(x_train_final, y_train)

In [63]:
#Making predictions
prediction=lr.predict(x_test_final)

In [64]:
# Evaluate the model
mse = mean_squared_error(y_test, prediction)
r2 = r2_score(y_test, prediction)

print(f'Mean Squared Error: {mse}')
print(f'R² Score: {r2}')

Mean Squared Error: 0.14725280925342968
R² Score: 0.3955501553745513


In [70]:
lr.score(x_test_final, y_test)

0.3955501553745513

The MSE is a low value which indicates good performance. The R² score is low however.

### Logistic

In [78]:
# Initialize the logistic regression model
lrm = LogisticRegression(random_state=2,max_iter=1000)

# Fit the model using the training data
lrm.fit(x_train_final, y_train)

In [79]:
#Predicting 
y_train_pred = lrm.predict(x_train_final)
y_test_pred = lrm.predict(x_test_final)

In [85]:
#Scoring the model
lrm_train_score=accuracy_score(y_train, y_train_pred)
lrm_test_score=accuracy_score(y_test, y_test_pred)
print(f"lrm train score: {lrm_train_score}")
print(f"lrm test score: {lrm_test_score}")

lrm train score: 0.7917620137299771
lrm test score: 0.8127853881278538


In [89]:
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Confusion Matrix:
[[220  34]
 [ 48 136]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       254
           1       0.80      0.74      0.77       184

    accuracy                           0.81       438
   macro avg       0.81      0.80      0.81       438
weighted avg       0.81      0.81      0.81       438



The logistic regression model achieved 81% accuracy, effectively identifying both classes with good precision and recall, meaning it correctly predicted outcomes with a reasonable balance between false positives and false negatives.

### Comparing the models

In [92]:
print(f"The linear model scored {r2:.4f}")
print(f"The logistic model scored {lrm_test_score:.4f}")

The linear model scored 0.3956
The logistic model scored 0.8128


The logistic regression model outperformed the linear regression model with an accuracy score of 81.28% compared to 39.56%, indicating that the logistic model provided a more reliable classification of outcomes.

### Using the Logistic model on the unplayed games

In [105]:
#Bringing in the unplayed games data.
unplayed_df=unplayed_games.copy()
unplayed_df=unplayed_df.drop(columns="Gm#")
unplayed_df.head(3)

Unnamed: 0,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
1813,8:40 pm,Game Preview and Matchups,H,MIA,33,0.242424,8,1,10,0.285714,2,30,0.3,9,1,10,0.416667,6
1814,3:10 pm,Game Preview and Matchups,H,MIA,42,0.428571,18,2,7,0.489362,4,36,0.333333,12,3,10,0.390244,4
1815,8:40 pm,Game Preview and Matchups,H,BAL,-1,0.314286,-1,-1,6,-1.0,-1,33,0.30303,10,1,6,0.30303,0


In [137]:
#Converting D/N to D/N based on the time in the W/L Col
unplayed_df["D/N"]=unplayed_df["W/L"]
gametimes=unplayed_df["D/N"].unique().tolist()

In [138]:
# Define cutoff time
cutoff_time = datetime.strptime('6:00 pm', '%I:%M %p')

# Create the dictionary with inline logic
time_labels = {
    time: 'D' if datetime.strptime(time, '%I:%M %p') < cutoff_time else 'N'
    for time in gametimes
}
#Converting the times based on the time to d/n
unplayed_df["D/N"]=unplayed_df["D/N"].map(time_labels)
unplayed_df.head()

Unnamed: 0,W/L,D/N,H/A,Opp,COL_at_bats,COL_ba,COL_hits,COL_hr,COL_kk,COL_obp,COL_walks,Opp_at_bats,Opp_ba,Opp_hits,OPP_HR_Column,OPP_kk,Opp_obp,Opp_walks
1813,8:40 pm,N,H,6,33,0.242424,8,1,10,0.285714,2,30,0.3,9,1,10,0.416667,6
1814,3:10 pm,D,H,6,42,0.428571,18,2,7,0.489362,4,36,0.333333,12,3,10,0.390244,4
1815,8:40 pm,N,H,2,-1,0.314286,-1,-1,6,-1.0,-1,33,0.30303,10,1,6,0.30303,0
1816,8:10 pm,N,H,2,-1,-1.0,-1,-1,-1,-1.0,-1,29,0.310345,9,0,9,0.393939,4
1817,3:10 pm,D,H,2,-1,-1.0,-1,-1,-1,-1.0,-1,30,0.2,6,0,11,0.25,2


In [106]:
opp_list=unplayed_df["Opp"].unique().tolist()
opp_list

['MIA', 'BAL', 'ATL', 'MIL', 'DET', 'CHC', 'ARI', 'LAD', 'STL']

In [145]:
# Get the team batting stats for the current season (you can specify a year if needed)
team_stats = team_batting(start_season=2024)
ts_cl=team_stats.columns.tolist()
team_stats.head()
# ts_cl

Unnamed: 0,teamIDfg,Season,Team,Age,G,AB,PA,H,1B,2B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,9,2024,NYY,29,1994,4725,5405,1191,747,216,...,120.0,1544,0.421,3664,0.177,0.273,,,,30.2
1,15,2024,ARI,28,2091,4770,5418,1253,808,235,...,117.0,1467,0.388,3780,0.172,0.265,,,,27.9
2,22,2024,LAD,30,2055,4770,5395,1211,742,256,...,119.2,1492,0.409,3650,0.159,0.266,,,,27.6
3,2,2024,BAL,28,2081,4844,5380,1221,746,235,...,114.4,1584,0.425,3727,0.159,0.266,,,,26.2
4,26,2024,PHI,29,1977,4764,5320,1229,797,242,...,115.6,1452,0.398,3646,0.156,0.269,,,,22.8


In [152]:
#Setting up a new smaller df for the averages
avg_df=team_stats[["Team","AVG","OBP","HR","H"]]
avg_df=avg_df.rename(columns={"AVG":"ba"})
avg_df.head()

Unnamed: 0,Team,ba,OBP,HR,H
0,NYY,0.252,0.335,213,1191
1,ARI,0.263,0.336,177,1253
2,LAD,0.254,0.33,190,1211
3,BAL,0.252,0.318,209,1221
4,PHI,0.258,0.327,169,1229


In [154]:
#Converting everything to averages so that the log reg model will be more applicable
avg_df['at_bat'] = team_stats['AB'] / team_stats['G']
avg_df['kk']=team_stats['SO']/team_stats['G']
avg_df['walks'] = team_stats['BB'] / team_stats['G']

avg_df.head()

Unnamed: 0,Team,ba,OBP,HR,H,at_bat,kk,walks
0,NYY,0.252,0.335,213,1191,2.369609,0.566199,0.285356
1,ARI,0.263,0.336,177,1253,2.281205,0.517456,0.232903
2,LAD,0.254,0.33,190,1211,2.321168,0.573236,0.249635
3,BAL,0.252,0.318,209,1221,2.327727,0.560308,0.20519
4,PHI,0.258,0.327,169,1229,2.409712,0.589277,0.230147


In [None]:
#Encoding the opp col
unplayed_df['Opp'] = encoder.fit_transform(unplayed_df['Opp'])
unplayed_df.head(3)