In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

### Import and reorganize data

In [2]:
split_stats = pd.read_pickle('training_data/split_stats_training_data.pkl')

In [3]:
stats_to_compare = ['batting_doubles', 'batting_triples', 'batting_homeRuns',
                    'batting_strikeOuts', 'batting_baseOnBalls', 'batting_hits', 
                    'batting_avg', 'batting_atBats', 'batting_obp', 'batting_slg', 
                    'batting_ops', 'batting_stolenBases', 'batting_rbi', 'batting_leftOnBase',
                    'pitching_runs', 'pitching_doubles', 'pitching_triples', 
                    'pitching_homeRuns', 'pitching_strikeOuts', 'pitching_baseOnBalls', 
                    'pitching_hits', 'pitching_atBats', 'pitching_obp', 'pitching_stolenBases', 
                    'pitching_numberOfPitches', 'pitching_era', 'pitching_inningsPitched', 
                    'pitching_earnedRuns', 'pitching_pitchesThrown', 'pitching_strikes', 
                    'pitching_rbi', 'win_percentage']

df_differential = split_stats[['game_id', 'date', 'home_team_name', 'away_team_name', 'home_id', 'away_id', 'home_score', 'away_score', 'run_differential', 'home_win']].copy()

for stat in stats_to_compare:
    home_col = f'home_{stat}'
    away_col = f'away_{stat}'
    diff_col = f'{stat}'
    
    if home_col in split_stats.columns and away_col in split_stats.columns:
        df_differential[diff_col] = split_stats[home_col] - split_stats[away_col]

In [4]:
split_stats_to_compare = []
for stat in stats_to_compare:
    split_stats_to_compare.append(f'home_{stat}')
    split_stats_to_compare.append(f'away_{stat}')

In [5]:
for i, stat in enumerate(stats_to_compare):
    stats_to_compare[i] = f'diff_{stat}'

In [None]:
df_differential['date'] = pd.to_datetime(df_differential['date'], errors='coerce')
early_season_removed_diff = df_differential[~df_differential['date'].dt.month.isin([3, 4])]
for col in early_season_removed_diff.columns:
    if early_season_removed_diff[col].dtype == 'object':
        early_season_removed_diff[col] = pd.to_numeric(early_season_removed_diff[col], errors='coerce')

In [None]:
split_stats['date'] = pd.to_datetime(split_stats['date'], errors='coerce')
early_season_removed = split_stats[~split_stats['date'].dt.month.isin([3, 4])]
for col in early_season_removed.columns:
    if early_season_removed[col].dtype == 'object':
        early_season_removed[col] = pd.to_numeric(early_season_removed[col], errors='coerce')

In [None]:
early_season_removed['line_diff'] = (early_season_removed['run_differential'] > 1.5).astype(int)
early_season_removed_diff['line_diff'] = (early_season_removed_diff['run_differential'] > 1.5).astype(int)

In [9]:
strong_corr_diff_stats = [
    'win_percentage',
    'batting_rbi',
    'batting_homeRuns',
    'batting_baseOnBalls',
    'pitching_strikeOuts',
    'batting_ops',
    'batting_obp',
    'batting_slg',
    'pitching_inningsPitched',
    'pitching_pitchesThrown',
    'pitching_numberOfPitches',
    'pitching_atBats',
    'pitching_homeRuns',
    'pitching_doubles',
    'pitching_era',
    'pitching_baseOnBalls',
    'pitching_hits',
    'pitching_obp',
    'pitching_earnedRuns',
    'pitching_rbi',
    'pitching_runs'
]

### make eda graphs

In [None]:
numeric_df = early_season_removed_diff.select_dtypes(include=['number'])
correlation_matrix = numeric_df.corr()[['run_differential']]
sorted_correlation_matrix = correlation_matrix.sort_values(by='run_differential', ascending=False)
filtered_correlation_matrix = sorted_correlation_matrix[
    ((sorted_correlation_matrix['run_differential'] > 0.1) & (sorted_correlation_matrix['run_differential'] <= 0.2)) |
    ((sorted_correlation_matrix['run_differential'] < -0.1) & (sorted_correlation_matrix['run_differential'] >= -0.2))
]
purple_palette = sns.diverging_palette(270, 360, s=80, l=70, as_cmap=True)
plt.figure(figsize=(8, max(6, len(filtered_correlation_matrix) // 2)))
sns.heatmap(
    filtered_correlation_matrix, annot=True, cmap=purple_palette, cbar=True, center=0,
    annot_kws={"size": 10, "weight": "bold", "color": "black"}, linewidths=0.3, linecolor="lavender", fmt=".2f"
)
plt.xticks(color="black", fontsize=10, rotation=0, weight='bold')
plt.yticks(color="black", fontsize=10, rotation=0, weight='bold')
cbar = plt.colorbar()
cbar.set_label('Correlation Coefficient', color="black", fontsize=12, weight='bold')
cbar.ax.tick_params(labelsize=10, color="black")
plt.tight_layout()
plt.show()


In [None]:
x1 = early_season_removed_diff['win_percentage']
x2 = early_season_removed_diff['pitching_obp']
y = early_season_removed_diff['run_differential']
slope1, intercept1 = np.polyfit(x1, y, 1)
line1 = slope1 * x1 + intercept1
slope2, intercept2 = np.polyfit(x2, y, 1)
line2 = slope2 * x2 + intercept2
scatter_color = '#CBB6E4'
line_color = '#6A0DAD'
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6), dpi=100)
ax1.scatter(x1, y, color=scatter_color, alpha=0.5, s=60)
ax1.plot(x1, line1, color=line_color, linewidth=2)
ax1.set_xlabel('Win Percentage', fontsize=12)
ax1.set_ylabel('Run Differential', fontsize=12)
ax1.grid(True, linestyle='--', alpha=0.6)
ax2.scatter(x2, y, color=scatter_color, alpha=0.5, s=60)
ax2.plot(x2, line2, color=line_color, linewidth=2)
ax2.set_xlabel('Pitching OBP', fontsize=12)
ax2.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
background_color = '#f3e8ff'
original_color = '#8e44ad'
removed_color = '#a5d6f9' 
edge_color = '#4b306e'
grid_color = '#d0c2e6'
plt.figure(figsize=(10, 7), facecolor=background_color)
plt.hist(split_stats['home_win_percentage'], bins=25, color=original_color, edgecolor=edge_color, alpha=0.5, label='Original Data')
plt.hist(early_season_removed['home_win_percentage'], bins=25, color=removed_color, edgecolor=edge_color, alpha=0.6, label='Variable Early Season Removed')
plt.xlabel('Home Team Win Percentage', fontsize=14, color=edge_color)
plt.ylabel('Frequency', fontsize=14, color=edge_color)
plt.xlim(0, 1)
plt.grid(visible=True, which='major', linestyle='--', linewidth=0.7, color=grid_color)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.legend()
plt.show()

### Model Creation

In [None]:
X = early_season_removed[split_stats_to_compare]
y = early_season_removed['line_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

In [None]:
X = early_season_removed[split_stats_to_compare]
y = early_season_removed['line_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
print("Feature Importances:", model.feature_importances_)

In [None]:
X = early_season_removed[split_stats_to_compare]
y = early_season_removed['line_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
print("Feature Importances:", model.feature_importances_)

In [None]:
X = early_season_removed[split_stats_to_compare]
y = early_season_removed['line_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = Sequential()
model.add(Dense(16, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


In [None]:
models = ['Logistic Regression', 'Random Forest', 'Gradient Boosting', 'Neural Network']
accuracies = [64.0, 57.5, 58.9, 64.1]

sorted_indices = sorted(range(len(accuracies)), key=lambda i: accuracies[i], reverse=True)
models_sorted = [models[i] for i in sorted_indices]
accuracies_sorted = [accuracies[i] for i in sorted_indices]
plt.figure(figsize=(10, 6))
bars = plt.bar(models_sorted, accuracies_sorted, color='#D7BDE2', edgecolor='black')
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.5, f"{yval:.1f}%", ha='center', va='bottom', fontsize=10)
plt.ylabel("Accuracy (%)", fontsize=12)
plt.ylim(55, 70)
plt.show()