In [8]:
import numpy as np
import pandas as pd
import xgboost as xgb
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score, log_loss, r2_score
from sklearn.feature_selection import mutual_info_classif


# load in March Madness data, target is for regression, target2 is for classification
df = pd.read_csv('MarchMadnessData2024.csv')
df['team_1_win'] = (df['margin'] > 0).astype(int)
df.head()

Unnamed: 0,Win-Loss Percentage_team1,SRS_team1,SOS_team1,Team Points_team1,Opponent Ponts_team1,Minutes Played_team1,FG_team1,FGA_team1,FG%_team1,3P_team1,...,AST_team2,STL_team2,BLK_team2,TOV_team2,PF_team2,margin,team1_location_Away,team1_location_Home,team1_location_Neutral,team_1_win
0,0.69,5.94,-2.1,2283.0,2015.0,1170.0,790.0,1854.0,0.426,269.0,...,498.0,207.0,100.0,339.0,495.0,-5.0,True,False,False,0
1,0.643,-2.4,-3.28,2002.0,1858.0,1125.0,737.0,1689.0,0.436,201.0,...,498.0,207.0,100.0,339.0,495.0,7.0,False,True,False,1
2,0.481,-10.76,-7.92,2002.0,2025.0,1085.0,699.0,1724.0,0.405,243.0,...,498.0,207.0,100.0,339.0,495.0,19.0,False,True,False,1
3,0.483,-5.73,-2.62,2042.0,2081.0,1180.0,735.0,1695.0,0.434,221.0,...,498.0,207.0,100.0,339.0,495.0,13.0,False,True,False,1
4,0.778,3.39,-3.81,2256.0,1992.0,1085.0,819.0,1754.0,0.467,295.0,...,418.0,143.0,91.0,253.0,403.0,-16.0,False,False,True,0


In [9]:
features = ['Win-Loss Percentage_team1', 'Win-Loss Percentage_team2', 'SRS_team1', 'SRS_team2', 'SOS_team1', 'SOS_team2',
            'FG%_team1', 'FG%_team2', '3P%_team1', '3P%_team2', 'TOV_team1', 'TOV_team2', 'ORB_team1', 'ORB_team2']

y = df['team_1_win']
X = df[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras

# Dictionary to store results
results = {}

# Logistic Regression Model
log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)
log_probs = log_model.predict_proba(X_test_scaled)[:, 1]
results['Logistic Regression'] = log_loss(y_test, log_probs)

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)  # No scaling needed for trees
rf_probs = rf_model.predict_proba(X_test)[:, 1]
results['Random Forest'] = log_loss(y_test, rf_probs)

# XGBoost Model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
results['XGBoost'] = log_loss(y_test, xgb_probs)

# Neural Network Model (MLP)
nn_model = keras.Sequential([
    keras.layers.Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')  # Output is probability
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy')
nn_model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, verbose=0)
nn_probs = nn_model.predict(X_test_scaled).flatten()
results['Neural Network'] = log_loss(y_test, nn_probs)

# Print results
for model, score in sorted(results.items(), key=lambda x: x[1]):
    print(f"{model}: Log Loss = {score:.4f}")

Parameters: { "use_label_encoder" } are not used.

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Neural Network: Log Loss = 0.5845
Logistic Regression: Log Loss = 0.6117
Random Forest: Log Loss = 0.6963
XGBoost: Log Loss = 0.9705
