In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split


# load in March Madness data, target is for regression, target2 is for classification
df = pd.read_csv('MarchMadnessData2024.csv')
df['team_1_win'] = (df['margin'] > 0).astype(int)
df.head()

In [None]:
def NNModel(X_train, y_train, X_test, y_test):
    # Build the neural network model
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='sigmoid')  # Sigmoid for binary classification
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

    # Evaluate model performance
    loss, accuracy = model.evaluate(X_test, y_test)
    with open("results.txt", "a") as file:
        file.write(f"NN Test Accuracy: {accuracy:.4f} \n")

    # Predict probabilities for new games
    predictions = model.predict(X_test)
    predicted_labels = (predictions > 0.5).astype(int)

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

def XGBoost(X_train, y_train, X_test, y_test):
    model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    with open("results.txt", "a") as file:
        file.write(f"XGBoost Test Accuracy: {accuracy:.4f} \n")

In [None]:
X = df.drop(columns = ['margin', 'team_1_win'])
Y = df['team_1_win']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Scale features
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

methods = {
    "standard_scaling": StandardScaler(),
    "normalization": MinMaxScaler(),
    "pca_10": PCA(n_components=10),  # Adjust components as needed
    "pca_5": PCA(n_components=5)
}

# Perform train-test split and apply each preprocessing technique
for method_name, processor in methods.items():
    with open("results.txt", "a") as file:
        file.write(f"Using {method_name} \n")

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    # Fit and transform only on training data, then apply the same transformation to test data
    X_train_transformed = processor.fit_transform(X_train)
    X_test_transformed = processor.transform(X_test)
    
    for i in range(5, 20):
        with open("results.txt", "a") as file:
            file.write(f"Num Components: {i} \n")
        pca = PCA(n_components=i)
        X_train = pca.fit_transform(X_train_transformed)
        X_test = pca.fit_transform(X_test_transformed)

        NNModel(X_train, y_train, X_test, y_test)
        XGBoost(X_train, y_train, X_test, y_test)

In [None]:
# best model was XGBoost with 16 components and MinMaxScalar