In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, explained_variance_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.svm import SVC
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

# load in March Madness data, target is for regression, target2 is for classification
df = pd.read_csv('MarchMadnessData2024.csv')
target = df['margin']
target2 = (df['margin'] > 0)

# drop target column
X = df.drop(columns = ['margin'])

# shuffle the data set
shuffled_indices = np.random.permutation(X.index)
X = X.iloc[shuffled_indices].reset_index(drop=True)

print(X)

      Win-Loss Percentage_team1  SRS_team1  SOS_team1  Team Points_team1  \
0                         0.370     -10.00      -5.72             1880.0   
1                         0.519      10.08      11.45             2130.0   
2                         0.926      30.23      14.04             2277.0   
3                         0.630      14.21       9.95             2099.0   
4                         0.667      22.46      13.24             2313.0   
...                         ...        ...        ...                ...   
1108                      0.519      14.05      10.31             1982.0   
1109                      0.593      17.25      10.62             2054.0   
1110                      0.481      11.26       9.12             2013.0   
1111                      0.714      19.63       9.49             2093.0   
1112                      0.815      25.48      11.63             2016.0   

      Opponent Ponts_team1  Minutes Played_team1  FG_team1  FGA_team1  \
0             

In [2]:
max_score = 0
 # shuffle the data set
shuffled_indices = np.random.permutation(X.index)
X = X.iloc[shuffled_indices].reset_index(drop=True)

scaler = MinMaxScaler()
X_norm = scaler.fit_transform(X)

for components in range(14, 47): 
    # create a pca model for each number of components
    pca = PCA(n_components = components)
    X_pca = pca.fit_transform(X_norm)

    # calculate explained variance
    explained_variance = sum(pca.explained_variance_ratio_)
    
    # split data into train and test
    trainX, testX, trainY, testY = train_test_split(X_pca, target2, test_size = 0.2)
    scorerVar = make_scorer(f1_score, pos_label = 1)

    # create and fit svm model, print initial score
    svm_model = SVC()
    svm_model.fit(X_pca, target2)
    svm_model.predict(testX)
    print("Initial score for svm is ", svm_model.score(testX, testY), " for ", components, " components")
    
    # get the cross value score for svm, if it's better than the max score then set it to max
    svm_cv_score = cross_val_score(svm_model, X_pca, target2, cv = 10, scoring = scorerVar)
    print("Cross Validation Score for SVM: ", np.mean(svm_cv_score), " for ", components, " components")
    print()
    if np.mean(svm_cv_score) > max_score:
        max_score = np.mean(svm_cv_score)
        print("New max: ", max_score, " at ", components, "components")
    
    # repeat with logistic regression model
    lr_model = LogisticRegression()
    lr_model.fit(trainX, trainY)
    lr_model.predict(testX)
    print("Initial score for LogRegression is ", lr_model.score(testX, testY), " for ", components, " components")

    lr_cv_score = cross_val_score(lr_model, X_pca, target2, cv = 7, scoring = scorerVar)
    print("Cross Validation Score for LogRegression: ", np.mean(lr_cv_score), " for ", components, "components")
    print()
    if np.mean(lr_cv_score) > max_score:
        max_score = np.mean(lr_cv_score)
        print("New max: ", max_score, " at ", components, "components")
    
# # Best Score: Logistic Regression at 25 components: cv_score = 0.8410353628455827

Initial score for svm is  0.6771300448430493  for  14  components
Cross Validation Score for SVM:  0.7965262853548325  for  14  components

New max:  0.7965262853548325  at  14 components
Initial score for LogRegression is  0.6681614349775785  for  14  components
Cross Validation Score for LogRegression:  0.7960940200483926  for  14 components

Initial score for svm is  0.7040358744394619  for  15  components
Cross Validation Score for SVM:  0.7952148099449963  for  15  components

Initial score for LogRegression is  0.6860986547085202  for  15  components
Cross Validation Score for LogRegression:  0.79782557152017  for  15 components

New max:  0.79782557152017  at  15 components
Initial score for svm is  0.6591928251121076  for  16  components
Cross Validation Score for SVM:  0.7954500848536671  for  16  components

Initial score for LogRegression is  0.6591928251121076  for  16  components
Cross Validation Score for LogRegression:  0.7969581497750307  for  16 components

Initial sco

In [3]:
team1 = [
        'Connecticut', 'Fla Atlantic', 'San Diego St', 'Auburn', 'BYU', 'Illinois', 'Wash State', 'Iowa St', 
        'North Carolina', 'North Carolina', 'Miss State', 'St Marys', 'Alabama', 'Clemson', 'Baylor', 'Dayton', 'Arizona', 
        'Houston', 'Nebraska', 'Wisconsin', 'Duke', 'Texas Tech', 'Kentucky', 'Florida', 'Florida', 'Marquette', 
        'Purdue', 'Purdue', 'Utah St', 'Gonzaga', 'Kansas', 'S Carolina', 'Creighton', 'Texas', 'Texas', 'Tennessee'
        ]

team2 = [
        'Stetson', 'Northwestern', 'UAB', 'Yale', 'Duquesne', 'Morehead St', 'Drake', 'S Dakota St', 
        'Wagner', 'Howard', 'Michigan St', 'Grd Canyon', 'Col Charlestn', 'New Mexico', 'Colgate', 'Nevada', 'Lg Beach St', 
        'Longwood', 'Texas A&M', 'James Mad', 'Vermont', 'NC State', 'Oakland', 'Boise St', 'Colorado', 'W Kentucky', 
        'Montana St', 'Grambling St', 'TX Christian', 'McNeese St', 'Samford', 'Oregon', 'Akron', 'Virginia', 'Colorado St', 'St Peters'
        ]

location = ['neutral','home','away','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral',
            'neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral'
             'neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral','neutral'
        ]
# fill in teams and locations
print(len(team1))
print(len(team2))
print(len(location))

36
36
36


In [4]:
data_dump = pd.read_csv('march_madness_data_dump.csv')

df = pd.DataFrame({'team1':team1, 'team2':team2, 'team1_location':location})

# merge data dump with game data to predict
merge1 = pd.merge(df, data_dump, left_on = 'team1', right_on = 'School', how = 'left')
final = pd.merge(merge1, data_dump, left_on = 'team2', right_on = 'School', how = 'left', suffixes = ('_team1', '_team2'))

# use get_dummies() on team1_location, so that classification can be done 
final = pd.get_dummies(final, columns = ['team1_location'])

# remove all rows containing null or empty values
final = final.drop(columns = ['team1', 'team2', 'Rk_team1', 'Rk_team2', 'School_team1', 'School_team2'])

# MinMaxScalar was found to be the best normalizer option
scaler = MinMaxScaler()
final_norm = scaler.fit_transform(final)
X_norm = scaler.fit_transform(X)

# pca with 25 components was the best option
pca = PCA(n_components = 25)
final_pca = pca.fit_transform(final_norm)
X_pca = pca.fit_transform(X_norm)

X_train, X_test, y_train, y_test = train_test_split(X_pca, target2, test_size=0.25)

X_train = scaler.fit_transform(X_train)

# Create and fit LogRegression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# check the cross value score
scorerVar = make_scorer(f1_score, pos_label = 1)
lr_cv_score = cross_val_score(lr_model, X_pca, target2, cv = 10, scoring = scorerVar)
print("Cross Val Score: ", np.mean(lr_cv_score))

# predictions will output whether or not team1 wins each game
predictions = lr_model.predict(final_pca)

print(len(predictions))
print(pd.DataFrame({'team1':team1, 'team2':team2, 'predictions':predictions}))

Cross Val Score:  0.7914450693190804
36
             team1          team2  predictions
0      Connecticut        Stetson         True
1     Fla Atlantic   Northwestern         True
2     San Diego St            UAB         True
3           Auburn           Yale        False
4              BYU       Duquesne         True
5         Illinois    Morehead St         True
6       Wash State          Drake         True
7          Iowa St    S Dakota St         True
8   North Carolina         Wagner         True
9   North Carolina         Howard         True
10      Miss State    Michigan St         True
11        St Marys     Grd Canyon         True
12         Alabama  Col Charlestn         True
13         Clemson     New Mexico         True
14          Baylor        Colgate         True
15          Dayton         Nevada         True
16         Arizona    Lg Beach St         True
17         Houston       Longwood         True
18        Nebraska      Texas A&M         True
19       Wisconsin  