In [2]:
from pymongo import MongoClient
import pandas as pd
from sklearn.linear_model import LogisticRegression

mongo_uri = "mongodb+srv://Liz:<password>@cluster65664.az03p.mongodb.net/"

client = MongoClient(mongo_uri)

db = client['Project-CSE-482']

# Use season 24
testSeason = ['AFC 24', 'NFC 24'] 

#All teams that made it to the superbowl from 15-24
superBowlTeams = {'AFC 15' : "Denver Broncos", 'NFC 15' : "Carolina Panthers", 'AFC 16' : "New England Patriots", 'NFC 16' : "Los Angeles Rams", 'AFC 17' : "New England Patriots", 'NFC 17' : "Philadelphia Eagles",
                 'AFC 18' : "New England Patriots", 'NFC 18' : "Los Angeles Rams", 'AFC 19' : "Kansas City Chiefs", 'NFC 19' : "San Francisco 49ers", 'AFC 20' : "Kansas City Chiefs", 'NFC 20' : "Tampa Bay Buccaneers", 'AFC 21' : "Cincinnati Bengals", 'NFC 21' : "Los Angeles Rams",
                 'AFC 22' : "Kansas City Chiefs", 'NFC 22' : "Philadelphia Eagles", 'AFC 23' : "Kansas City Chiefs", 'NFC 23' : "San Francisco 49ers", 'AFC 24' : "Kansas City Chiefs", 'NFC 24' : "Philadelphia Eagles"}
            
trainData = []
testData = []

collectionNames = db.list_collection_names()
# Seperate the training(seasons 15-23) from testing(season 24)
for file in collectionNames:
    if file != 'Final Data':
        con = file[:3]
        year = file[-2:]

        collection = db[file]
        for doc in collection.find():
            # Get rid of these columns if present
            doc.pop('3rdDown%', None)
            doc.pop('4thDown%', None)

            # Add year and conference column 
            # Conference: AFC = 1, NFC = 0
            doc['Year'] = year
            doc["Conference"] = 1 if con == "AFC" else 0

            # Add finals made column 
            superBowlTeam = superBowlTeams[con + " " + year]
            doc['Finals Made'] = 1 if doc['Tm'] == superBowlTeam else 0 
            
            if 'EXP' in doc:
                doc['EXP'] = doc.pop('EXP')
            elif '\n EXP\n' in doc:
                doc['EXP'] = doc.pop('\n EXP\n')

            # Either add to train or test set
            if file not in testSeason:
                trainData.append(doc)
            else:
                testData.append(doc)

In [3]:
# Create dataframes
dataTrain = pd.DataFrame(trainData)
dataTest = pd.DataFrame(testData)

# Create csvs
dataTrain.to_csv("Data Train", index=False)
dataTest.to_csv("Data Test", index=False)

# Replace Na values with 0 to avoid error with model 
dataTrain = dataTrain.fillna(0)

# Non numeric values and target column 
dropColumns = ['_id', 'Tm', 'Finals Made']

# Training and testing data
outputs_train = dataTrain['Finals Made']
inputs_train = dataTrain.drop(dropColumns, axis=1)

outputs_test = dataTest['Finals Made']
inputs_test = dataTest.drop(dropColumns, axis=1)

# Delete columns that are not contained in all data sets 
if str(inputs_train.columns[-1]).strip() == 'EXP' and str(inputs_train.columns[-2]).strip() == 'EXP':
    inputs_train = inputs_train.drop(columns=[inputs_train.columns[-1]])
    
if str(inputs_test.columns[-4]).strip() == 'EXP':
    inputs_test = inputs_test.drop(columns=[inputs_test.columns[-4]])

if 'T' in inputs_train.columns:
    inputs_train = inputs_train.drop('T', axis=1)

if 'T' in inputs_test.columns:
    inputs_test = inputs_test.drop('T', axis=1)

In [4]:
# Create model and train on seasons 15-23
model = LogisticRegression(max_iter=11000)
model.fit(inputs_train, outputs_train)

# Get the predicted probabilities of model choosing 0 or 1
predicted_probs = model.predict_proba(inputs_test)

# Predicted probabilites of model choosing 1 for every team in test data
probs_of_1 = predicted_probs[:, 1]

# Add probs and conference to a temp DataFrame for easy filtering
results_df = dataTest.copy()
results_df['Prob'] = probs_of_1
results_df['Index'] = results_df.index  # keep track of original indices

# Separate into AFC and NFC
afc_df = results_df[results_df['Conference'] == 1]
nfc_df = results_df[results_df['Conference'] == 0]

# Sort each by descending probability
top_2_afc = afc_df.sort_values(by='Prob', ascending=False).head(2)
top_2_nfc = nfc_df.sort_values(by='Prob', ascending=False).head(2)

# Combine
top_4_teams = pd.concat([top_2_afc, top_2_nfc])

# Just show team name and conference
print("Top 2 predicted teams from AFC and NFC:")
print(top_4_teams[['Tm', 'Conference', 'Prob']])

Top 2 predicted teams from AFC and NFC:
                      Tm  Conference      Prob
29  Los Angeles Chargers           1  0.595441
16         Buffalo Bills           1  0.002245
0    Philadelphia Eagles           0  0.999064
5      Minnesota Vikings           0  0.005010


In [5]:
# Calculate accuracy 
from sklearn.metrics import roc_auc_score

# Actual Super Bowl teams (true labels)
actual_teams = dataTest.loc[outputs_test == 1, 'Tm'].tolist()

# Predicted top 4 teams from model
predicted_teams = top_4_teams['Tm'].tolist()

# Count how many of the predicted teams are correct
correct = sum(1 for team in predicted_teams if team in actual_teams)

print(f"number correct: {correct}")
print(f"accuracy: {correct / 2}")

roc_auc = roc_auc_score(outputs_test, probs_of_1)
print(roc_auc)

number correct: 1
accuracy: 0.5
0.8833333333333333
