In [None]:
from pymongo import MongoClient
import pandas as pd
from sklearn.linear_model import LogisticRegression

mongo_uri = "mongodb+srv://liz:<password>@cluster65664.az03p.mongodb.net/"

client = MongoClient(mongo_uri)

db = client['Project-CSE-482']

# Use season 24
testSeason = ['AFC 24', 'NFC 24'] 

#All teams that made it to the superbowl from 15-24
superBowlTeams = {'AFC 15' : "Denver Broncos", 'NFC 15' : "Carolina Panthers", 'AFC 16' : "New England Patriots", 'NFC 16' : "Los Angeles Rams", 'AFC 17' : "New England Patriots", 'NFC 17' : "Philadelphia Eagles",
                 'AFC 18' : "New England Patriots", 'NFC 18' : "Los Angeles Rams", 'AFC 19' : "Kansas City Chiefs", 'NFC 19' : "San Francisco 49ers", 'AFC 20' : "Kansas City Chiefs", 'NFC 20' : "Tampa Bay Buccaneers", 'AFC 21' : "Cincinnati Bengals", 'NFC 21' : "Los Angeles Rams",
                 'AFC 22' : "Kansas City Chiefs", 'NFC 22' : "Philadelphia Eagles", 'AFC 23' : "Kansas City Chiefs", 'NFC 23' : "San Francisco 49ers", 'AFC 24' : "Kansas City Chiefs", 'NFC 24' : "Philadelphia Eagles"}
            
trainData = []
testData = []

collectionNames = db.list_collection_names()
# Seperate the training(seasons 15-23) from testing(season 24)
for file in collectionNames:
    if file != 'Final Data':
        con = file[:3]
        year = file[-2:]

        collection = db[file]
        for doc in collection.find():
            # Get rid of these columns if present
            doc.pop('3rdDown%', None)
            doc.pop('4thDown%', None)

            # Add year and conference column 
            # Conference: AFC = 1, NFC = 0
            doc['Year'] = year
            doc["Conference"] = 1 if con == "AFC" else 0

            # Add finals made column 
            superBowlTeam = superBowlTeams[con + " " + year]
            doc['Finals Made'] = 1 if doc['Tm'] == superBowlTeam else 0 

            # Either add to train or test set
            if file not in testSeason:
                trainData.append(doc)
            else:
                testData.append(doc)

In [93]:
# Create dataframes
dataTrain = pd.DataFrame(trainData)
dataTest = pd.DataFrame(testData)

# Create csvs
dataTrain.to_csv("Data Train", index=False)
dataTest.to_csv("Data Test", index=False)

# Replace Na values with 0 to avoid error with model 
dataTrain = dataTrain.fillna(0)

teamInfoList = ['Tm', 'Year', 'Conference']
teamInfoTrain = dataTrain[teamInfoList]
teamInfoTest = dataTest[teamInfoList]

# Non numeric values and target column 
dropColumns = ['_id', 'Tm', 'Finals Made']

# Training and testing data
outputs_train = dataTrain['Finals Made']
inputs_train = dataTrain.drop(dropColumns, axis=1)

outputs_test = dataTest['Finals Made']
inputs_test = dataTest.drop(dropColumns, axis=1)

# COULD EVENTUALLY BE DELETED 
# Extra columns in training set not needed 
if 'EXP' in inputs_train.columns:
    print(1)
    inputs_train = inputs_train.drop('EXP', axis=1)
    
if 'T' in inputs_train.columns:
    print(2)
    inputs_train = inputs_train.drop('T', axis=1)
    
# Extra columns in training set not needed 
if 'EXP' in inputs_test.columns:
    print(3)
    inputs_test = inputs_test.drop('EXP', axis=1)
    
if 'T' in inputs_test.columns:
    print(4)
    inputs_test = inputs_test.drop('T', axis=1)

1
2
Train:
W                       11
L                        6
W-L%                 0.647
PF                     483
PA                     289
PD                     194
MoV                   11.4
SoS                   -1.6
SRS                    9.8
OSRS                   5.0
DSRS                   4.8
\n EXP\n            216.68
1stD                   398
FL                       6
Passing 1stD           236
Passing Att            655
Passing Cmp            415
Passing Int             16
Passing NY/A           6.3
Passing TD              36
Passing Yds           4284
Penalties 1stPy         28
Penalties Pen          113
Penalties Yds          980
Rushing 1stD           134
Rushing Att            461
Rushing TD              20
Rushing Y/A            4.8
Rushing Yds           2209
Sc%                   45.2
TO%                   11.3
Tot Yds & TO Ply      1143
Tot Yds & TO TO         22
Tot Yds & TO Y/P       5.7
Yds                   6493
Year                    21
Conference       

In [94]:
# Create model and train on seasons 15-23
model = LogisticRegression(max_iter=10000)
model.fit(inputs_train, outputs_train)

# Get the predicted probabilities of model choosing 0 or 1
predicted_probs = model.predict_proba(inputs_test)

# Predicted probabilites of model choosing 1 for every team in test data
probs_of_1 = predicted_probs[:, 1]

# Return list of indices from smallest to highest prob
indices_in_order = probs_of_1.argsort()

# Get the last 4 probs(greatest probs), then reverse the order so the highest prob is first
top_4_indices = indices_in_order[-4:][::-1]

# Get the top 4 teams rows with just the team names
top_4_teams = dataTest.iloc[top_4_indices]['Tm']

print("Top 4 predicted teams:")
print(top_4_teams)

Top 4 predicted teams:
28       Kansas City Chiefs
5         Minnesota Vikings
0       Philadelphia Eagles
1     Washington Commanders
Name: Tm, dtype: object
