In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from pymongo import MongoClient
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

client = MongoClient("mongodb+srv://julia:<password>@cluster65664.az03p.mongodb.net/")
db = client['Project-CSE-482']

YEAR_MAP = {
    14: 2014, 15: 2015, 16: 2016, 17: 2017, 18: 2018,
    19: 2019, 20: 2020, 21: 2021, 22: 2022, 23: 2023, 24: 2024
}

SUPER_BOWL_TEAMS = {
    2014: ['New England Patriots', 'Seattle Seahawks'],
    2015: ['Denver Broncos', 'Carolina Panthers'],
    2016: ['New England Patriots', 'Atlanta Falcons'],
    2017: ['Philadelphia Eagles', 'New England Patriots'],
    2018: ['New England Patriots', 'Los Angeles Rams'],
    2019: ['Kansas City Chiefs', 'San Francisco 49ers'],
    2020: ['Tampa Bay Buccaneers', 'Kansas City Chiefs'],
    2021: ['Los Angeles Rams', 'Cincinnati Bengals'],
    2022: ['Kansas City Chiefs', 'Philadelphia Eagles'],
    2023: ['Kansas City Chiefs', 'San Francisco 49ers'],
    2024: ['Kansas City Chiefs', 'Kansas City Chiefs']
}

#load data from mongodb 
def load_data():
    data = []
    for col in db.list_collection_names():
        if col.startswith(('AFC', 'NFC')):
            year_num = int(col.split()[-1])
            year = YEAR_MAP.get(year_num)
            if not year:
                continue
            
            conference = col.split()[0]
            for team in db[col].find():
                team_data = {'team': team['Tm'], 'conference': conference, 'year': year}
                for key, value in team.items():
                    if key != 'Tm':
                        try:
                            #converts strings to float
                            clean_val = float(str(value).replace('"', '').replace(',', ''))
                            team_data[key] = clean_val
                        except (ValueError, TypeError):
                            team_data[key] = value
                #new column made_sb which is 1 if team made it to Super Bowl
                team_data['made_sb'] = 1 if team['Tm'] in SUPER_BOWL_TEAMS.get(year, []) else 0
                data.append(team_data)
    
    return pd.DataFrame(data)

def train_model(df):
    exclude = ['team', 'conference', 'year', 'made_sb']
    #exclude the columns aove and if the data type is non numeric
    features = [col for col in df.columns if col not in exclude and pd.api.types.is_numeric_dtype(df[col])]

    
    model = RandomForestClassifier(
        n_estimators=500,
        class_weight='balanced'
    )

    #train the model using random forest model
    model.fit(df[features], df['made_sb'])
    return model, features

def predict_super_bowl(year_to_predict):
    
    all_data = load_data()
    model, features = train_model(all_data)

    #filters data to only include rows where the year matches year_to_predict 
    #creates new data that only tcontaining teams from that year
    prediction_data = all_data[all_data['year'] == year_to_predict].copy()

    #predict_proba function provides the probability of each team making it to the SB
    prediction_data['sb_prob'] = model.predict_proba(prediction_data[features])[:, 1]

    #gets the top two teams from each conference
    afc = prediction_data[prediction_data['conference'] == 'AFC'].nlargest(2, 'sb_prob')
    nfc = prediction_data[prediction_data['conference'] == 'NFC'].nlargest(2, 'sb_prob')

    
    return {
        'year': year_to_predict,
        'AFC_contenders': afc['team'].tolist(),
        'NFC_contenders': nfc['team'].tolist(),
        'AFC_probabilities': [round(prob, 2) for prob in afc['sb_prob'].tolist()], #round probability to 2 dec places
        'NFC_probabilities': [round(prob, 2) for prob in nfc['sb_prob'].tolist()]
    }

def evaluate_model(df):
    exclude = ['team', 'conference', 'year', 'made_sb']
    features = [col for col in df.columns if col not in exclude and pd.api.types.is_numeric_dtype(df[col])]
    X = df[features]
    y = df['made_sb']

    #split data into training sets
    #80% will be used for testing and 20 for training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=1)
    clf = RandomForestClassifier(n_estimators=500, class_weight='balanced')

    #train random forest model
    clf = clf.fit(X_train, y_train)

    #predict whether each team in the test set made it to the SB
    y_pred = clf.predict(X_test)

    #predict accuracy of model by comparing it ot the predicted values with true values
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    

all_data = load_data()
evaluate_model(all_data)

year = 2021
predictions = predict_super_bowl(year)

print(f"\nSuper Bowl {year} Predictions:")
print(f"AFC: {predictions['AFC_contenders']} (Probabilities: {predictions['AFC_probabilities']})")
print(f"NFC: {predictions['NFC_contenders']} (Probabilities: {predictions['NFC_probabilities']})")