<a href="https://colab.research.google.com/github/bhaveshasasik/nfl_game_predictor/blob/main/Random_forest_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import os

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd

def preprocess_all_files(file_paths):
    all_data = []
    for file_path in file_paths:
        data = pd.read_csv(file_path)
        data = data[data['Unnamed: 0'] != 'Week']
        data.rename(columns={
            'Score': 'Team_Points',
            'Score.1': 'Opponent_Points',
            'Offense': 'Team_TotalYards',
            'Offense.1': 'Team_PassYards',
            'Offense.2': 'Team_RushYards',
            'Offense.3': 'Team_Turnovers',
            'Defense': 'Opponent_TotalYards',
            'Defense.1': 'Opponent_PassYards',
            'Defense.2': 'Opponent_RushYards',
            'Defense.3': 'Opponent_Turnovers'
        }, inplace=True)
        numeric_cols = [
            'Team_Points', 'Opponent_Points', 'Team_TotalYards', 'Team_PassYards',
            'Team_RushYards', 'Team_Turnovers', 'Opponent_TotalYards',
            'Opponent_PassYards', 'Opponent_RushYards', 'Opponent_Turnovers'
        ]
        for col in numeric_cols:
            data[col] = pd.to_numeric(data[col], errors='coerce')
        data = data.dropna(subset=numeric_cols)
        data['Team'] = file_path.split('/')[-1].split('_')[0]
        data['Home_Field'] = (data['Unnamed: 0'] == 'Home').astype(int)  # Add home-field advantage as binary
        all_data.append(data)
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

def train_and_evaluate_model(combined_data):
    features = ['Team_TotalYards', 'Team_PassYards', 'Team_RushYards', 'Team_Turnovers',
                'Opponent_TotalYards', 'Opponent_PassYards', 'Opponent_RushYards', 'Opponent_Turnovers', 'Home_Field']
    combined_data['Outcome'] = (combined_data['Team_Points'] > combined_data['Opponent_Points']).astype(int)
    X = combined_data[features]
    y = combined_data['Outcome']

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predict on test set
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Print metrics
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_proba))
    #print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return model, combined_data, scaler

def predict_matchup(model, combined_data, scaler, team_a, team_b, home_team):
    team_a_stats = combined_data[combined_data['Team'] == team_a].iloc[0][
        ['Team_TotalYards', 'Team_PassYards', 'Team_RushYards', 'Team_Turnovers']
    ].tolist()
    team_b_stats = combined_data[combined_data['Team'] == team_b].iloc[0][
        ['Team_TotalYards', 'Team_PassYards', 'Team_RushYards', 'Team_Turnovers']
    ].tolist()
    home_field = 1 if home_team == team_a else 0

    matchup_stats = pd.DataFrame([team_a_stats + team_b_stats + [home_field]], columns=[
        'Team_TotalYards', 'Team_PassYards', 'Team_RushYards', 'Team_Turnovers',
        'Opponent_TotalYards', 'Opponent_PassYards', 'Opponent_RushYards', 'Opponent_Turnovers', 'Home_Field'
    ])
    matchup_stats = scaler.transform(matchup_stats)
    prediction = model.predict(matchup_stats)
    probability = model.predict_proba(matchup_stats)[0][1]
    outcome = team_a if prediction[0] == 1 else team_b
    print(f"Predicted Winner: {outcome}")
    print(f"Confidence: {probability:.2f}")
    return outcome, probability

file_paths = [
    'bears_data.csv',
    'bengals_data.csv',
    'bills_data.csv',
    'broncos_data.csv',
    'browns_data.csv',
    'buccaneers_data.csv',
    'cardinals_data.csv',
    'chargers_data.csv',
    'chiefs_data.csv',
    'colts_data.csv',
    'commanders_data.csv',
    'cowboys_data.csv',
    'dolphins_data.csv',
    'eagles_data.csv',
    'falcons_data.csv',
    'giants_data.csv',
    'jaguars_data.csv',
    'jets_data.csv',
    'lions_data.csv',
    'packers_data.csv',
    'panthers_data.csv',
    'patriots_data.csv',
    'raiders_data.csv',
    'rams_data.csv',
    'ravens_data.csv',
    'saints_data.csv',
    'seahawks_data.csv',
    'steelers_data.csv',
    'texans_data.csv',
    'titans_data.csv',
    'vikings_data.csv',
    '49ers_data.csv'
]

# Preprocess all files and train model
combined_data = preprocess_all_files(file_paths)
model, combined_data, scaler = train_and_evaluate_model(combined_data)


# Predict matchup
team_a = "texans"
team_b = "steelers"
home_team = "steelers"  # Specify which team has home-field advantage
predict_matchup(model, combined_data, scaler, team_a, team_b, home_team)


Accuracy: 0.6235294117647059
Precision: 0.6309523809523809
Recall: 0.6162790697674418
F1 Score: 0.6235294117647059
ROC AUC Score: 0.7323504983388704
Predicted Winner: steelers
Confidence: 0.25


('steelers', 0.25)