In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [3]:
file_path = '../data/nfl-games2002-2022.csv'

data = pd.read_csv(file_path)
nfl_df = pd.DataFrame(data)
nfl_df = nfl_df.dropna(axis=1)

nfl_df['home'] = nfl_df['home'].replace({'Washington Redskins': 'Washington Commanders', 'Washington Football Team': 'Washington Commanders', 'Oakland Raiders': 'Las Vegas Raiders'})
nfl_df['away'] = nfl_df['away'].replace({'Washington Redskins': 'Washington Commanders', 'Washington Football Team': 'Washington Commanders', 'Oakland Raiders': 'Las Vegas Raiders'})
nfl_df['result_home'] = (nfl_df['score_home'] - nfl_df['score_away']).apply(lambda x: 1 if x > 0 else (0 if x < 0 else 0.5)).astype(str)

def rate(col): return 0 if int(col[1]) == 0 else int(col[0]) / int(col[1])
def to_sec(col): return int(col[0]) * 60 + int(col[1])

nfl_df['third_downs_home_rate'] = nfl_df['third_downs_home'].str.split('-').apply(lambda x: rate(x))
nfl_df['third_downs_away_rate'] = nfl_df['third_downs_away'].str.split('-').apply(lambda x: rate(x))
nfl_df['fourth_downs_home_rate'] = nfl_df['fourth_downs_home'].str.split('-').apply(lambda x: rate(x))
nfl_df['fourth_downs_away_rate'] = nfl_df['fourth_downs_away'].str.split('-').apply(lambda x: rate(x))
nfl_df['possession_home'] = nfl_df['possession_home'].str.split(':').apply(lambda x: to_sec(x))
nfl_df['possession_away'] = nfl_df['possession_away'].str.split(':').apply(lambda x: to_sec(x))
nfl_df = nfl_df.drop('score_home', axis=1)
nfl_df = nfl_df.drop('score_away', axis=1)

In [60]:
nfl_labels = nfl_df['result_home'].tolist()
nfl_features = nfl_df.drop('result_home', axis=1)
nfl_features = ['rushing_attempts_away', 'rushing_attempts_home', 'turnovers_away', 'turnovers_home'] 
# nfl_features = nfl_df._get_numeric_data().columns.values.tolist()
nfl_feature_data = nfl_df[nfl_features]
X_train, X_test, y_train, y_test = train_test_split(nfl_feature_data, nfl_labels, test_size=0.3)

In [52]:
for nfl in nfl_features:
  print(nfl)

first_downs_away
first_downs_home
passing_yards_away
passing_yards_home
rushing_yards_away
rushing_yards_home
total_yards_away
total_yards_home
rushing_attempts_away
rushing_attempts_home
fumbles_away
fumbles_home
int_away
int_home
turnovers_away
turnovers_home
drives_away
drives_home
def_st_td_away
def_st_td_home
possession_away
possession_home
third_downs_home_rate
third_downs_away_rate
fourth_downs_home_rate
fourth_downs_away_rate


In [61]:
rf_model = RandomForestClassifier(n_estimators=100)

rf_model.fit(X_train, y_train)

rf_predictions = rf_model.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f'Random Forest Model Accuracy: {rf_accuracy}')

Random Forest Model Accuracy: 0.780862374483166


In [62]:
feature_importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
importances_df = pd.DataFrame({'Feature': nfl_features, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Display the top N most important features
top_n = 10  # Change this value based on your preference
print(f'Top {top_n} most important features:')
print(importances_df.head(top_n))

Top 10 most important features:
                 Feature  Importance
0  rushing_attempts_away    0.394491
1  rushing_attempts_home    0.340937
2         turnovers_away    0.140637
3         turnovers_home    0.123935


In [63]:
params = {
    "n_estimators": 500,
    "learning_rate": 0.01,
    "loss": "log_loss",
}

In [64]:
gb_model = GradientBoostingClassifier(**params)

# Train the Gradient Boosting model
gb_model.fit(X_train, y_train)

# Make predictions on the test set
gb_predictions = gb_model.predict(X_test)

# Evaluate the Gradient Boosting model accuracy
gb_accuracy = accuracy_score(y_test, gb_predictions)
print(f'Gradient Boosting Model Accuracy: {gb_accuracy}')

Gradient Boosting Model Accuracy: 0.8251624335499114


In [65]:
feature_importances = gb_model.feature_importances_

# Create a DataFrame to display feature importances
importances_df = pd.DataFrame({'Feature': nfl_features, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Display the top N most important features
top_n = 10  # Change this value based on your preference
print(f'Top {top_n} most important features:')
print(importances_df.head(top_n))

Top 10 most important features:
                 Feature  Importance
0  rushing_attempts_away    0.434752
1  rushing_attempts_home    0.269411
2         turnovers_away    0.162482
3         turnovers_home    0.133356


In [66]:
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming you have already defined rf_model, X_test, and y_test

# Set the number of bootstrap iterations
n_iterations = 1000

# Initialize an array to store the metric values
bootstrap_metrics = np.empty(n_iterations)

# Perform bootstrapping
for i in range(n_iterations):
    # Create a bootstrap sample
    X_bootstrap, y_bootstrap = resample(X_test, y_test, random_state=i)
    
    # Make predictions on the bootstrap sample
    bootstrap_predictions = gb_model.predict(X_bootstrap)
    
    # Calculate the metric of interest (e.g., accuracy)
    bootstrap_accuracy = accuracy_score(y_bootstrap, bootstrap_predictions)
    
    # Store the metric value in the array
    bootstrap_metrics[i] = bootstrap_accuracy

# Calculate the confidence interval
confidence_interval = np.percentile(bootstrap_metrics, [2.5, 97.5])

# Display the results
print(f'Bootstrap Accuracy Mean: {np.mean(bootstrap_metrics)}')
print(f'95% Confidence Interval: {confidence_interval}')


Bootstrap Accuracy Mean: 0.824663910218547
95% Confidence Interval: [0.80626108 0.84171589]
