# XGBoost

The following notebook will serve as a notebook to implement a Logistic Regression model to obtain a baseline measurement for our loss function in developing our models.

In [0]:
%run /Workspace/Repos/anthony.m.quagliata@vanderbilt.edu/NFL-Capstone/03-Models/Model_Evaluation_Functions

## Read Data and Import Libraries

In [0]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
from pyspark.sql import SparkSession
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [0]:
# Define the directory where your data is located
directory = "/dbfs/mnt/nfl/"

# Read in all your datasets
games = pd.read_csv(f"{directory}games.csv")
players = pd.read_csv(f"{directory}players.csv")
plays = pd.read_csv(f"{directory}plays.csv")
tackles = pd.read_csv(f"{directory}tackles.csv")
train = pd.read_csv(f"{directory}train.csv")
val = pd.read_csv(f"{directory}val.csv")
test = pd.read_csv(f"{directory}test.csv")

In [0]:
train.head()

## XGBoost Model

In [0]:
#import libraries
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from hyperopt import hp, tpe, fmin, Trials
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterSampler




cols_to_remove = ["Week", "gameId", "playId", "nflId", "frameId", "tackle_single", "tackle_multiple"]
x_train = train.drop(columns = cols_to_remove)
y_train = train["tackle_multiple"]
x_val = val.drop(columns = cols_to_remove)
y_val = val["tackle_multiple"]
x_test = test.drop(columns = cols_to_remove)
y_test = test["tackle_multiple"]

# Ensure observations of a play within a game are grouped together
group_sizes_train = train.groupby('gamePlayId').size().tolist()
group_sizes_val = val.groupby('gamePlayId').size().tolist()
group_sizes_test = test.groupby('gamePlayId').size().tolist()

x_train = x_train.drop(columns = 'gamePlayId')
x_val = x_val.drop(columns = 'gamePlayId')
x_test = x_test.drop(columns = 'gamePlayId')

# Convert data into DMatrix format (XGBoost's internal data structure)
dtrain = xgb.DMatrix(x_train, label=y_train, enable_categorical=True, group=group_sizes_train)
dval = xgb.DMatrix(x_val, label=y_val, enable_categorical=True, group=group_sizes_val)
dtest = xgb.DMatrix(x_test, label=y_test, enable_categorical=True, group=group_sizes_test)

In [0]:
params = {
    'objective': 'binary:logistic',  # for binary classification
    'eval_metric': 'logloss',  # use logloss for binary classification problems todo: test between logloss, error, auc
    'max_depth': 4,  # maximum depth of the tree
    'eta': 0.05,  # learning rate
    'subsample': 0.9,  # fraction of samples used for fitting the trees
    'colsample_bytree': 0.9,  # fraction of features used for fitting the trees
    'n_estimators': 140,
    'gamma': 0.2,
    'min_child_weight': 5,
    'scale_pos_weight': 5
}

In [0]:
# Train the XGBoost model with early stopping
early_stopping_rounds = 10
num_rounds = 1000

# Specify the validation set for early stopping
evals = [(dtrain, 'train'), (dval, 'validation')]

# Define the model
model = xgb.train(params, dtrain, num_rounds, evals=evals, early_stopping_rounds=early_stopping_rounds)

In [0]:
probabilities_train = model.predict(dtrain)
probabilities_val = model.predict(dval)
probabilities_test = model.predict(dtest)

In [0]:
log_loss(y_train, probabilities_train)

In [0]:
log_loss(y_val, probabilities_val)

In [0]:
log_loss(y_test, probabilities_test)

In [0]:
plotROC(y_train, probabilities_train)

In [0]:
plotROC(y_val, probabilities_val)

In [0]:
plotROC(y_test, probabilities_test)

In [0]:
youdens_j_value_train = youdens_j(y_train, probabilities_train)
youdens_j_value_val = youdens_j(y_val, probabilities_val)
youdens_j_value_test = youdens_j(y_test, probabilities_test)

In [0]:
binaryClfMetrics(y_train, probabilities_train, threshold=youdens_j_value_train)

In [0]:
binaryClfMetrics(y_val, probabilities_val, threshold=youdens_j_value_val)

In [0]:
binaryClfMetrics(y_test, probabilities_test, threshold=youdens_j_value_test)

In [0]:
highest_avg_acc_per_play(train, probabilities_train)

In [0]:
highest_avg_acc_per_play(val, probabilities_val)

In [0]:
highest_avg_acc_per_play(test, probabilities_test)

In [0]:
acc_frame_tackle(train,probabilities_train)

In [0]:
acc_frame_tackle(val,probabilities_val)

In [0]:
acc_frame_tackle(test, probabilities_test)

## Feature Importance

In [0]:
import matplotlib.pyplot as plt

labels = ["BC Distance", "BC Speed", "D Speed", "D Direction", "BC Direction", "BC Distance to OOB","BC Angle Relative to D", "D X Locaion", "D Orientation", "D Acceleration", "BC Distance From D Voronoi", "D Y Location", "Closest Blocker to D", "BC X Location", "BC Y Location"]

# Assuming 'model' is an instance of xgboost.Booster
importance = model.get_score(importance_type='weight')

# Normalize the feature importances
sum_values = sum(importance.values())
importance_normalized = {k: v / sum_values for k, v in importance.items()}

# Sort feature importance values
sorted_importance = sorted(importance_normalized.items(), key=lambda x: x[1], reverse=True)

# Assuming 'sorted_importance' contains the sorted feature importances
top_15_features = sorted_importance[:15]  # Select the top 15 features

# Unpacking the lists of features and their corresponding importances
features, importances = zip(*top_15_features)

# Creating the bar chart
plt.figure(figsize=(10, 8))
plt.barh(labels, importances, color='brown')
plt.xlabel('Importance')
plt.title('Top 15 Feature Importances')
plt.gca().invert_yaxis()  # Invert the y-axis to have the highest value at the top
plt.figtext(0.8, 0.0, 'NOTE: "BC" stands for Ball Carrier, "D" stands for Defender, and "OOB" stands for Out Of Bounds', ha='right', fontsize=10, color='black')
plt.show()


## Model Visualizations

In [0]:
#tracking_1 = pd.read_csv(f"{directory}tracking_week_1.csv")
#tracking_2 = pd.read_csv(f"{directory}tracking_week_2.csv")
# tracking_3 = pd.read_csv(f"{directory}tracking_week_3.csv")
# tracking_4 = pd.read_csv(f"{directory}tracking_week_4.csv")
# tracking_5 = pd.read_csv(f"{directory}tracking_week_5.csv")
# tracking_6 = pd.read_csv(f"{directory}tracking_week_6.csv")
# tracking_7 = pd.read_csv(f"{directory}tracking_week_7.csv")
#tracking_8 = pd.read_csv(f"{directory}tracking_week_8.csv")
tracking_9 = pd.read_csv(f"{directory}tracking_week_9.csv")
# tracking = pd.concat([tracking_1,tracking_2,tracking_3,tracking_4,tracking_5,tracking_6,tracking_7,tracking_8,tracking_9], axis = 0).reset_index(drop = True)
# del tracking_1
# del tracking_2
# del tracking_3
# del tracking_4
# del tracking_5
# del tracking_6
# del tracking_7
# del tracking_8
# del tracking_9

In [0]:
games[games["visitorTeamAbbr"]=="PHI"]

In [0]:
plays[(plays["gameId"]==2022110300) & (plays["ballCarrierDisplayName"] == "A.J. Brown")]

In [0]:
animate_play_probs(games,tracking_9,plays,players,train,probabilities_train,2022110300,968).show() 