# **NFL Databowl 23 Basic Pass Prediction**

using boosted trees to determine if a pass will be complete or incomplete given a offensive personnel package and a defensive personnel package plus coverage formation and coverage type.

Results: In feature importance below, shows that the most important feature given the dataset is the Dropback Type, pass coverage, and then play action. 

In [None]:
#Imported Stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report
from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
from sklearn.model_selection import cross_val_score
import pickle

'''
ML work flow:
1. Collect the data
2. Visualize the data
3. Clean the data
4. Train the model
5. Evaluate
6. Hyperparameter tuning
7. Choose the best model and prediction
see for more info, https://towardsdatascience.com/regression-analysis-for-beginners-using-tree-based-methods-2b65bd193a7#bb44


'''

plt.interactive(True) #interactive mode to display plots
#plt.savefig('myplot1.png') plot to a file instead

In [None]:
#Google Drive Access
import os

'''
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir("//content/gdrive/MyDrive/nfl-big-data-bowl-2023/")

data_dir = "/content/gdrive/My Drive/nfl-big-data-bowl-2023/"
save_dir = "/content/gdrive/My Drive/nfl-big-data-bowl-2023/models/"
'''
#if on kaggle
data_dir = "../input/play-predictor-data/play_predictor_data.csv"
OUTPUT_DIR = './'

In [None]:
'''
with c,i,in: 7566
playAction = 1 and passResult = C returns 1116 of 4620 24.1% of C pass plays AVG gain 13.54yds per pass Action play
playAction = 0 and passResult = C returns 3504 of 4620 75.9%  AVG gain 11.08yds per pass play W/O play Action
playAction = 1 and passResult = I returns 621 of 2756 22.5% of I pass plays
playAction = 0 and passResult = I returns 2135 of 2756 77.4%
playAction = 1 and passResult = IN returns 41 of 4620  //slightly less interceptions on plays with playAction 2 21.5% of INs
playAction  = 0 and passResult = IN returns 149 of 4620 // 78.4% of interceptions

slightly less interceptions and incompletions with playAction  = 1
#analysis would need to be done on a team by team basis

general observation: OL should study Wing Chun techniques

combine data: https://nflcombineresults.com/nflcombinedata.php?year=2022&pos=OT&college=

understanding passing table in db: https://fantasydata.com/fantasy-football-advanced-metrics-explained

defensive fronts: https://www.playerprofiler.com/article/meet-the-metric-base-stacked-and-light-fronts/

General Football Analytic Coding: https://www.opensourcefootball.com/posts/2020-08-24-getting-into-sports-analytics/


'''

## **Exploratory Data Analysis (EDA)**

In [None]:

#Reading Dataset
df = pd.read_csv(data_dir)
print(df.head())


print(df.shape)

In [None]:
#Checking for null values
print(df.isnull().sum())

print(df.describe())

df = df.dropna()
print(len(df))

In [None]:
#need to convert all categorial columns to integer columns

df['offenseFormation'].replace(['SHOTGUN', 'EMPTY', 'SINGLEBACK', 'I_FORM', 'JUMBO', 'PISTOL', 'WILDCAT'],
                        [1, 2, 3, 4, 5, 6, 7], inplace=True)

df['passResult'].replace(['C', 'I', 'S', 'R', 'IN'],
                        [0, 1, 2, 3, 4], inplace=True)

df['offenseFormation'].replace(['SHOTGUN', 'EMPTY', 'SINGLEBACK', 'I_FORM', 'JUMBO', 'PISTOL', 'WILDCAT'],
                        [1, 2, 3, 4, 5, 6, 7], inplace=True)

df['personnelO'].replace(['1 RB, 1 TE, 3 WR', 
'1 RB, 2 TE, 2 WR', 
'0 RB, 2 TE, 3 WR',
'1 RB, 0 TE, 4 WR', 
'2 RB, 1 TE, 2 WR',
'2 RB, 0 TE, 3 WR', 
'2 RB, 2 TE, 1 WR', 
'1 RB, 3 TE, 1 WR',
'2 RB, 3 TE, 0 WR',
'0 RB, 0 TE, 5 WR',
'0 RB, 1 TE, 4 WR',
'6 OL, 2 RB, 2 TE, 0 WR',
'2 QB, 2 RB, 0 TE, 2 WR',
'2 QB, 1 RB, 1 TE, 2 WR',
'6 OL, 1 RB, 1 TE, 2 WR',
'2 QB, 1 RB, 2 TE, 1 WR',
'6 OL, 1 RB, 2 TE, 1 WR',
'2 QB, 1 RB, 0 TE, 3 WR',
'6 OL, 2 RB, 1 TE, 1 WR',
'3 RB, 0 TE, 2 WR',
'2 QB, 6 OL, 1 RB, 1 TE, 1 WR',
'0 RB, 3 TE, 2 WR',
'6 OL, 1 RB, 3 TE, 0 WR',
'6 OL, 2 RB, 0 TE, 2 WR',
'6 OL, 1 RB, 0 TE, 3 WR',
'1 RB, 1 TE, 2 WR,1 LB',
'1 RB, 4 TE, 0 WR',
'2 QB, 2 RB, 1 TE, 1 WR',
'2 QB, 1 RB, 3 TE, 0 WR',
'7 OL, 1 RB, 0 TE, 2 WR'],
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], inplace=True)


df['personnelD'].replace(['4 DL, 2 LB, 5 DB',
'4 DL, 4 LB, 3 DB',
'3 DL, 3 LB, 5 DB',
'4 DL, 3 LB, 4 DB',
'3 DL, 4 LB, 4 DB',
'2 DL, 4 LB, 5 DB',
'2 DL, 2 LB, 7 DB',
'1 DL, 5 LB, 5 DB',
'2 DL, 3 LB, 6 DB',
'4 DL, 1 LB, 6 DB',
'3 DL, 2 LB, 6 DB',
'5 DL, 2 LB, 4 DB',
'6 DL, 1 LB, 4 DB',
'3 DL, 1 LB, 7 DB',
'1 DL, 4 LB, 6 DB',
'4 DL, 6 LB, 1 DB',
'0 DL, 3 LB, 8 DB',
'1 DL, 3 LB, 7 DB',
'5 DL, 1 LB, 5 DB',
'5 DL, 3 LB, 3 DB',
'0 DL, 5 LB, 6 DB',
'2 DL, 5 LB, 4 DB',
'6 DL, 3 LB, 2 DB',
'3 DL, 5 LB, 3 DB',
'5 DL, 5 LB, 1 DB',
'1 DL, 2 LB, 8 DB',
'6 DL, 4 LB, 1 DB',
'4 DL, 5 LB, 2 DB',
'6 DL, 2 LB, 3 DB'],
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], inplace=True)

df['dropBackType'].replace(['TRADITIONAL',
'SCRAMBLE_ROLLOUT_RIGHT',
'DESIGNED_ROLLOUT_RIGHT',
'SCRAMBLE',
'DESIGNED_ROLLOUT_LEFT',
'UNKNOWN',
'DESIGNED_RUN',
'SCRAMBLE_ROLLOUT_LEFT'],
                        [1, 2, 3, 4, 5, 6, 7, 8], inplace=True)

df['pff_passCoverage'].replace([
'Cover-1',
'Cover-3',
'Cover-6',
'Quarters',
'Cover-2',
'2-Man',
'Cover-0',
'Prevent',
'Bracket',
'Red Zone',
'Miscellaneous',
'Goal Line'],
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], inplace=True)



df['pff_passCoverageType'].replace(['Man', 'Zone', 'Other'],
                        [1, 2, 3], inplace=True)

df['playDirection'].replace(['left', 'right'],
                        [1, 2], inplace=True)




In [None]:
df = df.iloc[: , 1:]

df.head()

In [None]:

print(df.corr())
sns.pairplot(df)#plt.show()


In [None]:
#Correlations of each feature in dataset, 1 is the best value for correlated data
corrmat = df.corr()
top_features = corrmat.index
plt.figure(figsize = (20,20))

g = sns.heatmap(df[top_features].corr(), annot = True, cmap = "Blues")
plt.show()

In [None]:

plt.figure()
df.hist(figsize=(20,20))
plt.show()

# **Train, Test, Split**

In [None]:
#Setting independant and target variables, cleaning by dropping categorical and Target (Y) variable (feature)
X = df.drop(['passResult'],axis=1) #dropped target column
y = df['passResult']  #target variable

#Splitting Data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42,shuffle=True)

print("X data head")
print(X.head())
print ("Y data head")
print(y.head())


In [None]:

xgbc = XGBClassifier(max_depth=3, #how many levels of tree to grow, higher the num greater chance of overfitting
                     subsample = 0.5, #fraction of observations to be randomly sampled for each tree
                     n_estimators=200,
                     objective = "binary:logistic",
                     eval_metric=['merror','mlogloss'],
                     learning_rate=0.1, #alias of eta hyperparam, the step size shrinkage used in update to prevent overfit should be .01-.2
                     min_child_weight=1, #min sum of weights of all obs rquired in a child, high val can lead to underfitting, tune using CV
                     reg_alpha=0, #L! loss func regularization term on weights. higher more conservative the model
                     reg_lambda=1 #L2 loss func regularization term on weights, higher more conservative the model
                     )
#more info on hyperparams https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook
print(xgbc)


xgbc.fit(X_train, y_train)
y_predict = xgbc.predict(X_test)
y_train_predict = xgbc.predict(X_train)


In [None]:

##################Evalutation of results########################
# - cross validataion
scores = cross_val_score(xgbc, X_train, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())


'''
K-Fold Cross Validation
As there is never enough data to train your model, removing a part of it for validation poses a problem of underfitting. By reducing the training data, we risk losing important patterns/ trends in data set, which in turn increases error induced by bias. So, what we require is a method that provides ample data for training the model and also leaves ample data for validation. K Fold cross validation does exactly that.

In K Fold cross validation, the data is divided into k subsets. Now the holdout method is repeated k times, such that each time, one of the k subsets is used as the test set/ validation set and the other k-1 subsets are put together to form a training set. The error estimation is averaged over all k trials to get total effectiveness of our model. As can be seen, every data point gets to be in a validation set exactly once, and gets to be in a training set k-1 times. This significantly reduces bias as we are using most of the data for fitting, and also significantly reduces variance as most of the data is also being used in validation set. Interchanging the training and test sets also adds to the effectiveness of this method. As a general rule and empirical evidence, K = 5 or 10 is generally preferred, but nothing’s fixed and it can take any value.

sklearn provides the functionality for this cross check
'''
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbc, X_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())


In [None]:
ypred = xgbc.predict(X_test)
cm = confusion_matrix(y_test,ypred)
print(cm)

'''
Accuracy Score is not the best way to measure the fit of the model
'''
print('train accuracy', accuracy_score(y_train, y_train_predict))
print('test accuracy', accuracy_score(y_test, ypred))


In [None]:

xgbc.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)], 
         verbose=False)
# Load evals result by calling the evals_result() function

evals_result = xgbc.evals_result()


In [None]:
print('Access complete dict:')
print(evals_result)

results = evals_result
epochs = len(results["validation_0"]["merror"])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots(figsize=(12, 12))
ax.plot(x_axis, results["validation_0"]["mlogloss"], label="Train")
ax.plot(x_axis, results["validation_1"]["mlogloss"], label="Test")
ax.legend()
pyplot.ylabel("Log Loss")
pyplot.title("XGBoost Log Loss")
pyplot.show()

fig, ax = pyplot.subplots(figsize=(12, 12))
ax.plot(x_axis, results["validation_0"]["merror"], label="Train")
ax.plot(x_axis, results["validation_1"]["merror"], label="Test")
ax.legend()
pyplot.ylabel("Classification Error")
pyplot.title("XGBoost Classification Error")
pyplot.show()



In [None]:

print(xgbc.feature_importances_)

feat_importances = pd.Series(xgbc.feature_importances_, index = X.columns)
feat_importances.nlargest(5).plot(kind = 'barh')




#wrapping up
plt.show()

In [None]:
#Pickling and dumping, saving model in pkl format
file = open('xgbcl_model.pkl', 'wb')
pickle.dump(xgbc, file)
