In [1]:
# Import packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import glob

In [2]:
# Define the path to your Tracking Week CSV files
csv_files = glob.glob("original_data/tracking_week_*.csv")

# Initialize an empty DF
tracking_df = pd.DataFrame()

# Loop through each CSV file and combine all CSV's into the combined DF
for file in csv_files:
    df = pd.read_csv(file)
    tracking_df = pd.concat([tracking_df, df], ignore_index=True)

# Drop Na Values
tracking_df = tracking_df.dropna()

#Save DF as new CSV
df.to_csv('Resources/combined_tracking.csv', index=False)

# Show DF
tracking_df

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
5,2022100600,90,33084.0,Matt Ryan,6,2022-10-06 20:17:05.299999,2.0,IND,left,90.26,23.69,0.20,0.14,0.04,274.45,250.64,ball_snap
18,2022100600,90,33084.0,Matt Ryan,19,2022-10-06 20:17:06.599999,2.0,IND,left,90.34,24.17,1.95,3.14,0.19,307.77,29.34,handoff
37,2022100600,90,33084.0,Matt Ryan,38,2022-10-06 20:17:08.500000,2.0,IND,left,92.17,31.54,4.25,2.67,0.43,304.71,350.41,first_contact
55,2022100600,90,33084.0,Matt Ryan,56,2022-10-06 20:17:10.299999,2.0,IND,left,88.46,35.73,2.11,2.19,0.22,266.80,274.97,tackle
65,2022100600,90,35459.0,Kareem Jackson,6,2022-10-06 20:17:05.299999,22.0,DEN,left,72.17,16.80,0.48,0.94,0.05,119.60,125.33,ball_snap
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12187319,2022103100,3697,54525.0,Cam Taylor-Britt,33,2022-10-31 23:05:05.200000,29.0,CIN,left,18.04,30.27,3.05,1.77,0.31,165.05,177.63,tackle
12187329,2022103100,3697,54560.0,Zachary Carter,6,2022-10-31 23:05:02.500000,95.0,CIN,left,22.02,20.47,0.28,0.34,0.03,155.36,37.09,ball_snap
12187336,2022103100,3697,54560.0,Zachary Carter,13,2022-10-31 23:05:03.200000,95.0,CIN,left,22.12,20.79,0.69,0.67,0.08,118.30,353.39,run
12187345,2022103100,3697,54560.0,Zachary Carter,22,2022-10-31 23:05:04.099999,95.0,CIN,left,21.10,21.64,1.83,0.82,0.18,65.12,297.56,first_contact


In [3]:
# Define the path to your Tackles CSV file
path = ("Resources/tackles.csv")

# Initialize an empty DF
tackle_df = pd.DataFrame()

# Create DF
tackle_df = pd.read_csv(path)


# Drop Na Values
tackle_df = tackle_df.dropna()

# Display DF
tackle_df

Unnamed: 0,gameId,playId,nflId,tackle,assist,forcedFumble,pff_missedTackle
0,2022090800,101,42816,1,0,0,0
1,2022090800,393,46232,1,0,0,0
2,2022090800,486,40166,1,0,0,0
3,2022090800,646,47939,1,0,0,0
4,2022090800,818,40107,1,0,0,0
...,...,...,...,...,...,...,...
17421,2022091113,2494,43533,0,0,0,1
17422,2022092502,3510,42406,0,0,0,1
17423,2022091113,3642,43478,0,0,0,1
17424,2022091901,3578,42431,0,0,0,1


In [4]:
# Merge DF's based on gameId, playId, and nflId
tackles_df = pd.merge(tracking_df, tackle_df, on=['gameId', 'playId', 'nflId'])

# Show DF
tackles_df

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,...,s,a,dis,o,dir,event,tackle,assist,forcedFumble,pff_missedTackle
0,2022100600,90,35459.0,Kareem Jackson,6,2022-10-06 20:17:05.299999,22.0,DEN,left,72.17,...,0.48,0.94,0.05,119.60,125.33,ball_snap,0,1,0,0
1,2022100600,90,35459.0,Kareem Jackson,19,2022-10-06 20:17:06.599999,22.0,DEN,left,75.12,...,4.62,2.20,0.46,73.08,74.64,handoff,0,1,0,0
2,2022100600,90,35459.0,Kareem Jackson,38,2022-10-06 20:17:08.500000,22.0,DEN,left,83.20,...,1.14,4.79,0.13,34.45,87.72,first_contact,0,1,0,0
3,2022100600,90,35459.0,Kareem Jackson,56,2022-10-06 20:17:10.299999,22.0,DEN,left,83.70,...,2.63,1.67,0.25,302.40,90.46,tackle,0,1,0,0
4,2022100600,90,46074.0,Bradley Chubb,6,2022-10-06 20:17:05.299999,55.0,DEN,left,84.36,...,0.02,0.13,0.00,68.10,181.60,ball_snap,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67974,2022103100,3674,52623.0,Markus Bailey,56,2022-10-31 23:01:53.700000,51.0,CIN,left,23.53,...,2.45,2.11,0.25,324.87,257.73,tackle,1,0,0,0
67975,2022103100,3697,46138.0,B.J. Hill,6,2022-10-31 23:05:02.500000,92.0,CIN,left,22.12,...,0.13,0.51,0.02,86.60,65.26,ball_snap,1,0,0,0
67976,2022103100,3697,46138.0,B.J. Hill,13,2022-10-31 23:05:03.200000,92.0,CIN,left,22.27,...,0.89,0.62,0.09,97.84,173.74,run,1,0,0,0
67977,2022103100,3697,46138.0,B.J. Hill,22,2022-10-31 23:05:04.099999,92.0,CIN,left,22.00,...,1.13,0.80,0.13,144.54,211.78,first_contact,1,0,0,0


In [5]:
# Setting the attribute that will help predict a tackle
# s = Speed in yards/second. a = Speed in yards/second^2.  
# dis = Distance traveled from prior time point, in yards. o = Player orientation (deg), 0 - 360 degrees 
# dir= Angle of player motion (deg), 0 - 360 degrees. 
features = ['s', 'a', 'o', 'dis', 'dir']
X = tackles_df[features]

# Combine 'tackle' and 'pff_missedTackle' columns to create a new target variable
y = tackles_df['tackle'].astype(int)  # Convert 'tackle' to 0 or 1
y += tackles_df['assist'] # Add Assist as a Tackle

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=42)

# Use RandomOverSampler to handle class imbalance
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

# Create and train the Random Forest model
model = RandomForestClassifier(n_estimators=1000, random_state=42)
model.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Predicted probabilities for each class
y_prob = model.predict_proba(X_test)

# Output the results the closer to a 1 the higher the probability
results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Probability_Tackle': y_prob[:, 1] 
})

# Show Results
results.head()


Unnamed: 0,Actual,Predicted,Probability_Tackle
18386,1,1,0.987
37624,0,1,0.899
23305,1,1,0.906
3593,0,1,0.886
19746,1,1,0.938


In [6]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.870432480141218


In [7]:
# Print the balanced_accuracy score of the model
print(f'Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}')


Balanced Accuracy Score: 0.5057474224451807


In [8]:
# Generate a confusion matrix for the model
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')


Confusion Matrix: [[  170  5854]
 [  752 44209]]


In [9]:
# Print the classification report for the model
print(f'Classification Report: {classification_report(y_test, y_pred)}')


Classification Report:               precision    recall  f1-score   support

           0       0.18      0.03      0.05      6024
           1       0.88      0.98      0.93     44961

    accuracy                           0.87     50985
   macro avg       0.53      0.51      0.49     50985
weighted avg       0.80      0.87      0.83     50985

