In [1]:

from InjuryCleaningFunctions import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# from sklearn.metrics import balanced_accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report
# from imblearn.metrics import classification_report_imbalanced
# import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf

from ColumnCapitals import column_capitalizer

pd.set_option('mode.chained_assignment', None)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
seed = 42

In [2]:
## Connect to the Database
import sqlalchemy as db
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
import psycopg2
from config import db_password

# Import the Messy Data

Import the playlist and injuries lists, then clean and merge the data using the functions defined in NFL_Injury_Cleaning_Functions. Because these data will be processed with Random Forests and Neural Network models, we will need to convert all categorical data to numerical data.  

### Make a Connection to the SQL Server

1. Connect to the NFL_Turf Database
2. Retrieve the data from the 'injuries' table
3. Retrieve the data from the 'playlist' table


In [3]:
# Make connection to the database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/NFL_Turf"
engine = db.create_engine(db_string)
conn = engine.connect()
metadata = db.MetaData()


In [4]:
# Read in the injuries table:
injuries_sql = db.Table('injuries', metadata,
                        autoload=True, autoload_with=engine)
query = db.select(injuries_sql)
Results = conn.execute(query).fetchall()

# Create the new dataframe and set the keys
injuries = pd.DataFrame(Results)
injuries.columns = Results[0].keys()
injuries.head()

Unnamed: 0,playerkey,gameid,playkey,bodypart,fieldtype,dm_m1,dm_m7,dm_m28,dm_m42
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1


In [5]:
# Read in the playlist table:
playlist_sql = db.Table('playlist', metadata,
                        autoload=True, autoload_with=engine)
query = db.select(playlist_sql)
Results = conn.execute(query).fetchall()

# Create the new dataframe and set the keys
playlist = pd.DataFrame(Results)
playlist.columns = Results[0].keys()
playlist.head()


Unnamed: 0,playerkey,gameid,playkey,rosterposition,playerday,playergame,stadiumtype,fieldtype,temperature,weather,playtype,playergameplay,position,postiongroup
0,26624,26624-1,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,1,QB,QB
1,26624,26624-1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,2,QB,QB
2,26624,26624-1,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,3,QB,QB
3,26624,26624-1,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,4,QB,QB
4,26624,26624-1,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,5,QB,QB


When the data from these tables are read in, the column names have been dropped to all lower case, whereas the original column names were not. To fix this, we convert the names back to their original forms using the column_capitalizer function.

In [6]:
injuries = column_capitalizer(injuries, 'injuries')
playlist = column_capitalizer(playlist, 'playlist')

The next step uses the ml_data_cleaner function from InjuryCleaningFunctions.py to merge and clean the data from both playlist and injuries. The processes for the data_cleaner functions are listed at the bottom of this document. 

In [7]:
ml = ml_data_cleaner(playlist, injuries)
ml.head()

Unnamed: 0,PlayKey,RosterPosition,Temperature,PlayerGamePlay,Position,Outdoor,Precipitation,DaysPlayed,PlayCode,InjuryType,InjuryDuration,SevereInjury
0,26624-1-1,0,63,1,0,1,0.0,64,0.0,0.0,0.0,0.0
1,26624-1-2,0,63,2,0,1,0.0,64,0.0,0.0,0.0,0.0
2,26624-1-3,0,63,3,0,1,0.0,64,1.0,0.0,0.0,0.0
3,26624-1-4,0,63,4,0,1,0.0,64,1.0,0.0,0.0,0.0
4,26624-1-5,0,63,5,0,1,0.0,64,0.0,0.0,0.0,0.0


The next line creates a new column called 'IsInjured', that just denotes that there was an injury of any type. 

In [8]:
# The numpy where function reads as follows... set ml.IsInjured equal to 0
# where ml.InjuryType == 0, else set equal to 1. All injuryType 0 values are not injures,
# everything else is an injury

ml['IsInjured'] = np.where(ml['InjuryType'] == 0, 0, 1)


In [9]:
ml.IsInjured.value_counts()

0    260037
1        77
Name: IsInjured, dtype: int64

## Sampling the Non-Injury Data

We are using an undersampling method to perform the neural network analysis, though we will be maintaining an imbalanced dataset. We reduced the number of Non-Injured plays such that the Injured plays make up 1% of all plays in the dataset. To do this, we separated the Injury Plays from the Non-Injury plays so as not to reduce the number of positive results. This is done prior to merging with the tracking data, which will exponentially expand the data. 

In [10]:
ml_y = ml.loc[ml.IsInjured == 1]
ml_X = ml.loc[ml.IsInjured == 0]

In [11]:
samp = ml_X.sample(n=7700, random_state=seed)
samp.head()

Unnamed: 0,PlayKey,RosterPosition,Temperature,PlayerGamePlay,Position,Outdoor,Precipitation,DaysPlayed,PlayCode,InjuryType,InjuryDuration,SevereInjury,IsInjured
99826,41143-14-2,15,33,2,18,0,0.0,148,0.0,0.0,0.0,0.0,0
22713,36554-1-2,15,81,2,17,1,0.0,64,1.0,0.0,0.0,0.0,0
72035,39680-23-57,5,59,57,10,1,0.0,137,0.0,0.0,0.0,0.0,0
58339,39017-3-17,15,83,17,17,1,0.0,78,0.0,0.0,0.0,0.0,0
105405,41558-9-79,5,53,79,7,1,0.0,127,0.0,0.0,0.0,0.0,0


In [12]:
ml_y = ml_y.append(samp, ignore_index=True, sort=False)

## Merge the Tracking Data with the other data

- Load the tracking data
    - The size of the tracking data has proven to take up too much memory on the local server to use SQLAlchemy to retreive
    - Save the file locally and import using Pandas 
- Drop the columns from tracking that are not necessary
- perform an inner merge between the datasets

In [13]:
tracking = pd.read_csv('NFL_Turf/PlayerTrackData.csv')
tracking.drop(columns=['event', 'dis', 'time'], inplace=True)
tracking.head()

Unnamed: 0,PlayKey,x,y,dir,o,s
0,26624-1-1,87.46,28.93,288.24,262.33,0.13
1,26624-1-1,87.45,28.92,283.91,261.69,0.12
2,26624-1-1,87.44,28.92,280.4,261.17,0.12
3,26624-1-1,87.44,28.92,278.79,260.66,0.1
4,26624-1-1,87.44,28.92,275.44,260.27,0.09


### Direction-Orientation

The direction and orientations alone don't do much alone, but the difference between them gives us the angle at which the player's body is twisting. The following merge is an inner merge, so that only the data from tracking associated with the sampled plays is included.

In [14]:
tracking['Twist'] = abs(tracking.dir - tracking.o)
tracking.drop(columns=['dir', 'o'], inplace=True)

In [15]:
ml_merged = pd.merge(tracking, ml_y, on='PlayKey', how='inner')
ml_merged.head()

Unnamed: 0,PlayKey,x,y,s,Twist,RosterPosition,Temperature,PlayerGamePlay,Position,Outdoor,Precipitation,DaysPlayed,PlayCode,InjuryType,InjuryDuration,SevereInjury,IsInjured
0,26624-1-45,21.32,29.14,0.88,23.24,0,63,45,0,1,0.0,64,0.0,0.0,0.0,0.0,0
1,26624-1-45,21.31,29.21,0.91,15.59,0,63,45,0,1,0.0,64,0.0,0.0,0.0,0.0,0
2,26624-1-45,21.3,29.29,0.93,7.61,0,63,45,0,1,0.0,64,0.0,0.0,0.0,0.0,0
3,26624-1-45,21.28,29.38,0.93,0.42,0,63,45,0,1,0.0,64,0.0,0.0,0.0,0.0,0
4,26624-1-45,21.26,29.45,0.89,6.2,0,63,45,0,1,0.0,64,0.0,0.0,0.0,0.0,0


## Remove the large datasets from memory

ml_merged contains the tracking data from tracking.csv merged with the features extracted from the injuryreports.csv and playlist.csv, and maintains several columns that could be used for predictive analysis. 

In [16]:
del ml, tracking, playlist, injuries, ml_X, samp, ml_y, Results

# Prepare the Training and Testing sets

Columns that need to be removed or separated:
- PlayKey is non-informative and object-type
- SevereInjury and InjuryDuration cannot be part of the same training set, as they are not independent 
- For one of the analyses, InjuryType will be the prediction, so we want to exclude the InjuryDuration and SevereInjury, as we're trying to determine the other factors leading to such injuries
- In predicting the severity and/or duration of injury, a few sub-analyses can be performed
    - Looking only at factors that lead to severe injury without respect to injury type
    - Looking at the factors including injury type that predict the duration of injury
    - Looking at the factors that predict BOTH injury type and Severity (or perhaps duration)

The following tests will be performed: 
1. Can the model predict whether an injury occurred in a 99:1 imbalance of data?
2. Can the model predict whether a severe injury is likely to occur?


---
# Deep Learning
## Injury Prediction

This classifier is only looking to see whether the model can predict if the player has sustained an injury or not. The results from this analysis without the tracking data yielded about 60% accuracy. 

In [17]:
# Create an output table
columns = ['Test', 'Model', 'Nodes', 'Epochs', 'Accuracy', 'Loss', 'Precision', 'Recall']
nn_table = pd.DataFrame(columns=columns)
model = 'Neural Network'

In [18]:
ml_merged.head(2)

Unnamed: 0,PlayKey,x,y,s,Twist,RosterPosition,Temperature,PlayerGamePlay,Position,Outdoor,Precipitation,DaysPlayed,PlayCode,InjuryType,InjuryDuration,SevereInjury,IsInjured
0,26624-1-45,21.32,29.14,0.88,23.24,0,63,45,0,1,0.0,64,0.0,0.0,0.0,0.0,0
1,26624-1-45,21.31,29.21,0.91,15.59,0,63,45,0,1,0.0,64,0.0,0.0,0.0,0.0,0


In [19]:
X = ml_merged.copy(deep=True)
X.drop(columns=['PlayKey', 'InjuryType', 'SevereInjury', 'Position', 'InjuryDuration', 'IsInjured'], inplace=True)

In [20]:
X.head()

Unnamed: 0,x,y,s,Twist,RosterPosition,Temperature,PlayerGamePlay,Outdoor,Precipitation,DaysPlayed,PlayCode
0,21.32,29.14,0.88,23.24,0,63,45,1,0.0,64,0.0
1,21.31,29.21,0.91,15.59,0,63,45,1,0.0,64,0.0
2,21.3,29.29,0.93,7.61,0,63,45,1,0.0,64,0.0
3,21.28,29.38,0.93,0.42,0,63,45,1,0.0,64,0.0
4,21.26,29.45,0.89,6.2,0,63,45,1,0.0,64,0.0


Using IsInjured as the label, there are no categorical columns that need to be encoded

In [21]:
y = ml_merged.IsInjured

# Because the True case only represents 1% of the data, the training split is stratifying on y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=seed, stratify=y)

In [22]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit(X_train)
X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)

In [23]:
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 256
hidden_layer2 = 128

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
# nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               3072      
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 3,329
Trainable params: 3,329
Non-trainable params: 0
_________________________________________________________________


In [24]:
epochs = 2

# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[
           'accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=epochs)

Epoch 1/2
Epoch 2/2


In [25]:
# Evaluate the model using the test data
results = nn.evaluate(X_test_scaled, y_test, verbose=2)

17338/17338 - 10s - loss: 0.0170 - accuracy: 0.9944 - precision: 0.8517 - recall: 0.5363 - 10s/epoch - 570us/step


In [26]:
test = "Is Injured"
loss = round(results[0], 4)
accuracy = round(results[1], 4)
precision = round(results[2], 4)
recall = round(results[3], 4)
nodes = [hidden_layer1, hidden_layer2]

row = pd.DataFrame(
    [[test, model, nodes, epochs, accuracy, loss, precision, recall]], columns=columns)
nn_table = nn_table.append(row)
nn_table


Unnamed: 0,Test,Model,Nodes,Epochs,Accuracy,Loss,Precision,Recall
0,Is Injured,Neural Network,"[256, 128]",2,0.9944,0.017,0.8517,0.5363


---
## Severe Injury Prediction

The same process is performed as above, though using SevereInjury as the y parameter

In [27]:
y = ml_merged.SevereInjury

# Because the True case only represents 1% of the data, the training split is stratifying on y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=seed, stratify=y)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit(X_train)
X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)


# Establish the NN Model
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 256
hidden_layer2 = 128

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

nn.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 256)               3072      
                                                                 
 dense_3 (Dense)             (None, 128)               32896     
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 36,097
Trainable params: 36,097
Non-trainable params: 0
_________________________________________________________________


In [28]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[
           'accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=epochs)

Epoch 1/2
Epoch 2/2


In [29]:
# Evaluate the model using the test data
results = nn.evaluate(X_test_scaled, y_test, verbose=2)

test = "Severe Injury"
loss = round(results[0], 4)
accuracy = round(results[1], 4)
precision = round(results[2], 4)
recall = round(results[3], 4)
nodes = [hidden_layer1, hidden_layer2]

row = pd.DataFrame(
    [[test, model, nodes, epochs, accuracy, loss, precision, recall]], columns=columns)
nn_table = nn_table.append(row)
nn_table


17338/17338 - 11s - loss: 0.0020 - accuracy: 0.9994 - precision_1: 0.9917 - recall_1: 0.8469 - 11s/epoch - 630us/step


Unnamed: 0,Test,Model,Nodes,Epochs,Accuracy,Loss,Precision,Recall
0,Is Injured,Neural Network,"[256, 128]",2,0.9944,0.017,0.8517,0.5363
0,Severe Injury,Neural Network,"[256, 128]",2,0.9994,0.002,0.9917,0.8469


The ability to predict whether a player will be injured remains one of the lowest accuracies and precisions of any of our models; however, the specific types of injuries have much higher accuracies and predictions. This is likely explained by the differences the conditions that lead to the different injuries. If they were all overallping, the IsInjured condition should be easier to predict. But if the conditions leading to the different types of injuries are unique, then there is a loss in predictability on just the yes/no, will these conditions lead to an injury. Thus, the model for IsInjured would be improved my using the specific body-parts and using the union of those datasets as an injury predictor.

---
## Injury Type Prediction - General Model

- There are several Injury Type Models:
    - General Model Classifies into 4 categories
    - Foot Injury
    - Ankle Injury
    - Knee Injury

In [30]:
# Format this to do encoding
X_cat = ml_merged.copy(deep=True)
X_cat.drop(columns=['PlayKey', 'IsInjured', 'SevereInjury',
           'Position', 'InjuryDuration'], inplace=True)

# Change the Injury Types back the Categorical
injury = {48.0: 'Knee', 42.0: 'Ankle', 7.0: 'Foot', 0.0: 'NoInjury'}
X_cat['BodyPart'] = X_cat.InjuryType.map(injury)
X_cat.drop(columns='InjuryType', inplace=True)

X_cat.head()

# Grab all categorical variables and create a list for encoding
cat = X_cat.dtypes[X_cat.dtypes == 'object'].index.tolist()

# Create a OneHotEncoder Instance
# Create the instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHot to the columns necessary
encode_df = pd.DataFrame(enc.fit_transform(X_cat[cat]))

# Add the original variable names to the df
encode_df.columns = enc.get_feature_names_out(cat)

# Merge the OneHot features and drop the variables
X_encoded = X_cat.merge(encode_df, left_index=True, right_index=True)
X_encoded.drop(columns=cat, inplace=True)

X_encoded.head()

Unnamed: 0,x,y,s,Twist,RosterPosition,Temperature,PlayerGamePlay,Outdoor,Precipitation,DaysPlayed,PlayCode,BodyPart_Ankle,BodyPart_Foot,BodyPart_Knee,BodyPart_NoInjury
0,21.32,29.14,0.88,23.24,0,63,45,1,0.0,64,0.0,0.0,0.0,0.0,1.0
1,21.31,29.21,0.91,15.59,0,63,45,1,0.0,64,0.0,0.0,0.0,0.0,1.0
2,21.3,29.29,0.93,7.61,0,63,45,1,0.0,64,0.0,0.0,0.0,0.0,1.0
3,21.28,29.38,0.93,0.42,0,63,45,1,0.0,64,0.0,0.0,0.0,0.0,1.0
4,21.26,29.45,0.89,6.2,0,63,45,1,0.0,64,0.0,0.0,0.0,0.0,1.0


In [31]:
y = X_encoded.loc[:, 'BodyPart_Ankle':]
X_enc = X_encoded.drop(
    columns=['BodyPart_Ankle', 'BodyPart_Foot', 'BodyPart_Knee', 'BodyPart_NoInjury'])

# Because the True case only represents 1% of the data, the training split is stratifying on y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=seed, stratify=y)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit(X_train)
X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)


# Establish the NN Model
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 256
hidden_layer2 = 128

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=4, activation='sigmoid'))

nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 256)               3072      
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 4)                 516       
                                                                 
Total params: 36,484
Trainable params: 36,484
Non-trainable params: 0
_________________________________________________________________


In [32]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[
           'accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=epochs)

Epoch 1/2
Epoch 2/2


In [33]:
# Evaluate the model using the test data
results = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Add results to table
test = "Injury Type 4-Classes"
loss = round(results[0], 4)
accuracy = round(results[1], 4)
precision = round(results[2], 4)
recall = round(results[3], 4)
nodes = [hidden_layer1, hidden_layer2]

row = pd.DataFrame(
    [[test, model, nodes, epochs, accuracy, loss, precision, recall]], columns=columns)
nn_table = nn_table.append(row)
nn_table

17338/17338 - 12s - loss: 0.0016 - accuracy: 0.9991 - precision_2: 0.9991 - recall_2: 0.9986 - 12s/epoch - 712us/step


Unnamed: 0,Test,Model,Nodes,Epochs,Accuracy,Loss,Precision,Recall
0,Is Injured,Neural Network,"[256, 128]",2,0.9944,0.017,0.8517,0.5363
0,Severe Injury,Neural Network,"[256, 128]",2,0.9994,0.002,0.9917,0.8469
0,Injury Type 4-Classes,Neural Network,"[256, 128]",2,0.9991,0.0016,0.9991,0.9986


### Breaking this down to the different injury types
We can't get the specific Precision and Recall for the individual injuries like we were able to with the Random Forests algorith, so we broke up this into 3 additional analyses
 
--- 
#### Foot Injury Prediction 

In [34]:
# Foot is encoded by the value 7.0, ankle is 42.0, and knee is 48.0
y = ml_merged.InjuryType.apply(lambda row: 1 if row == 7.0 else 0) # To evaluate Foot Injuries

# Because the True case only represents 1% of the data, the training split is stratifying on y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=seed, stratify=y)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit(X_train)
X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)


# Establish the NN Model
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 256
hidden_layer2 = 128

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

nn.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 256)               3072      
                                                                 
 dense_9 (Dense)             (None, 128)               32896     
                                                                 
 dense_10 (Dense)            (None, 1)                 129       
                                                                 
Total params: 36,097
Trainable params: 36,097
Non-trainable params: 0
_________________________________________________________________


In [35]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[
           'accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=epochs)

Epoch 1/2
Epoch 2/2


In [36]:
# Evaluate the model using the test data
results = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Add results to table
test = "Foot Injury"
loss = round(results[0], 4)
accuracy = round(results[1], 4)
precision = round(results[2], 4)
recall = round(results[3], 4)
nodes = [hidden_layer1, hidden_layer2]

row = pd.DataFrame(
    [[test, model, nodes, epochs, accuracy, loss, precision, recall]], columns=columns)
nn_table = nn_table.append(row)
nn_table


17338/17338 - 11s - loss: 7.0312e-04 - accuracy: 0.9999 - precision_3: 1.0000 - recall_3: 0.8329 - 11s/epoch - 639us/step


Unnamed: 0,Test,Model,Nodes,Epochs,Accuracy,Loss,Precision,Recall
0,Is Injured,Neural Network,"[256, 128]",2,0.9944,0.017,0.8517,0.5363
0,Severe Injury,Neural Network,"[256, 128]",2,0.9994,0.002,0.9917,0.8469
0,Injury Type 4-Classes,Neural Network,"[256, 128]",2,0.9991,0.0016,0.9991,0.9986
0,Foot Injury,Neural Network,"[256, 128]",2,0.9999,0.0007,1.0,0.8329


---
### Ankle Injury

In [37]:
# Foot is encoded by the value 7.0, ankle is 42.0, and knee is 48.0
y = ml_merged.InjuryType.apply(
    lambda row: 1 if row == 42.0 else 0)  # To evaluate Ankle Injuries

# Because the True case only represents 1% of the data, the training split is stratifying on y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=seed, stratify=y)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit(X_train)
X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)


# Establish the NN Model
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 256
hidden_layer2 = 128

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

nn.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_11 (Dense)            (None, 256)               3072      
                                                                 
 dense_12 (Dense)            (None, 128)               32896     
                                                                 
 dense_13 (Dense)            (None, 1)                 129       
                                                                 
Total params: 36,097
Trainable params: 36,097
Non-trainable params: 0
_________________________________________________________________


In [38]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[
           'accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=epochs)

Epoch 1/2
Epoch 2/2


In [39]:
# Evaluate the model using the test data
results = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Add results to table
test = "Ankle Injury"
loss = round(results[0], 4)
accuracy = round(results[1], 4)
precision = round(results[2], 4)
recall = round(results[3], 4)
nodes = [hidden_layer1, hidden_layer2]

row = pd.DataFrame(
    [[test, model, nodes, epochs, accuracy, loss, precision, recall]], columns=columns)
nn_table = nn_table.append(row)
nn_table

17338/17338 - 11s - loss: 0.0022 - accuracy: 0.9993 - precision_4: 0.9385 - recall_4: 0.9030 - 11s/epoch - 648us/step


Unnamed: 0,Test,Model,Nodes,Epochs,Accuracy,Loss,Precision,Recall
0,Is Injured,Neural Network,"[256, 128]",2,0.9944,0.017,0.8517,0.5363
0,Severe Injury,Neural Network,"[256, 128]",2,0.9994,0.002,0.9917,0.8469
0,Injury Type 4-Classes,Neural Network,"[256, 128]",2,0.9991,0.0016,0.9991,0.9986
0,Foot Injury,Neural Network,"[256, 128]",2,0.9999,0.0007,1.0,0.8329
0,Ankle Injury,Neural Network,"[256, 128]",2,0.9993,0.0022,0.9385,0.903


---
### Knee Injury

In [40]:
# Foot is encoded by the value 7.0, ankle is 42.0, and knee is 48.0
y = ml_merged.InjuryType.apply(
    lambda row: 1 if row == 48.0 else 0)  # To evaluate Knee Injuries

# Because the True case only represents 1% of the data, the training split is stratifying on y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=seed, stratify=y)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit(X_train)
X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)


# Establish the NN Model
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 256
hidden_layer2 = 128

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

nn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_14 (Dense)            (None, 256)               3072      
                                                                 
 dense_15 (Dense)            (None, 128)               32896     
                                                                 
 dense_16 (Dense)            (None, 1)                 129       
                                                                 
Total params: 36,097
Trainable params: 36,097
Non-trainable params: 0
_________________________________________________________________


In [41]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[
           'accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=epochs)

Epoch 1/2
Epoch 2/2


In [42]:
# Evaluate the model using the test data
results = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Add results to table
test = "Knee Injury"
loss = round(results[0], 4)
accuracy = round(results[1], 4)
precision = round(results[2], 4)
recall = round(results[3], 4)
nodes = [hidden_layer1, hidden_layer2]

row = pd.DataFrame(
    [[test, model, nodes, epochs, accuracy, loss, precision, recall]], columns=columns)
nn_table = nn_table.append(row)
nn_table

17338/17338 - 13s - loss: 0.0019 - accuracy: 0.9992 - precision_5: 0.8878 - recall_5: 0.9617 - 13s/epoch - 768us/step


Unnamed: 0,Test,Model,Nodes,Epochs,Accuracy,Loss,Precision,Recall
0,Is Injured,Neural Network,"[256, 128]",2,0.9944,0.017,0.8517,0.5363
0,Severe Injury,Neural Network,"[256, 128]",2,0.9994,0.002,0.9917,0.8469
0,Injury Type 4-Classes,Neural Network,"[256, 128]",2,0.9991,0.0016,0.9991,0.9986
0,Foot Injury,Neural Network,"[256, 128]",2,0.9999,0.0007,1.0,0.8329
0,Ankle Injury,Neural Network,"[256, 128]",2,0.9993,0.0022,0.9385,0.903
0,Knee Injury,Neural Network,"[256, 128]",2,0.9992,0.0019,0.8878,0.9617


---
## Injury Duration Predictor - 5-Way Classifier

In [43]:
# Format this to do encoding
X_cat = ml_merged.copy(deep=True)
X_cat.drop(columns=['PlayKey', 'IsInjured', 'SevereInjury',
           'Position', 'InjuryType'], inplace=True)

# Change the Injury Types back the Categorical
duration = {0.0: 'NoInjury', 1.0: 'Under_1_Week', 7.0: 'Under_4_Weeks', 28.0: 'Under_6_Weeks', 42.0: 'Over_6_Weeks'}
X_cat['Durations'] = X_cat.InjuryDuration.map(duration)
X_cat.drop(columns='InjuryDuration', inplace=True)

# Grab all categorical variables and create a list for encoding
cat = X_cat.dtypes[X_cat.dtypes == 'object'].index.tolist()

# Create a OneHotEncoder Instance
# Create the instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHot to the columns necessary
encode_df = pd.DataFrame(enc.fit_transform(X_cat[cat]))

# Add the original variable names to the df
encode_df.columns = enc.get_feature_names_out(cat)

# Merge the OneHot features and drop the variables
X_encoded = X_cat.merge(encode_df, left_index=True, right_index=True)
X_encoded.drop(columns=cat, inplace=True)

X_encoded.head()


Unnamed: 0,x,y,s,Twist,RosterPosition,Temperature,PlayerGamePlay,Outdoor,Precipitation,DaysPlayed,PlayCode,Durations_NoInjury,Durations_Over_6_Weeks,Durations_Under_1_Week,Durations_Under_4_Weeks,Durations_Under_6_Weeks
0,21.32,29.14,0.88,23.24,0,63,45,1,0.0,64,0.0,1.0,0.0,0.0,0.0,0.0
1,21.31,29.21,0.91,15.59,0,63,45,1,0.0,64,0.0,1.0,0.0,0.0,0.0,0.0
2,21.3,29.29,0.93,7.61,0,63,45,1,0.0,64,0.0,1.0,0.0,0.0,0.0,0.0
3,21.28,29.38,0.93,0.42,0,63,45,1,0.0,64,0.0,1.0,0.0,0.0,0.0,0.0
4,21.26,29.45,0.89,6.2,0,63,45,1,0.0,64,0.0,1.0,0.0,0.0,0.0,0.0


In [44]:
y = X_encoded.loc[:, 'Durations_NoInjury':]
X_enc = X_encoded.drop(
    columns=['Durations_NoInjury', 'Durations_Over_6_Weeks', 'Durations_Under_1_Week', 'Durations_Under_4_Weeks', 'Durations_Under_6_Weeks'])

# Because the True case only represents 1% of the data, the training split is stratifying on y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=seed, stratify=y)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit(X_train)
X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)


# Establish the NN Model
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 256
hidden_layer2 = 128

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=5, activation='sigmoid'))

nn.summary()


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_17 (Dense)            (None, 256)               3072      
                                                                 
 dense_18 (Dense)            (None, 128)               32896     
                                                                 
 dense_19 (Dense)            (None, 5)                 645       
                                                                 
Total params: 36,613
Trainable params: 36,613
Non-trainable params: 0
_________________________________________________________________


In [45]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[
           'accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=epochs)

Epoch 1/2
Epoch 2/2


In [46]:
# Evaluate the model using the test data
results = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Add results to table
test = "Injury Duration 5-Classes"
loss = round(results[0], 4)
accuracy = round(results[1], 4)
precision = round(results[2], 4)
recall = round(results[3], 4)
nodes = [hidden_layer1, hidden_layer2]

row = pd.DataFrame(
    [[test, model, nodes, epochs, accuracy, loss, precision, recall]], columns=columns)
nn_table = nn_table.append(row)
nn_table

17338/17338 - 11s - loss: 0.0017 - accuracy: 0.9989 - precision_6: 0.9989 - recall_6: 0.9984 - 11s/epoch - 634us/step


Unnamed: 0,Test,Model,Nodes,Epochs,Accuracy,Loss,Precision,Recall
0,Is Injured,Neural Network,"[256, 128]",2,0.9944,0.017,0.8517,0.5363
0,Severe Injury,Neural Network,"[256, 128]",2,0.9994,0.002,0.9917,0.8469
0,Injury Type 4-Classes,Neural Network,"[256, 128]",2,0.9991,0.0016,0.9991,0.9986
0,Foot Injury,Neural Network,"[256, 128]",2,0.9999,0.0007,1.0,0.8329
0,Ankle Injury,Neural Network,"[256, 128]",2,0.9993,0.0022,0.9385,0.903
0,Knee Injury,Neural Network,"[256, 128]",2,0.9992,0.0019,0.8878,0.9617
0,Injury Duration 5-Classes,Neural Network,"[256, 128]",2,0.9989,0.0017,0.9989,0.9984


In [None]:
# Export the results table to the repo

nn_table.to_csv("NeuralNetwork_Results.csv")

---

# Summary 

For an injury analysis such as this, it is more important that our model achieve a high precision, rather than a high accuracy or recall. The accuracy only tells us how many true positives have been classified, however, the data are extremely imbalanced, which is a known problem with the accuracy measure. Even if all of the injuries were classified as Non-Injuries, the model would be predicting at 99.99%, if the Non-Injury is evaluated as the True Positive Measure, and nearly 0% if we consider the Injuries as the true positive. Meanwhile, the Precision gives us the count of the True Positives with respect to the True Positives and False Negatives. 

In the case that the True Positive is the injury we are evaluating, the False Negative would represent a player who is injured, but was classified as Not Injured. In most of our analyses, the precisiion was extremely high, though the recall tended to lag. An explanation for this is that plays meeting the critera of a high-risk play, potentially prone to injury did not result in an injury at that time, but the activity could not be differentiated from similar circumnstances that did lead to injury. 

From a medical-analytical perspective, this gives us insights as to what parameters can lead to injurious plays based on the locations of the players along with the other features analyzed. 


## Future Analysis

We would like to use the features analyzed with the Random Forests analysis to try to remove some, futher finding the most critical features leading to these lower body injuries.