# Evaluating Injury vs. Non-Injury Plays

The issue we ran into with these data is that there are already 76 million rows in the tracking data, and merging additional columns is problematic in local analysis due to memory constraints. The plan for this analysis is to use undersampling from the outer merge of the Playlist-Injury Datasets, to randomly reduce the non-injury plays. It's important to perform this step at this time, so that we don't have to perform additional aggregation steps to the large table with 76 million rows. When we merge the Playlist-Injury dataset to the Tracking data, only the rows that match a PlayKey number will be merged, significantly cutting down the size of the dataframe by rows, as we increase the number of columns. 

In [1]:
import numpy as np
import pandas as pd
from NFL_Injury_Cleaning_Functions import *
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.linear_model import LogisticRegression


pd.set_option('mode.chained_assignment', None)
seed = 42


## Read in the datasets and Import Functions

In [2]:
playlist = pd.read_csv("NFL_Turf/PlayList.csv")
injuries = pd.read_csv("NFL_Turf/InjuryRecord.csv")

In [3]:
ml = ML_Data_Cleaner(playlist, injuries)
ml.head()

Unnamed: 0,PlayKey,RosterPosition,Temperature,PlayerGamePlay,Position,SyntheticField,Outdoor,Precipitation,DaysPlayed,PlayCode,InjuryType,InjuryDuration,SevereInjury
0,26624-1-1,0,63,1,0,1,1,0,64,0.0,0.0,0.0,0.0
1,26624-1-2,0,63,2,0,1,1,0,64,0.0,0.0,0.0,0.0
2,26624-1-3,0,63,3,0,1,1,0,64,1.0,0.0,0.0,0.0
3,26624-1-4,0,63,4,0,1,1,0,64,1.0,0.0,0.0,0.0
4,26624-1-5,0,63,5,0,1,1,0,64,0.0,0.0,0.0,0.0


We are adding one additional column, 'IsInjured', where it is 1 wherever the injury type is not 0

In [4]:
# The numpy where function reads as follows... set ml.IsInjured equal to 0 
# where ml.InjuryType == 0, else set equal to 1. All injuryType 0 values are not injures,
# everything else is an injury
 
ml['IsInjured'] = np.where(ml['InjuryType'] == 0, 0, 1)
ml.drop(columns=['InjuryType', 'InjuryDuration', 'SevereInjury'], inplace=True)


In [5]:
ml.head()

Unnamed: 0,PlayKey,RosterPosition,Temperature,PlayerGamePlay,Position,SyntheticField,Outdoor,Precipitation,DaysPlayed,PlayCode,IsInjured
0,26624-1-1,0,63,1,0,1,1,0,64,0.0,0
1,26624-1-2,0,63,2,0,1,1,0,64,0.0,0
2,26624-1-3,0,63,3,0,1,1,0,64,1.0,0
3,26624-1-4,0,63,4,0,1,1,0,64,1.0,0
4,26624-1-5,0,63,5,0,1,1,0,64,0.0,0


## Load the Tracking data for Merge

- Load the tracking data
- Drop the columns from tracking that are not necessary
- perform an inner merge between the datasets

In [6]:
tracking = pd.read_csv('NFL_Turf/PlayerTrackData.csv')
tracking.drop(columns=['event', 'dis', 'time'], inplace=True)
tracking.head()

Unnamed: 0,PlayKey,x,y,dir,o,s
0,26624-1-1,87.46,28.93,288.24,262.33,0.13
1,26624-1-1,87.45,28.92,283.91,261.69,0.12
2,26624-1-1,87.44,28.92,280.4,261.17,0.12
3,26624-1-1,87.44,28.92,278.79,260.66,0.1
4,26624-1-1,87.44,28.92,275.44,260.27,0.09


In [7]:
ml_merged = pd.merge(tracking, ml, on='PlayKey', how='inner')

In [8]:
ml_merged.head()

Unnamed: 0,PlayKey,x,y,dir,o,s,RosterPosition,Temperature,PlayerGamePlay,Position,SyntheticField,Outdoor,Precipitation,DaysPlayed,PlayCode,IsInjured
0,26624-1-1,87.46,28.93,288.24,262.33,0.13,0,63,1,0,1,1,0,64,0.0,0
1,26624-1-1,87.45,28.92,283.91,261.69,0.12,0,63,1,0,1,1,0,64,0.0,0
2,26624-1-1,87.44,28.92,280.4,261.17,0.12,0,63,1,0,1,1,0,64,0.0,0
3,26624-1-1,87.44,28.92,278.79,260.66,0.1,0,63,1,0,1,1,0,64,0.0,0
4,26624-1-1,87.44,28.92,275.44,260.27,0.09,0,63,1,0,1,1,0,64,0.0,0


# Undersampling

We will undersample the data using the Cluster Centroids algorithm doing the following: 

1. Remove the nan values from the merged tables
2. Break into training and testing datasets
3. Define a model, fit the model, make predictions

In [9]:
# Drop any dfs taking up memory!
del tracking
# del X_test, X_train, y_test, y_train, X, y
# del y_pred, X_resampled, y_resampled
del playlist, ml, injuries


In [10]:
ml_merged.head()

Unnamed: 0,PlayKey,x,y,dir,o,s,RosterPosition,Temperature,PlayerGamePlay,Position,SyntheticField,Outdoor,Precipitation,DaysPlayed,PlayCode,IsInjured
0,26624-1-1,87.46,28.93,288.24,262.33,0.13,0,63,1,0,1,1,0,64,0.0,0
1,26624-1-1,87.45,28.92,283.91,261.69,0.12,0,63,1,0,1,1,0,64,0.0,0
2,26624-1-1,87.44,28.92,280.4,261.17,0.12,0,63,1,0,1,1,0,64,0.0,0
3,26624-1-1,87.44,28.92,278.79,260.66,0.1,0,63,1,0,1,1,0,64,0.0,0
4,26624-1-1,87.44,28.92,275.44,260.27,0.09,0,63,1,0,1,1,0,64,0.0,0


In [11]:
# Remove the rows with NaN values
ml_merged = ml_merged.loc[ml_merged.dir.isna() == False]

In [12]:
X = ml_merged.drop(columns=['PlayKey', 'IsInjured'])
y = ml_merged.IsInjured

In [13]:
del ml_merged

In [14]:
# Split the Data before fitting the model 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)

In [16]:
X_test.to_csv("Shared_Tables/X_test.csv")

y_train.to_csv("Shared_Tables/y_train.csv")
y_test.to_csv("Shared_Tables/y_test.csv")


In [17]:
X_train.to_csv("Shared_Tables/X_train.csv")



In [18]:
# Apply Random Undersampling model
rus = RandomUnderSampler(random_state=seed)

# Fit the resample
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Use the counter to display the classification
Counter(y_resampled)


Counter({0: 16665, 1: 16665})

In [19]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=seed)
model.fit(X_resampled, y_resampled)

# Calculate the balanced Accuracy Score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

# Display the confusion matrix
confusion_matrix(y_test, y_pred)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([[12134554,  6484643],
       [    2362,     3168]], dtype=int64)

## Use the SMOTEENN Model with combination oversampling and undersampling

There was somehow 2 values in dir and o with NaN values, so need to remove them

In [25]:
# Define the algorithm
smoteenn = SMOTEENN(random_state=seed)

# Fit the resample
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

# Use the couter to display the classification
Counter(y_resampled)


MemoryError: Unable to allocate 5.83 GiB for an array with shape (55874181, 14) and data type float64

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=seed)
model.fit(X_resampled, y_resampled)

# Calculate the balanced Accuracy Score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)