In [2]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score 

In [3]:
injuryData = pd.read_csv("~/Documents/Job Search - 2023/NBA_Injury_Predictor/Injuries.csv")

In [6]:
injuryData.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
injuryData.rename(columns={'Notes': 'Injury Type'}, inplace=True)
injuryData.rename(columns={'Relinquished': 'Relinquished_Player'}, inplace=True)
injuryData.rename(columns={'Acquired': 'Acquired_Player'}, inplace=True)

### We want to separate the dataset by year to get a good idea of the total number of injuries that have occured annually

In [5]:

# Setting the objects in the column to a pandas datetime object
injuryData['Date'] = pd.to_datetime(injuryData['Date'])

#Grouping rows by 'decade' column
injuryData['Year'] = (injuryData['Date'].dt.year)

### Let's now remove all rows where the players have been acquired back from the IL 

In [None]:













# Removing the Acquired column removes all duplicates 
injuryData = injuryData[injuryData['Acquired'].isna()]
del injuryData['Acquired']

### Now let's calculate the total sum of injuries for each Year

# We only want to include data from the years after 1989 
yearly_injuries = {}
total_injuries = 0
for year, group in injuryData.groupby('Year'):
    if year > 1989: 
        injuries = group.shape[0]
        yearly_injuries[year] = injuries
        total_injuries += injuries
#         print(f"Year: {year} Injuries: {injuries}")
#         print(f"Total Injuries: {total_injuries}")
#         print("---------------------------------------------")
        

# Now let's remove all the rows from the data frame that have data before 1990
injuryData = injuryData[injuryData['Year'] >= 1990]

## It is important to note that the NBA did not have and IR/IL until the 1989-1990 season. Prior to this injured players were still members of the active roster. For this reason, it's likely many injuries do not appear in the dataset.

print(yearly_injuries)

sns.set(style="darkgrid")
sns.lineplot(x=yearly_injuries.keys(), y=yearly_injuries.values(), data=yearly_injuries)
plt.xlabel('Year')
plt.ylabel('Number of Injuries')
plt.title('Number of Injuries Year by Year')

### We have to encode the data for the "Team" column

# Encoding each unique team instance
# decoding the column for later use and creating a dictionary linking teams to encoded values
unique_values = injuryData['Team'].nunique()
print("Number of unique teams:", unique_values)

### We now have a more accurate dataset representing the total number of injuries from 1990-Present. 

### Let's get the average number of injuries per season over this time span

# Occassionally the Dataframe will not properly remove all irrelevent rows
injury_numbers_list = list(yearly_injuries.values())
avg_inj_per_year = np.mean(injury_numbers_list, axis=0)
print(avg_inj_per_year.dtype)
print(np.sum(injury_numbers_list, axis=0))

## I'd now like to encode each individual player name, as well as remove the date column

del injuryData['Date']
label_encoder = LabelEncoder()
injuryData['Player']=label_encoder.fit_transform(injuryData['Player'])


injuryData

+ Due to a there being a lack of true concensus for the number of players in the NBA from 1990-present, I have estimated the total number of players in this time frame based on the number of teams in the league and the number of roster spots each team was allowed 

+ This does not account for players on two way contracts or players signed midseason, so the number is not 100% accurate, simply an estimation

# Total Number of players in the NBA in total(duplicates allowed) & total number of players per year
total_player_instances = 13446
ppy = total_player_instances/33

## Now I'd like to encode my Injury Type section. However, there are some extra words I'd like to remove first

slices_to_remove = ["placed on IL with", "placed on IR with", "placed on disabled list with", "placed on IL for",
                   "placed on IL recovering from", "placed on IL during", "(date approximate)", "(out for season)", "(DTD)", "(out 6-8 weeks)"]

notes_list = list(injuryData["Injury Type"])

# Perform the replacement operation on the list
for i in range(len(notes_list)):
    for slice_to_remove in slices_to_remove:
        notes_list[i] = notes_list[i].replace(slice_to_remove, "")

# Convert the list back to a NumPy array
injuryData["Injury Type"] = np.array(notes_list)

injuryData['Injury Type']=label_encoder.fit_transform(injuryData['Injury Type'])

### Next I decided to create a number of variables to train model on

# Variables for model training
uniq_inj = max(injuryData["Injury Type"]) # max method returns the maximum number, in this case that number is also the total number of unqiue injuries 
uniq_plyr = max(injuryData["Player"])
injury_instances = list(injuryData.shape)[0]
inj_per_plyr = injury_instances/uniq_plyr
inj_per_team = injuryData.groupby('Team')['ID'].count().reset_index()
inj_per_team.columns = ['Team', 'Injury_Count']
inj_list = inj_per_team.to_dict(orient='records')
# print(f"Unique Injuries: {uniq_inj}")
# print(f"Unique Player Instances: {uniq_plyr}")
# print(f"Average number of injuries per player: {inj_per_plyr}")

# Convertint the list of key-value pairs into a dictionary of values 
inj_dict = {}
for obj in inj_list: 
    inj_dict[obj['Team']] = obj['Injury_Count']

# Bar Graph displaying the total number of injuries per team 

ax = sns.barplot(x=list(inj_dict.values()), y=list(inj_dict.keys()))

# Add labels and title
ax.set_yticklabels(ax.get_yticklabels(), fontsize=8) 
plt.xlabel('Total Injuries')
plt.ylabel('Teams')
plt.title('Total Injuries in Franchise History')

# Display the plot
plt.show()

## Using the Random Forest Algorithm

# Narrowing possible injuries down to the 20 most frequent 
# decoded_labels = label_encoder.inverse_transform(encoded_labels)
injuries = injuryData["Injury Type"]
unique_inj_arr = injuries.value_counts()
sorted_injuries = unique_inj_arr.sort_values(ascending=False)
top_20_injuries = sorted_injuries.head(20)


# Encoding the Team names
injuryData['Team']=label_encoder.fit_transform(injuryData['Team'])

# `Current Issue, We DO NOT HAVE A X_VAL, Y_VAL outside of the foreloop below

X = injuryData[['Team', 'Year', 'Player', 'Injury Type']]
y = []

for injury_type in top_20_injuries: 
    y_train_arr = np.where(injuryData['Injury Type'] == injury_type, 1, 0)
    y.append(y_train_arr)
y = np.array(y)

reshaped_y = y.T
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# models = {}
# for injury_type, y_train_arr in top_20_injuries:
#     model = RandomForestClassifier()
#     X_train_injury = injuryData[['Team', 'Year', 'Player', 'Injury Type']]
#     model.fit(X_train_injury, y_train_arr)
#     models[injury_type] = model

len(models)

# Using the trained models to predict injury types on the validation set
y_probs = {}
for injury_type, model in models.items():
    y_probs[injury_type] = model.predict_proba(X_val)[:, 1]