In [None]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score 
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.ticker as ticker

injuryData = pd.read_csv("~/Documents/Job Search - 2023/NBA_Injury_Predictor/Injuries.csv")

injuryData.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
injuryData.rename(columns={'Notes': 'Injury Type'}, inplace=True)
injuryData.rename(columns={'Relinquished': 'Player'}, inplace=True)

### We want to separate the dataset by year to get a good idea of the total number of injuries that have occured annually

# Setting the objects in the column to a pandas datetime object
injuryData['Date'] = pd.to_datetime(injuryData['Date'])

#Grouping rows by 'decade' column
injuryData['Year'] = (injuryData['Date'].dt.year)



### Let's now remove all rows where the players have been acquired back from the IL 


# Removing the Acquired column removes all duplicates 
injuryData = injuryData[injuryData['Acquired'].isna()]
del injuryData['Acquired']

### Now let's calculate the total sum of injuries for each Year

# We only want to include data from the years after 1989 
yearly_injuries = {}
total_injuries = 0
for year, group in injuryData.groupby('Year'):
    if year > 1989: 
        injuries = group.shape[0]
        yearly_injuries[year] = injuries
        total_injuries += injuries
#         print(f"Year: {year} Injuries: {injuries}")
#         print(f"Total Injuries: {total_injuries}")
#         print("---------------------------------------------")
df = pd.DataFrame(yearly_injuries.items(), columns=['Year', 'Injuries'])
df      

#Using Linear Regression function to fit our model
train_data = df[:16]

X_train = np.array(train_data['Year']).reshape(-1, 1)
y_train = train_data['Injuries']
reg = LinearRegression()

fit_model = reg.fit(X_train, y_train)

# using a scikit-learn attribute, the coef represents the wieghts assigned to different features 
print(f"The Weight of feature(s): {fit_model.coef_}")

# getting the intercept(bias) for our fitted model
print(f"The Bias of the model: {fit_model.intercept_}")

# Adding our insample prediction for each year to our training data set
train_data.loc[:, 'prediction_insample'] = fit_model.predict(X_train).copy()
train_data.head()

# Calculating the error, MSE, and ABS error for each year
train_data['error'] = train_data.prediction_insample - train_data.Injuries
train_data['squared_error'] = train_data['error']**2
train_data['abs'] = abs(train_data['error'])

train_data

# I'm not sure why my error is zero 

# Returns the mean for error, squared error and absolute error for the training set
train_data[['error', 'squared_error', 'abs']].mean()

sns.set(style="darkgrid")
sns.lineplot(x=train_data['Year'], y=train_data['Injuries'], data=train_data)
sns.lineplot(x=train_data['Year'], y=train_data['prediction_insample'], data=train_data)
plt.xlabel('Year')
plt.ylabel('Number of Injuries')
plt.title('Number of Injuries Year by Year')


### <u>Time to test on a new set of data:<u>

test_data = df[16:]
 
X_test = np.array(test_data['Year']).reshape([-1, 1])
y_test = test_data['Injuries']

y_pred = fit_model.predict(X_test)

## Now let's evaluate the models performance 
## Using R-Squared score to determine the performance of the model

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared Score:", r2)

## As seen above, or model is not performing optimally

plt.plot(X_test, y_pred, 'r-', label='Predicted Data')
sns.set(style="darkgrid")
sns.lineplot(x=test_data['Year'], y=test_data['Injuries'], data=test_data)
plt.legend()
plt.xlabel('Year')
plt.ylabel('Number of Injuries')
plt.title('Number of Injuries Year by Year')
ax = plt.gca()
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
plt.show()

# Now let's remove all the rows from the data frame that have data before 1990
injuryData = injuryData[injuryData['Year'] >= 1990]

## It is important to note that the NBA did not have and IR/IL until the 1989-1990 season. Prior to this injured players were still members of the active roster. For this reason, it's likely many injuries do not appear in the dataset.

### My Data is overfitting my model. Time to add a regularization term to better fit future data

# Occassionally the Dataframe will not properly remove all irrelevent rows
injury_numbers_list = list(yearly_injuries.values())
avg_inj_per_year = np.mean(injury_numbers_list, axis=0)
print(avg_inj_per_year)
print(np.sum(injury_numbers_list, axis=0))

## I'd now like to encode each individual player name, as well as remove the date column

del injuryData['Date']
label_encoder = LabelEncoder()
injuryData['Player']=label_encoder.fit_transform(injuryData['Player'])


injuryData

+ Due to a there being a lack of true concensus for the number of players in the NBA from 1990-present, I have estimated the total number of players in this time frame based on the number of teams in the league and the number of roster spots each team was allowed 

+ This does not account for players on two way contracts or players signed midseason, so the number is not 100% accurate, simply an estimation

# Total Number of players in the NBA in total(duplicates allowed) & total number of players per year
total_player_instances = 13446
ppy = total_player_instances/33

## Now I'd like to encode my Injury Type section. However, there are some extra words I'd like to remove first

slices_to_remove = ["placed on IL with", "placed on IR with", "placed on disabled list with", "placed on IL for",
                   "placed on IL recovering from", "placed on IL during", "(date approximate)", "(out for season)", "(DTD)", "(out 6-8 weeks)"]

notes_list = list(injuryData["Injury Type"])

# Perform the replacement operation on the list
for i in range(len(notes_list)):
    for slice_to_remove in slices_to_remove:
        notes_list[i] = notes_list[i].replace(slice_to_remove, "")

# Convert the list back to a NumPy array
injuryData["Injury Type"] = np.array(notes_list)

injuryData['Injury Type']=label_encoder.fit_transform(injuryData['Injury Type'])

### Next I decided to create a number of variables to train model on

# Variables for model training
uniq_inj = max(injuryData["Injury Type"]) # max method returns the maximum number, in this case that number is also the total number of unqiue injuries 
uniq_plyr = max(injuryData["Player"])
injury_instances = list(injuryData.shape)[0]
inj_per_plyr = injury_instances/uniq_plyr
inj_per_team = injuryData.groupby('Team')['ID'].count().reset_index()
inj_per_team.columns = ['Team', 'Injury_Count']
inj_list = inj_per_team.to_dict(orient='records')

print(f"Unique Injuries: {uniq_inj}")
print(f"Unique Player Instances: {uniq_plyr}")
print(f"Average number of injuries per player: {inj_per_plyr}")
print(f"Average number of injuries per team: {inj_per_team}")


# Convertint the list of key-value pairs into a dictionary of values 
inj_dict = {}
for obj in inj_list: 
    inj_dict[obj['Team']] = obj['Injury_Count']

# Bar Graph displaying the total number of injuries per team 

ax = sns.barplot(x=list(inj_dict.values()), y=list(inj_dict.keys()))

# Add labels and title
ax.set_yticklabels(ax.get_yticklabels(), fontsize=8) 
plt.xlabel('Total Injuries')
plt.ylabel('Teams')
plt.title('Total Injuries in Franchise History')

# Display the plot
plt.show()

## Create a scatter plot to see the total number on injuries over 

# Narrowing possible injuries down to the 20 most frequent 
# decoded_labels = label_encoder.inverse_transform(encoded_labels)
injuries = injuryData["Injury Type"]
unique_inj_arr = injuries.value_counts()
sorted_injuries = unique_inj_arr.sort_values(ascending=False)
top_20_injuries = sorted_injuries.head(20)


injuryData