In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score 
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.ticker as ticker

In [2]:
injuryData = pd.read_csv("~/Documents/Job Search - 2023/NBA_Injury_Predictor/Injuries.csv")

In [3]:
injuryData.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
injuryData.rename(columns={'Notes': 'Injury Type'}, inplace=True)
injuryData.rename(columns={'Relinquished': 'Player'}, inplace=True)

### We want to separate the dataset by year to get a good idea of the total number of injuries that have occured annually

In [4]:
# Setting the objects in the column to a pandas datetime object
injuryData['Date'] = pd.to_datetime(injuryData['Date'])

#Grouping rows by 'decade' column
injuryData['Year'] = (injuryData['Date'].dt.year)



### Let's now remove all rows where the players have been acquired back from the IL 


In [5]:
# Removing the Acquired column removes all duplicates 
injuryData = injuryData[injuryData['Acquired'].isna()]
del injuryData['Acquired']

### Now let's calculate the total sum of injuries for each Year

## `It is important to note that the NBA did not have and IR/IL until the 1989-1990 season. Prior to this injured players were still members of the active roster. For this reason, it's likely many injuries do not appear in the dataset.`

In [6]:
# We only want to include data from the years after 1989 
yearly_injuries = {}
total_injuries = 0
for year, group in injuryData.groupby('Year'):
    if year > 1989: 
        injuries = group.shape[0]
        yearly_injuries[year] = injuries
        total_injuries += injuries
#         print(f"Year: {year} Injuries: {injuries}")
#         print(f"Total Injuries: {total_injuries}")
#         print("---------------------------------------------")
df = pd.DataFrame(yearly_injuries.items(), columns=['Year', 'Injuries'])
condition1 = df['Year'].astype(int) <= 2015
df.loc[condition1, 'Players'] = 354

# Set the condition for the second range of years (1996 onwards)
condition2 = df['Year'].astype(int) > 2015
df.loc[condition2, 'Players'] = 450
df      

Unnamed: 0,Year,Injuries,Players
0,1990,64,354.0
1,1991,66,354.0
2,1992,28,354.0
3,1993,33,354.0
4,1994,127,354.0
5,1995,245,354.0
6,1996,277,354.0
7,1997,290,354.0
8,1998,157,354.0
9,1999,396,354.0


In [7]:
# calculating the correlation for the two features I will use to train my model
Players_corr = df['Players'].corr(df['Injuries'])
Year_corr = df['Year'].corr(df['Injuries'])

print("Correlation for Players feature:", Players_corr)
print("Correlation for Year feature:", Year_corr)

Correlation for Players feature: 0.5759226257184823
Correlation for Year feature: 0.8285992890740163


In [20]:
def normalize_players(df):
  # Get the mean and standard deviation of the players feature.
  mean = df['Players'].mean()
  std = df['Players'].std()

  # Normalize the players feature.
  df['Normalized Players'] = (df['Players'] - mean) / std
#   df['Normalized Players'].fillna(mean, inplace=True)

  # Print the data frame.

  return df


In [21]:
#Using Linear Regression function to fit our model
train_data = df[:26]

train_data = normalize_players(train_data)
print(train_data)
# X_train = np.array(train_data[['Year', 'Normalized Players']]).reshape(-1, 2)
# y_train = train_data['Injuries']
# reg = LinearRegression()


# fit_model = reg.fit(X_train, y_train)
# print(df.shape)
# X_train.shape


    Year  Injuries  Players  Normalized Players
0   1990        64    354.0               354.0
1   1991        66    354.0               354.0
2   1992        28    354.0               354.0
3   1993        33    354.0               354.0
4   1994       127    354.0               354.0
5   1995       245    354.0               354.0
6   1996       277    354.0               354.0
7   1997       290    354.0               354.0
8   1998       157    354.0               354.0
9   1999       396    354.0               354.0
10  2000       326    354.0               354.0
11  2001       335    354.0               354.0
12  2002       318    354.0               354.0
13  2003       310    354.0               354.0
14  2004       356    354.0               354.0
15  2005       627    354.0               354.0
16  2006       938    354.0               354.0
17  2007       962    354.0               354.0
18  2008       857    354.0               354.0
19  2009       755    354.0             

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Normalized Players'] = (df['Players'] - mean) / std
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Normalized Players'].fillna(mean, inplace=True)


In [None]:
# using a scikit-learn attribute, the coef represents the wieghts assigned to different features 
print(f"The Weight of feature(s): {fit_model.coef_}")

In [None]:
# getting the intercept(bias) for our fitted model
print(f"The Bias of the model: {fit_model.intercept_}")

In [None]:
# Adding our insample prediction for each year to our training data set
train_data.loc[:, 'prediction_insample'] = fit_model.predict(X_train).copy()
train_data.head()

In [None]:
# Calculating the error, MSE, and ABS error for each year
train_data['error'] = train_data.prediction_insample - train_data.Injuries
train_data['squared_error'] = train_data['error']**2
train_data['abs'] = abs(train_data['error'])

train_data

In [None]:
# I'm not sure why my error is zero 

# Returns the mean for error, squared error and absolute error for the training set
train_data[['error', 'squared_error', 'abs']].mean()

In [None]:
sns.set(style="darkgrid")
sns.lineplot(x=train_data['Year'], y=train_data['Injuries'], data=train_data)
sns.lineplot(x=train_data['Year'], y=train_data['prediction_insample'], data=train_data)
plt.xlabel('Year')
plt.ylabel('Number of Injuries')
plt.title('Number of Injuries Year by Year')


### <u>Time to test on a new set of data:<u>

In [None]:
test_data = df[26:]
 
X_test = np.array(test_data[['Year', 'Players']]).reshape([-1, 2])
y_test = test_data['Injuries']
X_test

In [None]:
y_pred = fit_model.predict(X_test)
y_pred

In [None]:
## Now let's evaluate the models performance 
## Using R-Squared score to determine the performance of the model
from sklearn.metrics import mean_absolute_error

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error:", mae)
print("R-squared Score:", r2)

In [None]:
df.describe()["Injuries"]

## As seen above, or model is not performing optimally

In [None]:
plt.plot(X_test, y_pred, 'r-', label='Predicted Data')
sns.set(style="darkgrid")
sns.lineplot(x=test_data['Year'], y=test_data['Injuries'], data=test_data)
plt.legend()
plt.xlabel('Year')
plt.ylabel('Number of Injuries')
plt.title('Number of Injuries Year by Year')
ax = plt.gca()
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
plt.show()

In [None]:
# Now let's remove all the rows from the data frame that have data before 1990
injuryData = injuryData[injuryData['Year'] >= 1990]