In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score 
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.ticker as ticker

In [2]:
injuryData = pd.read_csv("~/Documents/Job Search - 2023/NBA_Injury_Predictor/Injuries.csv")

In [3]:
injuryData.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
injuryData.rename(columns={'Notes': 'Injury Type'}, inplace=True)
injuryData.rename(columns={'Relinquished': 'Player'}, inplace=True)

### We want to separate the dataset by year to get a good idea of the total number of injuries that have occured annually

In [4]:
# Setting the objects in the column to a pandas datetime object
injuryData['Date'] = pd.to_datetime(injuryData['Date'])

#Grouping rows by 'decade' column
injuryData['Year'] = (injuryData['Date'].dt.year)



### Let's now remove all rows where the players have been acquired back from the IL 


In [5]:
# Removing the Acquired column removes all duplicates 
injuryData = injuryData[injuryData['Acquired'].isna()]
del injuryData['Acquired']

### Now let's calculate the total sum of injuries for each Year

## `It is important to note that the NBA did not have and IR/IL until the 1989-1990 season. Prior to this injured players were still members of the active roster. For this reason, it's likely many injuries do not appear in the dataset.`

In [6]:
# We only want to include data from the years after 1989 
yearly_injuries = {}
total_injuries = 0
for year, group in injuryData.groupby('Year'):
    if year > 1989: 
        injuries = group.shape[0]
        yearly_injuries[year] = injuries
        total_injuries += injuries
        
df = pd.DataFrame(yearly_injuries.items(), columns=['Year', 'Injuries'])
condition1 = df['Year'].astype(int) <= 2015
df.loc[condition1, 'Players'] = 354
df['isleagueExpansion'] = np.where(df['Year']<=2015, 0, 1)

# Set the condition for the second range of years (1996 onwards)
condition2 = df['Year'].astype(int) > 2015
df.loc[condition2, 'Players'] = 450
    

In [7]:
# adding the total number of games played in the NBA each year to the Dataframe
def get_games_played(df):

    games_played = {
      year: 6620
      for year in range(1990, 2024)
  }

  # Update the dictionary for years with a different number of games played.
    games_played[1999] = 4100
    games_played[2011] = 4096
    games_played[2020] = 4384
    games_played[2021] = 4384

  # Create a new column in the DataFrame for the number of games played.
    df['Games Played'] = df.Year.map(games_played)
    df['Games Played'] = df['Games Played'].astype(int)

get_games_played(df)

In [8]:
# Adding feature to account for a lockout season
df['Lockout'] = np.where(df['Year'].isin([1999, 2011]), 1, 0)

In [9]:
# Add feature for average number of miles travelled year over year
annual_travel = pd.read_csv("~/Documents/Job Search - 2023/NBA_Injury_Predictor/AverageTravel.csv")
df = pd.concat([df, annual_travel], axis=1)

In [10]:
# why are there so few injuries in the early in the early 90s
# use a proxy to get data
# average NBA salary 
# Add a feature for average miles travelled per year
#

In [11]:
def normalize_players(df):
  # Get the mean and standard deviation of the players feature.
    mu = df['Players'].mean()
    std = df['Players'].std()

  # Normalize the players feature.
    df['Normalized Players'] = (df['Players'] - mu) / std
    
    return df

normalize_players(df)


Unnamed: 0,Year,Injuries,Players,isleagueExpansion,Games Played,Lockout,Year.1,Average Miles Traveled,Normalized Players
0,1990,64,354.0,0,6620,0,1990,10140,-0.546482
1,1991,66,354.0,0,6620,0,1991,10340,-0.546482
2,1992,28,354.0,0,6620,0,1992,10420,-0.546482
3,1993,33,354.0,0,6620,0,1993,10500,-0.546482
4,1994,127,354.0,0,6620,0,1994,10580,-0.546482
5,1995,245,354.0,0,6620,0,1995,10660,-0.546482
6,1996,277,354.0,0,6620,0,1996,10740,-0.546482
7,1997,290,354.0,0,6620,0,1997,10820,-0.546482
8,1998,157,354.0,0,6620,0,1998,10900,-0.546482
9,1999,396,354.0,0,4100,1,1999,10980,-0.546482


In [12]:
# calculating the correlation for the two features I will use to train my model
Players_corr = df['Players'].corr(df['Injuries'])
Normalized_corr = df['Normalized Players'].corr(df['Injuries'])
Year_corr = df['Year'].corr(df['Injuries'])


print("Correlation for Players feature:", Players_corr)
print("Correlation for Year feature:", Year_corr)
print("Correlation for Players feature:", Normalized_corr)


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
#Using Linear Regression function to fit our model
train_data = df[:26]

X_train = np.array(train_data[['Year', 'Normalized Players']]).reshape(-1, 2)
y_train = train_data['Injuries']
reg = LinearRegression()


fit_model = reg.fit(X_train, y_train)
print(df.shape)
X_train.shape


In [None]:
# using a scikit-learn attribute, the coef represents the wieghts assigned to different features 
print(f"The Weight of feature(s): {fit_model.coef_}")

In [None]:
# getting the intercept(bias) for our fitted model
print(f"The Bias of the model: {fit_model.intercept_}")

In [None]:
# Adding our insample prediction for each year to our training data set
train_data.loc[:, 'prediction_insample'] = fit_model.predict(X_train).copy()
train_data.head()

In [None]:
# Calculating the error, MSE, and ABS error for each year
train_data['error'] = train_data.prediction_insample - train_data.Injuries
train_data['squared_error'] = train_data['error']**2
train_data['abs'] = abs(train_data['error'])

train_data

In [None]:
# I'm not sure why my error is zero 

# Returns the mean for error, squared error and absolute error for the training set
train_data[['error', 'squared_error', 'abs']].mean()

In [None]:
sns.set(style="darkgrid")
sns.lineplot(x=train_data['Year'], y=train_data['Injuries'], data=train_data)
sns.lineplot(x=train_data['Year'], y=train_data['prediction_insample'], data=train_data)
plt.xlabel('Year')
plt.ylabel('Number of Injuries')
plt.title('Number of Injuries Year by Year')


### <u>Time to test on a new set of data:<u>

In [None]:
test_data = df[26:]
 
X_test = np.array(test_data[['Year', 'Players']]).reshape([-1, 2])
y_test = test_data['Injuries']
X_test

In [None]:
y_pred = fit_model.predict(X_test)
y_pred
# Adding our insample prediction for each year to our test data set
test_data.loc[:, 'prediction_insample'] = fit_model.predict(X_test).copy()
test_data.head()

In [None]:
test_data['error'] = test_data.prediction_insample - test_data.Injuries
test_data['squared_error'] = test_data['error']**2
test_data['abs'] = abs(test_data['error'])

test_data

In [None]:
df.describe()["Injuries"]

In [None]:

plt.plot(X_test[:, 0], y_pred, 'r-', label='Predicted Data')
sns.set(style="darkgrid")
sns.lineplot(x=test_data['Year'], y=test_data['Injuries'], data=test_data)
plt.legend()
plt.xlabel('Year')
plt.ylabel('Number of Injuries')
plt.title('Number of Injuries Year by Year')
ax = plt.gca()
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
plt.show()

In [None]:
# Now let's remove all the rows from the data frame that have data before 1990
injuryData = injuryData[injuryData['Year'] >= 1990]