In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv("NBA Player Injury Stats(1951 - 2023).csv")
del df['Unnamed: 0']

In [3]:
df.head()

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,1951-12-25,Bullets,,Don Barksdale,placed on IL
1,1952-12-26,Knicks,,Max Zaslofsky,placed on IL with torn side muscle
2,1956-12-29,Knicks,,Jim Baechtold,placed on inactive list
3,1959-01-16,Lakers,,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Lakers,,Elgin Baylor,player reported for military duty


In [4]:
# Fill NaN values in column1 with values from column2
df['Acquired'] = df['Acquired'].fillna(df['Relinquished'])

# Drop column2
df.drop(columns=['Relinquished'], inplace=True)

In [5]:
df.rename(columns = {'Acquired':'Players'}, inplace = True)
df['Date'] = pd.to_datetime(df['Date'])
df.drop(columns = {'Team'}, inplace = True)

In [6]:
df

Unnamed: 0,Date,Players,Notes
0,1951-12-25,Don Barksdale,placed on IL
1,1952-12-26,Max Zaslofsky,placed on IL with torn side muscle
2,1956-12-29,Jim Baechtold,placed on inactive list
3,1959-01-16,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Elgin Baylor,player reported for military duty
...,...,...,...
37662,2023-04-16,Marcus Morris,activated from IL
37663,2023-04-16,Dillon Brooks,activated from IL
37664,2023-04-16,Ja Morant,activated from IL
37665,2023-04-16,Jaren Jackson Jr.,activated from IL


In [8]:
# Tokenization and preprocessing
stop_words = set(stopwords.words('english'))

In [9]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [10]:
# Create vocabulary
vocabulary = set()
for sentence in df['Notes']:
    tokens = preprocess_text(sentence)
    vocabulary.update(tokens)

In [11]:
vocabulary

{'1',
 '11',
 '2',
 '3',
 '6',
 '8',
 'abcess',
 'abdomen',
 'abdominal',
 'abductor',
 'abrasion',
 'abscess',
 'abscessed',
 'absence',
 'abuse',
 'accident',
 'ache',
 'achilled',
 'achilles',
 'acl',
 'acquired',
 'activarted',
 'activated',
 'active',
 'acute',
 'addiction',
 'adductor',
 'adjustment',
 'admitted',
 'adominal',
 'adverse',
 'aggravated',
 'ailment',
 'allergic',
 'already',
 'ankle',
 'aorta',
 'ap',
 'appendectomy',
 'appendix',
 'approximate',
 'approxiomate',
 'arch',
 'area',
 'arm',
 'army',
 'around',
 'arrest',
 'arrhythmia',
 'artery',
 'arthritic',
 'arthritis',
 'arthroscopic',
 'asthma',
 'asthmatic',
 'athletic',
 'auto',
 'avulsion',
 'back',
 'bacterial',
 'base',
 'behavior',
 'benign',
 'bicep',
 'biceps',
 'big',
 'bilateral',
 'birth',
 'bladder',
 'blood',
 'body',
 'boken',
 'bone',
 'broken',
 'bronchitis',
 'brother',
 'bruise',
 'bruised',
 'bruising',
 'bruse',
 'brused',
 'bubble',
 'buck',
 'buised',
 'bulding',
 'bulging',
 'bull',
 'bur

In [12]:
# Create vocabulary
all_words = []
for sentence in df['Notes']:
    tokens = preprocess_text(sentence)
    all_words.extend(tokens)

In [13]:
from collections import Counter
word_freq = Counter(all_words)
top_200_words = [word for word, _ in word_freq.most_common(200)]
top_200_words

['il',
 'placed',
 'activated',
 'left',
 'right',
 'knee',
 'injury',
 'sprained',
 'ankle',
 'sore',
 'strained',
 'back',
 'season',
 'ir',
 'foot',
 'tendinitis',
 'surgery',
 'bruised',
 'lower',
 'protocol',
 'hamstring',
 'illness',
 'health',
 'safety',
 'nba',
 'shoulder',
 'torn',
 'recovering',
 'calf',
 'hip',
 'groin',
 'fractured',
 'spasm',
 'achilles',
 'wrist',
 'rest',
 'p',
 'toe',
 'thumb',
 'repair',
 'tendon',
 'hand',
 'concussion',
 'flu',
 'bone',
 'finger',
 'leg',
 'elbow',
 'broken',
 'patella',
 'fracture',
 'quadriceps',
 'stress',
 'muscle',
 'plantar',
 'thigh',
 'acl',
 'mcl',
 'big',
 'ligament',
 'neck',
 'heel',
 'per',
 'inflammation',
 'date',
 'arthroscopic',
 'abdominal',
 'rib',
 'quadricep',
 'adductor',
 'flexor',
 'stomach',
 'bruise',
 'meniscus',
 'infection',
 'cbc',
 'dislocated',
 'tightness',
 'week',
 'strain',
 'cartilage',
 'eye',
 'patellar',
 'hyperextended',
 'approximate',
 'respiratory',
 'stats',
 'partially',
 'dtd',
 'index',

In [14]:
# dictionary: 'abcess'/'abscess', 'abdom'/'addominal', 'abductor', 'abrasion', 'accident', 'ache', 'achille'/'acl', 'adductor', 'acute', 'ankle','asthma', 'arthriti', 'back', 'bicep', 'bilateral', 'broken', 'bruis'/'bruse', 'bronchitis', 'buttock', 'valf', 'cartilage', 'cervica', 'chest', 'chronic'

In [15]:
# Create new columns
for word in top_200_words:
    df[word] = 0

# Populate columns
for i, sentence in enumerate(df['Notes']):
    tokens = preprocess_text(sentence)
    for token in tokens:
        df.at[i, token] = 1

# Drop the original injury_notes column if desired
# df.drop('injury_notes', axis=1, inplace=True)

print(df)

            Date             Players  \
0     1951-12-25       Don Barksdale   
1     1952-12-26       Max Zaslofsky   
2     1956-12-29       Jim Baechtold   
3     1959-01-16        Elgin Baylor   
4     1961-11-26        Elgin Baylor   
...          ...                 ...   
37662 2023-04-16       Marcus Morris   
37663 2023-04-16       Dillon Brooks   
37664 2023-04-16           Ja Morant   
37665 2023-04-16   Jaren Jackson Jr.   
37666 2023-04-16        Santi Aldama   

                                                   Notes  il  placed  \
0                                           placed on IL   1       1   
1                     placed on IL with torn side muscle   1       1   
2                                placed on inactive list   0       1   
3      player refused to play after being denied a ro...   0       0   
4                      player reported for military duty   0       0   
...                                                  ...  ..     ...   
37662          

In [16]:
df.isnull().sum().head(205)

Date            0
Players         1
Notes           0
il              0
placed          0
            ...  
effusion        0
head            0
clot            0
side        37663
inactive    37666
Length: 205, dtype: int64

In [19]:
df = df.iloc[:, :203]

In [20]:
df

Unnamed: 0,Date,Players,Notes,il,placed,activated,left,right,knee,injury,...,biceps,accident,deep,loose,recovery,chondromalacia,pinched,effusion,head,clot
0,1951-12-25,Don Barksdale,placed on IL,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1952-12-26,Max Zaslofsky,placed on IL with torn side muscle,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1956-12-29,Jim Baechtold,placed on inactive list,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1959-01-16,Elgin Baylor,player refused to play after being denied a ro...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1961-11-26,Elgin Baylor,player reported for military duty,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37662,2023-04-16,Marcus Morris,activated from IL,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37663,2023-04-16,Dillon Brooks,activated from IL,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37664,2023-04-16,Ja Morant,activated from IL,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37665,2023-04-16,Jaren Jackson Jr.,activated from IL,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
del df['Notes']

In [21]:
def get_season_year(date):
    if date.month >= 10:
        return date.year
    else:
        return date.year - 1

In [25]:
df['Season'] = df['Date'].apply(get_season_year)
grouped = df.groupby(['Players', 'Season']).sum()

In [32]:
grouped.reset_index(inplace = True)
grouped.tail(20)

Unnamed: 0,Players,Season,il,placed,activated,left,right,knee,injury,sprained,...,biceps,accident,deep,loose,recovery,chondromalacia,pinched,effusion,head,clot
9277,Zydrunas Ilgauskas,1996,2,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9278,Zydrunas Ilgauskas,1998,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9279,Zydrunas Ilgauskas,1999,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9280,Zydrunas Ilgauskas,2000,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9281,Zydrunas Ilgauskas,2001,2,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9282,Zydrunas Ilgauskas,2005,2,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9283,Zydrunas Ilgauskas,2007,5,3,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9284,Zydrunas Ilgauskas,2008,5,3,2,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9285,Zydrunas Ilgauskas,2009,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9286,Zydrunas Ilgauskas,2010,5,3,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
grouped['Players'] = grouped['Players'].str.lstrip()

In [47]:
grouped['Players'].unique()

array(['(James) Mike Scott', '(William) Tony Parker', '11/25/2019', ...,
       'placed on IL with surgery on right knee',
       'placed on IL with torn labrum in right hip',
       'strained left quadriceps (DTD)'], dtype=object)

In [50]:
grouped.to_csv("injury_data.csv")