In [4]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import re
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from fuzzywuzzy import process



In [2]:
df = pd.read_csv("NBA Player Injury Stats(1951 - 2023).csv")
del df['Unnamed: 0']

In [3]:
df.head()

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,1951-12-25,Bullets,,Don Barksdale,placed on IL
1,1952-12-26,Knicks,,Max Zaslofsky,placed on IL with torn side muscle
2,1956-12-29,Knicks,,Jim Baechtold,placed on inactive list
3,1959-01-16,Lakers,,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Lakers,,Elgin Baylor,player reported for military duty


In [4]:
# Fill NaN values in column1 with values from column2
df['Acquired'] = df['Acquired'].fillna(df['Relinquished'])

# Drop column2
df.drop(columns=['Relinquished'], inplace=True)

In [5]:
df.rename(columns = {'Acquired':'Players'}, inplace = True)
df['Date'] = pd.to_datetime(df['Date'])
df.drop(columns = {'Team'}, inplace = True)

In [6]:
df

Unnamed: 0,Date,Players,Notes
0,1951-12-25,Don Barksdale,placed on IL
1,1952-12-26,Max Zaslofsky,placed on IL with torn side muscle
2,1956-12-29,Jim Baechtold,placed on inactive list
3,1959-01-16,Elgin Baylor,player refused to play after being denied a ro...
4,1961-11-26,Elgin Baylor,player reported for military duty
...,...,...,...
37662,2023-04-16,Marcus Morris,activated from IL
37663,2023-04-16,Dillon Brooks,activated from IL
37664,2023-04-16,Ja Morant,activated from IL
37665,2023-04-16,Jaren Jackson Jr.,activated from IL


In [16]:

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Function to tokenize text and remove stopwords
def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens
    return word_counts

In [17]:
# Pre-process keywords for fuzzy matching
def preprocess_keywords(keywords):
    total_word_counts = Counter()
    for keyword in keywords:
        total_word_counts.update(tokenize_and_remove_stopwords(keyword))
    keyword_mapping = {keyword: process.extractOne(keyword, total_word_counts.keys())[0] for keyword in keywords}
    return keyword_mapping

# Function to count words using pre-processed keyword mapping
def count_words(text, keyword_mapping):
    word_counts = Counter()

    # Tokenize and remove stopwords
    tokens = tokenize_and_remove_stopwords(text)

    # Count occurrences of each keyword
    for token in tokens:
        if token in keyword_mapping:
            closest_match = keyword_mapping[token]
            word_counts[closest_match] += 1

    return word_counts

In [19]:
# Apply preprocessing and count words for each row
df['Injury_Notes_Preprocessed'] = df['Notes'].apply(preprocess_text)

In [23]:
# Select top N most frequent words as keywords
N = 100  # Number of keywords to select
total_word_counts = Counter()
for _, row in df.iterrows():
    word_counts = Counter(tokenize_and_remove_stopwords(row['Injury_Notes_Preprocessed']))
    total_word_counts += word_counts

In [24]:
# Get top N most common words as keywords
keywords = [word for word, _ in total_word_counts.most_common(N)]

In [25]:
# Pre-process keywords for fuzzy matching
keyword_mapping = preprocess_keywords(keywords)

In [26]:
# Apply fuzzy word matching and count words for each row
df['Word_Counts'] = df['Injury_Notes_Preprocessed'].apply(lambda x: count_words(x, keyword_mapping))

In [27]:
# Create columns for each key word with count
for word in keywords:
    df[word] = df['Word_Counts'].apply(lambda x: x[word] if word in x else 0)

In [28]:
# Drop intermediate columns
df.drop(columns=['Injury_Notes_Preprocessed', 'Word_Counts'], inplace=True)

In [29]:
print(df)

            Date             Players  \
0     1951-12-25       Don Barksdale   
1     1952-12-26       Max Zaslofsky   
2     1956-12-29       Jim Baechtold   
3     1959-01-16        Elgin Baylor   
4     1961-11-26        Elgin Baylor   
...          ...                 ...   
37662 2023-04-16       Marcus Morris   
37663 2023-04-16       Dillon Brooks   
37664 2023-04-16           Ja Morant   
37665 2023-04-16   Jaren Jackson Jr.   
37666 2023-04-16        Santi Aldama   

                                                   Notes  il  placed  \
0                                           placed on IL   1       1   
1                     placed on IL with torn side muscle   1       1   
2                                placed on inactive list   0       1   
3      player refused to play after being denied a ro...   0       0   
4                      player reported for military duty   0       0   
...                                                  ...  ..     ...   
37662          

In [30]:
df.isnull().sum().head(205)

Date         0
Players      1
Notes        0
il           0
placed       0
            ..
disc         0
nose         0
ring         0
fasciitis    0
reaction     0
Length: 103, dtype: int64

In [31]:
df = df.iloc[:, :203]

In [32]:
df

Unnamed: 0,Date,Players,Notes,il,placed,activated,left,right,knee,injury,...,shin,knees,upper,pulled,f,disc,nose,ring,fasciitis,reaction
0,1951-12-25,Don Barksdale,placed on IL,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1952-12-26,Max Zaslofsky,placed on IL with torn side muscle,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1956-12-29,Jim Baechtold,placed on inactive list,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1959-01-16,Elgin Baylor,player refused to play after being denied a ro...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1961-11-26,Elgin Baylor,player reported for military duty,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37662,2023-04-16,Marcus Morris,activated from IL,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37663,2023-04-16,Dillon Brooks,activated from IL,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37664,2023-04-16,Ja Morant,activated from IL,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37665,2023-04-16,Jaren Jackson Jr.,activated from IL,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
del df['Notes']

In [34]:
def get_season_year(date):
    if date.month >= 10:
        return date.year
    else:
        return date.year - 1

In [35]:
df['Season'] = df['Date'].apply(get_season_year)
grouped = df.groupby(['Players', 'Season']).sum()

In [36]:
grouped.reset_index(inplace = True)
grouped.tail(20)

Unnamed: 0,Players,Season,il,placed,activated,left,right,knee,injury,sprained,...,shin,knees,upper,pulled,f,disc,nose,ring,fasciitis,reaction
9277,Zydrunas Ilgauskas,1996,2,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9278,Zydrunas Ilgauskas,1998,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9279,Zydrunas Ilgauskas,1999,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9280,Zydrunas Ilgauskas,2000,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9281,Zydrunas Ilgauskas,2001,2,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9282,Zydrunas Ilgauskas,2005,2,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9283,Zydrunas Ilgauskas,2007,5,3,2,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9284,Zydrunas Ilgauskas,2008,5,3,2,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9285,Zydrunas Ilgauskas,2009,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9286,Zydrunas Ilgauskas,2010,5,3,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
grouped['Players'] = grouped['Players'].str.lstrip()

In [38]:
grouped['Players'].unique()

array(['(James) Mike Scott', '(William) Tony Parker', '11/25/2019', ...,
       'placed on IL with surgery on right knee',
       'placed on IL with torn labrum in right hip',
       'strained left quadriceps (DTD)'], dtype=object)

In [39]:
grouped.to_csv("injury_data.csv")