In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
cross_df = pd.read_csv('/content/drive/MyDrive/Practicum/new_training.csv')
cross_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4223 entries, 0 to 4222
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              4223 non-null   int64  
 1   player_name             4223 non-null   object 
 2   team_abbreviation       4223 non-null   object 
 3   age                     4223 non-null   float64
 4   body_fat_perct          4223 non-null   float64
 5   FGA_per_MP              4223 non-null   float64
 6   season                  4223 non-null   object 
 7   Pos                     4223 non-null   object 
 8   MP                      4223 non-null   float64
 9   TRB                     4223 non-null   float64
 10  Date                    4223 non-null   object 
 11  Major anatomical areas  4223 non-null   object 
 12  Anatomical sub-areas    4223 non-null   object 
 13  Notes                   4223 non-null   object 
 14  Upper_ext_count         4223 non-null   

In [4]:
##Trying to use total injuries and check accuracy -> worse
cross_df['total_injury_count'] = cross_df['Upper_ext_count']+cross_df['Lower_ext_count']+cross_df['Head-neck-trunk_count']

In [5]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

## Using Undersampling ###

In [6]:
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

X = cross_df.drop(columns=['Major anatomical areas'])
y = cross_df['Major anatomical areas']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


undersampler = RandomUnderSampler(sampling_strategy= 'auto',
                                  random_state=42)

X_undersampled, y_undersampled = undersampler.fit_resample(X, y_encoded)

print('Class distribution after undersampling:', Counter(y_undersampled))

Class distribution after undersampling: Counter({0: 591, 1: 591, 2: 591})


###Using Decision Tree

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report


# Encode categorical features using OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(X_undersampled[['Pos']])

# Select training features
other_features = X_undersampled[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Upper_ext_count', 'Lower_ext_count', 'Head-neck-trunk_count']]

# Combine encoded and numerical features
X = np.concatenate([encoded_data, other_features], axis=1)
y = y_undersampled

#Using cross validation of 5 splits

dt_clf = DecisionTreeClassifier(random_state=42)
cv_scores = cross_val_score(dt_clf, X, y, cv=5)

y_pred = cross_val_predict(dt_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))



Cross-validation scores: [0.67605634 0.64225352 0.62253521 0.62429379 0.65819209]
Mean cross-validation score: 0.6446661892257499
Accuracy: 0.6446700507614214
Recall: 0.6446700507614214
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.65      0.65       591
           1       0.61      0.66      0.64       591
           2       0.67      0.62      0.65       591

    accuracy                           0.64      1773
   macro avg       0.65      0.64      0.64      1773
weighted avg       0.65      0.64      0.64      1773



### Using Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=60, criterion='entropy', random_state=42)
cv_scores = cross_val_score(rf_clf, X, y, cv=5)

y_pred = cross_val_predict(rf_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.70985915 0.67042254 0.64788732 0.6299435  0.61581921]
Mean cross-validation score: 0.6547863451897828
Accuracy: 0.6548223350253807
Recall: 0.6548223350253807
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.69      0.65       591
           1       0.70      0.61      0.65       591
           2       0.65      0.67      0.66       591

    accuracy                           0.65      1773
   macro avg       0.66      0.65      0.65      1773
weighted avg       0.66      0.65      0.65      1773



###Using XGBoost

In [9]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05,objective='multi:softprob',eval_metric='mlogloss',random_state=42)
cv_scores = cross_val_score(xgb_clf, X, y, cv=5)

y_pred = cross_val_predict(xgb_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.73521127 0.70704225 0.67042254 0.65254237 0.67514124]
Mean cross-validation score: 0.6880719344314474
Accuracy: 0.6880992667794699
Recall: 0.6880992667794699
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.75      0.70       591
           1       0.74      0.60      0.66       591
           2       0.68      0.71      0.70       591

    accuracy                           0.69      1773
   macro avg       0.69      0.69      0.69      1773
weighted avg       0.69      0.69      0.69      1773



##Using Oversampling

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

X = cross_df.drop(columns=['Major anatomical areas'])
y = cross_df['Major anatomical areas']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


oversampler = RandomOverSampler(sampling_strategy= 'auto',
                                  random_state=42)

X_oversampled, y_oversampled = oversampler.fit_resample(X, y_encoded)

print('Class distribution after oversampling:', Counter(y_oversampled))

Class distribution after oversampling: Counter({2: 2813, 0: 2813, 1: 2813})


###Using Decision Tree

In [12]:
# Encode categorical features using OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(X_oversampled[['Pos']])

# Select training features
other_features = X_oversampled[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Upper_ext_count', 'Lower_ext_count', 'Head-neck-trunk_count']]

# Combine encoded and numerical features
X = np.concatenate([encoded_data, other_features], axis=1)
y = y_oversampled

#Using cross validation of 5 splits

dt_clf = DecisionTreeClassifier(random_state=42)
cv_scores = cross_val_score(dt_clf, X, y, cv=5)

y_pred = cross_val_predict(dt_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))



Cross-validation scores: [0.87381517 0.87973934 0.89869668 0.88033175 0.89567279]
Mean cross-validation score: 0.8856511460653955
Accuracy: 0.8856499585258917
Recall: 0.8856499585258917
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.98      0.90      2813
           1       0.97      0.68      0.80      2813
           2       0.88      1.00      0.93      2813

    accuracy                           0.89      8439
   macro avg       0.90      0.89      0.88      8439
weighted avg       0.90      0.89      0.88      8439



###Using Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=60, criterion='entropy',random_state=42)
cv_scores = cross_val_score(rf_clf, X, y, cv=5)

y_pred = cross_val_predict(rf_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.84597156 0.84834123 0.87736967 0.85248815 0.86662715]
Mean cross-validation score: 0.8581595529797138
Accuracy: 0.8581585495911838
Recall: 0.8581585495911838
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.98      0.88      2813
           1       0.97      0.60      0.74      2813
           2       0.85      1.00      0.92      2813

    accuracy                           0.86      8439
   macro avg       0.88      0.86      0.85      8439
weighted avg       0.88      0.86      0.85      8439



### Using XGBosst

In [14]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(objective='multi:softprob',eval_metric='mlogloss',random_state=42)
cv_scores = cross_val_score(xgb_clf, X, y, cv=5)

y_pred = cross_val_predict(xgb_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.84774882 0.84774882 0.8785545  0.86255924 0.8731476 ]
Mean cross-validation score: 0.8619517947392523
Accuracy: 0.8619504680649366
Recall: 0.8619504680649366
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.96      0.88      2813
           1       0.95      0.63      0.76      2813
           2       0.85      0.99      0.92      2813

    accuracy                           0.86      8439
   macro avg       0.87      0.86      0.85      8439
weighted avg       0.87      0.86      0.85      8439



## Using SMOTE-NC to oversample###

In [22]:
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import LabelEncoder
from collections import Counter
data = cross_df

# # Selecting specified features for X and the target for y
features = ['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Pos','Upper_ext_count',	'Lower_ext_count','Head-neck-trunk_count','total_injury_count']

X = data[features]
y = data['Major anatomical areas']

# 0-> head,neck,trunk ; 1-> lower extremity ; 2-> upper extremity
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Since 'Pos' is the only categorical feature among the ones selected, identify its index for SMOTENC
categorical_feature_indices = [X.columns.get_loc('Pos')]

# Apply SMOTENC, indicating the index of the categorical feature
smotenc = SMOTENC(categorical_features=categorical_feature_indices, random_state=42)
X_res, y_res = smotenc.fit_resample(X, y_encoded)




print('Class distribution after oversampling:', Counter(y_res))

Class distribution after oversampling: Counter({2: 2813, 0: 2813, 1: 2813})


In [23]:
X_res

Unnamed: 0,age,body_fat_perct,FGA_per_MP,MP,TRB,Pos,Upper_ext_count,Lower_ext_count,Head-neck-trunk_count,total_injury_count
0,26.000000,17.835740,0.321429,22.400000,2.000000,PG,1,0,0,1
1,26.000000,17.835740,0.321429,22.400000,2.000000,PG,1,0,1,2
2,26.000000,17.835740,0.321429,22.400000,2.000000,PG,1,1,1,3
3,28.000000,15.491044,0.319149,18.800000,1.500000,PG,0,1,0,1
4,29.000000,15.651044,0.375000,21.600000,1.900000,PG,0,2,0,2
...,...,...,...,...,...,...,...,...,...,...
8434,25.894147,19.774069,0.256824,14.221171,1.252927,PG,1,0,0,1
8435,24.000000,16.678812,0.390000,27.671435,4.500000,SF,1,0,0,2
8436,36.430571,22.895421,0.190403,7.676857,1.161572,C,1,1,0,3
8437,30.000000,18.191720,0.373127,31.259442,3.619504,SG,1,0,0,1


###Using Decision Tree Classifier##

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report


# Encode categorical features using OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(X_res[['Pos']])

# Select training features
other_features = X_res[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Upper_ext_count', 'Lower_ext_count', 'Head-neck-trunk_count']]


# Combine encoded and numerical features
X = np.concatenate([encoded_data, other_features], axis=1)
y = y_res

#Using cross validation of 5 splits

dt_clf = DecisionTreeClassifier(random_state=42)
cv_scores = cross_val_score(dt_clf, X, y, cv=5)

y_pred = cross_val_predict(dt_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))



Cross-validation scores: [0.6735782  0.74822275 0.79028436 0.80035545 0.80379372]
Mean cross-validation score: 0.7632468949901252
Accuracy: 0.7632420902950586
Recall: 0.7632420902950586
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.81      0.78      2813
           1       0.72      0.62      0.67      2813
           2       0.81      0.85      0.83      2813

    accuracy                           0.76      8439
   macro avg       0.76      0.76      0.76      8439
weighted avg       0.76      0.76      0.76      8439



###Using Random Forest Classifier###

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

rf_clf = RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=42)
cv_scores = cross_val_score(rf_clf, X, y, cv=5)

y_pred = cross_val_predict(rf_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.67594787 0.76421801 0.82464455 0.81101896 0.82572614]
Mean cross-validation score: 0.7803111049930187
Accuracy: 0.7803057234269464
Recall: 0.7803057234269464
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.88      0.81      2813
           1       0.79      0.54      0.64      2813
           2       0.80      0.92      0.85      2813

    accuracy                           0.78      8439
   macro avg       0.78      0.78      0.77      8439
weighted avg       0.78      0.78      0.77      8439



###Using XGBoost ###

In [18]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(objective='multi:softprob',eval_metric='mlogloss',random_state=42)
cv_scores = cross_val_score(xgb_clf, X, y, cv=5)

y_pred = cross_val_predict(xgb_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.68187204 0.7950237  0.84597156 0.83827014 0.85892116]
Mean cross-validation score: 0.8040117205168039
Accuracy: 0.8040052138879014
Recall: 0.8040052138879014
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.86      0.82      2813
           1       0.79      0.65      0.71      2813
           2       0.83      0.90      0.87      2813

    accuracy                           0.80      8439
   macro avg       0.80      0.80      0.80      8439
weighted avg       0.80      0.80      0.80      8439



###Using Neural Networks

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [25]:
from tensorflow.keras.optimizers import AdamW
# Using different feature sets on the SMOTE-NC sampled data

#Checking if total injury count has better results


other_features = X_res[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Upper_ext_count', 'Lower_ext_count', 'Head-neck-trunk_count']]
#other_features = X_res[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'total_injury_count']]

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(X_res[['Pos']])


X = np.concatenate([encoded_data, other_features], axis=1)
y = y_res

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the deep learning model
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))  # Additional hidden layer
    model.add(Dense(3, activation='softmax'))  # 3 classes for output
    model.compile(optimizer=AdamW(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create and train the model
#class_weights = {0: 1, 1: 2, 2: 1}
model = create_model(X_train.shape[1])
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1, validation_split=0.2)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y_test, y_pred_classes))




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 0.4721563981042654
Recall: 0.4721563981042654
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.59      0.50       550
           1       0.50      0.27      0.35       570
           2       0.50      0.55      0.53       568

    accuracy                           0.47      1688
   macro avg       0.48      0.47      0.46      1688
weig

For separate injuries as a feature:

Accuracy: 0.792654028436019

Recall: 0.792654028436019

Classification Report:

                precision    recall  f1-score   support
           0       0.77      0.85      0.81       550
           1       0.91      0.60      0.73       570
           2       0.75      0.93      0.83       568

    accuracy                            0.79      1688
    macro avg       0.81      0.79      0.79      1688
    weighted avg    0.81      0.79      0.79      1688

For total injury count as a feature:

Accuracy: 0.4721563981042654

Recall: 0.4721563981042654

Classification Report:

              precision    recall  f1-score   support
           0       0.44      0.59      0.50       550
           1       0.50      0.27      0.35       570
           2       0.50      0.55      0.53       568

    accuracy                           0.47      1688
    macro avg       0.48      0.47     0.46      1688
    weighted avg    0.48      0.47     0.46      1688


Which signifies it is significantly worse

In [None]:
##Saving the model weights to use in transfer learning
model.save_weights('nba_model_weights.h5')

In [None]:
model.save('nba_injury_model.h5')

  saving_api.save_model(


###Transfer learning to the WNBA

In [None]:
# @title
wnba_df = pd.read_csv('/content/drive/MyDrive/Practicum/wnba_inj_merged.csv')
wnba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0.1    85 non-null     int64  
 1   Unnamed: 0      85 non-null     int64  
 2   Athlete         85 non-null     object 
 3   Team_x          85 non-null     object 
 4   Body_Part       85 non-null     object 
 5   BMI             83 non-null     float64
 6   AGE             85 non-null     float64
 7   body_fat_perct  83 non-null     float64
 8   Pos             85 non-null     object 
 9   G               85 non-null     int64  
 10  MP              85 non-null     float64
 11  FGA             85 non-null     int64  
 12  TRB             85 non-null     int64  
 13  FGA_per_MP      85 non-null     float64
 14  TRB_per_MP      85 non-null     float64
dtypes: float64(6), int64(5), object(4)
memory usage: 10.1+ KB


In [None]:
# @title
from sklearn.metrics import accuracy_score, recall_score, classification_report
from sklearn.model_selection import train_test_split

#Encoding the position of the player as it is categorical variable
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(wnba_df[['Pos']])

other_features = wnba_df[['AGE','body_fat_perct', 'FGA_per_MP', 'MP', 'TRB',]]
X= np.concatenate([encoded_data, other_features], axis=1)

#Encoding the target variable
y = wnba_df['Body_Part']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#Since datset is small it is better to use cross-validation
# X_train_wnba, X_test_wnba, y_train_wnba, y_test_wnba = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
recalls = []
reports = []

for train_index, val_index in kf.split(X, y_encoded):
    X_train_wnba, X_val_wnba = X[train_index], X[val_index]
    y_train_wnba, y_val_wnba = y_encoded[train_index], y_encoded[val_index]




In [None]:
# @title
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import AdamW

In [None]:
# @title
#Modified model as there are less features

def create_modified_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(3, activation='softmax'))  # 3 classes for output
    return model

input_dim_wnba = X_train_wnba.shape[1]
modified_model = create_modified_model(input_dim_wnba)


In [None]:
# @title
modified_model.load_weights('nba_model_weights.h5',by_name=True, skip_mismatch=True)

In [None]:
# @title
#Compiling and fine tuning the model

modified_model.compile(optimizer=AdamW(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
modified_model.fit(X_train_wnba, y_train_wnba, epochs=50, batch_size=32, verbose=1, validation_data=(X_val_wnba, y_val_wnba))

y_pred_val = modified_model.predict(X_val_wnba)
y_pred_classes_val = np.argmax(y_pred_val, axis=1)

accuracy_val = accuracy_score(y_val_wnba, y_pred_classes_val)
recall_val = recall_score(y_val_wnba, y_pred_classes_val, average='weighted')
report_val = classification_report(y_val_wnba, y_pred_classes_val)


accuracies.append(accuracy_val)
recalls.append(recall_val)
reports.append(report_val)

# Print average results

print(f"Average Accuracy: {np.mean(accuracies)}")
print(f"Average Recall: {np.mean(recalls)}")
for i, report in enumerate(reports):
    print(f"Classification Report for Fold {i+1}:\n{report}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Average Accuracy: 0.23529411764705882
Average Recall: 0.23529411764705882
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.24      1.00      0.38         4
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         4

    accuracy                           0.24        17
   macro avg       0.08      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##Trying a diff model by freezing layers

In [None]:
# @title
from sklearn.metrics import accuracy_score, recall_score, classification_report
from sklearn.model_selection import train_test_split

#Encoding the position of the player as it is categorical variable
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(wnba_df[['Pos']])

other_features = wnba_df[['AGE','body_fat_perct', 'FGA_per_MP', 'MP', 'TRB',]]
X= np.concatenate([encoded_data, other_features], axis=1)

#Encoding the target variable
y = wnba_df['Body_Part']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)



In [None]:
# @title
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, classification_report
from tensorflow.keras.models import load_model



# Initialize StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Prepare to store results
accuracies = []
recalls = []
reports = []

# Assuming X and y are your WNBA features and labels
input_dim_wnba = X.shape[1]

for train_index, val_index in kf.split(X, y):
    X_train_wnba, X_val_wnba = X[train_index], X[val_index]
    y_train_wnba, y_val_wnba = y_encoded[train_index], y_encoded[val_index]

    # Clone the model for each fold to ensure the same starting weights
    wnba_model = load_model('nba_injury_model.h5')

    for layer in wnba_model.layers[:-2]:  # Freeze the initial layers
        layer.trainable = False

    wnba_model.compile(optimizer=AdamW(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model on the current fold
    wnba_model.fit(X_train_wnba, y_train_wnba, epochs=50, batch_size=16, verbose=1, validation_data=(X_val_wnba, y_val_wnba))

    # Predict on the validation set
    y_pred_val = wnba_model.predict(X_val_wnba)
    y_pred_classes_val = np.argmax(y_pred_val, axis=1)

    # Evaluate performance
    accuracy_val = accuracy_score(y_val_wnba, y_pred_classes_val)
    recall_val = recall_score(y_val_wnba, y_pred_classes_val, average='weighted')
    report_val = classification_report(y_val_wnba, y_pred_classes_val)

    # Store results
    accuracies.append(accuracy_val)
    recalls.append(recall_val)
    reports.append(report_val)

# Print average results
print(f"Average Accuracy: {np.mean(accuracies)}")
print(f"Average Recall: {np.mean(recalls)}")
for i, report in enumerate(reports):
    print(f"Classification Report for Fold {i+1}:\n{report}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50






  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




Average Accuracy: 0.24705882352941178
Average Recall: 0.24705882352941178
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.24      1.00      0.38         4
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         3

    accuracy                           0.24        17
   macro avg       0.08      0.33      0.13        17
weighted avg       0.06      0.24      0.09        17

Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       0.24      1.00      0.38         4
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         3

    accuracy                           0.24        17
   macro avg       0.08      0.33      0.13        17
weighted avg       0.06      0.24      0.09        17

Classification Report for Fold 3:
              precision    recall  f1-score   support

       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# data,y_encoded

(      Unnamed: 0      player_name team_abbreviation   age  body_fat_perct  \
 0            783       A.J. Price               WAS  26.0       17.835740   
 1            784       A.J. Price               WAS  26.0       17.835740   
 2            785       A.J. Price               WAS  26.0       17.835740   
 3            713     Aaron Brooks               HOU  28.0       15.491044   
 4           1561     Aaron Brooks               DEN  29.0       15.651044   
 ...          ...              ...               ...   ...             ...   
 4218        3442          Zhou Qi               HOU  22.0       12.584961   
 4219        3444          Zhou Qi               HOU  23.0       12.744961   
 4220        3445          Zhou Qi               HOU  23.0       12.744961   
 4221        4139  Zion Williamson               NOP  19.0       29.318593   
 4222        4140  Zion Williamson               NOP  19.0       29.318593   
 
       FGA_per_MP   season Pos    MP  TRB        Date Major an