In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
cross_df = pd.read_csv('/content/drive/MyDrive/Practicum/new_training.csv')
cross_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4223 entries, 0 to 4222
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              4223 non-null   int64  
 1   player_name             4223 non-null   object 
 2   team_abbreviation       4223 non-null   object 
 3   age                     4223 non-null   float64
 4   body_fat_perct          4223 non-null   float64
 5   FGA_per_MP              4223 non-null   float64
 6   season                  4223 non-null   object 
 7   Pos                     4223 non-null   object 
 8   MP                      4223 non-null   float64
 9   TRB                     4223 non-null   float64
 10  Date                    4223 non-null   object 
 11  Major anatomical areas  4223 non-null   object 
 12  Anatomical sub-areas    4223 non-null   object 
 13  Notes                   4223 non-null   object 
 14  Upper_ext_count         4223 non-null   

In [None]:
##Trying to use total injuries and check accuracy -> worse
cross_df['total_injury_count'] = cross_df['Upper_ext_count']+cross_df['Lower_ext_count']+cross_df['Head-neck-trunk_count']

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

### Using Undersampling ###

In [None]:
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

X = cross_df.drop(columns=['Major anatomical areas'])
y = cross_df['Major anatomical areas']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


undersampler = RandomUnderSampler(sampling_strategy= 'auto',
                                  random_state=42)

X_undersampled, y_undersampled = undersampler.fit_resample(X, y_encoded)

print('Class distribution after undersampling:', Counter(y_undersampled))

Class distribution after undersampling: Counter({0: 591, 1: 591, 2: 591})


###Using Decision Tree

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report


# Encode categorical features using OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(X_undersampled[['Pos']])

# Select training features
#other_features = X_undersampled[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Upper_ext_count', 'Lower_ext_count', 'Head-neck-trunk_count']]

##Trying to use total injuries and check accuracy -> worse
other_features = X_undersampled[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB']]

# Combine encoded and numerical features
X = np.concatenate([encoded_data, other_features], axis=1)
y = y_undersampled

#Using cross validation of 5 splits

dt_clf = DecisionTreeClassifier(random_state=42)
cv_scores = cross_val_score(dt_clf, X, y, cv=5)

y_pred = cross_val_predict(dt_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))



Cross-validation scores: [0.29577465 0.29577465 0.32112676 0.31355932 0.32768362]
Mean cross-validation score: 0.3107837988382271
Accuracy: 0.3107727016356458
Recall: 0.3107727016356458
Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.40      0.36       591
           1       0.33      0.34      0.33       591
           2       0.25      0.19      0.22       591

    accuracy                           0.31      1773
   macro avg       0.30      0.31      0.31      1773
weighted avg       0.30      0.31      0.31      1773



### Using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=60, criterion='entropy', random_state=42)
cv_scores = cross_val_score(rf_clf, X, y, cv=5)

y_pred = cross_val_predict(rf_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.27887324 0.33239437 0.30422535 0.3220339  0.29943503]
Mean cross-validation score: 0.3073923768600303
Accuracy: 0.3073886068809927
Recall: 0.3073886068809927
Classification Report:
              precision    recall  f1-score   support

           0       0.32      0.35      0.33       591
           1       0.34      0.37      0.36       591
           2       0.24      0.20      0.22       591

    accuracy                           0.31      1773
   macro avg       0.30      0.31      0.30      1773
weighted avg       0.30      0.31      0.30      1773



###Using XGBoost

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05,objective='multi:softprob',eval_metric='mlogloss',random_state=42)
cv_scores = cross_val_score(xgb_clf, X, y, cv=5)

y_pred = cross_val_predict(xgb_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.28450704 0.31549296 0.29295775 0.27966102 0.29378531]
Mean cross-validation score: 0.2932808148324978
Accuracy: 0.29328821206993794
Recall: 0.29328821206993794
Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.31      0.31       591
           1       0.32      0.36      0.34       591
           2       0.24      0.21      0.23       591

    accuracy                           0.29      1773
   macro avg       0.29      0.29      0.29      1773
weighted avg       0.29      0.29      0.29      1773



###Using Oversampling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

X = cross_df.drop(columns=['Major anatomical areas'])
y = cross_df['Major anatomical areas']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


oversampler = RandomOverSampler(sampling_strategy= 'auto',
                                  random_state=42)

X_oversampled, y_oversampled = oversampler.fit_resample(X, y_encoded)

print('Class distribution after oversampling:', Counter(y_oversampled))

Class distribution after oversampling: Counter({2: 2813, 0: 2813, 1: 2813})


###Using Decision Tree

In [None]:
# Encode categorical features using OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(X_oversampled[['Pos']])

# Select training features
#other_features = X_oversampled[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Upper_ext_count', 'Lower_ext_count', 'Head-neck-trunk_count']]

##Trying to use total injuries and check accuracy -> worse
other_features = X_oversampled[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB']]

# Combine encoded and numerical features
X = np.concatenate([encoded_data, other_features], axis=1)
y = y_oversampled

#Using cross validation of 5 splits

dt_clf = DecisionTreeClassifier(random_state=42)
cv_scores = cross_val_score(dt_clf, X, y, cv=5)

y_pred = cross_val_predict(dt_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))



Cross-validation scores: [0.6943128  0.68483412 0.70912322 0.68601896 0.71191464]
Mean cross-validation score: 0.6972407481802578
Accuracy: 0.6972390093612987
Recall: 0.6972390093612987
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.86      0.75      2813
           1       0.80      0.35      0.48      2813
           2       0.71      0.88      0.78      2813

    accuracy                           0.70      8439
   macro avg       0.72      0.70      0.67      8439
weighted avg       0.72      0.70      0.67      8439



###Using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=60, criterion='entropy',random_state=42)
cv_scores = cross_val_score(rf_clf, X, y, cv=5)

y_pred = cross_val_predict(rf_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.73637441 0.73222749 0.75177725 0.7464455  0.74925904]
Mean cross-validation score: 0.7432167368530468
Accuracy: 0.7432160208555516
Recall: 0.7432160208555516
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.82      0.76      2813
           1       0.82      0.50      0.62      2813
           2       0.75      0.91      0.82      2813

    accuracy                           0.74      8439
   macro avg       0.75      0.74      0.73      8439
weighted avg       0.75      0.74      0.73      8439



### Using XGBosst

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(objective='multi:softprob',eval_metric='mlogloss',random_state=42)
cv_scores = cross_val_score(xgb_clf, X, y, cv=5)

y_pred = cross_val_predict(xgb_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.69372038 0.68957346 0.72452607 0.70853081 0.7202134 ]
Mean cross-validation score: 0.7073128214924835
Accuracy: 0.7073112928072046
Recall: 0.7073112928072046
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.79      0.73      2813
           1       0.73      0.45      0.56      2813
           2       0.73      0.88      0.80      2813

    accuracy                           0.71      8439
   macro avg       0.71      0.71      0.69      8439
weighted avg       0.71      0.71      0.69      8439



### Using SMOTE-NC to oversample###

In [None]:
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import LabelEncoder
from collections import Counter
data = cross_df

# # Selecting specified features for X and the target for y
features = ['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Pos','Upper_ext_count',	'Lower_ext_count','Head-neck-trunk_count']

##Trying to use total injuries and check accuracy -> worse
#features = ['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Pos','total_injury_count']

X = data[features]
y = data['Major anatomical areas']

# 0-> head,neck,trunk ; 1-> lower extremity ; 2-> upper extremity
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Since 'Pos' is the only categorical feature among the ones selected, identify its index for SMOTENC
categorical_feature_indices = [X.columns.get_loc('Pos')]

# Apply SMOTENC, indicating the index of the categorical feature
smotenc = SMOTENC(categorical_features=categorical_feature_indices, random_state=42)
X_res, y_res = smotenc.fit_resample(X, y_encoded)




print('Class distribution after oversampling:', Counter(y_res))

Class distribution after oversampling: Counter({2: 2813, 0: 2813, 1: 2813})


##Using Decision Tree Classifier##

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report


# Encode categorical features using OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(X_res[['Pos']])

# Select training features
#other_features = X_res[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Upper_ext_count', 'Lower_ext_count', 'Head-neck-trunk_count']]

##Trying to use total injuries and check accuracy -> worse
other_features = X_res[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB']]

# Combine encoded and numerical features
X = np.concatenate([encoded_data, other_features], axis=1)
y = y_res

#Using cross validation of 5 splits

dt_clf = DecisionTreeClassifier(random_state=42)
cv_scores = cross_val_score(dt_clf, X, y, cv=5)

y_pred = cross_val_predict(dt_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))



Cross-validation scores: [0.40936019 0.507109   0.57464455 0.54739336 0.57557795]
Mean cross-validation score: 0.5228170116053343
Accuracy: 0.5228107595686693
Recall: 0.5228107595686693
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.60      0.55      2813
           1       0.44      0.35      0.39      2813
           2       0.60      0.61      0.61      2813

    accuracy                           0.52      8439
   macro avg       0.52      0.52      0.52      8439
weighted avg       0.52      0.52      0.52      8439



###Using Random Forest Classifier###

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

rf_clf = RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=42)
cv_scores = cross_val_score(rf_clf, X, y, cv=5)

y_pred = cross_val_predict(rf_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.42950237 0.57879147 0.66824645 0.63981043 0.65085951]
Mean cross-validation score: 0.5934420449661054
Accuracy: 0.5934352411423155
Recall: 0.5934352411423155
Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.68      0.63      2813
           1       0.50      0.35      0.42      2813
           2       0.65      0.75      0.70      2813

    accuracy                           0.59      8439
   macro avg       0.58      0.59      0.58      8439
weighted avg       0.58      0.59      0.58      8439



##Testing feature importances##

In [None]:
# if isinstance(X, pd.DataFrame):
#     feature_names = X.columns
# else:
#     feature_names = [f"Feature {i}" for i in range(X.shape[1])]

# rf_clf.fit(X, y)
# feature_importances = rf_clf.feature_importances_

# # Print the feature importances
# print("Feature Importances:")
# for name, importance in zip(feature_names, feature_importances):
#     print(f"{name}: {importance}")


rf_clf.fit(X, y)
feature_importances = rf_clf.feature_importances_

# Get feature names
encoded_feature_names = ohe.get_feature_names_out(['Pos'])
all_feature_names = np.concatenate([encoded_feature_names, other_features.columns])

# Create DataFrame to store feature importances
importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importances})

# Sort DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print top features and their importances
print(importance_df)

          Feature  Importance
8              MP    0.204654
6  body_fat_perct    0.195692
5             age    0.193846
9             TRB    0.184734
7      FGA_per_MP    0.178330
1          Pos_PF    0.009690
2          Pos_PG    0.008747
0           Pos_C    0.008467
3          Pos_SF    0.008406
4          Pos_SG    0.007432


###Using XGBoost ###

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(objective='multi:softprob',eval_metric='mlogloss',random_state=42)
cv_scores = cross_val_score(xgb_clf, X, y, cv=5)

y_pred = cross_val_predict(xgb_clf, X, y, cv=5)

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred, average='weighted')


print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y, y_pred))

Cross-validation scores: [0.46090047 0.58412322 0.67120853 0.64336493 0.69828097]
Mean cross-validation score: 0.6115756257075995
Accuracy: 0.611565351344946
Recall: 0.611565351344946
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.63      0.61      2813
           1       0.56      0.51      0.53      2813
           2       0.67      0.70      0.68      2813

    accuracy                           0.61      8439
   macro avg       0.61      0.61      0.61      8439
weighted avg       0.61      0.61      0.61      8439



###Using Neural Networks

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [None]:
from tensorflow.keras.optimizers import AdamW
# Using different feature sets on the SMOTE-NC sampled data

#other_features = X_res[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB', 'Upper_ext_count', 'Lower_ext_count', 'Head-neck-trunk_count']]

other_features = X_res[['age', 'body_fat_perct', 'FGA_per_MP', 'MP', 'TRB']]

X = np.concatenate([encoded_data, other_features], axis=1)
y = y_res

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Define the deep learning model
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))  # Additional hidden layer
    model.add(Dense(3, activation='softmax'))  # 3 classes for output
    model.compile(optimizer=AdamW(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create and train the model
#class_weights = {0: 1, 1: 2, 2: 1}
model = create_model(X_train.shape[1])
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=1, validation_split=0.1)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y_test, y_pred_classes))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 0.4146919431279621
Recall: 0.4146919431279621
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.60      0.48       281
           1       0.42      0.16      0.23       277
           2       0.44      0.47      0.45       286

    accuracy                           0.41       844
   macro avg       0.42      0.41      0.39       844
weig

In [None]:
##Saving the model weights to use in transfer learning
model.save_weights('nba_model_weights.h5')

In [None]:
model.save('nba_injury_model.h5')

  saving_api.save_model(


###Transfer learning to the WNBA

In [None]:
wnba_df = pd.read_csv('/content/drive/MyDrive/Practicum/wnba_inj_merged.csv')
wnba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0.1    85 non-null     int64  
 1   Unnamed: 0      85 non-null     int64  
 2   Athlete         85 non-null     object 
 3   Team_x          85 non-null     object 
 4   Body_Part       85 non-null     object 
 5   BMI             83 non-null     float64
 6   AGE             85 non-null     float64
 7   body_fat_perct  83 non-null     float64
 8   Pos             85 non-null     object 
 9   G               85 non-null     int64  
 10  MP              85 non-null     float64
 11  FGA             85 non-null     int64  
 12  TRB             85 non-null     int64  
 13  FGA_per_MP      85 non-null     float64
 14  TRB_per_MP      85 non-null     float64
dtypes: float64(6), int64(5), object(4)
memory usage: 10.1+ KB


In [None]:
from sklearn.metrics import accuracy_score, recall_score, classification_report
from sklearn.model_selection import train_test_split

#Encoding the position of the player as it is categorical variable
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(wnba_df[['Pos']])

other_features = wnba_df[['AGE','body_fat_perct', 'FGA_per_MP', 'MP', 'TRB',]]
X= np.concatenate([encoded_data, other_features], axis=1)

#Encoding the target variable
y = wnba_df['Body_Part']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#Since datset is small it is better to use cross-validation
# X_train_wnba, X_test_wnba, y_train_wnba, y_test_wnba = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
recalls = []
reports = []

for train_index, val_index in kf.split(X, y_encoded):
    X_train_wnba, X_val_wnba = X[train_index], X[val_index]
    y_train_wnba, y_val_wnba = y_encoded[train_index], y_encoded[val_index]




In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import AdamW

In [None]:
#Modified model as there are less features

def create_modified_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(3, activation='softmax'))  # 3 classes for output
    return model

input_dim_wnba = X_train_wnba.shape[1]
modified_model = create_modified_model(input_dim_wnba)


In [None]:
modified_model.load_weights('nba_model_weights.h5',by_name=True, skip_mismatch=True)

In [None]:
#Compiling and fine tuning the model

modified_model.compile(optimizer=AdamW(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
modified_model.fit(X_train_wnba, y_train_wnba, epochs=50, batch_size=32, verbose=1, validation_data=(X_val_wnba, y_val_wnba))

y_pred_val = modified_model.predict(X_val_wnba)
y_pred_classes_val = np.argmax(y_pred_val, axis=1)

accuracy_val = accuracy_score(y_val_wnba, y_pred_classes_val)
recall_val = recall_score(y_val_wnba, y_pred_classes_val, average='weighted')
report_val = classification_report(y_val_wnba, y_pred_classes_val)


accuracies.append(accuracy_val)
recalls.append(recall_val)
reports.append(report_val)

# Print average results

print(f"Average Accuracy: {np.mean(accuracies)}")
print(f"Average Recall: {np.mean(recalls)}")
for i, report in enumerate(reports):
    print(f"Classification Report for Fold {i+1}:\n{report}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Average Accuracy: 0.23529411764705882
Average Recall: 0.23529411764705882
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.24      1.00      0.38         4
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         4

    accuracy                           0.24        17
   macro avg       0.08      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##Trying a diff model by freezing layers

In [None]:
from sklearn.metrics import accuracy_score, recall_score, classification_report
from sklearn.model_selection import train_test_split

#Encoding the position of the player as it is categorical variable
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = ohe.fit_transform(wnba_df[['Pos']])

other_features = wnba_df[['AGE','body_fat_perct', 'FGA_per_MP', 'MP', 'TRB',]]
X= np.concatenate([encoded_data, other_features], axis=1)

#Encoding the target variable
y = wnba_df['Body_Part']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)



In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, classification_report
from tensorflow.keras.models import load_model



# Initialize StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Prepare to store results
accuracies = []
recalls = []
reports = []

# Assuming X and y are your WNBA features and labels
input_dim_wnba = X.shape[1]

for train_index, val_index in kf.split(X, y):
    X_train_wnba, X_val_wnba = X[train_index], X[val_index]
    y_train_wnba, y_val_wnba = y_encoded[train_index], y_encoded[val_index]

    # Clone the model for each fold to ensure the same starting weights
    wnba_model = load_model('nba_injury_model.h5')

    for layer in wnba_model.layers[:-2]:  # Freeze the initial layers
        layer.trainable = False

    wnba_model.compile(optimizer=AdamW(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model on the current fold
    wnba_model.fit(X_train_wnba, y_train_wnba, epochs=50, batch_size=16, verbose=1, validation_data=(X_val_wnba, y_val_wnba))

    # Predict on the validation set
    y_pred_val = wnba_model.predict(X_val_wnba)
    y_pred_classes_val = np.argmax(y_pred_val, axis=1)

    # Evaluate performance
    accuracy_val = accuracy_score(y_val_wnba, y_pred_classes_val)
    recall_val = recall_score(y_val_wnba, y_pred_classes_val, average='weighted')
    report_val = classification_report(y_val_wnba, y_pred_classes_val)

    # Store results
    accuracies.append(accuracy_val)
    recalls.append(recall_val)
    reports.append(report_val)

# Print average results
print(f"Average Accuracy: {np.mean(accuracies)}")
print(f"Average Recall: {np.mean(recalls)}")
for i, report in enumerate(reports):
    print(f"Classification Report for Fold {i+1}:\n{report}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50






  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




Average Accuracy: 0.24705882352941178
Average Recall: 0.24705882352941178
Classification Report for Fold 1:
              precision    recall  f1-score   support

           0       0.24      1.00      0.38         4
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         3

    accuracy                           0.24        17
   macro avg       0.08      0.33      0.13        17
weighted avg       0.06      0.24      0.09        17

Classification Report for Fold 2:
              precision    recall  f1-score   support

           0       0.24      1.00      0.38         4
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         3

    accuracy                           0.24        17
   macro avg       0.08      0.33      0.13        17
weighted avg       0.06      0.24      0.09        17

Classification Report for Fold 3:
              precision    recall  f1-score   support

       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# data,y_encoded

(      Unnamed: 0      player_name team_abbreviation   age  body_fat_perct  \
 0            783       A.J. Price               WAS  26.0       17.835740   
 1            784       A.J. Price               WAS  26.0       17.835740   
 2            785       A.J. Price               WAS  26.0       17.835740   
 3            713     Aaron Brooks               HOU  28.0       15.491044   
 4           1561     Aaron Brooks               DEN  29.0       15.651044   
 ...          ...              ...               ...   ...             ...   
 4218        3442          Zhou Qi               HOU  22.0       12.584961   
 4219        3444          Zhou Qi               HOU  23.0       12.744961   
 4220        3445          Zhou Qi               HOU  23.0       12.744961   
 4221        4139  Zion Williamson               NOP  19.0       29.318593   
 4222        4140  Zion Williamson               NOP  19.0       29.318593   
 
       FGA_per_MP   season Pos    MP  TRB        Date Major an