## Download the Data

### USE KAGGLE API

In [1]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('Kaggle_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('Kaggle_USER')

ModuleNotFoundError: No module named 'google.colab'

- Note: Learn to setup environment variable in colab. https://x.com/GoogleColab/status/1719798406195867814

### Download and Unzip

In [None]:
! kaggle datasets download -d mrsimple07/injury-prediction-dataset
! unzip "injury-prediction-dataset.zip"

Dataset URL: https://www.kaggle.com/datasets/mrsimple07/injury-prediction-dataset
License(s): apache-2.0
injury-prediction-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  injury-prediction-dataset.zip
replace injury_data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: injury_data.csv         


### Read Data

In [None]:
import pandas as pd

df = pd.read_csv('injury_data.csv')
display(df.head())

Unnamed: 0,Player_Age,Player_Weight,Player_Height,Previous_Injuries,Training_Intensity,Recovery_Time,Likelihood_of_Injury
0,24,66.251933,175.732429,1,0.457929,5,0
1,37,70.996271,174.58165,0,0.226522,6,1
2,32,80.093781,186.329618,0,0.61397,2,1
3,28,87.473271,175.50424,1,0.252858,4,1
4,25,84.65922,190.175012,0,0.577632,1,1


## Data Exploration

In [None]:
df.describe()

Unnamed: 0,Player_Age,Player_Weight,Player_Height,Previous_Injuries,Training_Intensity,Recovery_Time,Likelihood_of_Injury
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,28.231,74.794351,179.750948,0.515,0.490538,3.466,0.5
std,6.538378,9.892621,9.888921,0.500025,0.286184,1.701099,0.50025
min,18.0,40.191912,145.285701,0.0,3.1e-05,1.0,0.0
25%,22.0,67.944028,173.036976,0.0,0.241042,2.0,0.0
50%,28.0,75.020569,180.034436,1.0,0.483912,4.0,0.5
75%,34.0,81.302956,186.557913,1.0,0.730404,5.0,1.0
max,39.0,104.650104,207.308672,1.0,0.997749,6.0,1.0


- Note: Seems like data is very clean, nothing cleaning need to be done further.
- Note: this is a binary classification problem with depedent varible as `Likelihood_of_Injury`
- Note: there is no class imbalance so no need for over/undersampling

## Data Prep

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features (X) and target variable (y)
X = df.drop('Likelihood_of_Injury', axis=1)
y = df['Likelihood_of_Injury']

# Normalize
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


- Note: since there are 1000 observation, which is small dataset.
- Note: we choose cross-validation over validation set for robustness and relatively low computational cost for small dataset
- Note: For large datasets, a validation set is prefered for hyperparamter tuning due to its low compuataional cost but less robust.

## Feature Engineering & Feature Selection

- Only 6 features - no need for selection or PCA for dimension reduction
- Think about what features engineered here can help boost the model performance (ex. BMI)

## Training & Hyperparamter Tuning
With Cross-Validation (CV)

Models tested:
- Logistic Regression (LR)
- Support Vector Machine (SVM)
- Generalized Linear Method (GLM)

- Random Forest (RF)
- Gradient Boosting (GB)

- Neural Networks

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### LR

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

lr = LogisticRegression()
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
lr_best = GridSearchCV(estimator=lr, param_grid=lr_param_grid, cv=skf, scoring='accuracy')
lr_best.fit(X_train, y_train)

model = lr_best.best_estimator_

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # for AUC

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.58
Precision: 0.6363636363636364
Recall: 0.4666666666666667
F1 Score: 0.5384615384615384
ROC-AUC: 0.5652130325814537

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.71      0.61        95
           1       0.64      0.47      0.54       105

    accuracy                           0.58       200
   macro avg       0.59      0.59      0.58       200
weighted avg       0.59      0.58      0.57       200



- Note: Recall should be priotized assuming the cost of injury is high.
- Note: No PCA seems needed.

## SVM

In [None]:
from sklearn.svm import SVC

# SVM
svm = SVC(probability=True, random_state=42)  # probability=True for ROC-AUC
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf']
}
svm_best = GridSearchCV(estimator=svm, param_grid=svm_param_grid, cv=skf, scoring='accuracy')
svm_best.fit(X_train, y_train)

model = svm_best.best_estimator_

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # for AUC

# Metrics
print("SVM Results:")
print("Best Parameters:", svm_best.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

SVM Results:
Best Parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.565
Precision: 0.59375
Recall: 0.5428571428571428
F1 Score: 0.5671641791044776
ROC-AUC: 0.5718796992481203

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.59      0.56        95
           1       0.59      0.54      0.57       105

    accuracy                           0.56       200
   macro avg       0.57      0.57      0.56       200
weighted avg       0.57      0.56      0.57       200



## RF

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
rf_best = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=skf, scoring='accuracy')
rf_best.fit(X_train, y_train)

model = rf_best.best_estimator_

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # for AUC

# Metrics
print("Random Forest Results:")
print("Best Parameters:", rf_best.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Random Forest Results:
Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.56
Precision: 0.5894736842105263
Recall: 0.5333333333333333
F1 Score: 0.56
ROC-AUC: 0.5631077694235589

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.59      0.56        95
           1       0.59      0.53      0.56       105

    accuracy                           0.56       200
   macro avg       0.56      0.56      0.56       200
weighted avg       0.56      0.56      0.56       200



## Areas to Think About
- More advanced models can achieve high performance (like NuSVM), investigate why. https://www.kaggle.com/code/tkunzler/injury-prediction-eda-eng-pt-br
- What other models can you test?
- Are Neural networks good option, why and why not?