## Download the Data

### USE KAGGLE API

In [None]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('Kaggle_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('Kaggle_USER')

- Note: Learn to setup environment variable in colab. https://x.com/GoogleColab/status/1719798406195867814

### Download and Unzip

In [None]:
! kaggle datasets download -d mrsimple07/injury-prediction-dataset
! unzip "injury-prediction-dataset.zip"

### Read Data

In [None]:
import pandas as pd

df = pd.read_csv('injury_data.csv')
display(df.head())

## Data Exploration

In [None]:
df.describe()

- Note: Seems like data is very clean, nothing cleaning need to be done further.
- Note: this is a binary classification problem with depedent varible as `Likelihood_of_Injury`
- Note: there is no class imbalance so no need for over/undersampling

## Data Prep

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features (X) and target variable (y)
X = df.drop('Likelihood_of_Injury', axis=1)
y = df['Likelihood_of_Injury']

# Normalize
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


- Note: since there are 1000 observation, which is small dataset.
- Note: we choose cross-validation over validation set for robustness and relatively low computational cost for small dataset
- Note: For large datasets, a validation set is prefered for hyperparamter tuning due to its low compuataional cost but less robust.

## Feature Engineering & Feature Selection

- Only 6 features - no need for selection or PCA for dimension reduction
- Think about what features engineered here can help boost the model performance (ex. BMI)

## Training & Hyperparamter Tuning
With Cross-Validation (CV)

Models tested:
- Logistic Regression (LR)
- Support Vector Machine (SVM)
- Generalized Linear Method (GLM)

- Random Forest (RF)
- Gradient Boosting (GB)

- Neural Networks

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### LR

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

lr = LogisticRegression()
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
lr_best = GridSearchCV(estimator=lr, param_grid=lr_param_grid, cv=skf, scoring='accuracy')
lr_best.fit(X_train, y_train)

model = lr_best.best_estimator_

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # for AUC

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



- Note: Recall should be priotized assuming the cost of injury is high.
- Note: No PCA seems needed.

## SVM

In [None]:
from sklearn.svm import SVC

# SVM
svm = SVC(probability=True, random_state=42)  # probability=True for ROC-AUC
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf']
}
svm_best = GridSearchCV(estimator=svm, param_grid=svm_param_grid, cv=skf, scoring='accuracy')
svm_best.fit(X_train, y_train)

model = svm_best.best_estimator_

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # for AUC

# Metrics
print("SVM Results:")
print("Best Parameters:", svm_best.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## RF

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
rf_best = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=skf, scoring='accuracy')
rf_best.fit(X_train, y_train)

model = rf_best.best_estimator_

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # for AUC

# Metrics
print("Random Forest Results:")
print("Best Parameters:", rf_best.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Areas to Think About
- More advanced models can achieve high performance (like NuSVM), investigate why. https://www.kaggle.com/code/tkunzler/injury-prediction-eda-eng-pt-br
- What other models can you test?
- Are Neural networks good option, why and why not?