# Injury Prediction V2
## X-day lookback â†’ Y-day forward injury likelihood

Predicts injury probability for each of the next Y days based on the previous X days of game data.
- **Features**: Rolling aggregates (mean, std, min, max) over lookback window
- **Targets**: Injured within 1 day, 2 days, ..., Y days (cumulative)

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report

# Add src to path for ball package
NOTEBOOK_DIR = Path(".").resolve()
SRC_DIR = NOTEBOOK_DIR.parent.parent.parent.parent  # src/
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

from ball.models.injury_prediction.v2.injury_prediction_v2 import (
    load_base_data,
    load_injury_dates,
    build_features_targets,
    get_engine,
)

LOOKBACK_DAYS = 14
FORWARD_DAYS = 14

In [2]:
# Load base data from DB
conn = get_engine()
base_df = load_base_data(conn)
injury_df = load_injury_dates(conn)
conn.close()

print("Base rows:", len(base_df))
print("Injury events:", len(injury_df))
display(base_df.head(3))

Base rows: 17866
Injury events: 20044


Unnamed: 0,player_id,full_name,game_date,game_id,age,speed,distance,height_wo_shoes,weight,wingspan,standing_reach,body_fat_pct,hand_length,hand_width,minutes,usagePercentage,pace,possessions
0,202694,Marcus Morris Sr.,2015-10-27,21500001,26,4.03,2.48,79.0,229.6,82.0,105.5,8.0,8.25,8.5,37:05,0.22,97.08,76.0
1,101141,Ersan Ilyasova,2015-10-27,21500001,28,4.05,2.32,79.75,208.8,85.25,109.5,,,,34:26,0.174,103.16,74.0
2,203083,Andre Drummond,2015-10-27,21500001,22,3.72,2.3,81.75,278.6,90.25,109.5,7.5,9.5,9.5,37:09,0.23,96.26,76.0


In [3]:
# Build features and targets
features_df, targets_df, base_feature_names = build_features_targets(
    base_df, injury_df, LOOKBACK_DAYS, FORWARD_DAYS
)

print("Training rows:", len(features_df))
print("Target columns:", list(targets_df.columns))

# Feature columns (exclude id/date)
exclude = {"player_id", "game_date", "game_id"}
FEATURE_COLS = [c for c in features_df.columns if c not in exclude]
print("Feature count:", len(FEATURE_COLS))

Training rows: 17866
Target columns: ['injured_within_1', 'injured_within_2', 'injured_within_3', 'injured_within_4', 'injured_within_5', 'injured_within_6', 'injured_within_7', 'injured_within_8', 'injured_within_9', 'injured_within_10', 'injured_within_11', 'injured_within_12', 'injured_within_13', 'injured_within_14']
Feature count: 56


In [4]:
# Prepare X, y for each forward day
X = features_df[FEATURE_COLS].copy()
for c in X.columns:
    X[c] = pd.to_numeric(X[c], errors="coerce")
X = X.fillna(X.median())

# Train Y models (one per forward day)
models = {}
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

for d in range(1, FORWARD_DAYS + 1):
    col = f"injured_within_{d}"
    y = targets_df[col].astype(int)
    if y.sum() < 10:
        print(f"Skipping day {d} (only {y.sum()} positives)")
        continue
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.25, random_state=42, stratify=y
    )
    model = LogisticRegression(max_iter=3000, random_state=42, class_weight="balanced")
    model.fit(X_train, y_train)
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    models[d] = model
    print(f"Day {d}: AUC={auc:.3f}")

Day 1: AUC=0.546
Day 2: AUC=0.552
Day 3: AUC=0.596
Day 4: AUC=0.624
Day 5: AUC=0.559
Day 6: AUC=0.613
Day 7: AUC=0.627
Day 8: AUC=0.611
Day 9: AUC=0.606
Day 10: AUC=0.632
Day 11: AUC=0.628
Day 12: AUC=0.648
Day 13: AUC=0.600
Day 14: AUC=0.609


In [5]:
# Try GradientBoosting for better accuracy
models_gb = {}
for d in range(1, FORWARD_DAYS + 1):
    col = f"injured_within_{d}"
    y = targets_df[col].astype(int)
    if y.sum() < 10:
        continue
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.25, random_state=42, stratify=y
    )
    model = GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42)
    model.fit(X_train, y_train)
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    models_gb[d] = model
    print(f"Day {d} GB: AUC={auc:.3f}")

Day 1 GB: AUC=0.488
Day 2 GB: AUC=0.598
Day 3 GB: AUC=0.730
Day 4 GB: AUC=0.724
Day 5 GB: AUC=0.774
Day 6 GB: AUC=0.821
Day 7 GB: AUC=0.834
Day 8 GB: AUC=0.833
Day 9 GB: AUC=0.830
Day 10 GB: AUC=0.872
Day 11 GB: AUC=0.857
Day 12 GB: AUC=0.874
Day 13 GB: AUC=0.853
Day 14 GB: AUC=0.874


In [6]:
# Save models and artifacts for Streamlit
ARTIFACTS_DIR = NOTEBOOK_DIR / "artifacts"
ARTIFACTS_DIR.mkdir(exist_ok=True)

joblib.dump(scaler, ARTIFACTS_DIR / "scaler.joblib")
joblib.dump(models_gb if models_gb else models, ARTIFACTS_DIR / "models.joblib")
joblib.dump(FEATURE_COLS, ARTIFACTS_DIR / "feature_cols.joblib")
joblib.dump(FORWARD_DAYS, ARTIFACTS_DIR / "forward_days.joblib")

print("Saved to:", ARTIFACTS_DIR)

Saved to: C:\Users\yusuf\OneDrive\Desktop\CODE\BALL\src\ball\models\injury_prediction\v2\artifacts
