In [1]:
import os
import pandas as pd
from supabase import create_client
from dotenv import load_dotenv

load_dotenv()
supabase = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY"))

# Pull only what we need for baseline
COLS = "injury_id,player_id,team_id,injury_date,acquired,relinquished,body_region,diagnosis,return_status"
CHUNK = 1000

def fetch_injuries():
    rows = []
    start = 0

    while True:
        resp = (
            supabase.table("injuries")
            .select(COLS)
            .order("injury_date", desc=False)   # stable paging
            .range(start, start + CHUNK - 1)
            .execute()
        )

        batch = resp.data or []
        if not batch:
            break

        rows.extend(batch)
        print(f"Fetched {len(rows)} rows so far...")

        if len(batch) < CHUNK:
            break

        start += CHUNK

    df = pd.DataFrame(rows)
    df["injury_date"] = pd.to_datetime(df["injury_date"], errors="coerce")

    df.to_csv("injuries_raw.csv", index=False)
    print("Saved -> injuries_raw.csv")
    print(df.head())
    print("Rows:", len(df), "Players:", df["player_id"].nunique())

if __name__ == "__main__":
    fetch_injuries()


Fetched 1000 rows so far...
Fetched 2000 rows so far...
Fetched 3000 rows so far...
Fetched 4000 rows so far...
Fetched 5000 rows so far...
Fetched 6000 rows so far...
Fetched 7000 rows so far...
Fetched 8000 rows so far...
Fetched 9000 rows so far...
Fetched 10000 rows so far...
Fetched 11000 rows so far...
Fetched 12000 rows so far...
Fetched 13000 rows so far...
Fetched 14000 rows so far...
Fetched 15000 rows so far...
Fetched 16000 rows so far...
Fetched 17000 rows so far...
Fetched 18000 rows so far...
Fetched 19000 rows so far...
Fetched 20000 rows so far...
Fetched 21000 rows so far...
Fetched 22000 rows so far...
Fetched 23000 rows so far...
Fetched 24000 rows so far...
Fetched 25000 rows so far...
Fetched 26000 rows so far...
Fetched 27000 rows so far...
Fetched 28000 rows so far...
Fetched 29000 rows so far...
Fetched 30000 rows so far...
Fetched 31000 rows so far...
Fetched 32000 rows so far...
Fetched 33000 rows so far...
Fetched 34000 rows so far...
Fetched 35000 rows so f

In [2]:
import pandas as pd

df = pd.read_csv("injuries_raw.csv")
df["injury_date"] = pd.to_datetime(df["injury_date"], errors="coerce")

# 1) drop unusable rows
df = df[df["player_id"].notna()]
df = df[df["injury_date"].notna()]

# 2) keep ONLY injury-start rows
# (based on your table meaning: relinquished=True => placed on IL / removed)
inj_start = df[df["relinquished"] == True].copy()

print("inj_start rows:", len(inj_start))
print("inj_start players:", inj_start["player_id"].nunique())
print(inj_start[["player_id","injury_date","body_region","diagnosis","acquired","relinquished"]].head(10))


inj_start rows: 19689
inj_start players: 2617
    player_id injury_date body_region diagnosis  acquired  relinquished
0     76102.0  1951-12-25         NaN       NaN     False          True
1     78638.0  1952-12-26        side      tear     False          True
2     76081.0  1956-12-29         NaN       NaN     False          True
3     76127.0  1959-01-16         NaN       NaN     False          True
4     76127.0  1961-11-26         NaN       NaN     False          True
7     76708.0  1962-10-25         NaN       NaN     False          True
9     76708.0  1962-11-14        knee       NaN     False          True
10    76577.0  1965-09-11         NaN       NaN     False          True
11    76865.0  1969-10-15       ankle    sprain     False          True
13    76065.0  1969-12-03         NaN       NaN     False          True


In [4]:
# Treat missing text as its own category
df["body_region"] = df["body_region"].fillna("unknown")
df["diagnosis"] = df["diagnosis"].fillna("unknown")

# Quick sanity check
print(df["body_region"].value_counts().head())
print(df["diagnosis"].value_counts().head())


body_region
unknown    25427
knee        2383
ankle       1869
back        1132
foot         727
Name: count, dtype: int64
diagnosis
unknown      27608
sprain        2042
soreness      1704
strain        1384
infection      848
Name: count, dtype: int64


In [6]:
print(df.columns.tolist())



['injury_id', 'player_id', 'team_id', 'injury_date', 'acquired', 'relinquished', 'body_region', 'diagnosis', 'return_status']


In [7]:
import pandas as pd

# 1) parse dates
df["injury_date"] = pd.to_datetime(df["injury_date"], errors="coerce")

# 2) keep "injury start" events only (relinquished=True) + valid ids/dates
inj_start = df[
    df["player_id"].notna()
    & df["injury_date"].notna()
    & (df["relinquished"] == True)
].copy()

inj_start = inj_start.sort_values(["player_id", "injury_date"])

# 3) define prediction horizon
Y = 30  # days

# 4) label: does another injury happen within Y days?
inj_start["next_injury_date"] = inj_start.groupby("player_id")["injury_date"].shift(-1)

inj_start["label_next_injury_within_Y"] = (
    (inj_start["next_injury_date"] - inj_start["injury_date"]).dt.days.between(1, Y)
).astype(int)

# 5) features: injury history known at this injury date
inj_start["prior_injuries_count"] = inj_start.groupby("player_id").cumcount()

inj_start["days_since_last_injury"] = (
    inj_start["injury_date"] - inj_start.groupby("player_id")["injury_date"].shift(1)
).dt.days.fillna(-1)

# 6) modeling dataset
dataset = inj_start[[
    "player_id",
    "injury_date",
    "body_region",
    "diagnosis",
    "prior_injuries_count",
    "days_since_last_injury",
    "label_next_injury_within_Y"
]].copy()

dataset["body_region"] = dataset["body_region"].fillna("unknown")
dataset["diagnosis"] = dataset["diagnosis"].fillna("unknown")

print("dataset columns:", dataset.columns.tolist())
print("label rate:", dataset["label_next_injury_within_Y"].mean())
dataset.head()


dataset columns: ['player_id', 'injury_date', 'body_region', 'diagnosis', 'prior_injuries_count', 'days_since_last_injury', 'label_next_injury_within_Y']
label rate: 0.30600843110366194


Unnamed: 0,player_id,injury_date,body_region,diagnosis,prior_injuries_count,days_since_last_injury,label_next_injury_within_Y
444,2.0,1985-11-12,hamstring,strain,0,-1.0,0
2347,3.0,1997-01-22,ankle,sprain,0,-1.0,0
2978,3.0,1998-02-19,foot,fracture,1,393.0,0
3692,3.0,1999-11-01,knee,sprain,2,620.0,0
4123,3.0,2000-03-25,finger,break,3,145.0,0


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Features / label
X = dataset[
    ["prior_injuries_count", "days_since_last_injury", "body_region", "diagnosis"]
]
y = dataset["label_next_injury_within_Y"]

# Time-based split
dataset = dataset.sort_values("injury_date")
split = int(0.8 * len(dataset))

X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

num_cols = ["prior_injuries_count", "days_since_last_injury"]
cat_cols = ["body_region", "diagnosis"]

preprocess = ColumnTransformer(
    [
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", model)
])

pipe.fit(X_train, y_train)

# quick metric
proba_test = pipe.predict_proba(X_test)[:, 1]
print("ROC AUC:", roc_auc_score(y_test, proba_test))


ROC AUC: 0.6450015289183995


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Add predictions to test set
showcase = X_test.copy()
showcase["injury_date"] = dataset.iloc[split:]["injury_date"].values
showcase["actual"] = y_test.values
showcase["predicted_risk"] = pipe.predict_proba(X_test)[:, 1]

# Sort by highest predicted risk
showcase = showcase.sort_values("predicted_risk", ascending=False)

showcase.head(10)


Unnamed: 0,prior_injuries_count,days_since_last_injury,body_region,diagnosis,injury_date,actual,predicted_risk
36703,30,11.0,unknown,unknown,2022-03-02,1,0.857421
36587,29,216.0,unknown,unknown,2022-03-02,1,0.83923
34250,23,7.0,unknown,unknown,2021-02-05,1,0.812465
34037,21,7.0,unknown,unknown,2021-06-28,0,0.797686
34904,21,33.0,unknown,unknown,2021-04-24,0,0.795746
33941,20,22.0,unknown,unknown,2022-02-03,1,0.788828
33958,20,38.0,unknown,unknown,2021-04-07,0,0.787598
27978,19,93.0,unknown,unknown,2021-03-02,0,0.775228
26910,18,15.0,unknown,unknown,2021-03-02,0,0.773272
31819,17,8.0,unknown,unknown,2021-06-16,0,0.765486


In [12]:
row = showcase.iloc[0]
row


prior_injuries_count                       30
days_since_last_injury                   11.0
body_region                           unknown
diagnosis                             unknown
injury_date               2022-03-02 00:00:00
actual                                      1
predicted_risk                       0.857421
Name: 36703, dtype: object

In [13]:
showcase = dataset.iloc[split:].copy()

showcase["predicted_risk"] = pipe.predict_proba(X_test)[:, 1]

# Optional: binary prediction
showcase["predicted_label"] = (showcase["predicted_risk"] >= 0.5).astype(int)

# Keep / reorder columns for display
showcase = showcase[[
    "player_id",
    "injury_date",
    "prior_injuries_count",
    "days_since_last_injury",
    "body_region",
    "diagnosis",
    "label_next_injury_within_Y",
    "predicted_risk",
    "predicted_label"
]]

showcase = showcase.sort_values("predicted_risk", ascending=False)
showcase.head(10)


Unnamed: 0,player_id,injury_date,prior_injuries_count,days_since_last_injury,body_region,diagnosis,label_next_injury_within_Y,predicted_risk,predicted_label
35225,1630168.0,2022-03-02,5,68.0,head,concussion,0,0.857421,1
35232,1626164.0,2022-03-02,11,1149.0,unknown,unknown,0,0.83923,1
31773,1629690.0,2021-02-05,3,14.0,unknown,unknown,1,0.812465,1
33054,1627826.0,2021-06-28,14,1247.0,mcl,sprain,0,0.797686,1
32664,1629048.0,2021-04-24,3,122.0,ankle,sprain,0,0.795746,1
34971,1630598.0,2022-02-03,0,-1.0,ankle,sprain,0,0.788828,1
32430,203924.0,2021-04-07,7,35.0,knee,soreness,1,0.787598,1
31986,203937.0,2021-03-02,11,457.0,unknown,infection,0,0.775228,1
31984,201188.0,2021-03-02,11,397.0,unknown,unknown,1,0.773272,1
33044,101108.0,2021-06-16,16,307.0,unknown,unknown,0,0.765486,1
