In [None]:
# package imports

from __future__ import annotations

import glob
import os
import re
from pathlib import Path
from typing import Dict, Tuple

import numpy as np
import pandas as pd


In [None]:
# Configuration

# Inputs
CONTRACTS_CSV = r"mlb_free_agent_contracts_master_v1_3_with_player_keys.csv"
INJURIES_CSV  = r"injuries_cleaned_v1_6_use.csv"
TERM_MAP_PATH = r"injury_term_map_v1.txt"

# Outputs
OUT_CONTRACTS   = r"contracts_with_isi.csv"
OUT_AUDIT_TABLE = r"injury_term_to_bucket_weights.csv"
OUT_COVERAGE    = r"isi_mapping_coverage_report.csv"

# ISI settings
LOOKBACK_YEARS = 3
RECENCY_LAMBDA = 0.70
DENOM_MODE = "calendar"     # "calendar" (365*years) or "games" (162*years)
DAYS_CAP = 800
INJURY_TO_CONTRACT_OFFSET = 0.50

# Positional Parsing for Pitcher Specific Positions
PITCHER_CODES = {"P", "SP", "RP", "RHP", "LHP"}

SEVERITY_TIER_MULT = {
    0: 1.00,
    1: 1.00,
    2: 1.15,
    3: 1.30,
}
SURGERY_BONUS_MULT = 1.10       # if surgery_flag = 1
STRUCTURAL_BONUS_MULT = 1.05    # if structural_flag = 1


In [None]:
# ISI Helper Functions

def is_pitcher(position_value: str | float | None) -> bool:
    """
    Detects Players with Pitcher as their Primary Position
    Handles variations such as "RP", "SP", "RHP", "LHP", "P", and combined forms like "RP/SP", "P/OF", "RHP-SP".
    """
    if position_value is None or (isinstance(position_value, float) and np.isnan(position_value)):
        return False

    s = str(position_value).strip().upper()
    if s == "":
        return False

    # Delimiter split, remove whitespace
    tokens = [t for t in re.split(r"[/,;|\-\s]+", s) if t]
    return any(t in PITCHER_CODES for t in tokens)

def denom_days(mode: str, lookback_years: int) -> float:
    m = mode.lower().strip()
    if m == 'games':
        return 162.0 * lookback_years
    if m == 'calendar':
        return 365.0 * lookback_years
    raise ValueError("Edit Configuration.  DENOM_MODE must be set to either 'calendar' or 'games'")

def load_term_map(term_map_path: str | Path) -> pd.DataFrame:
    """
    Loads the severity term map (.txt file)
    Required cols: inj_norm_v6, anatomical_group, severity_tier, surgery_flag, structural_flag
    """
    p = Path(term_map_path)
    if not p.exists():
        raise FileNotFoundError(f"TERM_MAP_PATH not found: {p}")

    # file should be a txt, but if txt fails, csv can also be used (untested) 
    df = pd.read_csv(p, sep="\t", dtype=str, engine="python")
    if df.shape[1] == 1:
        # csv fallback
        df = pd.read_csv(p, dtype=str)

    df.columns = [c.strip() for c in df.columns]

    # data integrity check
    required = {"inj_norm_v6", "anatomical_group", "severity_tier", "surgery_flag", "structural_flag"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f'Required cols missing: {sorted(missing)}')

    df["inj_norm_v6"] = df["inj_norm_v6"].astype(str).str.strip()
    df = df[df["inj_norm_v6"].notna() & (df["inj_norm_v6"] != "") & (df["inj_norm_v6"] != "0")].copy()

    df["anatomical_group"] = df["anatomical_group"].astype(str).str.strip()
    df["severity_tier"] = pd.to_numeric(df["severity_tier"], errors="coerce").fillna(0).astype(int)
    df["surgery_flag"] = pd.to_numeric(df["surgery_flag"], errors="coerce").fillna(0).astype(int)
    df["structural_flag"] = pd.to_numeric(df["structural_flag"], errors="coerce").fillna(0).astype(int)

    df = df.drop_duplicates(subset=["inj_norm_v6"], keep="first")

    return df

In [None]:
# create injury weights

def build_bucket_weights() -> Tuple[Dict[str, float], Dict[str, float]]:
    """
    Base anatomical-group risk weights
    Apply severity multipliers separately
    Default for unknown buckets = 1.00
    Minimum for weights is set to 1.00.
    """
    hitter_weights = {
        # Throwing related injuries (TJ is more improtant for pitchers than position players)
        "Tommy John": 1.20,
        "Elbow": 1.10,
        "Flexor Tendon": 1.15,
        "Shoulder": 1.15,

        # Soft tissue and spinal injuries
        "Lower back": 1.10,
        "Spinal disc": 1.20,
        "Groin": 1.20,
        "Hip": 1.10,
        "Pelvis": 1.10,
        "Hernia": 1.10,
        "Calf": 1.15,
        "Hamstring": 1.20,
        "Quad": 1.10,

        # Lower body injuries
        "Knee": 1.10,
        "Ankle": 1.05,
        "Foot": 1.05,
        "Achilles": 1.15,
        "Shin": 1.00,
        "Leg": 1.05,

        # Upper extremity (non-throwing related injuries)
        "Wrist": 1.00,
        "Hand": 1.00,
        "Finger": 1.00,
        "Forearm": 1.05,
        "Arm": 1.00,

        # General injuries
        "Head": 1.15,
        "Face": 1.00,
        "Neck": 1.05,
        "Chest": 1.00,
        "Abdominal": 1.00,
        "Rib": 1.00,
        "Toe": 1.00,
        "Neurological / Nerve": 1.10,
        "Vascular / Cardiac": 1.20,
        "Vascular cardiac related": 1.20,

        # Non-injury related and miscellaneous
        "Illness": 1.00,                 
        "Medical (Non-injury)": 1.00,    
        "Miscellaneous": 1.00,
    }

    # increased weights for pitchers due to nature of position and responsibilities
    pitcher_weights = dict(hitter_weights)
    pitcher_weights.update({
        "Tommy John": 1.50,
        "Elbow": 1.30,
        "Flexor Tendon": 1.35,
        "Shoulder": 1.30,
        "Forearm": 1.20,
        "Arm": 1.10,
    })
    return hitter_weights, pitcher_weights


def standardize_bucket_name(bucket: str | None) -> str | None:
    """
    Ensures naming conventions that were skipped in Injury Naming files are properly cleaned
    Precautionary measure, will not do anything if there are no changes to make.
    Normalize labels from the term map, reduces mismatches to weight dict keys.
    """
    if bucket is None or (isinstance(bucket, float) and np.isnan(bucket)):
        return None

    b = str(bucket).strip()
    if b == "":
        return None

    # deals with whitespace and slash variants
    b_norm = " ".join(b.replace("\\", "/").split())
    b_low = b_norm.lower()

    canon = {
        # vascular/cardiac variants
        "vascular / cardiac": "Vascular / Cardiac",
        "vascular/cardiac": "Vascular / Cardiac",
        "vascular cardiac related": "Vascular / Cardiac",

        # final variant cleaning terms
        "lower back": "Lower back",
        "spinal disc": "Spinal disc",
        "neurological / nerve": "Neurological / Nerve",
        "medical (non-injury)": "Medical (Non-injury)",
    }

    if b_low in canon:
        return canon[b_low]

    return b_norm

def get_bucket_weight(bucket: str | None, is_pitcher_flag: bool,
                      hitter_w: Dict[str, float], pitcher_w: Dict[str, float]) -> float:
    """
    Safely returns bucket weight, default = 1.00
    Standardizes bucket name prior to lookup
    """
    b = standardize_bucket_name(bucket)
    # ensures no weights are assigned 0.00, which would skew analysis
    if b is None:
        return 1.00
    weights = pitcher_w if is_pitcher_flag else hitter_w
    return float(weights.get(b, 1.00))

# applys a severity factor for each INDIVIDUAL injury stint
def severity_multiplier(severity_tier: int | float | None,
                        surgery_flag: int | float | None,
                        structural_flag: int | float | None,
                        tier_mult: Dict[int, float],
                        surgery_mult: float,
                        structural_mult: float) -> float:
    """
    Computes a per-stint severity multiplier
    tier_mult: {tier 0 : 1.00, tier 1 : 1.00, tier 2 : 1.15, tier 3 : 1.30}
    surgery_mult applies if surgery_flag = 1
    structural_mult applies if structural_flag = 1
    """
    tier = int(severity_tier) if pd.notna(severity_tier) else 0
    if tier not in tier_mult:
        tier = 0

    mult = float(tier_mult.get(tier, 1.00))

    if int(surgery_flag) == 1:
        mult *= float(surgery_mult)
    if int(structural_flag) == 1:
        mult *= float(structural_mult)

    return mult


In [None]:
# computes ISI weights
def compute_isi_features(
    contracts: pd.DataFrame,
    injuries: pd.DataFrame,
    term_map: pd.DataFrame,
    hitter_w: Dict[str, float],
    pitcher_w: Dict[str, float],
    lookback_years: int = 3,
    lambda_recency: float = 0.7,
    denom_mode: str = "calendar",
    days_cap: int = 800,
    injury_to_contract_offset: float = 0.5,
    id_col: str = "_cid",
) -> pd.DataFrame:

    # copy to preserve data integrity if errors occur
    dfc = contracts.copy()
    dfc["key_mlbam"] = pd.to_numeric(dfc.get("key_mlbam"), errors="coerce")
    dfc["year"] = pd.to_numeric(dfc.get("year"), errors="coerce")

    # row PK
    if id_col not in dfc.columns:
        dfc[id_col] = np.arange(len(dfc), dtype=int)

    dfc["_contract_row_id"] = dfc[id_col].astype(int)

    dfc["lb_start"] = dfc["year"] - lookback_years
    dfc["lb_end"] = dfc["year"] - 1

    # Injury csv file data preparation
    dfi = injuries.copy()
    dfi["mlbamid"] = pd.to_numeric(dfi.get("mlbamid"), errors="coerce")
    dfi["season"] = pd.to_numeric(dfi.get("season"), errors="coerce")
    dfi["days_to_return"] = pd.to_numeric(dfi.get("days_to_return"), errors="coerce")

    # data integrity checks
    dfi["days_capped"] = dfi["days_to_return"].clip(lower=0, upper=days_cap)
    dfi["inj_norm_v6"] = dfi["inj_norm_v6"].astype(str).str.strip()

    tm = term_map.rename(columns={"anatomical_group": "bucket"}).copy()
    tm["bucket"] = tm["bucket"].map(standardize_bucket_name)

    dfi = dfi.merge(
        tm[["inj_norm_v6", "bucket", "severity_tier", "surgery_flag", "structural_flag"]],
        on="inj_norm_v6",
        how="left",
    )
    dfi["bucket"] = dfi["bucket"].map(standardize_bucket_name)

    merged = dfc[["_contract_row_id", "key_mlbam", "year", "position", "lb_start", "lb_end"]].merge(
        dfi[["mlbamid", "season", "days_capped", "inj_norm_v6", "bucket", "severity_tier", "surgery_flag", "structural_flag"]],
        left_on="key_mlbam",
        right_on="mlbamid",
        how="left",
    )

    # Lookback period filter
    in_window = (
        merged["season"].notna()
        & merged["year"].notna()
        & (merged["season"] >= merged["lb_start"])
        & (merged["season"] <= merged["lb_end"])
    )
    merged = merged.loc[in_window].copy()

    merged["is_pitcher"] = merged["position"].map(is_pitcher)

    # Assign bucket weights
    # utilizes 'is_pitcher' bool flag
    merged["base_risk_w"] = merged.apply(
        lambda r: get_bucket_weight(
            bucket=r.get("bucket"),
            is_pitcher_flag=bool(r.get("is_pitcher")),
            hitter_w=hitter_w,
            pitcher_w=pitcher_w,
        ),
        axis=1,
    )

    # apply severity multiplier
    merged["severity_tier"] = pd.to_numeric(merged["severity_tier"], errors="coerce").fillna(0).astype(int)
    merged["surgery_flag"] = pd.to_numeric(merged["surgery_flag"], errors="coerce").fillna(0).astype(int)
    merged["structural_flag"] = pd.to_numeric(merged["structural_flag"], errors="coerce").fillna(0).astype(int)
    merged["severity_mult"] = merged.apply(
        lambda r: severity_multiplier(
            r.get("severity_tier"),
            r.get("surgery_flag"),
            r.get("structural_flag"),
            tier_mult=SEVERITY_TIER_MULT,
            surgery_mult=SURGERY_BONUS_MULT,
            structural_mult=STRUCTURAL_BONUS_MULT,
        ),
        axis=1,
    ).astype(float)

    merged["risk_w"] = merged["base_risk_w"] * merged["severity_mult"]

    # recency decay (season-based)
    # Explanation: If injury season is Y-1, time since injury is equivalent to 1 + offset
    merged["t_years"] = (merged["year"] - merged["season"]).astype(float) + float(injury_to_contract_offset)
    merged["t_years"] = merged["t_years"].clip(lower=0)

    merged["recency_mult"] = np.exp(-lambda_recency * merged["t_years"])

    ##################################
    ###### Per-stint components ######
    ##################################

    merged["weighted_days"] = merged["risk_w"] * merged["days_capped"]
    merged["decayed_days"] = merged["recency_mult"] * merged["days_capped"]
    merged["recency_weighted_days"] = merged["days_capped"] * merged["risk_w"] * merged["recency_mult"]

    # Convenience flags
    # if greater than 3 appears, an error occurred, range should be 0-3
    merged["tier3plus_flag"] = (merged["severity_tier"] >= 3).astype(int)

    arm_buckets = {"Tommy John", "Elbow", "Shoulder", "Flexor Tendon", "Forearm", "Arm"}
    merged["_is_arm_bucket"] = merged["bucket"].isin(arm_buckets).astype(int)
    merged["_arm_days"] = merged["days_capped"] * merged["_is_arm_bucket"]


    ##################################
    ####### Contract Aggregate #######
    ##################################

    agg = merged.groupby("_contract_row_id", as_index=False).agg(
        isi_D_days=("days_capped", "sum"),
        isi_N_stints=("days_capped", "size"),
        isi_S_raw=("weighted_days", "sum"),
        isi_C_raw=("decayed_days", "sum"),

        recency_weighted_days=("recency_weighted_days", "sum"),

        any_surgery_flag=("surgery_flag", "max"),
        any_structural_flag=("structural_flag", "max"),
        any_tier3plus_flag=("tier3plus_flag", "max"),
        isi_unique_seasons_with_inj=("season", pd.Series.nunique),

        arm_days=("__arm_days_safe", "sum") if False else ("_arm_days", "sum"),
    )

    out = dfc.merge(agg, on="_contract_row_id", how="left")
    fill0 = [
        "isi_D_days", "isi_N_stints", "isi_S_raw", "isi_C_raw",
        "recency_weighted_days", "any_surgery_flag", "any_structural_flag",
        "any_tier3plus_flag", "isi_unique_seasons_with_inj", "arm_days"
    ]
    for col in fill0:
        if col in out.columns:
            out[col] = out[col].fillna(0)

    # coverage indicators
    inj_season_min = int(pd.to_numeric(injuries["season"], errors="coerce").min())
    inj_season_max = int(pd.to_numeric(injuries["season"], errors="coerce").max())

    out["isi_window_seasons_avail"] = (
        (np.minimum(out["lb_end"], inj_season_max) - np.maximum(out["lb_start"], inj_season_min) + 1)
        .clip(lower=0)
    ).astype("Int64")
    out["isi_full_window_flag"] = (out["isi_window_seasons_avail"] == lookback_years)

    ##################################
    ####### Norm, Burden Score #######
    ##################################

    denom = denom_days(denom_mode, lookback_years)

    # Normalize components
    base_max_w = max(max(hitter_w.values()), max(pitcher_w.values())) if hitter_w and pitcher_w else 1.0
    sev_max = max(SEVERITY_TIER_MULT.values()) * SURGERY_BONUS_MULT * STRUCTURAL_BONUS_MULT
    max_w = base_max_w * sev_max

    out["isi_D_star"] = (out["isi_D_days"] / denom).clip(0, 1)
    out["isi_S_star"] = (out["isi_S_raw"] / (denom * max_w)).clip(0, 1)

    # recurrence normalization
    N_MAX = 10.0
    out["isi_R_star"] = (np.log1p(out["isi_N_stints"]) / np.log1p(N_MAX)).clip(0, 1)
    out["isi_C_star"] = (out["isi_C_raw"] / denom).clip(0, 1)

    # IGNORE THIS FOR NOW, THIS WAS IN REFERENCE TO A PREVIOUS ATTEMPT, USED FOR DIAGNOSES
    out["ISI_v1_1"] = 0.25 * (out["isi_D_star"] + out["isi_S_star"] + out["isi_R_star"] + out["isi_C_star"])

    out["durability_days_rate"] = (out["isi_D_days"] / denom).clip(lower=0)
    out["recency_weighted_days_rate"] = (out["recency_weighted_days"] / (denom * max_w)).clip(lower=0)
    out["injury_burden"] = np.log1p(out["recency_weighted_days_rate"])

    out["arm_share_days"] = np.where(out["isi_D_days"] > 0, out["arm_days"] / out["isi_D_days"], 0.0)

    # Data integrity
    # If no window found, treat as missing ISI score
    miss_mask = (out["isi_window_seasons_avail"] == 0)
    out.loc[miss_mask, ["ISI_v1_1", "injury_burden", "recency_weighted_days_rate"]] = np.nan

    return out.drop(columns=["_contract_row_id"])


In [None]:
# Audit Pipeline

def build_coverage_report(
    contracts: pd.DataFrame,
    injuries: pd.DataFrame,
    term_map: pd.DataFrame,
    *,
    lookback_years: int | None = None,
    contract_year_col: str = "year",
) -> pd.DataFrame:
    """
    Diagnostics table
    """
    c = contracts.copy()
    i = injuries.copy()
    tm = term_map.copy()

    # normalize required columns
    c["key_mlbam"] = pd.to_numeric(c.get("key_mlbam"), errors="coerce")
    c[contract_year_col] = pd.to_numeric(c.get(contract_year_col), errors="coerce")

    i["mlbamid"] = pd.to_numeric(i.get("mlbamid"), errors="coerce")
    i["season"] = pd.to_numeric(i.get("season"), errors="coerce")
    i["inj_norm_v6"] = i.get("inj_norm_v6").astype(str).str.strip()

    tm["inj_norm_v6"] = tm.get("inj_norm_v6").astype(str).str.strip()

    # Counts to determine if required components are found
    n_contracts = int(len(c))
    n_contracts_with_mlbam = int(c["key_mlbam"].notna().sum())

    n_inj_rows = int(len(i))
    n_inj_rows_with_mlbam = int(i["mlbamid"].notna().sum())

    # Coverage Map
    unique_terms_in_inj = pd.Series(i["inj_norm_v6"]).replace({"nan": np.nan}).dropna()
    unique_terms_in_inj = unique_terms_in_inj[unique_terms_in_inj != ""].unique().tolist()

    mapped_terms = set(pd.Series(tm["inj_norm_v6"]).replace({"nan": np.nan}).dropna().tolist())

    n_terms = int(len(unique_terms_in_inj))
    n_terms_mapped = int(sum(t in mapped_terms for t in unique_terms_in_inj))

    # Below answers the question: How many contracts have at least one injury row
    contracts_with_any_injury = int(
        c.loc[c["key_mlbam"].notna(), "key_mlbam"]
         .isin(set(i.loc[i["mlbamid"].notna(), "mlbamid"]))
         .sum()
    )

    # Below answers the question: How many contracts have at least 1 injury in lookback period
    contracts_with_injury_in_window = np.nan
    if lookback_years is not None:
        ct = c.loc[c["key_mlbam"].notna() & c[contract_year_col].notna(),
                   ["key_mlbam", contract_year_col]].copy()
        ct["_rid"] = np.arange(len(ct), dtype=int)
        ct["lb_start"] = ct[contract_year_col] - int(lookback_years)
        ct["lb_end"] = ct[contract_year_col] - 1

        m = ct.merge(
            i.loc[i["mlbamid"].notna() & i["season"].notna(), ["mlbamid", "season"]],
            left_on="key_mlbam",
            right_on="mlbamid",
            how="left",
        )

        in_window = (
            m["season"].notna()
            & (m["season"] >= m["lb_start"])
            & (m["season"] <= m["lb_end"])
        )
        contracts_with_injury_in_window = int(m.loc[in_window, "_rid"].nunique())

    report = pd.DataFrame([{
        "contracts_rows": n_contracts,
        "contracts_with_key_mlbam": n_contracts_with_mlbam,
        "contracts_with_key_mlbam_pct": (n_contracts_with_mlbam / n_contracts) if n_contracts else np.nan,

        "injury_rows": n_inj_rows,
        "injury_rows_with_mlbamid": n_inj_rows_with_mlbam,
        "injury_rows_with_mlbamid_pct": (n_inj_rows_with_mlbam / n_inj_rows) if n_inj_rows else np.nan,

        "contracts_with_any_injury_match_on_mlbam": contracts_with_any_injury,
        "contracts_with_any_injury_match_on_mlbam_pct":
            (contracts_with_any_injury / n_contracts_with_mlbam) if n_contracts_with_mlbam else np.nan,

        "unique_injury_terms_in_injuries": n_terms,
        "unique_injury_terms_mapped_in_term_map": n_terms_mapped,
        "unique_injury_terms_mapped_pct": (n_terms_mapped / n_terms) if n_terms else np.nan,

        "lookback_years_for_window_check": lookback_years,
        "contracts_with_injury_in_lookback_window": contracts_with_injury_in_window,
    }])

    return report


def build_audit_table(
    term_map: pd.DataFrame,
    hitter_w: Dict[str, float],
    pitcher_w: Dict[str, float],
) -> pd.DataFrame:
    """
    Term-level audit
    Documents how each injury term affects
    to ISI
    Measurements for: anatomical bucket, severity tier, and effective weights
    """
    tm = term_map.copy()
    tm = tm.rename(columns={"anatomical_group": "bucket"})
    tm["bucket"] = tm["bucket"].map(standardize_bucket_name)

    tm["severity_tier"] = pd.to_numeric(tm["severity_tier"], errors="coerce").fillna(0).astype(int)
    tm["surgery_flag"] = pd.to_numeric(tm["surgery_flag"], errors="coerce").fillna(0).astype(int)
    tm["structural_flag"] = pd.to_numeric(tm["structural_flag"], errors="coerce").fillna(0).astype(int)

    # Severity mult
    tm["tier_mult"] = tm["severity_tier"].map(
        lambda x: SEVERITY_TIER_MULT.get(int(x), 1.00)
    ).astype(float)

    tm["surgery_mult"] = np.where(tm["surgery_flag"] == 1, SURGERY_BONUS_MULT, 1.00)
    tm["structural_mult"] = np.where(tm["structural_flag"] == 1, STRUCTURAL_BONUS_MULT, 1.00)

    tm["severity_mult"] = tm["tier_mult"] * tm["surgery_mult"] * tm["structural_mult"]

    # Base bucket weights
    def w_hitter(bucket: str | None) -> float:
        if bucket is None or pd.isna(bucket) or str(bucket).strip() == "":
            return 1.00
        return float(hitter_w.get(bucket, 1.00))

    def w_pitcher(bucket: str | None) -> float:
        if bucket is None or pd.isna(bucket) or str(bucket).strip() == "":
            return 1.00
        return float(pitcher_w.get(bucket, 1.00))

    tm["weight_hitter_base"] = tm["bucket"].map(w_hitter)
    tm["weight_pitcher_base"] = tm["bucket"].map(w_pitcher)

    tm["weight_hitter_effective"] = tm["weight_hitter_base"] * tm["severity_mult"]
    tm["weight_pitcher_effective"] = tm["weight_pitcher_base"] * tm["severity_mult"]

    # Final column ordering
    cols = [
        "inj_norm_v6",
        "bucket",
        "severity_tier",
        "surgery_flag",
        "structural_flag",
        "tier_mult",
        "surgery_mult",
        "structural_mult",
        "severity_mult",
        "weight_hitter_base",
        "weight_pitcher_base",
        "weight_hitter_effective",
        "weight_pitcher_effective",
    ]

    tm = tm[cols].sort_values(
        ["bucket", "severity_tier", "inj_norm_v6"],
        kind="stable"
    ).reset_index(drop=True)

    return tm


In [None]:
# This blocks applies the above functions

def main() -> None:
    # load input
    contracts = pd.read_csv(CONTRACTS_CSV)

    # restrict to contracts whose first season is within the injury file
    contracts["term_start_year"] = pd.to_numeric(contracts["term_start_year"], errors="coerce")
    contracts = contracts.dropna(subset=["term_start_year"]).copy()
    contracts = contracts[(contracts["term_start_year"] >= 2020) & (contracts["term_start_year"] <= 2025)].copy()

    # Data Integrity
    # Ensures ISI anchor year matches contract start year
    contracts["year"] = contracts["term_start_year"]

    injuries = pd.read_csv(INJURIES_CSV)
    term_map = load_term_map(TERM_MAP_PATH)

    # Build baseline weights
    hitter_w, pitcher_w = build_bucket_weights()

    # ensure proper naming conventions
    def suffix_df(df: pd.DataFrame, suffix: str, keep_cols: list[str]) -> pd.DataFrame:
        out = df.copy()
        rename = {c: f"{c}__{suffix}" for c in out.columns if c not in keep_cols}
        return out.rename(columns=rename)

    def summarize_setting(df: pd.DataFrame, suffix: str) -> dict:
        """
        Diagnostics prior to any training
        """
        cols = {
            "injury_burden": f"injury_burden__{suffix}",
            "rwd_rate": f"recency_weighted_days_rate__{suffix}",
            "dur_rate": f"durability_days_rate__{suffix}",
            "any_surg": f"any_surgery_flag__{suffix}",
            "tier3": f"any_tier3plus_flag__{suffix}",
        }

        s = {}
        s["setting"] = suffix

        ib = df.get(cols["injury_burden"])
        if ib is not None:
            s["injury_burden_nonnull_pct"] = float(ib.notna().mean())
            s["injury_burden_mean"] = float(ib.dropna().mean()) if ib.notna().any() else np.nan
            s["injury_burden_p90"] = float(ib.dropna().quantile(0.90)) if ib.notna().any() else np.nan
            s["injury_burden_p99"] = float(ib.dropna().quantile(0.99)) if ib.notna().any() else np.nan

        rwd = df.get(cols["rwd_rate"])
        if rwd is not None:
            s["rwd_rate_mean"] = float(rwd.dropna().mean()) if rwd.notna().any() else np.nan

        dur = df.get(cols["dur_rate"])
        if dur is not None:
            s["durability_rate_mean"] = float(dur.dropna().mean()) if dur.notna().any() else np.nan

        surg = df.get(cols["any_surg"])
        if surg is not None:
            s["any_surgery_rate"] = float((surg.fillna(0) > 0).mean())

        tier3 = df.get(cols["tier3"])
        if tier3 is not None:
            s["any_tier3plus_rate"] = float((tier3.fillna(0) > 0).mean())

        # Data Integrity
        if "guarantee_total_real_2025" in df.columns and ib is not None:
            y = pd.to_numeric(df["guarantee_total_real_2025"], errors="coerce")
            x = pd.to_numeric(ib, errors="coerce")
            ok = y.notna() & x.notna()
            s["corr_injury_burden_vs_guarantee"] = float(x[ok].corr(y[ok])) if ok.sum() >= 30 else np.nan

        return s

    # PK
    contracts = contracts.copy()
    contracts["_cid"] = np.arange(len(contracts), dtype=int)

    PARAM_GRID = [
        {"lookback_years": 3, "lambda_recency": 0.35},
        {"lookback_years": 3, "lambda_recency": 0.50},
        {"lookback_years": 3, "lambda_recency": 0.70},
        {"lookback_years": 5, "lambda_recency": 0.35},
        {"lookback_years": 5, "lambda_recency": 0.50},
        {"lookback_years": 5, "lambda_recency": 0.70},
    ]

    # Cols to keep
    BASE_KEEP = [
        "_cid",
        "player_name", "key_mlbam", "year", "position",
        "guarantee_total_real_2025",
        "term_start_year", "term_years",
    ]

    BASE_KEEP = [c for c in BASE_KEEP if c in contracts.columns]

    wide = contracts[BASE_KEEP].copy()
    summary_rows = []

    for p in PARAM_GRID:
        lb = int(p["lookback_years"])
        lam = float(p["lambda_recency"])
        suffix = f"lb{lb}_lam{str(lam).replace('.', 'p')}"

        out = compute_isi_features(
            contracts=contracts,
            injuries=injuries,
            term_map=term_map,
            hitter_w=hitter_w,
            pitcher_w=pitcher_w,
            lookback_years=lb,
            lambda_recency=lam,
            denom_mode=DENOM_MODE,
            days_cap=DAYS_CAP,
            injury_to_contract_offset=INJURY_TO_CONTRACT_OFFSET,
            id_col="_cid",
        )

        isi_cols = [
            "_cid",
            "ISI_v1_1",
            "isi_D_days", "isi_N_stints",
            "recency_weighted_days", "recency_weighted_days_rate",
            "durability_days_rate", "injury_burden",
            "any_surgery_flag", "any_structural_flag", "any_tier3plus_flag",
            "arm_days", "arm_share_days",
            "isi_window_seasons_avail", "isi_full_window_flag",
        ]
        isi_cols = [c for c in isi_cols if c in out.columns]

        out_small = out[isi_cols].copy()
        out_small = suffix_df(out_small, suffix=suffix, keep_cols=["_cid"])

        wide = wide.merge(out_small, on="_cid", how="left")

        summary_rows.append(summarize_setting(wide, suffix=suffix))

    summary_df = pd.DataFrame(summary_rows)

    # sweep outputs
    wide.to_csv(OUT_CONTRACTS.replace(".csv", "_SWEEP_WIDE.csv"), index=False)
    summary_df.to_csv(OUT_CONTRACTS.replace(".csv", "_SWEEP_SUMMARY.csv"), index=False)

    print("\nWrote sweep-wide:", OUT_CONTRACTS.replace(".csv", "_SWEEP_WIDE.csv"))
    print("Wrote sweep-summary:", OUT_CONTRACTS.replace(".csv", "_SWEEP_SUMMARY.csv"))
    print("\nSweep summary:")
    print(summary_df)

if __name__ == "__main__":
    main()


Wrote sweep-wide: C:\Users\brend\Desktop\SABR_research_proposal\injuries\ISI Computations\ISI_update\contracts_with_isi_v2_SWEEP_WIDE.csv
Wrote sweep-summary: C:\Users\brend\Desktop\SABR_research_proposal\injuries\ISI Computations\ISI_update\contracts_with_isi_v2_SWEEP_SUMMARY.csv

Sweep summary:
       setting  injury_burden_nonnull_pct  injury_burden_mean  \
0  lb3_lam0p35                   0.845622            0.018077   
1   lb3_lam0p5                   0.845622            0.013315   
2   lb3_lam0p7                   0.845622            0.008972   
3  lb5_lam0p35                   0.845622            0.011712   
4   lb5_lam0p5                   0.845622            0.008439   
5   lb5_lam0p7                   0.845622            0.005574   

   injury_burden_p90  injury_burden_p99  rwd_rate_mean  durability_rate_mean  \
0           0.040412           0.229917       0.019190              0.051868   
1           0.029746           0.175212       0.013930              0.051868   
2    

In [None]:
# Coverage report
    coverage = build_coverage_report(contracts, injuries, term_map)
    coverage.to_csv(OUT_COVERAGE, index=False)

    # Compute ISI (severity-enabled, updated)
    out = compute_isi_features(
        contracts=contracts,
        injuries=injuries,
        term_map=term_map,
        hitter_w=hitter_w,
        pitcher_w=pitcher_w,
        lookback_years=LOOKBACK_YEARS,
        lambda_recency=RECENCY_LAMBDA,
        denom_mode=DENOM_MODE,
        days_cap=DAYS_CAP,
        injury_to_contract_offset=INJURY_TO_CONTRACT_OFFSET,
    )

    # Export main output + audit table
    out.to_csv(OUT_CONTRACTS, index=False)
    audit = build_audit_table(term_map, hitter_w, pitcher_w)
    audit.to_csv(OUT_AUDIT_TABLE, index=False)

    print("\nWrote:", OUT_CONTRACTS)
    print("Wrote:", OUT_AUDIT_TABLE)
    print("Wrote:", OUT_COVERAGE)

    # top 15 by injury burden, allows to quickly find injuries
    cols_show = [
        "player_name", "year", "position",
        "guarantee_total_real_2025",
        "injury_burden",
        "recency_weighted_days", "recency_weighted_days_rate",
        "durability_days_rate", "isi_D_days", "isi_N_stints",
        "any_surgery_flag", "any_tier3plus_flag", "arm_share_days",
        "isi_window_seasons_avail", "isi_full_window_flag",
    ]
    cols_show = [c for c in cols_show if c in out.columns]
    top = out.dropna(subset=["injury_burden"]).sort_values("injury_burden", ascending=False).head(15)
    print("\nTop 15 injury burden contracts:")
    print(top[cols_show].to_string(index=False))


if __name__ == "__main__":
    main()
