"""
Estimate_fans_fast.py

Reconstruct latent fan vote shares F_it for DWTS using:
- softmax utility model
- inequality-based likelihood from elimination events

This is an optimized version that avoids pandas operations
inside the likelihood, so SciPy optimization is much faster.
"""

import re
from dataclasses import dataclass

import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import expit  # sigmoid


# ============================================================
# 1. Load DWTS data
# ============================================================

DATA_PATH = "2026_MCM_Problem_C_Data.csv"  # change if needed

df_raw = pd.read_csv(DATA_PATH)
print("Number of rows (contestant-season):", len(df_raw))


# ============================================================
# 2. Helpers: parse result strings, find week columns
# ============================================================

def parse_elim_week(result_str: str):
    """
    Parse 'results' string to get elimination week.
    Examples:
        'Eliminated Week 3' -> 3
        '1st Place'         -> None
        '2nd Place'         -> None
    """
    if not isinstance(result_str, str):
        return None
    m = re.search(r"Eliminated Week\s+(\d+)", result_str)
    if m:
        return int(m.group(1))
    else:
        return None


def collect_week_judge_cols(columns):
    """
    From all column names, find 'weekX_judgeY_score' columns.
    Return: dict week -> [list of columns for that week]
    """
    pattern = re.compile(r"week(\d+)_judge(\d+)_score")
    week_to_cols = {}
    for col in columns:
        m = pattern.match(col)
        if m:
            w = int(m.group(1))
            week_to_cols.setdefault(w, []).append(col)
    # sort judge cols per week for consistency
    for w in week_to_cols:
        week_to_cols[w] = sorted(week_to_cols[w])
    return week_to_cols


week_to_cols = collect_week_judge_cols(df_raw.columns)
print("Detected weeks:", sorted(week_to_cols.keys()))


# ============================================================
# 3. Build long panel: one row per (season, week, contestant)
# ============================================================

rows = []

for _, row in df_raw.iterrows():
    season = int(row["season"])
    celeb = row["celebrity_name"]
    elim_week = parse_elim_week(row["results"])

    for week, cols in week_to_cols.items():
        scores = row[cols].values.astype(float)

        # if all NaN → contestant not active / show not started
        if np.all(np.isnan(scores)):
            continue

        total_judge = np.nansum(scores)

        if elim_week is None:
            alive = True
            eliminated = False
        else:
            alive = week <= elim_week
            eliminated = (week == elim_week)

        if not alive:
            continue

        rows.append(
            {
                "season": season,
                "celebrity_name": celeb,
                "week": week,
                "judge_total": total_judge,
                "elim_week": elim_week,
                "eliminated": int(eliminated),
            }
        )

panel = pd.DataFrame(rows)
print("Panel shape:", panel.shape)
print(panel.head())


# ============================================================
# 4. Standardize judge scores within (season, week)
# ============================================================

panel["judge_std"] = (
    panel.groupby(["season", "week"])["judge_total"]
    .transform(lambda s: (s - s.mean()) / s.std(ddof=0))
)
panel["judge_std"] = panel["judge_std"].fillna(0.0)  # all-equal scores


# ============================================================
# 5. Encode contestant IDs and weeks, precompute arrays
# ============================================================

# contestant_id: unique per (season, celebrity_name)
panel["contestant_id"] = panel.groupby(["season", "celebrity_name"]).ngroup()
panel["week_float"] = panel["week"].astype(float)

n_contestants = panel["contestant_id"].max() + 1
print("Number of contestant-season units:", n_contestants)

# Numpy views aligned with panel.index
cid_arr = panel["contestant_id"].to_numpy()
week_arr = panel["week_float"].to_numpy()
judge_total_arr = panel["judge_total"].to_numpy()
judge_std_arr = panel["judge_std"].to_numpy()

season_arr = panel["season"].to_numpy()
elim_arr = panel["eliminated"].to_numpy()


# ============================================================
# 6. Build elimination events with row indices (no pandas in likelihood)
# ============================================================

events = []

for (season, week), sub in panel.groupby(["season", "week"]):
    elim_rows = sub[sub["eliminated"] == 1]
    if len(elim_rows) != 1:
        continue  # skip weeks with 0 or >1 eliminations
    elim_row_index = elim_rows.index[0]
    row_idx = sub.index.to_numpy()  # all alive contestants this week

    # position of eliminated contestant within row_idx
    elim_pos = int(np.where(row_idx == elim_row_index)[0][0])

    events.append(
        {
            "row_idx": row_idx,  # indices into panel
            "elim_pos": elim_pos,
        }
    )

print("Number of usable elimination events:", len(events))


# ============================================================
# 7. Model config and parameter unpacking
# ============================================================

@dataclass
class ModelConfig:
    alpha_soft: float = 10.0  # steepness of sigmoid constraint
    use_week_effect: bool = True


config = ModelConfig()

# θ = [a_0,...,a_{n-1}, β_T, β_week(optional)]
n_params = n_contestants + (2 if config.use_week_effect else 1)


def unpack_theta(theta):
    """Split flat parameter vector into (a, beta_T, beta_week)."""
    a = theta[:n_contestants]
    beta_T = theta[n_contestants]
    if config.use_week_effect:
        beta_week = theta[n_contestants + 1]
    else:
        beta_week = 0.0
    return a, beta_T, beta_week


def compute_eta(theta):
    """
    Compute η_{it} = a_i + β_T * judge_std + β_week * week
    for every row in panel.
    Returns numpy array of shape (len(panel),).
    """
    a, beta_T, beta_week = unpack_theta(theta)
    eta = a[cid_arr] + beta_T * judge_std_arr
    if config.use_week_effect:
        eta = eta + beta_week * week_arr
    return eta


def softmax(x):
    """Standard softmax, numerically stable."""
    z = x - np.max(x)
    exp_z = np.exp(z)
    return exp_z / exp_z.sum()


# ============================================================
# 8. Negative log-likelihood using inequality constraints
# ============================================================

def neg_log_likelihood(theta):
    """
    For each elimination event:
        - compute fan shares via softmax(η)
        - form total score S_i = J_pct_i + F_i (percentage scheme)
        - penalize if eliminated contestant does NOT have the lowest S_i
    The penalty is implemented as sum of log(sigmoid(alpha * (S_j - S_elim)))
    for all survivors j.
    """
    eta_all = compute_eta(theta)
    loglike = 0.0
    eps = 1e-12

    for ev in events:
        idx = ev["row_idx"]          # indices of alive contestants this week
        elim_pos = ev["elim_pos"]    # which position is eliminated

        eta_week = eta_all[idx]      # η for these contestants
        F_week = softmax(eta_week)   # fan shares among alive contestants

        J_week = judge_total_arr[idx]
        J_pct = J_week / J_week.sum()

        S = J_pct + F_week
        S_elim = S[elim_pos]

        # survivors indices (within this week)
        mask_survivor = np.ones_like(S, dtype=bool)
        mask_survivor[elim_pos] = False
        S_surv = S[mask_survivor]

        diff = S_surv - S_elim  # we want these to be >= 0
        p_ok = expit(config.alpha_soft * diff)
        loglike += np.sum(np.log(p_ok + eps))

    return -loglike  # minimizer wants negative log-likelihood


# ============================================================
# 9. Fit the model with SciPy
# ============================================================

# Initial guess: a_i = 0, β_T = 1, β_week = 0
theta0 = np.zeros(n_params)
theta0[n_contestants] = 1.0  # β_T

result = minimize(
    neg_log_likelihood,
    theta0,
    method="L-BFGS-B",
    options={"maxiter": 80},  # you can increase if you want
)

print("Optimization success:", result.success)
print("Message:", result.message)
print("Final negative log-likelihood:", result.fun)

theta_hat = result.x
a_hat, beta_T_hat, beta_week_hat = unpack_theta(theta_hat)
print("beta_T (effect of judge_std):", beta_T_hat)
print("beta_week:", beta_week_hat)


# ============================================================
# 10. Reconstruct fan shares F_hat for each (season, week, contestant)
# ============================================================

eta_hat = compute_eta(theta_hat)

fan_rows = []
for (season, week), sub in panel.groupby(["season", "week"]):
    idx = sub.index.to_numpy()
    eta_week = eta_hat[idx]
    F_week = softmax(eta_week)
    for row_idx, F_i in zip(idx, F_week):
        fan_rows.append(
            {
                "row_index": row_idx,
                "fan_share_hat": F_i,
            }
        )

F_df = pd.DataFrame(fan_rows).set_index("row_index")
panel["fan_share_hat"] = F_df["fan_share_hat"]

print(panel.head())

# Optionally save the reconstructed panel for later questions
panel.to_csv("fan_shares_estimated.csv", index=False)
print("Saved fan_shares_estimated.csv")




In [None]:
"""
Estimate_fans_fast.py

Reconstruct latent fan vote shares F_it for DWTS using:
- softmax utility model
- inequality-based likelihood from elimination events

This is an optimized version that avoids pandas operations
inside the likelihood, so SciPy optimization is much faster.
"""

import re
from dataclasses import dataclass

import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import expit  # sigmoid


# ============================================================
# 1. Load DWTS data
# ============================================================

DATA_PATH = "2026_MCM_Problem_C_Data.csv"  # change if needed

df_raw = pd.read_csv(DATA_PATH)
print("Number of rows (contestant-season):", len(df_raw))


# ============================================================
# 2. Helpers: parse result strings, find week columns
# ============================================================

def parse_elim_week(result_str: str):
    """
    Parse 'results' string to get elimination week.
    Examples:
        'Eliminated Week 3' -> 3
        '1st Place'         -> None
        '2nd Place'         -> None
    """
    if not isinstance(result_str, str):
        return None
    m = re.search(r"Eliminated Week\s+(\d+)", result_str)
    if m:
        return int(m.group(1))
    else:
        return None


def collect_week_judge_cols(columns):
    """
    From all column names, find 'weekX_judgeY_score' columns.
    Return: dict week -> [list of columns for that week]
    """
    pattern = re.compile(r"week(\d+)_judge(\d+)_score")
    week_to_cols = {}
    for col in columns:
        m = pattern.match(col)
        if m:
            w = int(m.group(1))
            week_to_cols.setdefault(w, []).append(col)
    # sort judge cols per week for consistency
    for w in week_to_cols:
        week_to_cols[w] = sorted(week_to_cols[w])
    return week_to_cols


week_to_cols = collect_week_judge_cols(df_raw.columns)
print("Detected weeks:", sorted(week_to_cols.keys()))


# ============================================================
# 3. Build long panel: one row per (season, week, contestant)
# ============================================================

rows = []

for _, row in df_raw.iterrows():
    season = int(row["season"])
    celeb = row["celebrity_name"]
    elim_week = parse_elim_week(row["results"])

    for week, cols in week_to_cols.items():
        scores = row[cols].values.astype(float)

        # if all NaN → contestant not active / show not started
        if np.all(np.isnan(scores)):
            continue

        total_judge = np.nansum(scores)

        if elim_week is None:
            alive = True
            eliminated = False
        else:
            alive = week <= elim_week
            eliminated = (week == elim_week)

        if not alive:
            continue

        rows.append(
            {
                "season": season,
                "celebrity_name": celeb,
                "week": week,
                "judge_total": total_judge,
                "elim_week": elim_week,
                "eliminated": int(eliminated),
            }
        )

panel = pd.DataFrame(rows)
print("Panel shape:", panel.shape)
print(panel.head())


# ============================================================
# 4. Standardize judge scores within (season, week)
# ============================================================

panel["judge_std"] = (
    panel.groupby(["season", "week"])["judge_total"]
    .transform(lambda s: (s - s.mean()) / s.std(ddof=0))
)
panel["judge_std"] = panel["judge_std"].fillna(0.0)  # all-equal scores


# ============================================================
# 5. Encode contestant IDs and weeks, precompute arrays
# ============================================================

# contestant_id: unique per (season, celebrity_name)
panel["contestant_id"] = panel.groupby(["season", "celebrity_name"]).ngroup()
panel["week_float"] = panel["week"].astype(float)

n_contestants = panel["contestant_id"].max() + 1
print("Number of contestant-season units:", n_contestants)

# Numpy views aligned with panel.index
cid_arr = panel["contestant_id"].to_numpy()
week_arr = panel["week_float"].to_numpy()
judge_total_arr = panel["judge_total"].to_numpy()
judge_std_arr = panel["judge_std"].to_numpy()

season_arr = panel["season"].to_numpy()
elim_arr = panel["eliminated"].to_numpy()


# ============================================================
# 6. Build elimination events with row indices (no pandas in likelihood)
# ============================================================

events = []

for (season, week), sub in panel.groupby(["season", "week"]):
    elim_rows = sub[sub["eliminated"] == 1]
    if len(elim_rows) != 1:
        continue  # skip weeks with 0 or >1 eliminations
    elim_row_index = elim_rows.index[0]
    row_idx = sub.index.to_numpy()  # all alive contestants this week

    # position of eliminated contestant within row_idx
    elim_pos = int(np.where(row_idx == elim_row_index)[0][0])

    events.append(
        {
            "row_idx": row_idx,  # indices into panel
            "elim_pos": elim_pos,
        }
    )

print("Number of usable elimination events:", len(events))


# ============================================================
# 7. Model config and parameter unpacking
# ============================================================

@dataclass
class ModelConfig:
    alpha_soft: float = 10.0  # steepness of sigmoid constraint
    use_week_effect: bool = True


config = ModelConfig()

# θ = [a_0,...,a_{n-1}, β_T, β_week(optional)]
n_params = n_contestants + (2 if config.use_week_effect else 1)


def unpack_theta(theta):
    """Split flat parameter vector into (a, beta_T, beta_week)."""
    a = theta[:n_contestants]
    beta_T = theta[n_contestants]
    if config.use_week_effect:
        beta_week = theta[n_contestants + 1]
    else:
        beta_week = 0.0
    return a, beta_T, beta_week


def compute_eta(theta):
    """
    Compute η_{it} = a_i + β_T * judge_std + β_week * week
    for every row in panel.
    Returns numpy array of shape (len(panel),).
    """
    a, beta_T, beta_week = unpack_theta(theta)
    eta = a[cid_arr] + beta_T * judge_std_arr
    if config.use_week_effect:
        eta = eta + beta_week * week_arr
    return eta


def softmax(x):
    """Standard softmax, numerically stable."""
    z = x - np.max(x)
    exp_z = np.exp(z)
    return exp_z / exp_z.sum()


# ============================================================
# 8. Negative log-likelihood using inequality constraints
# ============================================================

def neg_log_likelihood(theta):
    """
    For each elimination event:
        - compute fan shares via softmax(η)
        - form total score S_i = J_pct_i + F_i (percentage scheme)
        - penalize if eliminated contestant does NOT have the lowest S_i
    The penalty is implemented as sum of log(sigmoid(alpha * (S_j - S_elim)))
    for all survivors j.
    """
    eta_all = compute_eta(theta)
    loglike = 0.0
    eps = 1e-12

    for ev in events:
        idx = ev["row_idx"]          # indices of alive contestants this week
        elim_pos = ev["elim_pos"]    # which position is eliminated

        eta_week = eta_all[idx]      # η for these contestants
        F_week = softmax(eta_week)   # fan shares among alive contestants

        J_week = judge_total_arr[idx]
        J_pct = J_week / J_week.sum()

        S = J_pct + F_week
        S_elim = S[elim_pos]

        # survivors indices (within this week)
        mask_survivor = np.ones_like(S, dtype=bool)
        mask_survivor[elim_pos] = False
        S_surv = S[mask_survivor]

        diff = S_surv - S_elim  # we want these to be >= 0
        p_ok = expit(config.alpha_soft * diff)
        loglike += np.sum(np.log(p_ok + eps))

    return -loglike  # minimizer wants negative log-likelihood


# ============================================================
# 9. Fit the model with SciPy
# ============================================================

# Initial guess: a_i = 0, β_T = 1, β_week = 0
theta0 = np.zeros(n_params)
theta0[n_contestants] = 1.0  # β_T

result = minimize(
    neg_log_likelihood,
    theta0,
    method="L-BFGS-B",
    options={"maxiter": 80},  # you can increase if you want
)

print("Optimization success:", result.success)
print("Message:", result.message)
print("Final negative log-likelihood:", result.fun)

theta_hat = result.x
a_hat, beta_T_hat, beta_week_hat = unpack_theta(theta_hat)
print("beta_T (effect of judge_std):", beta_T_hat)
print("beta_week:", beta_week_hat)


# ============================================================
# 10. Reconstruct fan shares F_hat for each (season, week, contestant)
# ============================================================

eta_hat = compute_eta(theta_hat)

fan_rows = []
for (season, week), sub in panel.groupby(["season", "week"]):
    idx = sub.index.to_numpy()
    eta_week = eta_hat[idx]
    F_week = softmax(eta_week)
    for row_idx, F_i in zip(idx, F_week):
        fan_rows.append(
            {
                "row_index": row_idx,
                "fan_share_hat": F_i,
            }
        )

F_df = pd.DataFrame(fan_rows).set_index("row_index")
panel["fan_share_hat"] = F_df["fan_share_hat"]

print(panel.head())

# Optionally save the reconstructed panel for later questions
panel.to_csv("fan_shares_estimated.csv", index=False)
print("Saved fan_shares_estimated.csv")




## Consistency of Reconstructed Fan Votes with Historical Eliminations

This script evaluates how well the reconstructed fan vote shares reproduce the actual elimination outcomes across all seasons.

Using the file `fan_shares_estimated.csv` (which contains judges’ totals, reconstructed fan shares, and elimination indicators), we:

- Restrict to weeks with **exactly one eliminated contestant** and at least two active contestants.
- Simulate eliminations under two combination rules:
  - **Rank-Sum** (used in seasons 1–2 and 28–34):  
    Rank contestants separately by judges’ total and fan share (1 = best), sum the ranks, and eliminate the contestant with the **largest** combined rank.
  - **Percent-Sum** (used in seasons 3–27):  
    Convert judges’ totals to weekly percentages, add reconstructed fan share percentages, and eliminate the contestant with the **lowest** combined percentage.
- For each elimination event, compute:
  - **Hit rate**: whether the predicted eliminated contestant matches the actual one.
  - **Satisfaction rate**: what fraction of competitors are “more deserving to stay” than the eliminated contestant under the chosen rule.
  - **Worst rank / percentile**: how poorly the eliminated contestant ranks in that week under the rule.

The script outputs:
- `verification_event_level.csv`: event-level diagnostics per (season, week).
- `verification_season_level.csv`: season-level summaries of accuracy and consistency.


In [None]:
"""
consistency_evaluation.py

Evaluate how consistent the reconstructed fan shares are with the
actual DWTS elimination outcomes under different scoring schemes.

Input:
    EST_PATH (CSV): panel with reconstructed fan shares, expected columns:
        - season
        - week
        - celebrity_name
        - judge_total
        - eliminated  (1 if eliminated in that week, else 0)
        - fan_share_hat

Output:
    verification_event_level.csv   : event-level metrics for each (season, week)
    verification_season_level.csv  : season-level summary metrics

Scoring schemes:
    - Rank-Sum (seasons 1, 2, 28–34)
    - Percent-Sum (seasons 3–27)
"""

import numpy as np
import pandas as pd

# ======================================================================
# Paths (change EST_PATH if needed)
# ======================================================================
EST_PATH = "/content/fan_shares_estimated.csv"
OUT_EVENT_PATH = "verification_event_level.csv"
OUT_SEASON_PATH = "verification_season_level.csv"

# Seasons using Rank-Sum combination in the actual show
RANK_SUM_SEASONS = set([1, 2] + list(range(28, 35)))


def load_panel(path: str) -> pd.DataFrame:
    """Load the reconstructed panel and validate required columns."""
    df = pd.read_csv(path)

    required_cols = {
        "season",
        "week",
        "celebrity_name",
        "judge_total",
        "eliminated",
        "fan_share_hat",
    }
    missing = required_cols - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns in estimated panel: {missing}")

    return df


def get_usable_events(df: pd.DataFrame):
    """
    Keep only (season, week) pairs with exactly one eliminated contestant
    and at least two active contestants. This mirrors the modeling
    assumptions used when building the elimination events.
    """
    g = df.groupby(["season", "week"], sort=True)
    usable_keys = []
    for (s, w), sub in g:
        if sub["eliminated"].sum() == 1 and len(sub) >= 2:
            usable_keys.append((s, w))

    print("Usable elimination events:", len(usable_keys))
    return g, usable_keys


def rank_sum_elim(sub: pd.DataFrame):
    """
    Rank-Sum rule:

    - Rank contestants by judge_total (descending) and by fan_share_hat (descending),
      with rank 1 = best.
    - Combined rank = judge_rank + fan_rank (smaller = better).
    - Predicted eliminated contestant = one with the largest combined rank (worst).

    Returns:
        pred_name      : predicted eliminated contestant name
        act_name       : actual eliminated contestant name
        sat_rate       : fraction of other contestants whose combined rank
                         is <= the eliminated contestant's combined rank
                         (should be close to 1 if the eliminated contestant
                         is truly the worst).
        margin         : comb_elim - max_other_comb (>= 0 if eliminated is worst)
        worst_rank     : "worst-first" rank of the actual eliminated contestant
                         (1 = worst)
        n_alive        : number of active contestants in this event
    """
    sub = sub.reset_index(drop=True).copy()

    j = sub["judge_total"].to_numpy(dtype=float)
    f = sub["fan_share_hat"].to_numpy(dtype=float)

    # Normalize fan shares within this week for safety
    f = f / f.sum() if f.sum() > 0 else np.ones_like(f) / len(f)

    # Rank: 1 = best (larger value = better)
    judge_rank = pd.Series(-j).rank(method="min").to_numpy()
    fan_rank = pd.Series(-f).rank(method="min").to_numpy()

    comb = judge_rank + fan_rank  # smaller = better
    pred_idx = int(np.argmax(comb))  # largest combined rank -> worst
    act_idx = int(np.where(sub["eliminated"].to_numpy() == 1)[0][0])

    pred_name = sub.loc[pred_idx, "celebrity_name"]
    act_name = sub.loc[act_idx, "celebrity_name"]

    # Consistency diagnostics
    # sat_rate: fraction of others with comb <= comb_elim
    sat_rate = float(np.mean(np.delete(comb, act_idx) <= comb[act_idx]))

    # margin: comb_elim - max_other_comb (>= 0 if eliminated is truly worst)
    margin = float(comb[act_idx] - np.max(np.delete(comb, act_idx)))

    # worst_rank: rank of actual eliminated in "worst-first" order (1 = worst)
    worst_rank = float(
        pd.Series(comb).rank(method="min", ascending=False).to_numpy()[act_idx]
    )

    return pred_name, act_name, sat_rate, margin, worst_rank, len(sub)


def pct_sum_elim(sub: pd.DataFrame):
    """
    Percent-Sum rule:

    - J_pct = judge_total / sum(judge_total) (judge percentage within the week).
    - F_pct = normalized fan_share_hat within the same week.
    - Combined score S = J_pct + F_pct.
    - Predicted eliminated contestant = one with smallest S (worst total).

    Returns:
        pred_name, act_name, sat_rate, margin, worst_rank, n_alive
        with analogous definitions to rank_sum_elim.
    """
    sub = sub.reset_index(drop=True).copy()

    j = sub["judge_total"].to_numpy(dtype=float)
    f = sub["fan_share_hat"].to_numpy(dtype=float)

    # Normalize fan shares within this week for safety
    f = f / f.sum() if f.sum() > 0 else np.ones_like(f) / len(f)

    j_sum = j.sum()
    j_pct = (j / j_sum) if j_sum > 0 else (np.ones_like(j) / len(j))

    S = j_pct + f  # larger = better
    pred_idx = int(np.argmin(S))  # smallest S -> worst
    act_idx = int(np.where(sub["eliminated"].to_numpy() == 1)[0][0])

    pred_name = sub.loc[pred_idx, "celebrity_name"]
    act_name = sub.loc[act_idx, "celebrity_name"]

    # sat_rate: fraction of others with S >= S_elim (should be close to 1)
    sat_rate = float(np.mean(np.delete(S, act_idx) >= S[act_idx]))

    # margin: min_other_S - S_elim (>= 0 if eliminated is truly worst)
    margin = float(np.min(np.delete(S, act_idx)) - S[act_idx])

    # worst_rank: rank of actual eliminated in "worst-first" order (S smaller = worse)
    worst_rank = float(
        pd.Series(S).rank(method="min", ascending=True).to_numpy()[act_idx]
    )

    return pred_name, act_name, sat_rate, margin, worst_rank, len(sub)


def main():
    # Load panel and filter usable events
    df = load_panel(EST_PATH)
    g, usable_keys = get_usable_events(df)

    # Run event-level evaluation
    event_rows = []
    for (s, w) in usable_keys:
        sub = g.get_group((s, w))
        s_int = int(s)

        if s_int in RANK_SUM_SEASONS:
            scheme = "rank_sum"
            pred, act, sat, margin, worst_rank, n_alive = rank_sum_elim(sub)
        else:
            scheme = "pct_sum"
            pred, act, sat, margin, worst_rank, n_alive = pct_sum_elim(sub)

        event_rows.append(
            {
                "season": s_int,
                "week": int(w),
                "scheme": scheme,
                "n_alive": n_alive,
                "pred_eliminated": pred,
                "actual_eliminated": act,
                "hit": int(pred == act),
                "sat_rate": sat,
                "margin": margin,
                "actual_worst_rank(1=worst)": worst_rank,
                "actual_worst_percentile": worst_rank / n_alive,
            }
        )

    ev = pd.DataFrame(event_rows)

    # Overall & by-scheme metrics
    print("\n=== Overall consistency ===")
    print("Events:", len(ev))
    print("Accuracy (hit rate):", ev["hit"].mean())
    print("Avg sat_rate:", ev["sat_rate"].mean())
    print("Median worst_rank:", ev["actual_worst_rank(1=worst)"].median())
    print("Avg worst_percentile:", ev["actual_worst_percentile"].mean())

    print("\n=== By scheme ===")
    print(
        ev.groupby("scheme").agg(
            events=("hit", "size"),
            accuracy=("hit", "mean"),
            avg_sat=("sat_rate", "mean"),
            median_worst_rank=("actual_worst_rank(1=worst)", "median"),
            avg_worst_percentile=("actual_worst_percentile", "mean"),
            median_margin=("margin", "median"),
        )
    )

    # Season-level consistency (only over usable elimination weeks)
    season_summary = (
        ev.groupby("season")
        .agg(
            events=("hit", "size"),
            accuracy=("hit", "mean"),
            avg_sat=("sat_rate", "mean"),
            avg_worst_percentile=("actual_worst_percentile", "mean"),
        )
        .reset_index()
        .sort_values(["accuracy", "events"], ascending=[True, False])
    )

    # Save outputs
    ev.to_csv(OUT_EVENT_PATH, index=False)
    season_summary.to_csv(OUT_SEASON_PATH, index=False)

    print("\nSaved:", OUT_EVENT_PATH)
    print("Saved:", OUT_SEASON_PATH)


if __name__ == "__main__":
    main()


## Visualization of Elimination Consistency Diagnostics

This script creates a 2×2 panel of plots that summarize how well our reconstructed fan votes align with historical eliminations:

1. **Worst percentile distribution (all events)**  
   Histogram of the eliminated contestant’s “worst percentile” within their week under the chosen scoring rule (1 = worst in the field).

2. **Worst percentile by scoring scheme**  
   Overlaid histograms comparing worst percentiles under the Percent-Sum and Rank-Sum schemes.

3. **Pairwise satisfaction rate**  
   Distribution of the satisfaction rate \( s_t \), defined as the share of contestants who are not more “deserving to leave” than the eliminated contestant in week \( t \).

4. **Season-level accuracy**  
   Bar plot of hit rate (prediction accuracy) by season, with a horizontal line showing the overall mean accuracy.

The figure is saved as `diagnostics.png` and is suitable for inclusion in the report as a compact visual summary of model–history consistency.


In [None]:
"""
consistency_plots.py

Generate a 2×2 grid of diagnostic plots summarizing how well the
reconstructed fan votes reproduce historical eliminations.

Requires:
    - ev : DataFrame with event-level results
        * columns: ["season", "week", "scheme", "hit", "sat_rate",
                    "actual_worst_percentile", ...]
    - season_summary : DataFrame with season-level accuracy
        * columns: ["season", "events", "accuracy", ...]

Outputs:
    - diagnostics.png : 2×2 panel of consistency plots
"""

import matplotlib.pyplot as plt


def plot_diagnostics(ev, season_summary, out_path: str = "diagnostics.png"):
    # ------------------------------------------------------------------
    # Global plotting style (clean & paper-friendly)
    # ------------------------------------------------------------------
    plt.rcParams.update(
        {
            "font.size": 11,
            "axes.titlesize": 12,
            "axes.labelsize": 11,
            "xtick.labelsize": 10,
            "ytick.labelsize": 10,
            "figure.dpi": 120,
        }
    )

    # ------------------------------------------------------------------
    # Create 2×2 subplot layout
    # ------------------------------------------------------------------
    fig, axes = plt.subplots(2, 2, figsize=(11, 8))
    (ax1, ax2), (ax3, ax4) = axes

    # ==============================================================
    # (1) Worst percentile distribution (ALL events)
    # ==============================================================
    ax1.hist(
        ev["actual_worst_percentile"],
        bins=20,
        color="#4C72B0",
        edgecolor="white",
    )
    ax1.axvline(
        ev["actual_worst_percentile"].mean(),
        color="black",
        linestyle="--",
        linewidth=1,
        label="Mean",
    )
    ax1.set_title("Worst Percentile of Eliminated Contestant")
    ax1.set_xlabel("Worst percentile (1 = worst)")
    ax1.set_ylabel("Number of events")
    ax1.legend(frameon=False)

    # ==============================================================
    # (2) Worst percentile by scheme
    # ==============================================================
    for scheme, color in zip(
        ["pct_sum", "rank_sum"],
        ["#55A868", "#C44E52"],
    ):
        sub = ev[ev["scheme"] == scheme]
        if len(sub) == 0:
            continue

        ax2.hist(
            sub["actual_worst_percentile"],
            bins=20,
            alpha=0.6,
            label=scheme.replace("_", " "),
            color=color,
            edgecolor="white",
        )

    ax2.set_title("Worst Percentile by Scoring Scheme")
    ax2.set_xlabel("Worst percentile")
    ax2.set_ylabel("Number of events")
    ax2.legend(frameon=False)

    # ==============================================================
    # (3) Pairwise satisfaction rate
    # ==============================================================
    ax3.hist(
        ev["sat_rate"],
        bins=20,
        color="#8172B2",
        edgecolor="white",
    )
    ax3.axvline(
        ev["sat_rate"].mean(),
        color="black",
        linestyle="--",
        linewidth=1,
        label="Mean",
    )
    ax3.set_title("Pairwise Satisfaction Rate")
    ax3.set_xlabel(r"Satisfaction rate $s_t$")
    ax3.set_ylabel("Number of events")
    ax3.legend(frameon=False)

    # ==============================================================
    # (4) Season-level accuracy
    # ==============================================================
    ax4.bar(
        season_summary["season"].astype(str),
        season_summary["accuracy"],
        color="#CCB974",
    )
    ax4.axhline(
        ev["hit"].mean(),
        color="black",
        linestyle="--",
        linewidth=1,
        label="Overall mean",
    )
    ax4.set_title("Elimination Accuracy by Season")
    ax4.set_xlabel("Season")
    ax4.set_ylabel("Accuracy")
    ax4.set_ylim(0, 1)
    ax4.tick_params(axis="x", rotation=45)
    ax4.legend(frameon=False)

    # ------------------------------------------------------------------
    # Final layout tweaks & save
    # ------------------------------------------------------------------
    plt.suptitle(
        "Consistency Diagnostics for Reconstructed Fan Votes",
        fontsize=14,
    )
    plt.tight_layout(rect=[0, 0, 1, 0.96])

    plt.savefig(out_path, dpi=300)
    print(f"Saved diagnostics figure to: {out_path}")

    plt.show()


if __name__ == "__main__":
    # Example usage:
    # Import here to avoid circular dependencies if you structure this as a package.
    import pandas as pd

    ev = pd.read_csv("verification_event_level.csv")
    season_summary = pd.read_csv("verification_season_level.csv")

    plot_diagnostics(ev, season_summary)


## Consistency Check for Reconstructed Fan Votes

This script evaluates how well our reconstructed fan vote shares reproduce the **actual weekly eliminations** on *Dancing with the Stars*.

**Input**

- `fan_shares_estimated.csv`, containing for each (season, week, contestant):
  - `season`, `week`
  - `celebrity_name`
  - `judge_total` (total judges’ score that week)
  - `eliminated` (1 if eliminated that week, 0 otherwise)
  - `fan_share_hat` (reconstructed fan share from Problem 1)

We restrict to weeks where:
- exactly **one** contestant is eliminated, and  
- at least **two** contestants are still active.

---

### Elimination Rules Evaluated

For each usable (season, week):

1. **Rank-Sum rule** (assumed in seasons 1–2 and 28–34):
   - Rank contestants by judges’ total and fan share separately (1 = best).
   - Sum the two ranks to get a combined rank (smaller = better).
   - Predict the eliminated contestant as the one with the **largest** combined rank.

2. **Percent-Sum rule** (assumed in seasons 3–27):
   - Convert judges’ totals to weekly percentages.
   - Treat `fan_share_hat` as fan vote percentage (renormalized within the week).
   - Total score = judge percent + fan percent.
   - Predict the eliminated contestant as the one with the **smallest** total score.

---

### Diagnostics and Outputs

For each elimination event, we compute:

- **`hit`** — whether the predicted elimination matches the actual one (hit rate / accuracy).
- **`sat_rate`** — a “pairwise satisfaction” measure:
  - Rank-Sum: fraction of other contestants whose combined rank is **no better** than the eliminated contestant’s.
  - Percent-Sum: fraction of other contestants whose total score is **no higher** than the eliminated’s.
- **`actual_worst_rank(1=worst)`** — how the actual eliminated contestant ranks in a “worst-first” ordering.
- **`actual_worst_percentile`** — that rank converted to a percentile within the week.

The script writes:

- `verification_event_level.csv`  
  Event-level diagnostics for every usable (season, week).

- `verification_season_level.csv`  
  Season-level summaries: number of events, accuracy, mean satisfaction rate, and mean worst percentile.

These diagnostics are later used to compare rules and discuss how well our reconstructed fan votes align with historical eliminations.


In [None]:
import numpy as np
import pandas as pd

"""
consistency_check.py

Evaluate how consistent the reconstructed fan shares are with the
actual elimination outcomes under different combination rules.

Input:
    fan_shares_estimated.csv
        Must contain columns:
        - season
        - week
        - celebrity_name
        - judge_total
        - eliminated (1 if eliminated this week, else 0)
        - fan_share_hat (reconstructed fan share)

Output:
    verification_event_level.csv
        Event-level metrics for each (season, week) elimination event.

    verification_season_level.csv
        Season-level summary metrics (accuracy, average rank, etc.).
"""

# ===== Paths (change if needed) ===========================================
EST_PATH = "fan_shares_estimated.csv"        # panel with reconstructed fan shares
OUT_EVENT_PATH = "verification_event_level.csv"
OUT_SEASON_PATH = "verification_season_level.csv"

# Seasons where the actual TV show is believed to have used Rank-Sum
RANK_SUM_SEASONS = set([1, 2] + list(range(28, 35)))


# ===== Load and basic checks =============================================
df = pd.read_csv(EST_PATH)

required_cols = {
    "season", "week", "celebrity_name",
    "judge_total", "eliminated", "fan_share_hat"
}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing columns in estimated panel: {missing}")

# Keep only weeks with exactly ONE eliminated contestant and at least 2 active
g = df.groupby(["season", "week"], sort=True)
usable_keys = []
for (s, w), sub in g:
    if sub["eliminated"].sum() == 1 and len(sub) >= 2:
        usable_keys.append((s, w))

print("Usable elimination events:", len(usable_keys))


# ===== Helper: Rank-Sum Rule =============================================
def rank_sum_elim(sub: pd.DataFrame):
    """
    Rank-Sum rule (as described in the main report):

    - Compute separate ranks for judge_total and fan_share_hat (1 = best).
    - Sum ranks to get `combined_rank_sum` (smaller is better).
    - The eliminated contestant is the one with the LARGEST rank sum.

    Returns:
        pred_name  (str): predicted eliminated contestant name
        act_name   (str): actual eliminated contestant name
        sat_rate   (float): fraction of others whose rank sum <= eliminated's
        margin     (float): min(other rank sum) - eliminated rank sum
        worst_rank (int): actual rank of the eliminated contestant (1 = worst)
    """
    sub = sub.reset_index(drop=True).copy()
    j = sub["judge_total"].to_numpy(dtype=float)
    f = sub["fan_share_hat"].to_numpy(dtype=float)

    # Normalize fan shares for safety
    f = f / f.sum() if f.sum() > 0 else np.ones_like(f) / len(f)

    # Ranks: 1 = best (largest score/share), n = worst
    # So we sort descending and invert
    j_order = np.argsort(-j)
    f_order = np.argsort(-f)

    rank_j = np.empty_like(j_order)
    rank_f = np.empty_like(f_order)
    rank_j[j_order] = np.arange(1, len(j) + 1)
    rank_f[f_order] = np.arange(1, len(f) + 1)

    combined = rank_j + rank_f

    # Predicted eliminated: largest combined rank (worst)
    pred_idx = int(np.argmax(combined))
    act_idx = int(np.where(sub["eliminated"].to_numpy() == 1)[0][0])

    pred_name = sub.loc[pred_idx, "celebrity_name"]
    act_name = sub.loc[act_idx, "celebrity_name"]

    # Diagnostics:
    # sat_rate: fraction of other contestants with combined >= eliminated's
    sat_rate = float(np.mean(np.delete(combined, act_idx) >= combined[act_idx]))

    # margin: min(other combined) - eliminated combined
    # (should be >= 0 if eliminated really is worst)
    margin = float(np.min(np.delete(combined, act_idx)) - combined[act_idx])

    # worst_rank: rank of actual eliminated contestant under "worst first"
    worst_rank = 1 + np.sum(combined < combined[act_idx])
    n_alive = len(sub)
    worst_percentile = worst_rank / n_alive

    return pred_name, act_name, sat_rate, margin, worst_rank, worst_percentile


# ===== Helper: Percent-Sum Rule ==========================================
def percent_sum_elim(sub: pd.DataFrame):
    """
    Percent-Sum rule:

    - Convert judge_total to a judge percent j_pct = J_i / sum_j J_j.
    - Use reconstructed fan_share_hat as fan percent (already normalized).
    - Total score = j_pct + fan_pct.
      Lower score = worse, eliminated = smallest total.

    Returns:
        pred_name, act_name, sat_rate, margin, worst_rank, worst_percentile
        defined analogously to rank_sum_elim.
    """
    sub = sub.reset_index(drop=True).copy()
    j = sub["judge_total"].to_numpy(dtype=float)
    f = sub["fan_share_hat"].to_numpy(dtype=float)

    j_sum = j.sum()
    j_pct = (j / j_sum) if j_sum > 0 else (np.ones_like(j) / len(j))

    f = f / f.sum() if f.sum() > 0 else np.ones_like(f) / len(f)

    S = j_pct + f  # total percent score (higher = better)
    # Eliminated = smallest S
    pred_idx = int(np.argmin(S))
    act_idx = int(np.where(sub["eliminated"].to_numpy() == 1)[0][0])

    pred_name = sub.loc[pred_idx, "celebrity_name"]
    act_name = sub.loc[act_idx, "celebrity_name"]

    # For sat_rate and margin we interpret S as "higher is better"
    sat_rate = float(np.mean(np.delete(S, act_idx) >= S[act_idx]))
    margin = float(np.min(np.delete(S, act_idx)) - S[act_idx])

    # Rank of actual eliminated in "worst first" order
    worst_rank = 1 + np.sum(S < S[act_idx])
    n_alive = len(sub)
    worst_percentile = worst_rank / n_alive

    return pred_name, act_name, sat_rate, margin, worst_rank, worst_percentile


# ===== Main loop over events =============================================
event_rows = []

for (s, w) in usable_keys:
    sub = g.get_group((s, w)).copy()

    # Decide which system to simulate for this season
    if s in RANK_SUM_SEASONS:
        rule = "rank_sum"
        pred_name, act_name, sat, margin, worst_rank, worst_pct = rank_sum_elim(sub)
    else:
        rule = "percent_sum"
        pred_name, act_name, sat, margin, worst_rank, worst_pct = percent_sum_elim(sub)

    event_rows.append({
        "season": s,
        "week": w,
        "rule": rule,
        "pred_elim": pred_name,
        "actual_elim": act_name,
        "hit": int(pred_name == act_name),
        "sat_rate": sat,
        "margin": margin,
        "actual_worst_rank(1=worst)": worst_rank,
        "actual_worst_percentile": worst_pct
    })

ev = pd.DataFrame(event_rows)


# ===== Overall & per-season metrics ======================================
print("\n=== Overall consistency ===")
print("Events:", len(ev))
print("Accuracy (hit rate):", ev["hit"].mean())
print("Avg sat_rate:", ev["sat_rate"].mean())
print("Median worst percentile:", ev["actual_worst_percentile"].median())

print("\n=== By rule (rank_sum vs percent_sum) ===")
print(ev.groupby("rule")["hit"].mean())

# Per-season consistency (only over usable elimination weeks)
season_summary = (
    ev.groupby("season")
      .agg(
          events=("hit", "size"),
          accuracy=("hit", "mean"),
          avg_sat=("sat_rate", "mean"),
          avg_worst_percentile=("actual_worst_percentile", "mean"),
      )
      .reset_index()
      .sort_values(["accuracy", "events"], ascending=[True, False])
)

# Save results
ev.to_csv(OUT_EVENT_PATH, index=False)
season_summary.to_csv(OUT_SEASON_PATH, index=False)

print("\nSaved:", OUT_EVENT_PATH)
print("Saved:", OUT_SEASON_PATH)


## Certainty of Elimination Predictions (Fan + Judge Model)

This script takes the fitted output from `fan_shares_estimated.csv` and quantifies **how certain** our fan+judge model is about each weekly elimination.

### Input

`fan_shares_estimated.csv`, containing for each (season, week, contestant):

- `season`, `week`
- `celebrity_name`
- `judge_total`
- `elim_week`, `eliminated`
- `fan_share_hat` (estimated fan share \(F_{it}\))
- plus helper columns (`judge_std`, `contestant_id`, `week_float`)

We assume this file already reflects the fitted fan model from Problem 1.

---

### Model for Elimination Certainty

For each week \( t \) with active contestant set \( C_t \), the script:

1. Computes **judge percentage**:
   \[
   J^{\%}_{it} = \frac{\text{judge\_total}_{it}}{\sum_{j \in C_t} \text{judge\_total}_{jt}}.
   \]

2. Defines a **total score** combining judges and fans:
   \[
   S_{it} = J^{\%}_{it} + F_{it},
   \]
   where \( F_{it} = \text{fan\_share\_hat}_{it} \).

3. For each contestant \( i \in C_t \), constructs a **pairwise logistic weight**:
   \[
   w_{it} = \prod_{j \in C_t, j \neq i} \sigma\big(\alpha (S_{jt} - S_{it})\big),
   \]
   where \( \sigma(\cdot) \) is the logistic function and `ALPHA_SOFT` controls how “hard” the comparison is (higher \(\alpha\) → closer to a sharp indicator of \(S_j > S_i\)).

4. Normalizes these weights within the week to obtain a **probability that contestant \( i \) is eliminated**:
   \[
   p_{it} = \frac{w_{it}}{\sum_{k \in C_t} w_{kt}}.
   \]

For each actual elimination, \( p_{it} \) evaluated at the eliminated contestant is interpreted as the **certainty** of that elimination under our model.

---

### Certainty Metrics Produced

From the week-by-week probabilities, the script reports:

- **Event-level accuracy**:  
  Fraction of weeks where the contestant with the **highest elimination probability** \( p_{it} \) is indeed the one eliminated.

- **Certainty of observed eliminations**:
  - Mean and median \( p_{i_t^* t} \), where \( i_t^* \) is the actual eliminated contestant.
  - 25th/75th percentiles, min and max of these probabilities.

- **Log-likelihood of the elimination path**:
  \[
  \sum_t \log p_{i_t^* t},
  \]
  summarizing how plausible the entire observed sequence of eliminations is under the model.

- **Uniform baseline certainty**:
  Mean of \( 1 / |C_t| \) across weeks, representing the certainty level if eliminations were random among active contestants.

Additionally, the script prints:

- A small **LaTeX table snippet** with key summary statistics (ready to paste into the paper).
- The **top 10 most “certain” eliminations** (largest \( p_{i_t^* t} \)).
- The **bottom 10 most “surprising” eliminations** (smallest \( p_{i_t^* t} \)), which are natural candidates for case studies and discussion.


In [None]:
"""
evaluate_fan_certainty.py

Use the already-estimated fan model output in fan_shares_estimated.csv
to compute *certainty* of eliminations.

INPUT: fan_shares_estimated.csv with columns like:
    - season
    - celebrity_name
    - week
    - judge_total
    - elim_week        (NaN for finalists / winners)
    - eliminated       (1 if eliminated in that week, 0 otherwise)
    - judge_std        (not used here, but present)
    - contestant_id
    - week_float
    - fan_share_hat    (our estimated F_it from the fan model)

STEPS:
    For each (season, week) alive set C_t:

    1) Judge score percentage:
           J_pct_it = judge_total_it / sum_j judge_total_jt

    2) Total score:
           S_it = J_pct_it + fan_share_hat_it

    3) Raw elimination weight:
           w_it = ∏_{j∈C_t, j≠i} σ( α * (S_jt - S_it) )

    4) Proper probability:
           p_it = w_it / sum_{k∈C_t} w_kt

    5) Certainty metrics:
        - log-likelihood over observed eliminations
        - mean / median p_it for actually eliminated contestants
        - event-level accuracy (how often argmax p_it is eliminated?)

Tune ALPHA_SOFT to control how "hard" the pairwise comparison is.
"""

import numpy as np
import pandas as pd
from scipy.special import expit  # logistic σ(x) = 1 / (1 + exp(-x))


# ============================================================
# 0. Configuration
# ============================================================

FAN_CSV_PATH = "fan_shares_estimated.csv"

# Softness parameter in pairwise logistic comparison.
# Larger -> closer to hard "S_j > S_i" indicator.
ALPHA_SOFT = 20.0


# ============================================================
# 1. Core computations
# ============================================================

def add_scores(df: pd.DataFrame) -> pd.DataFrame:
    """
    Given the fan_shares_estimated dataframe, add:

        - J_pct_it   = judge percentage in each (season, week)
        - S_it       = J_pct_it + fan_share_hat

    We do NOT refit the fan model; we just use fan_share_hat as F_it.
    """
    df = df.copy()

    # Judge percentage within each (season, week)
    total_J = df.groupby(["season", "week"])["judge_total"].transform("sum")
    df["J_pct_it"] = df["judge_total"] / total_J

    # Total score S_it = J_pct + F_it
    df["S_it"] = df["J_pct_it"] + df["fan_share_hat"]

    return df


def compute_week_probs(sub: pd.DataFrame, alpha_soft: float) -> pd.DataFrame:
    """
    For a single (season, week) subset, compute raw weights w_it and
    normalized probabilities p_elim for all contestants in that week.

    sub must already have S_it.
    """
    sub = sub.copy()
    S = sub["S_it"].to_numpy()

    n = len(S)
    # Matrix of α(S_j - S_i): shape (n, n), row i, col j.
    diff_mat = alpha_soft * (S[None, :] - S[:, None])
    sig_mat = expit(diff_mat)  # σ(α(S_j - S_i))

    # Exclude i=j from the product by setting diagonal = 1
    np.fill_diagonal(sig_mat, 1.0)

    # Raw weights: product over j≠i
    w = sig_mat.prod(axis=1)

    # Normalize to get a proper distribution over contestants in this week
    w_sum = w.sum()
    if w_sum <= 0:
        p = np.ones_like(w) / n
    else:
        p = w / w_sum

    sub["w_it"] = w
    sub["p_elim"] = p

    return sub


def compute_all_probs(df: pd.DataFrame, alpha_soft: float) -> pd.DataFrame:
    """
    Loop over all (season, week) and compute p_elim for everyone.

    We treat every row in fan_shares_estimated as a contestant who is
    alive in that week (the CSV only contains weeks when they danced).
    """
    df = add_scores(df)

    results = []
    for (season, week), sub in df.groupby(["season", "week"], sort=True):
        res_week = compute_week_probs(sub, alpha_soft)
        results.append(res_week)

    out = pd.concat(results, ignore_index=True)

    # Flag the rows corresponding to actual eliminations
    out["is_eliminated_this_week"] = out["eliminated"] == 1

    return out


# ============================================================
# 2. Certainty metrics
# ============================================================
def summarize_certainty(df_probs: pd.DataFrame):
    """
    Given df_probs with p_elim and is_eliminated_this_week, compute:

        - log-likelihood over observed eliminations
        - average and median p_elim for eliminated contestants (Certainty events)
        - event-level accuracy (argmax p_elim is eliminated?)
        - a uniform-baseline certainty for comparison
    """
    # All actual elimination *rows* (each row = one elimination event)
    elim_rows = df_probs[df_probs["is_eliminated_this_week"]].copy()

    # -------- 1. Certainty_t = p_elim of eliminated contestant --------
    eps = 1e-12
    cert_values = np.clip(elim_rows["p_elim"].to_numpy(), eps, 1.0)

    loglik = np.log(cert_values).sum()
    avg_p = cert_values.mean()
    med_p = np.median(cert_values)

    # Optional distribution summary if you want it later
    q25, q75 = np.quantile(cert_values, [0.25, 0.75])
    p_min, p_max = cert_values.min(), cert_values.max()

    # -------- 2. Event-level accuracy (hit rate) --------
    correct_events = 0
    total_events = 0

    for (season, week), sub in df_probs.groupby(["season", "week"]):
        if not sub["is_eliminated_this_week"].any():
            continue  # no elimination in this (season, week) combo

        total_events += 1
        max_p = sub["p_elim"].max()
        top = sub[sub["p_elim"] == max_p]

        if top["is_eliminated_this_week"].any():
            correct_events += 1

    event_accuracy = correct_events / total_events if total_events > 0 else np.nan

    # -------- 3. Uniform baseline for certainty --------
    # For each elimination event, baseline certainty would be 1 / |C_t|
    # where |C_t| is the number of contestants alive in that week.
    # We approximate this by the size of the group in df_probs.
    baseline_certs = []
    for (season, week), sub in df_probs.groupby(["season", "week"]):
        if not sub["is_eliminated_this_week"].any():
            continue
        n_alive = len(sub)
        baseline_certs.extend([1.0 / n_alive] * sub["is_eliminated_this_week"].sum())

    baseline_certs = np.array(baseline_certs, dtype=float)
    baseline_mean = baseline_certs.mean() if baseline_certs.size > 0 else np.nan

    stats = {
        "log_likelihood": loglik,
        "avg_p_elim_eliminated": avg_p,      # mean Certainty_t
        "median_p_elim_eliminated": med_p,   # median Certainty_t
        "q25_p_elim_eliminated": q25,
        "q75_p_elim_eliminated": q75,
        "min_p_elim_eliminated": p_min,
        "max_p_elim_eliminated": p_max,
        "event_accuracy": event_accuracy,
        "num_elimination_events": len(elim_rows),
        "num_elimination_weeks": total_events,
        "baseline_mean_certainty": baseline_mean,
    }

    return stats, elim_rows

# ============================================================
# 3. Main
# ============================================================

def main():
    # Load the fan model output
    df = pd.read_csv(FAN_CSV_PATH)

    # Compute week-by-week elimination probabilities implied by fan model
    df_probs = compute_all_probs(df, ALPHA_SOFT)

    # Summarize certainty
    stats, elim_rows = summarize_certainty(df_probs)

    print("=== Certainty of the fan+judge model (based on fan_shares_estimated.csv) ===")
    print(f"Alpha_soft (pairwise softness):              {ALPHA_SOFT}")
    print(f"Number of elimination events (rows):         {stats['num_elimination_events']}")
    print(f"Number of elimination weeks (unique weeks):  {stats['num_elimination_weeks']}")
    print(f"Event-level accuracy (hit rate):             {stats['event_accuracy']:.3f}")
    print(f"Avg p_elim for eliminated rows:              {stats['avg_p_elim_eliminated']:.3f}")
    print(f"Median p_elim for eliminated rows:           {stats['median_p_elim_eliminated']:.3f}")
    print(f"25/75% quantiles of p_elim (elim rows):      {stats['q25_p_elim_eliminated']:.3f}, {stats['q75_p_elim_eliminated']:.3f}")
    print(f"Min / Max p_elim for eliminated rows:        {stats['min_p_elim_eliminated']:.3e}, {stats['max_p_elim_eliminated']:.3f}")
    print(f"Log-likelihood of elimination path:          {stats['log_likelihood']:.3f}")
    print(f"Uniform baseline mean certainty:             {stats['baseline_mean_certainty']:.3f}")
    print()

    # ---------- Paper-ready "TBD" block ----------
    print("=== Values to fill in TBDs in the paper ===")
    print(f"Mean Certainty_t (avg p_elim of eliminated): {stats['avg_p_elim_eliminated']:.3f}")
    print(f"Median Certainty_t:                          {stats['median_p_elim_eliminated']:.3f}")
    print(f"Event-level accuracy (hit rate):             {stats['event_accuracy']:.3f}")
    print(f"Log-likelihood (sum log Certainty_t):        {stats['log_likelihood']:.3f}")
    print(f"Mean uniform-baseline certainty:             {stats['baseline_mean_certainty']:.3f}")
    print()

    # ---------- Optional LaTeX table snippet ----------
    print("=== LaTeX snippet for Table~\\ref{tab:certainty} (edit caption/label as needed) ===")
    print(r"\\begin{tabular}{l c}")
    print(r"\\hline")
    print(r"Number of elimination events & %d \\\\" % stats["num_elimination_events"])
    print(r"Event-level accuracy & %.3f \\\\" % stats["event_accuracy"])
    print(r"Mean certainty $\\mathbb{E}[p_{i_t^* t}]$ & %.3f \\\\" % stats["avg_p_elim_eliminated"])
    print(r"Median certainty & %.3f \\\\" % stats["median_p_elim_eliminated"])
    print(r"Uniform baseline (mean $1/|C_t|$) & %.3f \\\\" % stats["baseline_mean_certainty"])
    print(r"Log-likelihood $\\sum_t \\log p_{i_t^* t}$ & %.3f \\\\" % stats["log_likelihood"])
    print(r"\\hline")
    print(r"\\end{tabular}")
    print()

    # ---------- Examples of individual events ----------
    print("Top 10 most 'certain' eliminations (highest p_elim):")
    top10 = elim_rows.sort_values("p_elim", ascending=False).head(10)
    print(
        top10[
            ["season", "week", "celebrity_name", "p_elim", "judge_total",
             "fan_share_hat", "J_pct_it", "S_it"]
        ].to_string(index=False)
    )

    print("\nBottom 10 most 'surprising' eliminations (lowest p_elim):")
    bottom10 = elim_rows.sort_values("p_elim", ascending=True).head(10)
    print(
        bottom10[
            ["season", "week", "celebrity_name", "p_elim", "judge_total",
             "fan_share_hat", "J_pct_it", "S_it"]
        ].to_string(index=False)
    )



if __name__ == "__main__":
    main()
