## Helper: Building a Weekly Panel with True Eliminations and Fan Shares

This helper module takes the original wide-format DWTS data and the estimated fan shares, and converts them into a **clean weekly panel** that can be used for downstream analyses (consistency checks, certainty, voting-rule simulations, etc.).

### 1. Parsing Elimination Week

We first parse the `results` field from the COMAP CSV to extract each contestant’s *elimination week*:

- Strings like `"Eliminated Week 3"` are mapped to `elim_week = 3`.
- Winners, finalists, or runners-up (e.g., `"1st Place"`, `"2nd Place"`) are treated as **never eliminated**, and assigned `elim_week = ∞`.

This gives a numeric indicator we can compare to week numbers.

---

### 2. Constructing the Weekly Panel

The core function:

```python
build_weekly_panel(df_raw, fan_df) -> df_panel


In [None]:
import numpy as np
import pandas as pd

# ============================================================
# 1. Parse elimination week from "results" field
#    e.g. "Eliminated Week 3" -> 3
#         "1st Place"          -> np.inf (never eliminated)
# ============================================================

def parse_elim_week(result_str: str):
    if pd.isna(result_str):
        return np.inf
    result_str = str(result_str)
    # Typical pattern: "Eliminated Week X"
    if "Eliminated Week" in result_str:
        # extract last token and cast to int if possible
        last = result_str.split()[-1]
        try:
            return int(last)
        except ValueError:
            return np.inf
    # Winners / runners-up etc: treat as never eliminated
    return np.inf


# ============================================================
# 2. Build a "weekly panel" dataframe:
#    one row per (season, week, celebrity_name)
#    with judge totals, fan shares, and real elimination flag
# ============================================================

def build_weekly_panel(df_raw: pd.DataFrame,
                       fan_df: pd.DataFrame) -> pd.DataFrame:
    """
    Inputs
    ------
    df_raw : wide-format DWTS dataset from COMAP CSV.
             Must have columns:
               - 'season'
               - 'celebrity_name'
               - 'results'
               - weekX_judgeY_score columns.

    fan_df : long-format table with columns:
               - 'season'
               - 'week'
               - 'celebrity_name'
               - 'F_hat'  (estimated fan vote share, sums to 1 within season-week)

    Returns
    -------
    df_panel : long-format dataframe with columns:
        ['season', 'week', 'celebrity_name',
         'judge_total', 'F_hat',
         'is_active', 'eliminated_real']
    """

    df = df_raw.copy()

    # ---------- 2.1 compute each contestant's elimination week ----------
    df["elim_week"] = df["results"].apply(parse_elim_week)

    # ---------- 2.2 identify judge score columns & max week ----------
    judge_cols = [c for c in df.columns
                  if c.startswith("week") and c.endswith("_total_score")]
    if not judge_cols:
        # If you only have individual judge scores, sum them first.
        # Example: week1_judge1_score, week1_judge2_score, ...
        # We build weekX_total_score on the fly.
        week_tags = sorted(
            set(col.split("_judge")[0] for col in df.columns
                if col.startswith("week") and "_judge" in col and col.endswith("_score"))
        )
        for wtag in week_tags:
            sub = [c for c in df.columns if c.startswith(wtag) and c.endswith("_score")]
            df[f"{wtag}_total_score"] = df[sub].sum(axis=1)
        judge_cols = [c for c in df.columns
                      if c.startswith("week") and c.endswith("_total_score")]

    # Map week number from column name "weekX_total_score"
    week_nums = sorted(int(col[4:].split("_")[0]) for col in judge_cols)
    max_week = max(week_nums)

    # ---------- 2.3 wide -> long for judge totals ----------
    judge_long = []
    for week in week_nums:
        col = f"week{week}_total_score"
        tmp = df[["season", "celebrity_name", "elim_week", "results", col]].copy()
        tmp = tmp.rename(columns={col: "judge_total"})
        tmp["week"] = week
        judge_long.append(tmp)

    df_j = pd.concat(judge_long, ignore_index=True)

    # ---------- 2.4 active flag: contestant is active if week <= elim_week ----------
    df_j["is_active"] = df_j["week"] <= df_j["elim_week"]

    # For contestants that were never eliminated (elim_week = inf),
    # we still only care about weeks where the show actually ran.
    # You can optionally filter by non-NA judge_total:
    df_j["is_active"] &= df_j["judge_total"].notna()

    # ---------- 2.5 real elimination: True if this is the week they were eliminated ----------
    # Note: winners have elim_week = inf, so never flagged as eliminated_real.
    df_j["eliminated_real"] = (df_j["week"] == df_j["elim_week"])

    # ---------- 2.6 merge fan estimates ----------
    # fan_df must be unique per (season, week, celebrity_name)
    df_fan = fan_df.copy()
    df_fan = df_fan.rename(columns={"fan_share": "F_hat"}) if "fan_share" in df_fan.columns else df_fan

    df_panel = df_j.merge(df_fan,
                          on=["season", "week", "celebrity_name"],
                          how="left")

    return df_panel


## Comparing Rank-Sum and Percent-Sum Elimination Rules

This function directly compares the two elimination rules used on *Dancing with the Stars*—**Rank-Sum** and **Percent-Sum**—using the same reconstructed fan vote shares.

### Input

`df_panel` is a long-format weekly panel with one row per `(season, week, contestant)`, containing:

- `season`, `week`
- `celebrity_name`
- `judge_total` — total judges’ score that week
- `fan_share_hat` — estimated fan vote share from the fan model
- `eliminated` — indicator for the actual elimination (1 if eliminated that week)

Only contestants with valid judge scores and fan shares are used in each week.

---

### Elimination Rules Implemented

For each week’s active contestant set \( C_t \):

1. **Rank-Sum rule**
   - Rank contestants by judges’ score and by fan share separately (1 = best).
   - Compute the combined rank
     \[
     S^{\text{rank}}_{it} = \text{rank}_J(i,t) + \text{rank}_F(i,t).
     \]
   - Predict the eliminated contestant as the one with the **largest** combined rank.

2. **Percent-Sum rule**
   - Convert judges’ totals to weekly percentages.
   - Normalize fan shares within the same week.
   - Compute the combined percentage
     \[
     S^{\text{pct}}_{it} = J^{\%}_{it} + F^{\%}_{it}.
     \]
   - Predict the eliminated contestant as the one with the **smallest** combined percentage.

---

### Outputs and Diagnostics

For each `(season, week)`, the function returns:

- `eliminated_rank` — contestant eliminated under the Rank-Sum rule
- `eliminated_pct` — contestant eliminated under the Percent-Sum rule
- `eliminated_real` — actual eliminated contestant (only defined when exactly one elimination occurred)
- `disagree` — whether the two rules eliminate different contestants
- `delta_F_rank_minus_pct` — difference in fan share between the Rank-Sum loser and Percent-Sum loser (only when they disagree)
- `delta_J_rank_minus_pct` — difference in judges’ score between the two losers (only when they disagree)
- `n_contestants` — number of active contestants that week
- `n_real_elims` — number of actual eliminations that week

This comparison isolates **when and why** the two rules diverge, and quantifies whether disagreements are primarily driven by differences in **fan support** or **judges’ evaluations**.


In [None]:
import numpy as np
import pandas as pd

def simulate_rank_vs_pct(df_panel: pd.DataFrame) -> pd.DataFrame:
    """
    df_panel: long-format dataframe with at least:
        - 'season'
        - 'week'
        - 'celebrity_name'
        - 'judge_total'
        - 'fan_share_hat'   (estimated fan share from your model)
        - 'eliminated'      (1 if actually eliminated this week, else 0)

    Returns:
        week_results: one row per (season, week) with
          - eliminated_rank: elimination under rank rule
          - eliminated_pct:  elimination under percent rule
          - eliminated_real: actual elimination (if exactly one)
          - disagree:        True if rank vs pct give different loser
          - delta_F_rank_minus_pct: fan_share_rank - fan_share_pct (when disagree)
          - delta_J_rank_minus_pct: judge_rank - judge_pct (when disagree)
          - n_contestants, n_real_elims
    """

    rows = []

    # group by season & week (this is your A_t set)
    for (season, week), sub in df_panel.groupby(["season", "week"]):
        sub = sub.copy()

        # Safety: drop rows lacking key info
        sub = sub.dropna(subset=["judge_total", "fan_share_hat"])
        if sub.empty:
            continue

        # --- normalize fan shares within this week (just in case) ---
        total_fan = sub["fan_share_hat"].sum()
        if total_fan <= 0 or np.isnan(total_fan):
            # if your model somehow produced zero mass this week, skip it
            continue
        sub["F_norm"] = sub["fan_share_hat"] / total_fan

        # ======================= RANK METHOD =======================
        # Higher judge_total => better (rank 1)
        sub["r_judge"] = sub["judge_total"].rank(
            ascending=False, method="min"
        )
        # Higher fan share => better (rank 1)
        sub["r_fan"] = sub["F_norm"].rank(
            ascending=False, method="min"
        )
        sub["S_rank"] = sub["r_judge"] + sub["r_fan"]

        # Worst = largest sum of ranks (ties broken by first appearance)
        worst_rank_idx = sub["S_rank"].idxmax()
        elim_rank = sub.loc[worst_rank_idx, "celebrity_name"]

        # ===================== PERCENT METHOD ======================
        judge_sum = sub["judge_total"].sum()
        sub["P_judge"] = sub["judge_total"] / judge_sum        # judge %
        sub["P_fan"]   = sub["F_norm"]                         # fan %, already normalized
        sub["S_pct"]   = sub["P_judge"] + sub["P_fan"]

        # Worst = smallest combined percent
        worst_pct_idx = sub["S_pct"].idxmin()
        elim_pct = sub.loc[worst_pct_idx, "celebrity_name"]

        # ====================== REAL ELIMINATION ==================
        # Some weeks have 0 or >1 eliminations. For “match” stats we only
        # use weeks with exactly 1 eliminated contestant.
        eliminated_rows = sub[sub.get("eliminated", 0) == 1]
        real_elims = eliminated_rows["celebrity_name"].tolist()
        real_elim = real_elims[0] if len(real_elims) == 1 else None

        # ====================== DISAGREEMENT =======================
        disagree = (elim_rank != elim_pct)

        delta_F = np.nan
        delta_J = np.nan
        if disagree:
            # fan share of contestant eliminated by rank vs pct
            F_rank = sub.loc[sub["celebrity_name"] == elim_rank, "F_norm"].iloc[0]
            F_pct  = sub.loc[sub["celebrity_name"] == elim_pct,  "F_norm"].iloc[0]
            delta_F = F_rank - F_pct

            # judge total of contestant eliminated by rank vs pct
            J_rank = sub.loc[sub["celebrity_name"] == elim_rank, "judge_total"].iloc[0]
            J_pct  = sub.loc[sub["celebrity_name"] == elim_pct,  "judge_total"].iloc[0]
            delta_J = J_rank - J_pct

        rows.append({
            "season": season,
            "week": week,
            "eliminated_rank": elim_rank,
            "eliminated_pct": elim_pct,
            "eliminated_real": real_elim,
            "disagree": disagree,
            "delta_F_rank_minus_pct": delta_F,
            "delta_J_rank_minus_pct": delta_J,
            "n_contestants": len(sub),
            "n_real_elims": len(real_elims),
        })

    week_results = pd.DataFrame(rows)
    return week_results


## Summary of Rank vs. Percent Comparison Results

This helper function takes the week-level comparison between the **Rank-Sum** and **Percent-Sum** rules and computes a set of global and per-season statistics for the writeup.

### Input

`week_results`: a DataFrame produced by `simulate_rank_vs_pct`, with one row per `(season, week)` containing at least:

- `season`, `week`
- `disagree` — whether Rank-Sum and Percent-Sum eliminate different contestants
- `delta_F_rank_minus_pct` — fan-share difference between the Rank loser and Percent loser
- `delta_J_rank_minus_pct` — judge-score difference between the Rank loser and Percent loser
- `eliminated_rank`, `eliminated_pct` — losers under each rule
- `eliminated_real` — actual eliminated contestant (only defined when there is exactly one elimination)

---

### Global Summary Metrics

The function first computes **aggregate statistics** across all weeks:

- `total_weeks`  
  Total number of (season, week) instances considered.

- `disagree_weeks`  
  Number of weeks where the Rank-Sum and Percent-Sum rules eliminate **different** contestants.

- `disagreement_rate`  
  Fraction of weeks with disagreement:
  \[
  \text{disagreement\_rate} = \frac{\text{disagree\_weeks}}{\text{total\_weeks}}.
  \]

- `avg_delta_F_rank_minus_pct`  
  Among weeks where the rules disagree, the average difference in fan share between the Rank-Sum loser and the Percent-Sum loser.  
  Positive values suggest the Rank-Sum rule tends to eliminate **more popular** contestants (higher fan share) than the Percent-Sum rule.

- `avg_delta_J_rank_minus_pct`  
  Among disagreement weeks, the average difference in judge scores between the Rank-Sum loser and Percent-Sum loser.  
  Positive values suggest Rank-Sum tends to eliminate contestants with **higher judge scores** than the Percent-Sum rule.

- `rank_match_real_rate`  
  Among weeks with exactly one actual elimination, the fraction of weeks in which the Rank-Sum loser matches the **actual** eliminated contestant.

- `pct_match_real_rate`  
  Similarly, the fraction of weeks where the Percent-Sum loser matches the actual eliminated contestant.

These global metrics are returned in a dictionary `summary_global`.

---

### Per-Season Breakdown

The function also produces a **per-season** summary table `per_season` with columns:

- `season`
- `weeks` — number of weeks observed for that season
- `disagree_weeks` — how many weeks the two rules disagreed
- `disagreement_rate` — fraction of weeks with disagreement in that season
- `avg_delta_F` — within-season average of `delta_F_rank_minus_pct`
- `avg_delta_J` — within-season average of `delta_J_rank_minus_pct`

This table is useful for an appendix or figure, showing in which seasons the choice of rule mattered most, and whether those disagreements systematically favor judges or fans.

---

### Return Values

The function returns:

```python
summary_global, per_season
```
- `summary_global`: a dictionary of headline numbers for the main text.
- `per_season`: a DataFrame suitable for tables/plots in the appendix or supporting analysis.

In [None]:
def summarize_week_results(week_results: pd.DataFrame):
    """
    Compute the key numbers you need for the writeup:
      - overall disagreement rate
      - average fan-share difference when methods disagree
      - which method matches actual show eliminations more often
      - per-season breakdown
    """

    total_weeks = len(week_results)
    disagree_weeks = week_results["disagree"].sum()
    disagreement_rate = disagree_weeks / total_weeks

    # Only weeks where methods disagree
    mask_dis = week_results["disagree"]
    avg_delta_F = week_results.loc[mask_dis, "delta_F_rank_minus_pct"].mean()
    avg_delta_J = week_results.loc[mask_dis, "delta_J_rank_minus_pct"].mean()

    # Only weeks with exactly ONE real elimination
    mask_real = week_results["eliminated_real"].notna()
    rank_match = (
        week_results.loc[mask_real, "eliminated_rank"]
        == week_results.loc[mask_real, "eliminated_real"]
    ).mean()
    pct_match = (
        week_results.loc[mask_real, "eliminated_pct"]
        == week_results.loc[mask_real, "eliminated_real"]
    ).mean()

    summary_global = {
        "total_weeks": int(total_weeks),
        "disagree_weeks": int(disagree_weeks),
        "disagreement_rate": disagreement_rate,
        "avg_delta_F_rank_minus_pct": avg_delta_F,
        "avg_delta_J_rank_minus_pct": avg_delta_J,
        "rank_match_real_rate": rank_match,
        "pct_match_real_rate": pct_match,
    }

    # Per-season breakdown (nice table for appendix)
    per_season = (
        week_results
        .groupby("season")
        .agg(
            weeks=("week", "count"),
            disagree_weeks=("disagree", "sum"),
            disagreement_rate=("disagree", "mean"),
            avg_delta_F=("delta_F_rank_minus_pct", "mean"),
            avg_delta_J=("delta_J_rank_minus_pct", "mean"),
        )
        .reset_index()
    )

    return summary_global, per_season


In [None]:
# ============================================================
# MAIN PIPELINE (adjust paths / fan_df construction as needed)
# ============================================================


df_fan = pd.read_csv("fan_shares_estimated.csv")  # your model output

week_results = simulate_rank_vs_pct(df_fan)

summary_global, per_season = summarize_week_results(week_results)

print("Global summary:")
for k, v in summary_global.items():
    print(f"{k}: {v}")

print("\nPer-season disagreement summary (head):")
print(per_season.head())



## Question 2 Analysis: Rank vs. Percent Voting Systems

This module implements the full analysis pipeline for **Question 2 (Part 1)** of the MCM problem, focusing on how different methods of combining judges’ scores and fan votes affect weekly elimination outcomes.

It assumes that fan vote shares have already been reconstructed in **Question 1** and saved as `fan_shares_estimated.csv`.

---

### 1. Weekly Simulation of Voting Rules

For each season and week, the function `simulate_rank_vs_pct` applies **both** voting rules to the same set of active contestants:

- **Rank-Sum rule**  
  Contestants are ranked separately by judges’ total score and by fan vote share (1 = best).  
  The ranks are summed, and the contestant with the **largest** combined rank is eliminated.

- **Percent-Sum rule**  
  Judges’ scores are converted into weekly percentages and added to normalized fan vote shares.  
  The contestant with the **smallest** combined percentage is eliminated.

For every `(season, week)`, the function records:
- the predicted elimination under each rule,
- the actual elimination (if exactly one occurred),
- whether the two rules disagree,
- and the fan-score and judge-score differences driving the disagreement.

---

### 2. Global and Per-Season Summaries

The function `summarize_week_results` aggregates the week-level results into interpretable statistics:

- **Disagreement rate**: how often Rank-Sum and Percent-Sum eliminate different contestants.
- **Average fan-share difference** and **judge-score difference** when the rules disagree.
- **Match rates**: how often each rule agrees with the actual show elimination (when uniquely defined).
- **Per-season breakdowns** showing which seasons exhibit the strongest rule conflicts.

These summaries provide quantitative evidence for comparing the two voting systems across the full history of the show.

---

### 3. Per-Season Accuracy Analysis

`compute_per_season_accuracy` measures, for each season:
- the accuracy of Rank-Sum,
- the accuracy of Percent-Sum,
- and the number of weeks with a well-defined real elimination.

This allows direct season-by-season comparison of how closely each method aligns with historical outcomes.

---

### 4. “Zombie” Contestant Identification

The function `compute_zombies` identifies **“zombie” contestants**:
> contestants who would be eliminated under the Percent-Sum rule (or tied for worst),  
> but who **survive** in the actual show.

These cases highlight systematic tension between fan-driven outcomes and judge-driven outcomes, and are especially relevant for controversial seasons.

---

### 5. Visualization Utilities

The plotting functions generate publication-ready figures:

- **Per-season accuracy curves** for Rank-Sum and Percent-Sum (with optional smoothing).
- **Accuracy gap plots** showing Percent − Rank performance by season.
- **Combined UMN-style figure**:
  - Top: Rank vs. Percent accuracy over seasons.
  - Bottom: Accuracy difference with highlighted zombie seasons and notable cases (e.g., Season 27).

These visualizations support clear interpretation and comparison in the final report.

---

### 6. Script Entry Point

Running the script end-to-end:

1. Loads `fan_shares_estimated.csv`.
2. Simulates Rank-Sum vs. Percent-Sum eliminations.
3. Prints global and per-season summaries.
4. Identifies zombie contestants (including known controversial figures).
5. Produces a combined comparison plot saved to disk.

Together, this module provides a complete, data-driven comparison of the two voting systems and directly addresses the core questions posed in **MCM Problem C – Question 2**.


In [None]:
"""
Analysis utilities for 2026 MCM Problem C Question 2 (Part 1).

This module assumes you have already run your Question 1 fan model and
saved its output as `fan_shares_estimated.csv`, with at least:

Columns (per row = contestant-season-week):
    - season           (int)
    - week             (int)
    - celebrity_name   (str)
    - judge_total      (float)  # total judge score for that week
    - fan_share_hat    (float)  # estimated fan share for that week
    - eliminated       (0/1)    # 1 if actually eliminated in this week, else 0

Core features:
    - Simulate rank-based vs percent-based elimination rules.
    - Compute disagreement rate and fan/judge differences.
    - Compute how often each method matches real eliminations.
    - Identify “zombie” contestants (should die under Percent, but survive).
    - Plot per-season accuracies and Percent–Rank accuracy gap.

Usage (as a script):
    python analysis_q2.py
"""

from __future__ import annotations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    # Optional smoothing if SciPy is available
    from scipy.interpolate import make_interp_spline
    HAVE_SCIPY = True
except ImportError:
    HAVE_SCIPY = False


# ============================================================
# 1. Core simulation: Rank vs Percent methods
# ============================================================

def simulate_rank_vs_pct(df: pd.DataFrame) -> pd.DataFrame:
    """
    Simulate rank-based and percent-based elimination rules week by week.

    Parameters
    ----------
    df : DataFrame
        Long-format table with columns:
          - 'season'
          - 'week'
          - 'celebrity_name'
          - 'judge_total'
          - 'fan_share_hat'
          - 'eliminated' (0/1)

        Assumed: each row is a (season, week, contestant) where the contestant
        is still in the competition. (No post-elimination zero rows.)

    Returns
    -------
    week_results : DataFrame
        One row per (season, week) with:
          - 'season', 'week'
          - 'eliminated_rank'  : predicted loser under rank rule
          - 'eliminated_pct'   : predicted loser under percent rule
          - 'eliminated_real'  : actual loser (if exactly 1), else None
          - 'disagree'         : True if rank vs percent differ
          - 'delta_F_rank_minus_pct'
          - 'delta_J_rank_minus_pct'
          - 'n_contestants'
          - 'n_real_elims'
    """
    rows = []

    # group by season-week = A_t
    for (season, week), sub in df.groupby(["season", "week"]):
        sub = sub.copy()

        # basic sanity: need both judge and fan info
        sub = sub.dropna(subset=["judge_total", "fan_share_hat"])
        if sub.empty:
            continue

        # normalize fan shares within week (just in case)
        total_fan = sub["fan_share_hat"].sum()
        if total_fan <= 0 or np.isnan(total_fan):
            continue
        sub["F_norm"] = sub["fan_share_hat"] / total_fan

        # -------- Rank method --------
        # higher judge_total => better (rank 1)
        sub["r_judge"] = sub["judge_total"].rank(ascending=False, method="min")
        # higher fan share => better (rank 1)
        sub["r_fan"] = sub["F_norm"].rank(ascending=False, method="min")
        sub["S_rank"] = sub["r_judge"] + sub["r_fan"]

        worst_rank_idx = sub["S_rank"].idxmax()
        elim_rank = sub.loc[worst_rank_idx, "celebrity_name"]

        # -------- Percent method --------
        judge_sum = sub["judge_total"].sum()
        if judge_sum <= 0:
            # If something degenerate happens, skip this week
            continue

        sub["P_judge"] = sub["judge_total"] / judge_sum
        sub["P_fan"] = sub["F_norm"]
        sub["S_pct"] = sub["P_judge"] + sub["P_fan"]

        worst_pct_idx = sub["S_pct"].idxmin()
        elim_pct = sub.loc[worst_pct_idx, "celebrity_name"]

        # -------- Real elimination --------
        eliminated_rows = sub[sub.get("eliminated", 0) == 1]
        real_elims = eliminated_rows["celebrity_name"].tolist()
        eliminated_real = real_elims[0] if len(real_elims) == 1 else None

        # -------- Disagreement analysis --------
        disagree = (elim_rank != elim_pct)

        delta_F = np.nan
        delta_J = np.nan
        if disagree:
            F_rank = sub.loc[sub["celebrity_name"] == elim_rank, "F_norm"].iloc[0]
            F_pct = sub.loc[sub["celebrity_name"] == elim_pct, "F_norm"].iloc[0]
            delta_F = F_rank - F_pct

            J_rank = sub.loc[sub["celebrity_name"] == elim_rank, "judge_total"].iloc[0]
            J_pct = sub.loc[sub["celebrity_name"] == elim_pct, "judge_total"].iloc[0]
            delta_J = J_rank - J_pct

        rows.append({
            "season": season,
            "week": week,
            "eliminated_rank": elim_rank,
            "eliminated_pct": elim_pct,
            "eliminated_real": eliminated_real,
            "disagree": disagree,
            "delta_F_rank_minus_pct": delta_F,
            "delta_J_rank_minus_pct": delta_J,
            "n_contestants": len(sub),
            "n_real_elims": len(real_elims),
        })

    week_results = pd.DataFrame(rows)
    return week_results


# ============================================================
# 2. Global & per-season summaries (disagreement + match rates)
# ============================================================

def summarize_week_results(week_results: pd.DataFrame):
    """
    Compute global and per-season summary statistics from week_results.

    Parameters
    ----------
    week_results : DataFrame
        Output of simulate_rank_vs_pct().

    Returns
    -------
    summary_global : dict
        {
            "total_weeks",
            "disagree_weeks",
            "disagreement_rate",
            "avg_delta_F_rank_minus_pct",
            "avg_delta_J_rank_minus_pct",
            "rank_match_real_rate",
            "pct_match_real_rate",
        }

    per_season_disagree : DataFrame
        Per-season disagreement statistics.
    """
    total_weeks = len(week_results)
    disagree_weeks = week_results["disagree"].sum()
    disagreement_rate = disagree_weeks / total_weeks if total_weeks > 0 else np.nan

    # Only disagreement weeks
    mask_dis = week_results["disagree"]
    avg_delta_F = week_results.loc[mask_dis, "delta_F_rank_minus_pct"].mean()
    avg_delta_J = week_results.loc[mask_dis, "delta_J_rank_minus_pct"].mean()

    # Only weeks with exactly 1 real elimination
    mask_real = week_results["eliminated_real"].notna()
    if mask_real.sum() > 0:
        rank_match = (
            week_results.loc[mask_real, "eliminated_rank"]
            == week_results.loc[mask_real, "eliminated_real"]
        ).mean()
        pct_match = (
            week_results.loc[mask_real, "eliminated_pct"]
            == week_results.loc[mask_real, "eliminated_real"]
        ).mean()
    else:
        rank_match = np.nan
        pct_match = np.nan

    summary_global = {
        "total_weeks": int(total_weeks),
        "disagree_weeks": int(disagree_weeks),
        "disagreement_rate": disagreement_rate,
        "avg_delta_F_rank_minus_pct": avg_delta_F,
        "avg_delta_J_rank_minus_pct": avg_delta_J,
        "rank_match_real_rate": rank_match,
        "pct_match_real_rate": pct_match,
    }

    # Per-season disagreement summary
    per_season_disagree = (
        week_results
        .groupby("season")
        .agg(
            weeks=("week", "count"),
            disagree_weeks=("disagree", "sum"),
            disagreement_rate=("disagree", "mean"),
            avg_delta_F=("delta_F_rank_minus_pct", "mean"),
            avg_delta_J=("delta_J_rank_minus_pct", "mean"),
        )
        .reset_index()
    )

    return summary_global, per_season_disagree


def compute_per_season_accuracy(week_results: pd.DataFrame) -> pd.DataFrame:
    """
    Compute per-season accuracy of Rank vs Percent methods
    relative to actual eliminations.

    Parameters
    ----------
    week_results : DataFrame
        Output of simulate_rank_vs_pct().

    Returns
    -------
    per_season_acc : DataFrame
        Columns:
          - 'season'
          - 'rank_accuracy'
          - 'pct_accuracy'
          - 'n_real_weeks'
    """
    # Restrict to weeks with exactly 1 real elimination
    wr = week_results.copy()
    mask_real = wr["eliminated_real"].notna()
    wr = wr[mask_real]

    def _acc(group: pd.DataFrame):
        n = len(group)
        if n == 0:
            return pd.Series({
                "rank_accuracy": np.nan,
                "pct_accuracy": np.nan,
                "n_real_weeks": 0
            })
        rank_acc = (group["eliminated_rank"] == group["eliminated_real"]).mean()
        pct_acc = (group["eliminated_pct"] == group["eliminated_real"]).mean()
        return pd.Series({
            "rank_accuracy": rank_acc,
            "pct_accuracy": pct_acc,
            "n_real_weeks": n
        })

    per_season_acc = wr.groupby("season").apply(_acc).reset_index()
    per_season_acc = per_season_acc.sort_values("season")
    return per_season_acc


# ============================================================
# 3. Zombie analysis (Percent loser who survives)
# ============================================================

def compute_zombies(df: pd.DataFrame, tol: float = 1e-3) -> pd.DataFrame:
    """
    Identify "zombie" contestants:
      - Under Percent rule, they are bottom (or effectively tied for bottom),
        but in reality they are NOT eliminated in that week.

    Parameters
    ----------
    df : DataFrame
        Same input format as simulate_rank_vs_pct().
        Assumes:
          - judge_total > 0 means active contestant
          - eliminated == 1 marks real elimination.
    tol : float
        Numerical tolerance for considering someone "no better than" the
        theoretical percent loser.

    Returns
    -------
    zombies_df : DataFrame
        Columns:
          - 'season', 'week', 'celebrity_name'
          - 'judge_total', 'fan_share_hat'
          - 'reason'
    """
    zombies = []

    for (season, week), sub in df.groupby(["season", "week"]):
        sub = sub.copy()

        # Only active contestants (drop judge_total <= 0)
        sub = sub[sub["judge_total"] > 0]
        if len(sub) < 2:
            continue

        j = sub["judge_total"].values
        f = sub["fan_share_hat"].values

        # Percent rule score
        j_pct = j / j.sum()
        s_pct = j_pct + f

        # Theoretical victim: lowest combined percent
        min_idx = np.argmin(s_pct)
        min_score = s_pct[min_idx]

        # Survivors = all with eliminated == 0
        survivors = sub[sub["eliminated"] == 0]

        for _, row in survivors.iterrows():
            my_score = row["judge_total"] / j.sum() + row["fan_share_hat"]
            is_bottom = (my_score <= min_score + tol)
            if is_bottom:
                zombies.append({
                    "season": season,
                    "week": week,
                    "celebrity_name": row["celebrity_name"],
                    "judge_total": row["judge_total"],
                    "fan_share_hat": row["fan_share_hat"],
                    "reason": "Bottom by Percent but survived"
                })

    zombies_df = pd.DataFrame(zombies)
    return zombies_df


# ============================================================
# 4. Plotting utilities (per-season accuracy)
# ============================================================

def _smooth_xy(x: np.ndarray, y: np.ndarray,
               n_points: int = 300,
               clip_min: float | None = None,
               clip_max: float | None = None):
    """
    Helper: smooth (x, y) using cubic B-spline if SciPy available.
    """
    x = np.asarray(x)
    y = np.asarray(y)

    if len(x) < 3 or not HAVE_SCIPY:
        return x, y

    x_new = np.linspace(x.min(), x.max(), n_points)
    try:
        spl = make_interp_spline(x, y, k=3)
        y_new = spl(x_new)
        if clip_min is not None or clip_max is not None:
            y_new = np.clip(
                y_new,
                clip_min if clip_min is not None else y_new.min(),
                clip_max if clip_max is not None else y_new.max()
            )
        return x_new, y_new
    except Exception:
        return x, y


def plot_accuracy_by_season(per_season_acc: pd.DataFrame,
                            filename: str = "method_accuracy_comparison_fancy.png"):
    """
    Fancy plot: per-season Rank vs Percent accuracies with
    nice styling, markers, and optional smoothing.
    """
    if per_season_acc.empty:
        print("No per-season accuracy data to plot.")
        return

    seasons = per_season_acc["season"].values
    y_rank  = per_season_acc["rank_accuracy"].values
    y_pct   = per_season_acc["pct_accuracy"].values

    # --- Base figure style ---
    plt.figure(figsize=(14, 6))
    ax = plt.gca()
    ax.set_facecolor("#f8f9fa")  # light gray background

    # thinner, subtle grid
    ax.grid(True, which="both", axis="both", alpha=0.2, linestyle="--", linewidth=0.7)

    # small helper: smoothed line + points
    def _plot_series(x, y, label, marker, base_color):
        # Scatter (actual points)
        plt.scatter(x, y,
                    s=60,
                    marker=marker,
                    edgecolor="white",
                    linewidth=1.0,
                    alpha=0.9,
                    label=f"{label} (data)")

        # Smoothed trend line
        x_s, y_s = _smooth_xy(x, y, n_points=300, clip_min=0.0, clip_max=1.0)
        plt.plot(x_s, y_s,
                 linewidth=2.5,
                 alpha=0.9,
                 label=f"{label} (trend)")

    # --- Plot both methods ---
    _plot_series(seasons, y_rank, "Rank Sum Accuracy", "o", "#e74c3c")
    _plot_series(seasons, y_pct, "Percent Sum Accuracy", "s", "#3498db")

    # Horizontal reference lines at 0.5, 0.75, 1.0
    for y_ref in [0.5, 0.75, 1.0]:
        plt.axhline(y_ref, color="gray", linestyle=":", linewidth=0.6, alpha=0.5)

    # Titles & labels
    plt.title("Model Agreement with Reality by Season\nRank Sum vs Percent Sum",
              fontsize=16, fontweight="bold", pad=10)
    plt.xlabel("Season", fontsize=12)
    plt.ylabel("Accuracy (Match Rate)", fontsize=12)

    # X ticks
    plt.xticks(np.arange(seasons.min(), seasons.max() + 1, 2))

    # Legend styling
    leg = plt.legend(frameon=True, fontsize=10)
    leg.get_frame().set_facecolor("white")
    leg.get_frame().set_edgecolor("#cccccc")
    leg.get_frame().set_alpha(0.9)

    # Small annotation to explain
    plt.text(seasons.min() + 0.2, 0.95,
             "Higher = method matches actual eliminations more often",
             fontsize=9, color="#444444")

    plt.tight_layout()
    plt.savefig(filename, dpi=250)
    plt.close()
    print(f"Saved fancy plot: {filename}")



def plot_accuracy_diff_smooth(per_season_acc: pd.DataFrame,
                              filename: str = "method_comparison_diff_fancy.png"):
    """
    Fancy smoothed difference plot:
        diff(season) = Percent accuracy - Rank accuracy.

    Positive region (blue)  -> Percent behaves closer to reality.
    Negative region (red)   -> Rank behaves closer to reality.
    """
    if per_season_acc.empty:
        print("No per-season accuracy data to plot.")
        return

    seasons = per_season_acc["season"].values
    diff    = per_season_acc["pct_accuracy"].values - per_season_acc["rank_accuracy"].values

    x_s, y_s = _smooth_xy(seasons, diff, n_points=300, clip_min=-0.5, clip_max=0.5)

    plt.figure(figsize=(14, 6))
    ax = plt.gca()
    ax.set_facecolor("#f8f9fa")
    ax.grid(True, which="both", axis="both", alpha=0.2, linestyle="--", linewidth=0.7)

    # Baseline at 0
    plt.axhline(0, color="#333333", linewidth=1.0, linestyle="--", alpha=0.7)

    # Smoothed central curve (neutral color)
    plt.plot(x_s, y_s, color="#555555", linewidth=1.8, alpha=0.8, label="Smoothed gap")

    # Fill regions: Percent better (above 0) vs Rank better (below 0)
    plt.fill_between(x_s, 0, y_s, where=(y_s >= 0),
                     interpolate=True, color="#5dade2", alpha=0.55,
                     label="Percent Sum better")

    plt.fill_between(x_s, 0, y_s, where=(y_s <= 0),
                     interpolate=True, color="#e74c3c", alpha=0.45,
                     label="Rank Sum better")

    # Scatter actual season points, colored by sign
    colors = ["#5dade2" if v >= 0 else "#e74c3c" for v in diff]
    plt.scatter(seasons, diff,
                c=colors,
                s=60,
                edgecolor="white",
                linewidth=1.0,
                zorder=5)

    # Titles & labels
    plt.title("Performance Gap by Season\nPercent Sum vs Rank Sum",
              fontsize=16, fontweight="bold", pad=10)
    plt.xlabel("Season", fontsize=12)
    plt.ylabel("Accuracy Difference\n(Percent Accuracy − Rank Accuracy)", fontsize=11)

    # X ticks
    plt.xticks(np.arange(seasons.min(), seasons.max() + 1, 2))

    # Text hints
    # Place text based on y-limits to avoid overlap
    ymin, ymax = -0.5, 0.5
    plt.ylim(ymin, ymax)

    plt.text(seasons.min() + 0.2, ymax - 0.05,
             "↑ Percent Sum matches reality more often",
             fontsize=9, color="#1f618d", va="top")
    plt.text(seasons.min() + 0.2, ymin + 0.05,
             "↓ Rank Sum matches reality more often",
             fontsize=9, color="#922b21", va="bottom")

    # Highlight a specific season (e.g., 27) if present
    if 27 in seasons:
        s27_row = per_season_acc[per_season_acc["season"] == 27]
        if not s27_row.empty:
            s27 = 27
            d27 = (s27_row["pct_accuracy"].values[0]
                   - s27_row["rank_accuracy"].values[0])
            plt.scatter([s27], [d27],
                        s=90, edgecolor="black", facecolor="yellow", zorder=10)
            plt.annotate(f"S27: {d27:+.2f}",
                         xy=(s27, d27),
                         xytext=(s27 + 0.5, d27 + 0.1),
                         fontsize=9,
                         arrowprops=dict(arrowstyle="->", color="black"),
                         bbox=dict(boxstyle="round,pad=0.2",
                                   fc="white", ec="#555555", alpha=0.9))

    # Legend
    leg = plt.legend(frameon=True, fontsize=10, loc="upper right")
    leg.get_frame().set_facecolor("white")
    leg.get_frame().set_edgecolor("#cccccc")
    leg.get_frame().set_alpha(0.9)

    plt.tight_layout()
    plt.savefig(filename, dpi=250)
    plt.close()
    print(f"Saved fancy plot: {filename}")


import matplotlib.pyplot as plt
import numpy as np

# UMN colors
UMN_MAROON = "#7A0019"
UMN_GOLD   = "#FFCC33"

def plot_combined_umn(per_season_acc: pd.DataFrame,
                      zombies_df: pd.DataFrame,
                      filename: str = "q2_combined_umn.png"):
    """
    Create a combined 2x1 subplot figure with UMN maroon & gold theme:

      Top  subplot: per-season accuracy of Rank vs Percent (lines + points).
      Bottom subplot: accuracy gap (Percent - Rank) with shaded regions,
                      plus markers for seasons with zombies.

    Parameters
    ----------
    per_season_acc : DataFrame
        Output of compute_per_season_accuracy(), with columns:
          - 'season'
          - 'rank_accuracy'
          - 'pct_accuracy'
    zombies_df : DataFrame
        Output of compute_zombies(), with column:
          - 'season'
    filename : str
        Output PNG name.
    """
    if per_season_acc.empty:
        print("No per-season accuracy data to plot.")
        return

    seasons = per_season_acc["season"].values
    y_rank  = per_season_acc["rank_accuracy"].values
    y_pct   = per_season_acc["pct_accuracy"].values

    # Accuracy difference: Percent - Rank
    diff = y_pct - y_rank
    x_s, diff_s = _smooth_xy(seasons, diff,
                             n_points=300, clip_min=-0.5, clip_max=0.5)

    # Seasons that have at least one zombie
    zombie_seasons = sorted(zombies_df["season"].unique()) if not zombies_df.empty else []

    # ----------------- Create figure & axes -----------------
    fig, (ax1, ax2) = plt.subplots(
        2, 1, figsize=(14, 10),
        sharex=True,
        gridspec_kw={"height_ratios": [3, 2]}
    )

    # light gray background
    for ax in (ax1, ax2):
        ax.set_facecolor("#f8f9fa")
        ax.grid(True, which="both", axis="both",
                alpha=0.2, linestyle="--", linewidth=0.7)

    # ===================== TOP: ACCURACIES =====================
    # Smooth lines
    x_rank_s, y_rank_s = _smooth_xy(seasons, y_rank,
                                    n_points=300, clip_min=0.0, clip_max=1.0)
    x_pct_s,  y_pct_s  = _smooth_xy(seasons, y_pct,
                                    n_points=300, clip_min=0.0, clip_max=1.0)

    ax1.plot(x_rank_s, y_rank_s,
             color=UMN_MAROON, linewidth=2.5,
             label="Rank Sum Accuracy (trend)")
    ax1.plot(x_pct_s, y_pct_s,
             color=UMN_GOLD, linewidth=2.5,
             label="Percent Sum Accuracy (trend)")

    # Scatter actual points
    ax1.scatter(seasons, y_rank,
                s=55, marker="o", color=UMN_MAROON,
                edgecolor="white", linewidth=1.0,
                label="Rank (data)")
    ax1.scatter(seasons, y_pct,
                s=55, marker="s", color=UMN_GOLD,
                edgecolor="white", linewidth=1.0,
                label="Percent (data)")

    # Reference lines at 0.5, 0.75, 1.0
    for y_ref in [0.5, 0.75, 1.0]:
        ax1.axhline(y_ref, color="gray", linestyle=":", linewidth=0.6, alpha=0.5)

    ax1.set_title(
        "Model Agreement with Reality by Season\n"
        "Rank (Maroon) vs Percent (Gold)",
        fontsize=16, fontweight="bold", pad=10
    )
    ax1.set_ylabel("Accuracy (Match Rate)", fontsize=12)

    leg1 = ax1.legend(frameon=True, fontsize=10, loc="lower left")
    leg1.get_frame().set_facecolor("white")
    leg1.get_frame().set_edgecolor("#cccccc")
    leg1.get_frame().set_alpha(0.9)

    ax1.text(seasons.min() + 0.3, 0.96,
             "Higher = method matches actual eliminations more often",
             fontsize=9, color="#444444")

    # ================== BOTTOM: DIFF + ZOMBIES ==================
    ax2.axhline(0, color="#555555", linewidth=1.0,
                linestyle="--", alpha=0.7)

    # Smooth central curve
    ax2.plot(x_s, diff_s, color="#555555",
             linewidth=1.8, alpha=0.9, label="Smoothed gap")

    # Fill regions: Percent better vs Rank better
    ax2.fill_between(x_s, 0, diff_s, where=(diff_s >= 0),
                     interpolate=True, color=UMN_GOLD, alpha=0.55,
                     label="Percent Sum better")
    ax2.fill_between(x_s, 0, diff_s, where=(diff_s <= 0),
                     interpolate=True, color=UMN_MAROON, alpha=0.45,
                     label="Rank Sum better")

    # Scatter actual season diffs
    colors = [UMN_GOLD if d >= 0 else UMN_MAROON for d in diff]
    ax2.scatter(seasons, diff,
                c=colors, s=55, edgecolor="white", linewidth=1.0, zorder=5)

    # Mark zombie seasons with stars along the diff curve (or near zero if diff is tiny)
    for s in zombie_seasons:
        if s in per_season_acc["season"].values:
            d_val = diff[per_season_acc["season"].values.tolist().index(s)]
            # If diff is very small, lift the star slightly so it's visible
            y_star = d_val + (0.03 if abs(d_val) < 0.03 else 0.0)
            ax2.scatter(s, y_star,
                        marker="*", s=150,
                        color="#000000", edgecolor="white",
                        linewidth=1.0, zorder=10)
    if zombie_seasons:
        ax2.text(seasons.min() + 0.3, 0.46,
                 "★ Seasons with 'zombie' contestants\n"
                 "(bottom by Percent but not eliminated)",
                 fontsize=9, color="#000000")

    # Annotate Percent better / Rank better regions
    ymin, ymax = -0.5, 0.5
    ax2.set_ylim(ymin, ymax)

    ax2.text(seasons.min() + 0.3, ymax - 0.05,
             "↑ Percent Sum matches reality more often",
             fontsize=9, color="#555555", va="top")
    ax2.text(seasons.min() + 0.3, ymin + 0.05,
             "↓ Rank Sum matches reality more often",
             fontsize=9, color="#555555", va="bottom")

    # Optional: highlight Season 27 if present
    if 27 in seasons:
        idx27 = per_season_acc[per_season_acc["season"] == 27].index[0]
        s27 = 27
        d27 = diff[idx27]
        ax2.scatter([s27], [d27],
                    s=140, edgecolor="black",
                    facecolor="#ffff88", zorder=11)
        ax2.annotate(f"S27: {d27:+.2f}",
                     xy=(s27, d27),
                     xytext=(s27 + 0.6, d27 + 0.12),
                     fontsize=9,
                     arrowprops=dict(arrowstyle="->", color="black"),
                     bbox=dict(boxstyle="round,pad=0.2",
                               fc="white", ec="#555555", alpha=0.9))

    ax2.set_xlabel("Season", fontsize=12)
    ax2.set_ylabel("Accuracy Difference\n(Percent − Rank)", fontsize=11)
    ax2.set_xticks(np.arange(seasons.min(), seasons.max() + 1, 2))

    leg2 = ax2.legend(frameon=True, fontsize=10, loc="lower right")
    leg2.get_frame().set_facecolor("white")
    leg2.get_frame().set_edgecolor("#cccccc")
    leg2.get_frame().set_alpha(0.9)

    fig.tight_layout()
    fig.savefig(filename, dpi=260)
    plt.close(fig)
    print(f"Saved combined UMN-style plot: {filename}")

# ============================================================
# 5. Script entry point: tie everything together
# ============================================================

def main():
    try:
        df = pd.read_csv("fan_shares_estimated.csv")
    except FileNotFoundError:
        print("Could not find fan_shares_estimated.csv. "
              "Please run your Q1 model first.")
        return

    # 1) Simulate rank vs percent
    week_results = simulate_rank_vs_pct(df)

    # 2) Global + per-season disagreement summary
    summary_global, per_season_disagree = summarize_week_results(week_results)
    print("=== Global summary (Rank vs Percent) ===")
    for k, v in summary_global.items():
        print(f"{k}: {v}")

    print("\n=== Per-season disagreement summary (head) ===")
    print(per_season_disagree.head().to_string(index=False))

    # 3) Zombie analysis (Percent losers who survived)
    zombies_df = compute_zombies(df, tol=1e-3)
    print("\n=== Zombies under Percent rule (head) ===")
    print(zombies_df.head(10).to_string(index=False))

    # Optionally highlight specific controversial contestants
    targets = ["Jerry Rice", "Billy Ray Cyrus", "Bristol Palin", "Bobby Bones"]
    z_targets = zombies_df[zombies_df["celebrity_name"].isin(targets)]
    if not z_targets.empty:
        print("\n=== Zombies among controversial contestants ===")
        print(z_targets.to_string(index=False))

    # 4) Per-season accuracy and plots
    per_season_acc = compute_per_season_accuracy(week_results)
    plot_combined_umn(per_season_acc, zombies_df,
                      filename="q2_combined_umn.png")



if __name__ == "__main__":
    main()


## Question 2 Analysis: Comparing Rank, Percent, and BT+Judge Rules

This module implements the full analysis pipeline for **MCM 2026 Problem C – Question 2 (Part 1)**, assuming that fan vote shares from Question 1 have already been estimated and saved in `fan_shares_estimated.csv`.

Each row of `fan_shares_estimated.csv` is a `(season, week, contestant)` record with:
- `season`, `week`, `celebrity_name`
- `judge_total` — total judges’ score for that week
- `fan_share_hat` — reconstructed fan vote share
- `eliminated` — 1 if the contestant was eliminated that week, 0 otherwise

The module provides:

- **Simulation of three elimination rules** for every `(season, week)`:
  - **Rank-Sum**: rank by judges and fans, eliminate the worst combined rank.
  - **Percent-Sum**: add judge percentage + fan share, eliminate the lowest total.
  - **Bottom-Two + Judge (BT+J)**: take bottom two by Percent-Sum, then eliminate the one with the lower judge score.

- **Global and per-season summaries**:
  - Disagreement rate between Rank-Sum and Percent-Sum.
  - Average fan-share and judge-score gaps when the rules disagree.
  - Match rates of Rank, Percent, and BT+J against the actual historical eliminations.

- **Zombie analysis**:
  - Identification of “zombie” contestants who would be eliminated by Percent-Sum but survive in the real show.

- **Controversy scores**:
  - Per-contestant judge–fan rank disagreement across weeks.

- **Placement comparisons**:
  - For each contestant in each season, compute final placement under:
    - observed show,
    - Rank-Sum,
    - Percent-Sum,
    - BT+Judge.

- **Publication-ready plots**:
  - Season-by-season accuracy gaps relative to Rank-Sum.
  - Overall mean accuracy (with error bars) for all three rules.
  - Placement comparison for selected “extraordinary” contestants (e.g., Jerry Rice, Billy Ray Cyrus, Bristol Palin, Bobby Bones).

Running

```bash
python analysis_q2.py


In [None]:

---

### Polished `analysis_q2.py`

```python
"""
analysis_q2.py

Analysis utilities for 2026 MCM Problem C – Question 2 (Part 1).

This module assumes you have already run your Question 1 fan model and
saved its output as `fan_shares_estimated.csv`, with at least:

Columns (per row = contestant-season-week):
    - season           (int)
    - week             (int)
    - celebrity_name   (str)
    - judge_total      (float)  # total judge score for that week
    - fan_share_hat    (float)  # estimated fan share for that week
    - eliminated       (0/1)    # 1 if actually eliminated in this week, else 0

Core features:
    - Simulate rank-based, percent-based, and Bottom-Two+Judge rules.
    - Compute disagreement rates and fan/judge differences.
    - Compute how often each method matches real eliminations.
    - Identify “zombie” contestants (should die under Percent, but survive).
    - Compute placements under each rule and compare controversial cases.
    - Plot per-season accuracies and accuracy gaps across rules.

Usage (as a script):
    python analysis_q2.py
"""

from __future__ import annotations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    # Optional smoothing if SciPy is available
    from scipy.interpolate import make_interp_spline

    HAVE_SCIPY = True
except ImportError:  # pragma: no cover - optional dependency
    HAVE_SCIPY = False


# ======================================================================
# 1. Core simulation: Rank vs Percent vs Bottom-two + Judge (BT+J)
# ======================================================================


def simulate_all_methods(df: pd.DataFrame) -> pd.DataFrame:
    """
    Simulate three elimination mechanisms week by week:

      - Rank rule    -> eliminated_rank
      - Percent rule -> eliminated_pct
      - BT+J rule    -> eliminated_btj
        (Bottom-two by Percent, then judges eliminate the lower judge_total)

    Also records the observed elimination (if exactly one).

    Parameters
    ----------
    df : DataFrame
        Must contain:
          - 'season', 'week', 'celebrity_name'
          - 'judge_total', 'fan_share_hat', 'eliminated'

        Each row should be a (season, week, contestant) where the contestant
        is still active in that week.

    Returns
    -------
    week_results : DataFrame
        One row per (season, week) with:
          - 'season', 'week'
          - 'eliminated_rank'
          - 'eliminated_pct'
          - 'eliminated_btj'
          - 'eliminated_real'
          - 'disagree_rank_pct'   (Rank vs Percent)
          - 'disagree_pct_btj'    (Percent vs BT+J)
          - 'delta_F_rank_minus_pct'
          - 'delta_J_rank_minus_pct'
          - 'n_contestants'
          - 'n_real_elims'
    """
    rows: list[dict] = []

    for (season, week), sub in df.groupby(["season", "week"]):
        sub = sub.copy()

        # Drop rows missing key info
        sub = sub.dropna(subset=["judge_total", "fan_share_hat"])
        if sub.empty:
            continue

        # Normalize fan shares within this week (safety)
        total_fan = sub["fan_share_hat"].sum()
        if total_fan <= 0 or np.isnan(total_fan):
            continue
        sub["F_norm"] = sub["fan_share_hat"] / total_fan

        # ---------------- Rank rule ----------------
        # Higher judge_total => better (rank 1)
        sub["r_judge"] = sub["judge_total"].rank(
            ascending=False, method="min"
        )
        # Higher fan share => better (rank 1)
        sub["r_fan"] = sub["F_norm"].rank(
            ascending=False, method="min"
        )
        sub["S_rank"] = sub["r_judge"] + sub["r_fan"]

        idx_rank = sub["S_rank"].idxmax()
        elim_rank = sub.loc[idx_rank, "celebrity_name"]

        # ---------------- Percent rule ----------------
        judge_sum = sub["judge_total"].sum()
        if judge_sum <= 0:
            # Degenerate week, skip
            continue

        sub["P_judge"] = sub["judge_total"] / judge_sum
        sub["S_pct"] = sub["P_judge"] + sub["F_norm"]

        idx_pct = sub["S_pct"].idxmin()
        elim_pct = sub.loc[idx_pct, "celebrity_name"]

        # ---------------- Bottom-two + Judge (BT+J) ----------------
        # Bottom two by Percent score
        sub_sorted = sub.sort_values("S_pct", ascending=True)
        bottom2 = sub_sorted.head(2).copy()

        # Judges eliminate the one with LOWER judge_total
        idx_btj_row = bottom2["judge_total"].idxmin()
        elim_btj = sub.loc[idx_btj_row, "celebrity_name"]

        # ---------------- Real elimination ----------------
        eliminated_rows = sub[sub.get("eliminated", 0) == 1]
        real_elims = eliminated_rows["celebrity_name"].tolist()
        eliminated_real = real_elims[0] if len(real_elims) == 1 else None

        # ---------------- Disagreement stats ----------------
        disagree_rank_pct = (elim_rank != elim_pct)
        disagree_pct_btj = (elim_pct != elim_btj)

        delta_F = np.nan
        delta_J = np.nan
        if disagree_rank_pct:
            F_rank = sub.loc[
                sub["celebrity_name"] == elim_rank, "F_norm"
            ].iloc[0]
            F_pct = sub.loc[
                sub["celebrity_name"] == elim_pct, "F_norm"
            ].iloc[0]
            delta_F = F_rank - F_pct

            J_rank = sub.loc[
                sub["celebrity_name"] == elim_rank, "judge_total"
            ].iloc[0]
            J_pct = sub.loc[
                sub["celebrity_name"] == elim_pct, "judge_total"
            ].iloc[0]
            delta_J = J_rank - J_pct

        rows.append(
            {
                "season": season,
                "week": week,
                "eliminated_rank": elim_rank,
                "eliminated_pct": elim_pct,
                "eliminated_btj": elim_btj,
                "eliminated_real": eliminated_real,
                "disagree_rank_pct": disagree_rank_pct,
                "disagree_pct_btj": disagree_pct_btj,
                "delta_F_rank_minus_pct": delta_F,
                "delta_J_rank_minus_pct": delta_J,
                "n_contestants": len(sub),
                "n_real_elims": len(real_elims),
            }
        )

    return pd.DataFrame(rows)


# ======================================================================
# 2. Global & per-season summaries (disagreement + match rates)
# ======================================================================


def summarize_week_results(
    week_results: pd.DataFrame,
) -> tuple[dict, pd.DataFrame]:
    """
    Compute global and per-season summary statistics from week_results.

    Parameters
    ----------
    week_results : DataFrame
        Output of simulate_all_methods().

    Returns
    -------
    summary_global : dict
        {
            "total_weeks",
            "disagree_weeks",
            "disagreement_rate",
            "avg_delta_F_rank_minus_pct",
            "avg_delta_J_rank_minus_pct",
            "rank_match_real_rate",
            "pct_match_real_rate",
        }

    per_season_disagree : DataFrame
        Per-season disagreement statistics (Rank vs Percent).
    """
    total_weeks = len(week_results)

    # Use the Rank vs Percent disagreement flag
    disagree_col = "disagree_rank_pct"

    disagree_weeks = week_results[disagree_col].sum()
    disagreement_rate = (
        disagree_weeks / total_weeks if total_weeks > 0 else np.nan
    )

    # Only disagreement weeks
    mask_dis = week_results[disagree_col]
    avg_delta_F = week_results.loc[
        mask_dis, "delta_F_rank_minus_pct"
    ].mean()
    avg_delta_J = week_results.loc[
        mask_dis, "delta_J_rank_minus_pct"
    ].mean()

    # Only weeks with exactly one real elimination
    mask_real = week_results["eliminated_real"].notna()
    if mask_real.sum() > 0:
        rank_match = (
            week_results.loc[mask_real, "eliminated_rank"]
            == week_results.loc[mask_real, "eliminated_real"]
        ).mean()
        pct_match = (
            week_results.loc[mask_real, "eliminated_pct"]
            == week_results.loc[mask_real, "eliminated_real"]
        ).mean()
    else:
        rank_match = np.nan
        pct_match = np.nan

    summary_global = {
        "total_weeks": int(total_weeks),
        "disagree_weeks": int(disagree_weeks),
        "disagreement_rate": disagreement_rate,
        "avg_delta_F_rank_minus_pct": avg_delta_F,
        "avg_delta_J_rank_minus_pct": avg_delta_J,
        "rank_match_real_rate": rank_match,
        "pct_match_real_rate": pct_match,
    }

    # Per-season disagreement summary (Rank vs Percent)
    per_season_disagree = (
        week_results.groupby("season")
        .agg(
            weeks=("week", "count"),
            disagree_weeks=(disagree_col, "sum"),
            disagreement_rate=(disagree_col, "mean"),
            avg_delta_F=("delta_F_rank_minus_pct", "mean"),
            avg_delta_J=("delta_J_rank_minus_pct", "mean"),
        )
        .reset_index()
    )

    return summary_global, per_season_disagree


def compute_per_season_accuracy(week_results: pd.DataFrame) -> pd.DataFrame:
    """
    Compute per-season accuracy of Rank vs Percent vs BT+J methods
    relative to actual eliminations.

    Parameters
    ----------
    week_results : DataFrame
        Output of simulate_all_methods().

    Returns
    -------
    per_season_acc : DataFrame
        Columns:
          - 'season'
          - 'rank_accuracy'
          - 'pct_accuracy'
          - 'btj_accuracy'
          - 'n_real_weeks'
    """
    wr = week_results.copy()
    # Use only weeks with exactly one observed elimination
    wr = wr[wr["eliminated_real"].notna()]

    if wr.empty:
        return pd.DataFrame(
            columns=[
                "season",
                "rank_accuracy",
                "pct_accuracy",
                "btj_accuracy",
                "n_real_weeks",
            ]
        )

    wr["rank_correct"] = wr["eliminated_rank"] == wr["eliminated_real"]
    wr["pct_correct"] = wr["eliminated_pct"] == wr["eliminated_real"]
    wr["btj_correct"] = wr["eliminated_btj"] == wr["eliminated_real"]

    per_season_acc = (
        wr.groupby("season")
        .agg(
            rank_accuracy=("rank_correct", "mean"),
            pct_accuracy=("pct_correct", "mean"),
            btj_accuracy=("btj_correct", "mean"),
            n_real_weeks=("week", "size"),
        )
        .reset_index()
        .sort_values("season")
    )

    return per_season_acc


# ======================================================================
# 3. Zombie analysis (Percent loser who survives)
# ======================================================================


def compute_zombies(df: pd.DataFrame, tol: float = 1e-3) -> pd.DataFrame:
    """
    Identify 'zombie' contestants:
      - Under Percent rule (S_pct) they are bottom (or tied at bottom),
      - But in reality they are NOT eliminated that week.

    Parameters
    ----------
    df : DataFrame
        Must have: 'season', 'week', 'celebrity_name',
                   'judge_total', 'fan_share_hat', 'eliminated'
    tol : float
        Numerical tolerance when comparing scores.

    Returns
    -------
    zombies_df : DataFrame
        Columns: season, week, celebrity_name, judge_total, fan_share_hat, reason
    """
    zombies: list[dict] = []

    for (season, week), sub in df.groupby(["season", "week"]):
        sub = sub.copy()

        # Only active contestants (judge_total > 0)
        sub = sub[sub["judge_total"] > 0]
        if len(sub) < 2:
            continue

        j = sub["judge_total"].values
        f = sub["fan_share_hat"].values

        # Percent score
        j_pct = j / j.sum()
        s_pct = j_pct + f

        # Index of theoretical victim by Percent rule
        min_idx = np.argmin(s_pct)
        min_score = s_pct[min_idx]

        # Real elimination(s) that week
        eliminated_rows = sub[sub["eliminated"] == 1]
        real_elims = eliminated_rows["celebrity_name"].tolist()

        # Any contestant whose S_pct <= min_score + tol but NOT eliminated
        for _, row in sub.iterrows():
            my_score = (
                row["judge_total"] / j.sum() + row["fan_share_hat"]
            )
            is_bottom = my_score <= min_score + tol
            if is_bottom and (row["celebrity_name"] not in real_elims):
                zombies.append(
                    {
                        "season": season,
                        "week": week,
                        "celebrity_name": row["celebrity_name"],
                        "judge_total": row["judge_total"],
                        "fan_share_hat": row["fan_share_hat"],
                        "reason": "Bottom by Percent but survived",
                    }
                )

    return pd.DataFrame(zombies)


# ======================================================================
# 4. Judge–fan controversy measure
# ======================================================================


def compute_controversy(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute judge–fan controversy scores for each contestant in each season.

    For each (season, week), we compute:
        R_J = rank of judge_total (1 = highest)
        R_F = rank of fan_share_hat (1 = highest)
        D   = R_J - R_F

    Then we aggregate over weeks:
        mean_abs_D = mean |D|
        max_abs_D  = max |D|
        weeks_active = number of weeks the contestant appeared.
    """
    df = df.copy()

    Ds = []
    for (season, week), sub in df.groupby(["season", "week"]):
        sub = sub.copy()
        sub = sub.dropna(subset=["judge_total", "fan_share_hat"])
        if sub.empty:
            continue

        sub["R_J"] = sub["judge_total"].rank(ascending=False, method="min")
        sub["R_F"] = sub["fan_share_hat"].rank(
            ascending=False, method="min"
        )
        sub["D"] = sub["R_J"] - sub["R_F"]
        Ds.append(sub[["season", "week", "celebrity_name", "D"]])

    if not Ds:
        return pd.DataFrame(
            columns=[
                "season",
                "celebrity_name",
                "mean_abs_D",
                "max_abs_D",
                "weeks_active",
            ]
        )

    D_df = pd.concat(Ds, ignore_index=True)
    D_df["absD"] = D_df["D"].abs()

    cont_df = (
        D_df.groupby(["season", "celebrity_name"])
        .agg(
            mean_abs_D=("absD", "mean"),
            max_abs_D=("absD", "max"),
            weeks_active=("D", "size"),
        )
        .reset_index()
    )

    return cont_df


# ======================================================================
# 5. Compute placements per season and method
# ======================================================================


def compute_placements_per_season(
    week_results: pd.DataFrame, df: pd.DataFrame
) -> pd.DataFrame:
    """
    For each season and contestant, compute elimination week and placement
    under each method: observed, Rank, Percent, BT+J.

    Parameters
    ----------
    week_results : DataFrame
        Output of simulate_all_methods().
        Must have: 'season','week','eliminated_rank','eliminated_pct',
                   'eliminated_btj','eliminated_real'
    df : DataFrame
        Original panel to know which contestants participated in each season.
        Must have: 'season','celebrity_name'.

    Returns
    -------
    placement_df : DataFrame
        Columns:
          - season, celebrity_name
          - elim_week_obs, elim_week_rank, elim_week_pct, elim_week_btj
          - place_obs, place_rank, place_pct, place_btj
    """
    # All contestants per season
    contestants = (
        df.groupby("season")["celebrity_name"]
        .unique()
        .reset_index()
        .rename(columns={"celebrity_name": "contestants"})
    )

    records: list[dict] = []

    for _, row in contestants.iterrows():
        season = row["season"]
        names = row["contestants"]

        wr_s = week_results[week_results["season"] == season].copy()
        if wr_s.empty:
            continue

        # For quick lookup: week -> elim name for each method
        elim_obs = wr_s.set_index("week")["eliminated_real"].to_dict()
        elim_rank = wr_s.set_index("week")["eliminated_rank"].to_dict()
        elim_pct = wr_s.set_index("week")["eliminated_pct"].to_dict()
        elim_btj = wr_s.set_index("week")["eliminated_btj"].to_dict()

        weeks_sorted = sorted(wr_s["week"].unique())

        # Compute elimination week for each contestant under each method
        elim_week: dict[str, dict[str, int]] = {
            m: {} for m in ["obs", "rank", "pct", "btj"]
        }

        last_week = max(weeks_sorted)

        for name in names:
            # Observed
            tw_obs = [w for w in weeks_sorted if elim_obs.get(w) == name]
            elim_week["obs"][name] = tw_obs[0] if tw_obs else last_week

            # Rank
            tw_rank = [w for w in weeks_sorted if elim_rank.get(w) == name]
            elim_week["rank"][name] = tw_rank[0] if tw_rank else last_week

            # Percent
            tw_pct = [w for w in weeks_sorted if elim_pct.get(w) == name]
            elim_week["pct"][name] = tw_pct[0] if tw_pct else last_week

            # BT+J
            tw_btj = [w for w in weeks_sorted if elim_btj.get(w) == name]
            elim_week["btj"][name] = tw_btj[0] if tw_btj else last_week

        # Convert elimination weeks to placements (1 = winner)
        for name in names:
            rec = {
                "season": season,
                "celebrity_name": name,
                "elim_week_obs": elim_week["obs"][name],
                "elim_week_rank": elim_week["rank"][name],
                "elim_week_pct": elim_week["pct"][name],
                "elim_week_btj": elim_week["btj"][name],
            }
            for m in ["obs", "rank", "pct", "btj"]:
                wk_i = elim_week[m][name]
                num_before = sum(
                    1 for other in names if elim_week[m][other] < wk_i
                )
                rec[f"place_{m}"] = 1 + num_before

            records.append(rec)

    return pd.DataFrame(records)


# ======================================================================
# 6. Plotting utilities
# ======================================================================


def _smooth_xy(
    x: np.ndarray,
    y: np.ndarray,
    n_points: int = 300,
    clip_min: float | None = None,
    clip_max: float | None = None,
) -> tuple[np.ndarray, np.ndarray]:
    """
    Helper: smooth (x, y) using a cubic B-spline if SciPy is available.

    If SciPy is not available or there are too few points, returns (x, y).
    """
    x = np.asarray(x)
    y = np.asarray(y)

    if len(x) < 3 or not HAVE_SCIPY:
        return x, y

    x_new = np.linspace(x.min(), x.max(), n_points)
    try:
        spl = make_interp_spline(x, y, k=3)
        y_new = spl(x_new)
        if clip_min is not None or clip_max is not None:
            y_new = np.clip(
                y_new,
                clip_min if clip_min is not None else y_new.min(),
                clip_max if clip_max is not None else y_new.max(),
            )
        return x_new, y_new
    except Exception:  # pragma: no cover - robust to spline exceptions
        return x, y


def plot_accuracy_by_season(
    per_season_acc: pd.DataFrame,
    filename: str = "method_accuracy_comparison_fancy.png",
) -> None:
    """
    Plot per-season Rank vs Percent accuracies with clean styling and
    optional smoothing.

    This version only uses Rank vs Percent and is left here for
    backward compatibility; newer plots use all three methods.
    """
    if per_season_acc.empty:
        print("No per-season accuracy data to plot.")
        return

    seasons = per_season_acc["season"].values
    y_rank = per_season_acc["rank_accuracy"].values
    y_pct = per_season_acc["pct_accuracy"].values

    plt.figure(figsize=(14, 6))
    ax = plt.gca()
    ax.set_facecolor("#f8f9fa")
    ax.grid(
        True, which="both", axis="both", alpha=0.2, linestyle="--", linewidth=0.7
    )

    def _plot_series(x, y, label, marker):
        # Scatter (actual points)
        plt.scatter(
            x,
            y,
            s=60,
            marker=marker,
            edgecolor="white",
            linewidth=1.0,
            alpha=0.9,
            label=f"{label} (data)",
        )

        # Smoothed trend line
        x_s, y_s = _smooth_xy(x, y, n_points=300, clip_min=0.0, clip_max=1.0)
        plt.plot(
            x_s,
            y_s,
            linewidth=2.5,
            alpha=0.9,
            label=f"{label} (trend)",
        )

    _plot_series(seasons, y_rank, "Rank Sum Accuracy", "o")
    _plot_series(seasons, y_pct, "Percent Sum Accuracy", "s")

    for y_ref in [0.5, 0.75, 1.0]:
        plt.axhline(y_ref, color="gray", linestyle=":", linewidth=0.6, alpha=0.5)

    plt.title(
        "Model Agreement with Reality by Season\nRank Sum vs Percent Sum",
        fontsize=16,
        fontweight="bold",
        pad=10,
    )
    plt.xlabel("Season", fontsize=12)
    plt.ylabel("Accuracy (Match Rate)", fontsize=12)

    plt.xticks(np.arange(seasons.min(), seasons.max() + 1, 2))

    leg = plt.legend(frameon=True, fontsize=10)
    leg.get_frame().set_facecolor("white")
    leg.get_frame().set_edgecolor("#cccccc")
    leg.get_frame().set_alpha(0.9)

    plt.text(
        seasons.min() + 0.2,
        0.95,
        "Higher = method matches actual eliminations more often",
        fontsize=9,
        color="#444444",
    )

    plt.tight_layout()
    plt.savefig(filename, dpi=250)
    plt.close()
    print(f"Saved fancy plot: {filename}")


def plot_accuracy_diff_smooth(
    per_season_acc: pd.DataFrame,
    filename: str = "method_comparison_diff_fancy.png",
) -> None:
    """
    Smoothed difference plot:
        diff(season) = Percent accuracy - Rank accuracy.

    Positive region  -> Percent behaves closer to reality.
    Negative region  -> Rank behaves closer to reality.
    """
    if per_season_acc.empty:
        print("No per-season accuracy data to plot.")
        return

    seasons = per_season_acc["season"].values
    diff = (
        per_season_acc["pct_accuracy"].values
        - per_season_acc["rank_accuracy"].values
    )

    x_s, y_s = _smooth_xy(
        seasons, diff, n_points=300, clip_min=-0.5, clip_max=0.5
    )

    plt.figure(figsize=(14, 6))
    ax = plt.gca()
    ax.set_facecolor("#f8f9fa")
    ax.grid(
        True, which="both", axis="both", alpha=0.2, linestyle="--", linewidth=0.7
    )

    plt.axhline(0, color="#333333", linewidth=1.0, linestyle="--", alpha=0.7)

    plt.plot(
        x_s, y_s, color="#555555", linewidth=1.8, alpha=0.8, label="Smoothed gap"
    )

    plt.fill_between(
        x_s,
        0,
        y_s,
        where=(y_s >= 0),
        interpolate=True,
        color="#5dade2",
        alpha=0.55,
        label="Percent Sum better",
    )
    plt.fill_between(
        x_s,
        0,
        y_s,
        where=(y_s <= 0),
        interpolate=True,
        color="#e74c3c",
        alpha=0.45,
        label="Rank Sum better",
    )

    colors = ["#5dade2" if v >= 0 else "#e74c3c" for v in diff]
    plt.scatter(
        seasons,
        diff,
        c=colors,
        s=60,
        edgecolor="white",
        linewidth=1.0,
        zorder=5,
    )

    plt.title(
        "Performance Gap by Season\nPercent Sum vs Rank Sum",
        fontsize=16,
        fontweight="bold",
        pad=10,
    )
    plt.xlabel("Season", fontsize=12)
    plt.ylabel(
        "Accuracy Difference\n(Percent Accuracy − Rank Accuracy)",
        fontsize=11,
    )

    plt.xticks(np.arange(seasons.min(), seasons.max() + 1, 2))

    ymin, ymax = -0.5, 0.5
    plt.ylim(ymin, ymax)

    plt.text(
        seasons.min() + 0.2,
        ymax - 0.05,
        "↑ Percent Sum matches reality more often",
        fontsize=9,
        color="#1f618d",
        va="top",
    )
    plt.text(
        seasons.min() + 0.2,
        ymin + 0.05,
        "↓ Rank Sum matches reality more often",
        fontsize=9,
        color="#922b21",
        va="bottom",
    )

    if 27 in seasons:
        s27_row = per_season_acc[per_season_acc["season"] == 27]
        if not s27_row.empty:
            s27 = 27
            d27 = (
                s27_row["pct_accuracy"].values[0]
                - s27_row["rank_accuracy"].values[0]
            )
            plt.scatter(
                [s27],
                [d27],
                s=90,
                edgecolor="black",
                facecolor="yellow",
                zorder=10,
            )
            plt.annotate(
                f"S27: {d27:+.2f}",
                xy=(s27, d27),
                xytext=(s27 + 0.5, d27 + 0.1),
                fontsize=9,
                arrowprops=dict(arrowstyle="->", color="black"),
                bbox=dict(
                    boxstyle="round,pad=0.2",
                    fc="white",
                    ec="#555555",
                    alpha=0.9,
                ),
            )

    leg = plt.legend(frameon=True, fontsize=10, loc="upper right")
    leg.get_frame().set_facecolor("white")
    leg.get_frame().set_edgecolor("#cccccc")
    leg.get_frame().set_alpha(0.9)

    plt.tight_layout()
    plt.savefig(filename, dpi=250)
    plt.close()
    print(f"Saved fancy plot: {filename}")


# UMN-style colors for combined plots
UMN_MAROON = "#7A0019"
UMN_GOLD = "#FFCC33"
BTJ_GRAY = "#555555"


def plot_gap_and_overall_combined(
    per_season_acc: pd.DataFrame,
    filename: str = "q2_gap_overall_combined.png",
) -> None:
    """
    Create a figure with two subplots:

      Left:  smoothed accuracy gaps vs Rank by season
             (Percent − Rank, BT+Judge − Rank).

      Right: overall mean accuracy by method (Rank, Percent, BT+Judge)
             with standard-deviation error bars.

    Uses UMN maroon/gold/gray color scheme.
    """
    if per_season_acc.empty:
        print("No per-season accuracy data to plot.")
        return

    seasons = per_season_acc["season"].values
    acc_rank = per_season_acc["rank_accuracy"].values
    acc_pct = per_season_acc["pct_accuracy"].values
    acc_btj = per_season_acc["btj_accuracy"].values

    # ---- gaps vs Rank ----
    gap_pct = acc_pct - acc_rank
    gap_btj = acc_btj - acc_rank

    x_s_pct, gap_pct_s = _smooth_xy(
        seasons, gap_pct, n_points=300, clip_min=-0.5, clip_max=0.5
    )
    x_s_btj, gap_btj_s = _smooth_xy(
        seasons, gap_btj, n_points=300, clip_min=-0.5, clip_max=0.5
    )

    # ---- overall stats for right subplot ----
    means = {
        "Rank Sum": float(np.nanmean(acc_rank)),
        "Percent Sum": float(np.nanmean(acc_pct)),
        "BT+Judge": float(np.nanmean(acc_btj)),
    }
    stds = {
        "Rank Sum": float(np.nanstd(acc_rank)),
        "Percent Sum": float(np.nanstd(acc_pct)),
        "BT+Judge": float(np.nanstd(acc_btj)),
    }
    method_order = ["Rank Sum", "Percent Sum", "BT+Judge"]
    colors = {
        "Rank Sum": UMN_MAROON,
        "Percent Sum": UMN_GOLD,
        "BT+Judge": BTJ_GRAY,
    }

    fig, (ax_left, ax_right) = plt.subplots(
        1, 2, figsize=(13, 5), gridspec_kw={"width_ratios": [3, 1]}
    )

    # ---------------- LEFT: smooth gaps ----------------
    ax = ax_left
    ax.set_facecolor("#f8f9fa")
    ax.grid(
        True, which="both", axis="both", alpha=0.2, linestyle="--", linewidth=0.7
    )

    ax.axhline(0, color="#555555", linestyle="--", linewidth=1.0, alpha=0.7)

    # Percent − Rank (gold)
    ax.fill_between(
        x_s_pct,
        0,
        gap_pct_s,
        where=(gap_pct_s >= 0),
        color=UMN_GOLD,
        alpha=0.35,
    )
    ax.plot(
        x_s_pct,
        gap_pct_s,
        color=UMN_GOLD,
        linewidth=2.2,
        label="Percent − Rank",
    )

    # BT+Judge − Rank (gray)
    ax.fill_between(
        x_s_btj,
        0,
        gap_btj_s,
        where=(gap_btj_s >= 0),
        color=BTJ_GRAY,
        alpha=0.25,
    )
    ax.plot(
        x_s_btj,
        gap_btj_s,
        color=BTJ_GRAY,
        linewidth=2.0,
        label="BT+Judge − Rank",
    )

    # Actual season points
    ax.scatter(
        seasons,
        gap_pct,
        s=35,
        color=UMN_GOLD,
        edgecolor="white",
        linewidth=0.9,
    )
    ax.scatter(
        seasons,
        gap_btj,
        s=35,
        color=BTJ_GRAY,
        edgecolor="white",
        linewidth=0.9,
    )

    ax.set_xlabel("Season", fontsize=11)
    ax.set_ylabel("Accuracy Difference\n(Method − Rank)", fontsize=11)
    ax.set_xticks(np.arange(seasons.min(), seasons.max() + 1, 2))
    ax.set_ylim(-0.5, 0.45)

    ax.text(
        seasons.min() + 0.3,
        0.42,
        "↑ Above 0: method outperforms Rank",
        fontsize=9,
        color="#444444",
        va="top",
    )
    ax.text(
        seasons.min() + 0.3,
        -0.47,
        "↓ Below 0: Rank outperforms method",
        fontsize=9,
        color="#444444",
        va="bottom",
    )

    leg = ax.legend(frameon=True, fontsize=9, loc="upper right")
    leg.get_frame().set_facecolor("white")
    leg.get_frame().set_edgecolor("#cccccc")
    leg.get_frame().set_alpha(0.9)

    ax.set_title("Gap vs Rank by season", fontsize=12, fontweight="bold")

    # ---------------- RIGHT: overall bar chart ----------------
    ax2 = ax_right
    ax2.set_facecolor("#f8f9fa")
    ax2.grid(
        True, axis="x", alpha=0.2, linestyle="--", linewidth=0.7
    )

    y_pos = np.arange(len(method_order))
    mean_vals = [means[m] for m in method_order]
    err_vals = [stds[m] for m in method_order]
    bar_cols = [colors[m] for m in method_order]

    ax2.barh(
        y_pos,
        mean_vals,
        xerr=err_vals,
        color=bar_cols,
        edgecolor="white",
        linewidth=1.0,
        alpha=0.9,
    )

    ax2.set_yticks(y_pos)
    ax2.set_yticklabels(method_order, fontsize=10)
    ax2.invert_yaxis()  # Rank on top

    ax2.set_xlim(0.4, 1.0)
    ax2.set_xlabel("Mean accuracy over seasons", fontsize=11)
    ax2.set_title(
        "Overall performance by method",
        fontsize=12,
        fontweight="bold",
    )

    for i, v in enumerate(mean_vals):
        ax2.text(
            v + 0.01,
            i,
            f"{v:.2f}",
            va="center",
            ha="left",
            fontsize=9,
            color="#333333",
        )

    fig.suptitle(
        "Comparison of Voting Rules vs Observed Eliminations",
        fontsize=14,
        fontweight="bold",
        y=1.03,
    )

    fig.tight_layout()
    fig.savefig(filename, dpi=260, bbox_inches="tight")
    plt.close(fig)
    print(f"Saved combined gap+overall plot: {filename}")


def make_extraordinary_table(placement_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a compact summary table for extraordinary / controversial contestants.
    """
    extraordinary = [
        ("Jerry Rice", 2),
        ("Billy Ray Cyrus", 4),
        ("Bristol Palin", 11),
        ("Bristol Palin", 15),
        ("Bobby Bones", 27),
    ]

    rows: list[dict] = []

    for name, season in extraordinary:
        sub = placement_df[
            (placement_df["celebrity_name"] == name)
            & (placement_df["season"] == season)
        ]
        if sub.empty:
            continue

        r = sub.iloc[0]
        rows.append(
            {
                "Season": season,
                "Contestant": name,
                "Observed Place": r["place_obs"],
                "Rank Rule Place": r["place_rank"],
                "Percent Rule Place": r["place_pct"],
                "BT+Judge Place": r["place_btj"],
            }
        )

    table = pd.DataFrame(rows)

    print("\n=== Extraordinary Contestants Analysis ===")
    if not table.empty:
        print(table.to_string(index=False))
    else:
        print("No extraordinary contestants found in placement_df.")

    return table


def plot_extraordinary_placements(
    placement_df: pd.DataFrame,
    filename: str = "q2_extraordinary_placements.png",
) -> None:
    """
    Plot placement comparison for extraordinary contestants under
    observed, Rank, Percent, and BT+J rules.

    Dot-plot style: x = contestant, y = placement (1 = winner).
    """
    cases = [
        ("Jerry Rice", 2),
        ("Billy Ray Cyrus", 4),
        ("Bristol Palin", 11),
        ("Bristol Palin", 15),
        ("Bobby Bones", 27),
    ]

    records: list[dict] = []
    for name, season in cases:
        sub = placement_df[
            (placement_df["celebrity_name"] == name)
            & (placement_df["season"] == season)
        ]
        if sub.empty:
            continue
        r = sub.iloc[0]
        label = f"S{season}: {name}"
        records.append(
            {
                "label": label,
                "place_obs": r["place_obs"],
                "place_rank": r["place_rank"],
                "place_pct": r["place_pct"],
                "place_btj": r["place_btj"],
            }
        )

    if not records:
        print("No extraordinary contestants found in placement_df.")
        return

    tbl = pd.DataFrame(records)
    x_labels = tbl["label"].tolist()
    x = np.arange(len(x_labels))

    plt.figure(figsize=(12, 6))
    ax = plt.gca()
    ax.set_facecolor("#f8f9fa")
    ax.grid(
        True, axis="y", alpha=0.2, linestyle="--", linewidth=0.7
    )

    width = 0.15
    offsets = {
        "Observed": -1.5 * width,
        "Rank": -0.5 * width,
        "Percent": 0.5 * width,
        "BT+Judge": 1.5 * width,
    }

    ax.scatter(
        x + offsets["Observed"],
        tbl["place_obs"],
        s=70,
        marker="o",
        color="black",
        edgecolor="white",
        linewidth=1.0,
        label="Observed",
    )
    ax.scatter(
        x + offsets["Rank"],
        tbl["place_rank"],
        s=70,
        marker="o",
        color=UMN_MAROON,
        edgecolor="white",
        linewidth=1.0,
        label="Rank Sum",
    )
    ax.scatter(
        x + offsets["Percent"],
        tbl["place_pct"],
        s=70,
        marker="s",
        color=UMN_GOLD,
        edgecolor="white",
        linewidth=1.0,
        label="Percent Sum",
    )
    ax.scatter(
        x + offsets["BT+Judge"],
        tbl["place_btj"],
        s=80,
        marker="D",
        color=BTJ_GRAY,
        edgecolor="white",
        linewidth=1.0,
        label="Bottom-Two + Judge",
    )

    ax.set_xticks(x)
    ax.set_xticklabels(x_labels, rotation=20, ha="right")

    ax.set_ylabel("Placement (1 = winner, larger = earlier elimination)")
    ax.set_title(
        "Placement of Extraordinary Contestants\n"
        "Observed vs Rank, Percent, and Bottom-Two + Judge",
        fontsize=14,
        fontweight="bold",
        pad=10,
    )

    ymax = max(
        tbl["place_obs"].max(),
        tbl["place_rank"].max(),
        tbl["place_pct"].max(),
        tbl["place_btj"].max(),
    )
    ax.set_ylim(0.5, ymax + 0.5)
    ax.invert_yaxis()

    leg = ax.legend(frameon=True, fontsize=10, loc="upper right")
    leg.get_frame().set_facecolor("white")
    leg.get_frame().set_edgecolor("#cccccc")
    leg.get_frame().set_alpha(0.9)

    plt.tight_layout()
    plt.savefig(filename, dpi=260)
    plt.close()
    print(f"Saved extraordinary placement plot: {filename}")


# ======================================================================
# 7. Script entry point: tie everything together
# ======================================================================


def main() -> None:
    # Load fan model output
    df = pd.read_csv("fan_shares_estimated.csv")

    # 1) Simulate methods
    week_results = simulate_all_methods(df)

    # 2) Global and per-season summaries
    summary_global, per_season_disagree = summarize_week_results(
        week_results
    )
    print("=== Global summary (Rank vs Percent) ===")
    for k, v in summary_global.items():
        print(f"{k}: {v}")
    print("\n=== Per-season disagreement summary (head) ===")
    print(per_season_disagree.head().to_string(index=False))

    # 3) Per-season accuracy for all 3 methods + summary plot
    per_season_acc = compute_per_season_accuracy(week_results)
    plot_gap_and_overall_combined(
        per_season_acc, filename="q2_gap_overall_combined.png"
    )

    # 4) Placements for every contestant & placement plots
    placement_df = compute_placements_per_season(week_results, df)
    plot_extraordinary_placements(
        placement_df, filename="q2_extraordinary_placements.png"
    )

    # 5) Extraordinary contestants table (for writeup / appendix)
    make_extraordinary_table(placement_df)


if __name__ == "__main__":
    main()
