In [None]:
"""
distinctive_years_from_clusters_final_wide.py
---------------------------------------------------
Final version — visually balanced
✅ Ratio-based distinctiveness (A−B)/(A+B)
✅ Normalize per cluster to [0,1]
✅ Top 6 peaks per cluster labeled
✅ Wider figure and compact legend
---------------------------------------------------
Author: Samuel Jiang (2025)
"""

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
import re

START_YEAR = 1700
END_YEAR = 2025
ROLL_WINDOW = 5
TOP_N = 6
PEAK_PERCENTILE = 90
OUT_DIR = Path("/home/njian29/Desktop/distinctive_years_final_wide")
OUT_DIR.mkdir(exist_ok=True)

GROUPS = {
    "objects": {
        "parsed": Path("/home/njian29/Desktop/parsed_years_historical_objects_tagged.csv"),
        "clusters": Path("/home/njian29/Desktop/spectral_clusters_cleaned/clusters_objects_cleaned.csv"),
    },
    "ideologies": {
        "parsed": Path("/home/njian29/Desktop/parsed_years_history_of_ideologies_tagged.csv"),
        "clusters": Path("/home/njian29/Desktop/spectral_clusters_cleaned/clusters_ideologies_cleaned.csv"),
    },
    "sports": {
        "parsed": Path("/home/njian29/Desktop/parsed_years_history_of_sports_tagged.csv"),
        "clusters": Path("/home/njian29/Desktop/spectral_clusters_cleaned/clusters_sports_cleaned.csv"),
    },
}


def filename_to_language(fn: str) -> str:
    s = str(fn).strip()
    if s.lower().endswith(".txt"):
        s = s[:-4]
    return s.strip()


def parse_years(text: str):
    if not isinstance(text, str):
        return []
    years = [int(x) for x in re.findall(r"-?\d{1,4}", text)]
    return [y for y in years if START_YEAR <= y <= END_YEAR]

def compute_distinctive_years(parsed_path: Path, cluster_path: Path, group_name: str):
    print(f"[INFO] Processing {group_name}...")

    df = pd.read_csv(parsed_path, dtype=str, encoding="utf-8")
    df["language"] = df["filename"].apply(filename_to_language)

    clusters = pd.read_csv(cluster_path)
    clusters["Language"] = clusters["Language"].str.replace(".txt", "", regex=False)
    clusters["Cluster"] = pd.to_numeric(clusters["Cluster"], errors="coerce")

    records = []
    for _, row in df.iterrows():
        lang = row["language"]
        years = parse_years(row.get("parsed_years", ""))
        for y in years:
            records.append((lang, y))
    years_df = pd.DataFrame(records, columns=["language", "year"])
    if years_df.empty:
        print(f"[WARN] No data for {group_name}. Skipping.")
        return

    counts = years_df.groupby(["language", "year"]).size().reset_index(name="count")
    all_years = np.arange(START_YEAR, END_YEAR + 1)
    pivot = (
        counts.pivot(index="language", columns="year", values="count")
        .reindex(columns=all_years)
        .fillna(0)
    )

    merged = clusters.merge(pivot, left_on="Language", right_index=True, how="inner")
    merged = merged.sort_values("Cluster")
    X = merged.drop(columns=["Language", "Cluster"]).values
    clusters_arr = merged["Cluster"].values
    unique_clusters = sorted(set(clusters_arr))
    years = pivot.columns.values

    summary_records = []
    plt.figure(figsize=(16, 7))
    cmap = plt.get_cmap("tab10")

    for idx, c in enumerate(unique_clusters):
        color = cmap(idx)
        A = X[clusters_arr == c].mean(axis=0)
        B = X[clusters_arr != c].mean(axis=0)
        C = (A - B) / (A + B + 1e-6)

        C_smooth = pd.Series(C).rolling(window=ROLL_WINDOW, center=True, min_periods=1).mean().values

        C_min, C_max = np.min(C_smooth), np.max(C_smooth)
        if C_max - C_min > 0:
            C_smooth = (C_smooth - C_min) / (C_max - C_min)
        else:
            C_smooth[:] = 0
        threshold = np.percentile(C_smooth, PEAK_PERCENTILE)
        peaks, _ = find_peaks(C_smooth, height=threshold, distance=5)
        if len(peaks) > TOP_N:
            peaks = peaks[np.argsort(C_smooth[peaks])[-TOP_N:]]
        peak_years = years[peaks]
        peak_vals = C_smooth[peaks]
        for y, val in zip(peak_years, peak_vals):
            summary_records.append((group_name, c, y, val))

        plt.plot(years, C_smooth, label=f"Cluster {c}", color=color, linewidth=2)
        plt.scatter(peak_years, peak_vals, s=55, color=color, edgecolor="k", zorder=5)
        for y, val in zip(peak_years, peak_vals):
            plt.text(y, val + 0.03, str(int(y)), ha="center", fontsize=7, color=color)

    plt.title(f"Distinctiveness Curves for {group_name.capitalize()} (Top {TOP_N} Years per Cluster)",
              fontsize=15, pad=14)
    plt.xlabel("Year", fontsize=12)
    plt.ylabel("Distinctiveness (A−B Ratio, Normalized)", fontsize=12)
    plt.ylim(-0.05, 1.15)
    plt.xlim(START_YEAR, END_YEAR)
    plt.grid(alpha=0.3)
    plt.legend(title="Clusters", fontsize=9, loc="upper left", bbox_to_anchor=(1.02, 1))
    plt.tight_layout(rect=[0, 0, 0.88, 1])

    out_curve = OUT_DIR / f"{group_name}_distinctive_curves_top{TOP_N}_wide.png"
    plt.savefig(out_curve, dpi=200)
    plt.close()
    print(f"[OK] Saved curve plot: {out_curve}")

    timeline_df = pd.DataFrame(summary_records, columns=["Group", "Cluster", "Year", "Distinctiveness"])
    out_csv = OUT_DIR / f"{group_name}_distinctive_years_top{TOP_N}_wide.csv"
    timeline_df.to_csv(out_csv, index=False)
    print(f"[OK] Saved results: {out_csv}\n")

def main():
    for group, paths in GROUPS.items():
        compute_distinctive_years(paths["parsed"], paths["clusters"], group)


if __name__ == "__main__":
    main()