In [None]:
"""
wasserstein_matrix_all_languages.py
---------------------------------------------------
Compute pairwise 1D Wasserstein (Earth Mover’s) distances
between *all* languages in a parsed_years_*.csv file.

Each cell D[i,j] = Wasserstein distance between language_i and language_j.

Output:
    wasserstein_matrix_<group>.csv
---------------------------------------------------
Author: Samuel Jiang (2025)
"""

from pathlib import Path
import pandas as pd
import numpy as np
import re
from itertools import combinations
from tqdm import tqdm 

YEAR_MIN = 1700
YEAR_MAX = 2025
BIN_WIDTH = 5

CSV_PATH = Path("/home/njian29/Desktop/parsed_years_history_of_ideologies_tagged.csv")
OUT_PATH = Path("/home/njian29/Desktop/wasserstein_matrix_ideologies.csv")

def parse_years_field(text: str):
    """从 parsed_years 列中提取有效年份"""
    if not isinstance(text, str):
        return []
    years = []
    for m in re.findall(r"-?\d{1,4}", text):
        y = int(m)
        if YEAR_MIN <= y <= YEAR_MAX:
            years.append(y)
    return years


def build_hist(years, bins):
    hist, _ = np.histogram(years, bins=bins)
    hist = hist.astype(float)
    if hist.sum() > 0:
        hist /= hist.sum()
    return hist


def wasserstein_1d_from_hist(p, q, bin_width=1.0):
    cdf_p = np.cumsum(p)
    cdf_q = np.cumsum(q)
    return float(np.sum(np.abs(cdf_p - cdf_q)) * bin_width)

def main():
    print(f"[INFO] 读取数据: {CSV_PATH}")
    df = pd.read_csv(CSV_PATH, dtype=str)

    data = {}
    for _, row in df.iterrows():
        lang = row["filename"]
        years = parse_years_field(row["parsed_years"])
        if years:
            data[lang] = years

    langs = sorted(data.keys())
    print(f"[INFO] 共有 {len(langs)} 种语言.")

    bins = np.arange(YEAR_MIN, YEAR_MAX + BIN_WIDTH, BIN_WIDTH)
    hists = {lang: build_hist(data[lang], bins) for lang in langs}

    n = len(langs)
    D = np.zeros((n, n))
    print("[INFO] 计算 pairwise Wasserstein 距离矩阵...")

    for i, j in tqdm(list(combinations(range(n), 2)), ncols=80):
        d = wasserstein_1d_from_hist(hists[langs[i]], hists[langs[j]], BIN_WIDTH)
        D[i, j] = D[j, i] = d

    np.fill_diagonal(D, 0.0)

    D_df = pd.DataFrame(D, index=langs, columns=langs)
    D_df.to_csv(OUT_PATH)
    print(f"[OK] 保存矩阵到: {OUT_PATH}")


if __name__ == "__main__":
    main()