In [None]:
# wasserstein distance matrix

from pathlib import Path
import pandas as pd
import numpy as np
import re
from scipy.stats import wasserstein_distance
import logging
from tqdm import tqdm

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

BASE = Path("/home/njian29/Desktop")
INPUTS = {
    "sports": BASE/"filename_dates_packed_history_of_sports_tagged.csv",
    "ideologies": BASE/"filename_dates_packed_history_of_ideologies_tagged.csv",
    "objects": BASE/"filename_dates_packed_historical_objects_tagged.csv",
}
WORDCOUNTS = {
    "sports": BASE/"filename_wordcounts_sports.csv",
    "ideologies": BASE/"filename_wordcounts_ideologies.csv",
    "objects": BASE/"filename_wordcounts_objects.csv",
}
SCOPES = {"top10":0.10, "top25":0.25, "top50":0.50}

MAX_YEAR, MIN_NUM_AS_YEAR = 2025, 32

def extract_all_years(text: str):
    if not isinstance(text, str) or not text.strip():
        return []
    years = []
    for m in re.finditer(r"\b(\d{3,4})\b", text):
        val = int(m.group(1))
        if val >= MIN_NUM_AS_YEAR and val <= MAX_YEAR:
            years.append(val)
    return years

def build_matrix(in_path: Path, langs_filter=None, pbar=None):
    df = pd.read_csv(in_path, dtype=str, encoding="utf-8")
    date_cols = [c for c in df.columns if "date" in c.lower()]
    records = []
    for _, row in df.iterrows():
        lang = row["filename"]
        if langs_filter is not None and lang not in langs_filter:
            if pbar: pbar.update(1)
            continue
        for c in date_cols:
            for y in extract_all_years(row.get(c, "")):
                records.append((lang, y))
        if pbar: pbar.update(1)
    years_df = pd.DataFrame(records, columns=["language", "year"])
    pivot = (years_df.groupby(["language", "year"]).size()
             .reset_index(name="count")
             .pivot(index="language", columns="year", values="count")
             .fillna(0))
    return pivot

def compute_wasserstein_matrix(pivot: pd.DataFrame):
    langs = pivot.index.tolist()
    years = pivot.columns.values
    histograms = pivot.values
    n = len(langs)
    dist_mat = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            h1, h2 = histograms[i], histograms[j]
            xs = np.repeat(years, h1.astype(int))
            ys = np.repeat(years, h2.astype(int))
            if xs.size == 0 and ys.size == 0:
                d = 0.0
            elif xs.size == 0 or ys.size == 0:
                d = np.inf
            else:
                d = wasserstein_distance(xs, ys)
            dist_mat[i, j] = dist_mat[j, i] = d
    return pd.DataFrame(dist_mat, index=langs, columns=langs)

def main():
    ordered_groups = ["objects", "sports", "ideologies"]

    for group in ordered_groups:
        path = INPUTS[group]
        wc_path = WORDCOUNTS[group]
        wc_df = pd.read_csv(wc_path, dtype={"filename": str, "english_word_count": int})
        wc_df = wc_df.sort_values("english_word_count", ascending=False).reset_index(drop=True)
        langs_sorted = wc_df["filename"].tolist()
        total_langs = len(langs_sorted)
        logging.info(f"{group}: 总共有 {total_langs} 种语言")

        for scope, frac in SCOPES.items():
            k = max(1, int(total_langs * frac))
            top_langs = set(langs_sorted[:k])
            logging.info(f"{group}-{scope}: 取前 {k} 种语言")

            pivot = build_matrix(path, langs_filter=top_langs)
            dist_mat = compute_wasserstein_matrix(pivot)

            out_file = BASE / f"wasserstein_{group}_{scope}.csv"
            dist_mat.to_csv(out_file)
            logging.info(f"[{group}-{scope}] 已保存 Wasserstein 距离矩阵: {out_file}")

if __name__ == "__main__":
    main()

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import logging
from adjustText import adjust_text

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

BASE = Path("/home/njian29/Desktop")

INPUT_FILES = {
    "top10": BASE / "wasserstein_sports_top10.csv",
    "top25": BASE / "wasserstein_sports_top25.csv",
    "top50": BASE / "wasserstein_sports_top50.csv",
}

CLUSTER_CONFIG = {
    "top10":  {"min_clusters": 3, "max_clusters": 4}, 
    "top25":  {"min_clusters": 4, "max_clusters": 7}, 
    "top50":  {"min_clusters": 6, "max_clusters": 10}, 
}

def cluster_dbscan(dist_mat: pd.DataFrame, eps=10, min_samples=2):
    model = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = model.fit_predict(dist_mat.values)
    return pd.DataFrame({"language": dist_mat.index, "cluster_id": labels})

def auto_dbscan(dist_mat: pd.DataFrame, group="sports", scope="top10",
                min_clusters=3, max_clusters=12):
    eps_candidates = [3, 5, 7, 10, 12, 15, 20, 25, 30] 
    min_samples_candidates = [2, 3, 4, 5]

    best_clusters, best_params = None, None
    best_score = -1

    for eps in eps_candidates:
        for ms in min_samples_candidates:
            clusters = cluster_dbscan(dist_mat, eps=eps, min_samples=ms)
            labels = clusters["cluster_id"].values
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = sum(labels == -1)
            noise_ratio = n_noise / len(labels)

            if n_clusters < min_clusters or n_clusters > max_clusters:
                continue

            score = n_clusters - 2 * noise_ratio
            if score > best_score:
                best_score = score
                best_clusters, best_params = clusters, (eps, ms)

    return best_clusters, best_params

def plot_clusters(dist_mat: pd.DataFrame, clusters: pd.DataFrame,
                  group="sports", scope="top10", params=None, tag="repel"):
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
    coords = mds.fit_transform(dist_mat.values)
    df_plot = pd.DataFrame({
        "x": coords[:, 0], "y": coords[:, 1],
        "language": dist_mat.index,
        "cluster_id": clusters["cluster_id"].values
    })

    plt.figure(figsize=(18, 14))
    unique_clusters = sorted(df_plot["cluster_id"].unique())
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_clusters)))

    texts = []
    for cid, color in zip(unique_clusters, colors):
        subset = df_plot[df_plot["cluster_id"] == cid]
        label = f"Cluster {cid}" if cid != -1 else "Noise"
        plt.scatter(subset["x"], subset["y"], c=[color], s=40, alpha=0.8,
                    label=label, edgecolors="k")
        for _, row in subset.iterrows():
            texts.append(
                plt.text(row["x"], row["y"], row["language"], fontsize=8)
            )
    adjust_text(
        texts,
        arrowprops=dict(arrowstyle="->", color="gray", lw=0.5),
        force_points=1.0,
        force_text=1.5,
        expand_points=(1.4, 1.8),
        expand_text=(1.4, 1.8),
        only_move={'points': 'y', 'text': 'xy'}
    )

    plt.legend(title="Clusters", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.title(f"DBSCAN Clusters ({group}-{scope}, {tag})\nparams={params}", fontsize=14)

    out_fig = BASE / f"dbscan_{group}_{scope}_{tag}.png"
    plt.savefig(out_fig, dpi=200, bbox_inches="tight")
    plt.close()
    logging.info(f"[{group}-{scope}] 聚类图已保存: {out_fig}")

def main():
    group = "sports"
    for scope, file_path in INPUT_FILES.items():
        logging.info(f"=== 处理 {scope} ({file_path.name}) ===")
        dist_mat = pd.read_csv(file_path, index_col=0)

        cfg = CLUSTER_CONFIG[scope]
        clusters, params = auto_dbscan(
            dist_mat,
            group=group,
            scope=scope,
            min_clusters=cfg["min_clusters"],
            max_clusters=cfg["max_clusters"]
        )

        if clusters is None:
            logging.error(f"[{group}-{scope}] 未能找到合适的聚类参数。")
            continue

        plot_clusters(dist_mat, clusters,
                      group=group, scope=scope, params=params, tag="repel")

if __name__ == "__main__":
    main()

In [None]:
# csv files and dbcan filtered chart for objects

from pathlib import Path
import pandas as p
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import logging
from adjustText import adjust_text

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

BASE = Path("/home/njian29/Desktop")

INPUT_FILES = {
    "top10": BASE / "wasserstein_objects_top10.csv",
    "top25": BASE / "wasserstein_objects_top25.csv",
    "top50": BASE / "wasserstein_objects_top50.csv",
}

def cluster_dbscan(dist_mat: pd.DataFrame, eps=10, min_samples=2):
    model = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = model.fit_predict(dist_mat.values)
    return pd.DataFrame({"language": dist_mat.index, "cluster_id": labels})

def auto_dbscan(dist_mat: pd.DataFrame, group="objects", scope="top10",
                min_clusters=3, max_clusters=12):
    eps_candidates = [5, 7, 10, 12, 15, 20, 25, 30]
    min_samples_candidates = [2, 3, 4, 5]

    best_clusters, best_params = None, None
    best_score = -1

    for eps in eps_candidates:
        for ms in min_samples_candidates:
            clusters = cluster_dbscan(dist_mat, eps=eps, min_samples=ms)
            labels = clusters["cluster_id"].values
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = sum(labels == -1)
            noise_ratio = n_noise / len(labels)

            if n_clusters < min_clusters or n_clusters > max_clusters:
                continue

            score = n_clusters - 2 * noise_ratio
            logging.info(f"[{group}-{scope}] eps={eps}, min_samples={ms} → "
                         f"簇数={n_clusters}, 噪声={n_noise}, score={score:.2f}")

            if score > best_score:
                best_score = score
                best_clusters, best_params = clusters, (eps, ms)

    if best_clusters is None:
        logging.warning(f"[{group}-{scope}] 没有找到满足条件的参数。")
    else:
        logging.info(f"[{group}-{scope}] 最佳参数: eps={best_params[0]}, min_samples={best_params[1]} (score={best_score:.2f})")

    return best_clusters, best_params

def assign_noise_to_nearest(dist_mat: pd.DataFrame, clusters: pd.DataFrame):
    labels = clusters["cluster_id"].values
    noise_idxs = np.where(labels == -1)[0]

    if len(noise_idxs) == 0:
        return clusters

    cluster_centers = {}
    for cid in set(labels):
        if cid == -1:
            continue
        member_idxs = np.where(labels == cid)[0]
        cluster_centers[cid] = dist_mat.values[member_idxs].mean(axis=0)

    for idx in noise_idxs:
        dists = {cid: cluster_centers[cid][idx] for cid in cluster_centers}
        nearest = min(dists, key=dists.get)
        labels[idx] = nearest

    clusters["cluster_id"] = labels
    return clusters

def plot_clusters(dist_mat: pd.DataFrame, clusters: pd.DataFrame,
                  group="objects", scope="top10", params=None, tag="filtered"):
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
    coords = mds.fit_transform(dist_mat.values)
    df_plot = pd.DataFrame({
        "x": coords[:, 0], "y": coords[:, 1],
        "language": dist_mat.index,
        "cluster_id": clusters["cluster_id"].values
    })

    df_plot["dist"] = np.sqrt(df_plot["x"]**2 + df_plot["y"]**2)
    cutoff = df_plot["dist"].quantile(0.90)
    df_plot = df_plot[df_plot["dist"] <= cutoff]

    plt.figure(figsize=(18, 14))
    unique_clusters = sorted(df_plot["cluster_id"].unique())
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_clusters)))

    texts = []
    for cid, color in zip(unique_clusters, colors):
        subset = df_plot[df_plot["cluster_id"] == cid]
        label = f"Cluster {cid}" if cid != -1 else "Noise"
        plt.scatter(subset["x"], subset["y"], c=[color], s=40, alpha=0.8,
                    label=label, edgecolors="k")
        for _, row in subset.iterrows():
            texts.append(
                plt.text(row["x"], row["y"], row["language"], fontsize=8)
            )

    adjust_text(
        texts,
        arrowprops=dict(arrowstyle="-", color="gray", lw=0.5),
        force_points=0.5,
        force_text=0.8,
        expand_points=(1.2, 1.6),
        expand_text=(1.2, 1.6),
        only_move={'points':'y', 'text':'xy'}
    )

    plt.legend(title="Clusters", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.title(f"DBSCAN Clusters ({group}-{scope}, {tag})\nparams={params}", fontsize=14)

    out_fig = BASE / f"dbscan_{group}_{scope}_{tag}.png"
    plt.savefig(out_fig, dpi=200, bbox_inches="tight")
    plt.close()
    logging.info(f"[{group}-{scope}] 聚类图已保存: {out_fig}")

def main():
    group = "objects"
    for scope, file_path in INPUT_FILES.items():
        logging.info(f"=== 处理 {scope} ({file_path.name}) ===")
        dist_mat = pd.read_csv(file_path, index_col=0)

        clusters, params = auto_dbscan(dist_mat, group=group, scope=scope,
                                       min_clusters=3, max_clusters=12)

        if clusters is None:
            logging.error(f"[{group}-{scope}] 未能找到合适的聚类参数。")
            continue

        out_csv = BASE / f"dbscan_{group}_{scope}_original.csv"
        clusters.to_csv(out_csv, index=False)
        logging.info(f"[{group}-{scope}] 原始聚类结果已保存: {out_csv}")

        clusters_noisefree = assign_noise_to_nearest(dist_mat, clusters.copy())
        out_csv2 = BASE / f"dbscan_{group}_{scope}_noisefree.csv"
        clusters_noisefree.to_csv(out_csv2, index=False)
        logging.info(f"[{group}-{scope}] 去噪聚类结果已保存: {out_csv2}")

        plot_clusters(dist_mat, clusters_noisefree,
                      group=group, scope=scope, params=params, tag="filtered")

if __name__ == "__main__":
    main()

In [None]:
# csv files and dbcan filtered chart  for ideologies

from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import logging
from adjustText import adjust_text

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

BASE = Path("/home/njian29/Desktop")

INPUT_FILES = {
    "top10": BASE / "wasserstein_ideologies_top10.csv",
    "top25": BASE / "wasserstein_ideologies_top25.csv",
    "top50": BASE / "wasserstein_ideologies_top50.csv",
}

def cluster_dbscan(dist_mat: pd.DataFrame, eps=10, min_samples=2):
    model = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = model.fit_predict(dist_mat.values)
    return pd.DataFrame({"language": dist_mat.index, "cluster_id": labels})

def auto_dbscan(dist_mat: pd.DataFrame, group="ideologies", scope="top10",
                min_clusters=3, max_clusters=12):
    eps_candidates = [5, 7, 10, 12, 15, 20, 25, 30]
    min_samples_candidates = [2, 3, 4, 5]

    best_clusters, best_params = None, None
    best_score = -1

    for eps in eps_candidates:
        for ms in min_samples_candidates:
            clusters = cluster_dbscan(dist_mat, eps=eps, min_samples=ms)
            labels = clusters["cluster_id"].values
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = sum(labels == -1)
            noise_ratio = n_noise / len(labels)

            if n_clusters < min_clusters or n_clusters > max_clusters:
                continue

            score = n_clusters - 2 * noise_ratio
            logging.info(f"[{group}-{scope}] eps={eps}, min_samples={ms} → "
                         f"簇数={n_clusters}, 噪声={n_noise}, score={score:.2f}")

            if score > best_score:
                best_score = score
                best_clusters, best_params = clusters, (eps, ms)

    if best_clusters is None:
        logging.warning(f"[{group}-{scope}] 没有找到满足条件的参数。")
    else:
        logging.info(f"[{group}-{scope}] 最佳参数: eps={best_params[0]}, min_samples={best_params[1]} (score={best_score:.2f})")

    return best_clusters, best_params

def assign_noise_to_nearest(dist_mat: pd.DataFrame, clusters: pd.DataFrame):
    labels = clusters["cluster_id"].values
    noise_idxs = np.where(labels == -1)[0]

    if len(noise_idxs) == 0:
        return clusters

    cluster_centers = {}
    for cid in set(labels):
        if cid == -1:
            continue
        member_idxs = np.where(labels == cid)[0]
        cluster_centers[cid] = dist_mat.values[member_idxs].mean(axis=0)

    for idx in noise_idxs:
        dists = {cid: cluster_centers[cid][idx] for cid in cluster_centers}
        nearest = min(dists, key=dists.get)
        labels[idx] = nearest

    clusters["cluster_id"] = labels
    return clusters

# ========= 可视化 =========
def plot_clusters(dist_mat: pd.DataFrame, clusters: pd.DataFrame,
                  group="ideologies", scope="top10", params=None, tag="filtered"):
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
    coords = mds.fit_transform(dist_mat.values)
    df_plot = pd.DataFrame({
        "x": coords[:, 0], "y": coords[:, 1],
        "language": dist_mat.index,
        "cluster_id": clusters["cluster_id"].values
    })

    df_plot["dist"] = np.sqrt(df_plot["x"]**2 + df_plot["y"]**2)
    cutoff = df_plot["dist"].quantile(0.90)
    df_plot = df_plot[df_plot["dist"] <= cutoff]

    plt.figure(figsize=(18, 14))
    unique_clusters = sorted(df_plot["cluster_id"].unique())
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_clusters)))

    texts = []
    for cid, color in zip(unique_clusters, colors):
        subset = df_plot[df_plot["cluster_id"] == cid]
        label = f"Cluster {cid}" if cid != -1 else "Noise"
        plt.scatter(subset["x"], subset["y"], c=[color], s=40, alpha=0.8,
                    label=label, edgecolors="k")
        for _, row in subset.iterrows():
            texts.append(
                plt.text(row["x"], row["y"], row["language"], fontsize=8)
            )

    adjust_text(
        texts,
        arrowprops=dict(arrowstyle="-", color="gray", lw=0.5),
        force_points=0.5,
        force_text=0.8,
        expand_points=(1.2, 1.6),
        expand_text=(1.2, 1.6),
        only_move={'points':'y', 'text':'xy'}
    )

    plt.legend(title="Clusters", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.title(f"DBSCAN Clusters ({group}-{scope}, {tag})\nparams={params}", fontsize=14)

    out_fig = BASE / f"dbscan_{group}_{scope}_{tag}.png"
    plt.savefig(out_fig, dpi=200, bbox_inches="tight")
    plt.close()
    logging.info(f"[{group}-{scope}] 聚类图已保存: {out_fig}")

def main():
    group = "ideologies"
    for scope, file_path in INPUT_FILES.items():
        logging.info(f"=== 处理 {scope} ({file_path.name}) ===")
        dist_mat = pd.read_csv(file_path, index_col=0)

        clusters, params = auto_dbscan(dist_mat, group=group, scope=scope,
                                       min_clusters=3, max_clusters=12)

        if clusters is None:
            logging.error(f"[{group}-{scope}] 未能找到合适的聚类参数。")
            continue

        out_csv = BASE / f"dbscan_{group}_{scope}_original.csv"
        clusters.to_csv(out_csv, index=False)
        logging.info(f"[{group}-{scope}] 原始聚类结果已保存: {out_csv}")

        clusters_noisefree = assign_noise_to_nearest(dist_mat, clusters.copy())
        out_csv2 = BASE / f"dbscan_{group}_{scope}_noisefree.csv"
        clusters_noisefree.to_csv(out_csv2, index=False)
        logging.info(f"[{group}-{scope}] 去噪聚类结果已保存: {out_csv2}")

        plot_clusters(dist_mat, clusters_noisefree,
                      group=group, scope=scope, params=params, tag="filtered")

if __name__ == "__main__":
    main()

In [None]:
 # csv files and dbcan filtered chart for sports
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import logging
from adjustText import adjust_text

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

BASE = Path("/home/njian29/Desktop")

INPUT_FILES = {
    "top10": BASE / "wasserstein_sports_top10.csv",
    "top25": BASE / "wasserstein_sports_top25.csv",
    "top50": BASE / "wasserstein_sports_top50.csv",
}

CLUSTER_CONFIG = {
    "top10":  {"min_clusters": 3, "max_clusters": 4},
    "top25":  {"min_clusters": 4, "max_clusters": 7},  
    "top50":  {"min_clusters": 6, "max_clusters": 10}, 
}

def cluster_dbscan(dist_mat: pd.DataFrame, eps=10, min_samples=2):
    model = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = model.fit_predict(dist_mat.values)
    return pd.DataFrame({"language": dist_mat.index, "cluster_id": labels})

def auto_dbscan(dist_mat: pd.DataFrame, group="sports", scope="top10",
                min_clusters=3, max_clusters=12):
    eps_candidates = [3, 5, 7, 10, 12, 15, 20, 25, 30]
    min_samples_candidates = [2, 3, 4, 5]

    best_clusters, best_params = None, None
    best_score = -1

    for eps in eps_candidates:
        for ms in min_samples_candidates:
            clusters = cluster_dbscan(dist_mat, eps=eps, min_samples=ms)
            labels = clusters["cluster_id"].values
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = sum(labels == -1)
            noise_ratio = n_noise / len(labels)

            if n_clusters < min_clusters or n_clusters > max_clusters:
                continue

            score = n_clusters - 2 * noise_ratio
            logging.info(f"[{group}-{scope}] eps={eps}, min_samples={ms} → "
                         f"簇数={n_clusters}, 噪声={n_noise}, score={score:.2f}")

            if score > best_score:
                best_score = score
                best_clusters, best_params = clusters, (eps, ms)

    if best_clusters is None:
        logging.warning(f"[{group}-{scope}] 没有找到满足条件的参数。")
    else:
        logging.info(f"[{group}-{scope}] 最佳参数: eps={best_params[0]}, min_samples={best_params[1]} (score={best_score:.2f})")

    return best_clusters, best_params

def assign_noise_to_nearest(dist_mat: pd.DataFrame, clusters: pd.DataFrame):
    labels = clusters["cluster_id"].values
    noise_idxs = np.where(labels == -1)[0]

    if len(noise_idxs) == 0:
        return clusters

    cluster_centers = {}
    for cid in set(labels):
        if cid == -1:
            continue
        member_idxs = np.where(labels == cid)[0]
        cluster_centers[cid] = dist_mat.values[member_idxs].mean(axis=0)

    for idx in noise_idxs:
        dists = {cid: cluster_centers[cid][idx] for cid in cluster_centers}
        nearest = min(dists, key=dists.get)
        labels[idx] = nearest

    clusters["cluster_id"] = labels
    return clusters

def plot_clusters(dist_mat: pd.DataFrame, clusters: pd.DataFrame,
                  group="sports", scope="top10", params=None, tag="filtered"):
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
    coords = mds.fit_transform(dist_mat.values)
    df_plot = pd.DataFrame({
        "x": coords[:, 0], "y": coords[:, 1],
        "language": dist_mat.index,
        "cluster_id": clusters["cluster_id"].values
    })

    df_plot["dist"] = np.sqrt(df_plot["x"]**2 + df_plot["y"]**2)
    cutoff = df_plot["dist"].quantile(0.90)
    df_plot = df_plot[df_plot["dist"] <= cutoff]

    plt.figure(figsize=(18, 14))
    unique_clusters = sorted(df_plot["cluster_id"].unique())
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_clusters)))

    texts = []
    for cid, color in zip(unique_clusters, colors):
        subset = df_plot[df_plot["cluster_id"] == cid]
        label = f"Cluster {cid}" if cid != -1 else "Noise"
        plt.scatter(subset["x"], subset["y"], c=[color], s=40, alpha=0.8,
                    label=label, edgecolors="k")
        for _, row in subset.iterrows():
            texts.append(
                plt.text(row["x"], row["y"], row["language"], fontsize=8)
            )

    adjust_text(
        texts,
        arrowprops=dict(arrowstyle="-", color="gray", lw=0.5),
        force_points=0.5,
        force_text=0.8,
        expand_points=(1.2, 1.6),
        expand_text=(1.2, 1.6),
        only_move={'points':'y', 'text':'xy'}
    )

    plt.legend(title="Clusters", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.title(f"DBSCAN Clusters ({group}-{scope}, {tag})\nparams={params}", fontsize=14)

    out_fig = BASE / f"dbscan_{group}_{scope}_{tag}.png"
    plt.savefig(out_fig, dpi=200, bbox_inches="tight")
    plt.close()
    logging.info(f"[{group}-{scope}] 聚类图已保存: {out_fig}")

def main():
    group = "sports"
    for scope, file_path in INPUT_FILES.items():
        logging.info(f"=== 处理 {scope} ({file_path.name}) ===")
        dist_mat = pd.read_csv(file_path, index_col=0)

        cfg = CLUSTER_CONFIG[scope]
        clusters, params = auto_dbscan(
            dist_mat,
            group=group,
            scope=scope,
            min_clusters=cfg["min_clusters"],
            max_clusters=cfg["max_clusters"]
        )

        if clusters is None:
            logging.error(f"[{group}-{scope}] 未能找到合适的聚类参数。")
            continue

        out_csv = BASE / f"dbscan_{group}_{scope}_original.csv"
        clusters.to_csv(out_csv, index=False)

        clusters_noisefree = assign_noise_to_nearest(dist_mat, clusters.copy())
        out_csv2 = BASE / f"dbscan_{group}_{scope}_noisefree.csv"
        clusters_noisefree.to_csv(out_csv2, index=False)

        plot_clusters(dist_mat, clusters_noisefree,
                      group=group, scope=scope, params=params, tag="filtered")

if __name__ == "__main__":
    main()