In [1]:
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Iterable, Optional, Union


def load_jsonl_as_dict_of_dict(path, key=None):
    data = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)
            data[obj[key]] = obj  
    return data



import pandas as pd
import numpy as np

def neighbor_match_rates(
    df_survey: pd.DataFrame,
    neighbors_info: dict,
    *,
    id_col: str = "caseid",
    question_cols=None,
    top_k: int | None = None,
):
    if id_col not in df_survey.columns:
        raise ValueError(f"`{id_col}` not found in survey data.")
    if question_cols is None:
        question_cols = [c for c in df_survey.columns if c != id_col]

    # --- clean survey data ---
    df = df_survey.copy()
    df = df.replace(-1, np.nan)
    df[id_col] = df[id_col].astype(str)

    # map caseid -> row index
    id_to_idx = pd.Series(df.index.values, index=df[id_col]).to_dict()

    results = []
    for _, row in df.iterrows():
        cid = row[id_col]
        neighbors = neighbors_info.get(cid, [])
        if isinstance(neighbors, dict) and "neighbors" in neighbors:
            neighbors = neighbors["neighbors"]

        neighbors = [str(n) for n in neighbors if str(n) in id_to_idx and str(n) != cid]
        if top_k:
            neighbors = neighbors[:top_k]

        d = {id_col: cid}
        rates = []

        if not neighbors:
            for q in question_cols:
                d[f"{q}__match_rate"] = np.nan
            d["overall_match_rate_mean"] = np.nan
            results.append(d)
            continue

        nb_df = df.loc[[id_to_idx[n] for n in neighbors], question_cols]
        for q in question_cols:
            my_ans = row[q]
            if pd.isna(my_ans):
                d[f"{q}__match_rate"] = np.nan
            else:
                valid_nb = nb_df[q].dropna()
                d[f"{q}__match_rate"] = (valid_nb == my_ans).mean() if not valid_nb.empty else np.nan
                if not pd.isna(d[f"{q}__match_rate"]):
                    rates.append(d[f"{q}__match_rate"])

        d["overall_match_rate_mean"] = np.nanmean(rates) if rates else np.nan
        results.append(d)

    return pd.DataFrame(results)



for year in ['20', '22', '24']:
    jsonl_file = f"/home/ruomeng/gae/dataset/ces/raw/{year}/neighbors_top30_semantic_{year}.jsonl"  
    neighbors_info = load_jsonl_as_dict_of_dict(jsonl_file, key='caseid')
    caseid_lst = list(neighbors_info.keys())
    print('len(caseid_lst)', len(caseid_lst))
    # print(neighbors_info['1249937987'])
    df_survey = pd.read_csv(f"/home/ruomeng/gae/dataset/ces/raw/{year}/question_{year}.csv")

    # Example usage:
    for top_k in [1, 3, 5, 10, 15, 20, 25, 30]:
        sim_df = neighbor_match_rates(df_survey, neighbors_info, id_col="caseid", top_k=top_k)
        avg_rate = sim_df["overall_match_rate_mean"].mean(skipna=True)
        print('Year=%s - TopK=%s '%(year, top_k), "Average overall_match_rate_mean:", avg_rate)
        sim_df.to_csv('./analysis/analysis_neighbor_top%s_%s.csv'%(top_k, year))


len(caseid_lst) 6175
Year=20 - TopK=1  Average overall_match_rate_mean: 0.6462068548903752
Year=20 - TopK=3  Average overall_match_rate_mean: 0.6427965020088824
Year=20 - TopK=5  Average overall_match_rate_mean: 0.6430480309857971
Year=20 - TopK=10  Average overall_match_rate_mean: 0.6396983832200004
Year=20 - TopK=15  Average overall_match_rate_mean: 0.6391148873997874
Year=20 - TopK=20  Average overall_match_rate_mean: 0.6375436392419819
Year=20 - TopK=25  Average overall_match_rate_mean: 0.6365780819859783
Year=20 - TopK=30  Average overall_match_rate_mean: 0.636317245484725
len(caseid_lst) 6175
Year=22 - TopK=1  Average overall_match_rate_mean: 0.6334547078716286
Year=22 - TopK=3  Average overall_match_rate_mean: 0.6345493848018263
Year=22 - TopK=5  Average overall_match_rate_mean: 0.6370145489469016
Year=22 - TopK=10  Average overall_match_rate_mean: 0.6365390754861467
Year=22 - TopK=15  Average overall_match_rate_mean: 0.6359297905504931
Year=22 - TopK=20  Average overall_match_r

In [3]:
import numpy as np
import pandas as pd
from typing import Dict, List, Optional

def neighbor_mode_match(
    df_survey: pd.DataFrame,
    neighbors_info: Dict[str, List[str] | Dict],
    *,
    id_col: str = "caseid",
    question_cols: Optional[List[str]] = None,
    top_k: Optional[int] = None,
    tie_policy: str = "strict",  # "strict" | "lenient"
) -> pd.DataFrame:
    """
    For each respondent and question, compare the respondent's answer to the mode
    of their neighbors' answers.

    Returns a DataFrame with:
      - <q>__mode_match: True/False/NaN  (NaN when no neighbors or no unique mode under 'strict')
      - overall_mode_match_mean: mean of <q>__mode_match over questions (ignoring NaN)

    Args:
      df_survey:       rows = respondents, columns include id_col and question columns
      neighbors_info:  mapping id -> list of neighbor ids, or {"neighbors": [...]}
      id_col:          id column in df_survey
      question_cols:   which columns to evaluate; default = all non-id columns
      top_k:           if set, only use the first top_k neighbors
      tie_policy:      "strict": require unique mode; ties -> NaN
                       "lenient": ties -> match if own answer is among tied top answers
    """
    if id_col not in df_survey.columns:
        raise ValueError(f"`{id_col}` not found in survey data.")
    if question_cols is None:
        question_cols = [c for c in df_survey.columns if c != id_col]
    if tie_policy not in ("strict", "lenient"):
        raise ValueError("tie_policy must be 'strict' or 'lenient'")

    # --- clean survey data ---
    df = df_survey.copy()
    df = df.replace(-1, np.nan)   # treat -1 as missing if that's your convention
    df[id_col] = df[id_col].astype(str)

    # map caseid -> row index
    id_to_idx = pd.Series(df.index.values, index=df[id_col]).to_dict()

    out_rows = []
    for _, row in df.iterrows():
        cid = row[id_col]
        neighbors = neighbors_info.get(cid, [])
        if isinstance(neighbors, dict) and "neighbors" in neighbors:
            neighbors = neighbors["neighbors"]

        # normalize neighbor ids
        neighbors = [str(n) for n in neighbors if str(n) in id_to_idx and str(n) != cid]
        if top_k:
            neighbors = neighbors[:top_k]

        d = {id_col: cid}
        per_q_matches = []

        if not neighbors:
            # no neighbors → NaN for all questions
            for q in question_cols:
                d[f"{q}__mode_match"] = np.nan
            d["overall_mode_match_mean"] = np.nan
            out_rows.append(d)
            continue

        nb_df = df.loc[[id_to_idx[n] for n in neighbors], question_cols]

        for q in question_cols:
            my_ans = row[q]
            if pd.isna(my_ans):
                d[f"{q}__mode_match"] = np.nan
                continue

            # neighbor answers for this question
            valid_nb = nb_df[q].dropna()
            if valid_nb.empty:
                d[f"{q}__mode_match"] = np.nan
                continue

            vc = valid_nb.value_counts(dropna=False)
            top_count = vc.iloc[0]
            top_vals = vc[vc == top_count].index

            if tie_policy == "strict":
                # require a unique mode
                if len(top_vals) == 1:
                    mode_val = top_vals[0]
                    match = bool(my_ans == mode_val)
                else:
                    match = np.nan  # tie → unknown under strict
            else:  # lenient
                match = bool(my_ans in set(top_vals))

            d[f"{q}__mode_match"] = match
            if not pd.isna(match):
                per_q_matches.append(bool(match))

        d["overall_mode_match_mean"] = np.mean(per_q_matches) if per_q_matches else np.nan
        out_rows.append(d)

    return pd.DataFrame(out_rows)


for year in ['20', '22', '24']:
    jsonl_file = f"/home/ruomeng/gae/dataset/ces/raw/{year}/neighbors_top30_semantic_{year}.jsonl"  
    neighbors_info = load_jsonl_as_dict_of_dict(jsonl_file, key='caseid')
    caseid_lst = list(neighbors_info.keys())
    print('len(caseid_lst)', len(caseid_lst))
    # print(neighbors_info['1249937987'])
    df_survey = pd.read_csv(f"/home/ruomeng/gae/dataset/ces/raw/{year}/question_{year}.csv")

    # Example usage:
    for top_k in [30]:
        sim_df = neighbor_mode_match(df_survey, neighbors_info, id_col="caseid", top_k=top_k)
        avg_rate = sim_df["overall_mode_match_mean"].mean(skipna=True)
        print('Year=%s - TopK=%s '%(year, top_k), "Average overall_mode_match_mean:", avg_rate)
        sim_df.to_csv('./analysis/analysis_neighbor_top%s_%s_mode.csv'%(top_k, year))


len(caseid_lst) 6175
Year=20 - TopK=30  Average overall_mode_match_mean: 0.7118619265065904
len(caseid_lst) 6175
Year=22 - TopK=30  Average overall_mode_match_mean: 0.7112445333675073
len(caseid_lst) 6175
Year=24 - TopK=30  Average overall_mode_match_mean: 0.7149492131970796


In [4]:

import numpy as np
import pandas as pd
from collections import defaultdict
from typing import Dict, List, Optional, Union

def neighbor_weighted_mode_match(
    df_survey: pd.DataFrame,
    neighbors_info: Dict[str, Union[List[str], Dict]],
    *,
    id_col: str = "caseid",
    question_cols: Optional[List[str]] = None,
    top_k: Optional[int] = None,
    tie_policy: str = "strict",       # "strict" | "lenient"
    # ---- weighting options ----
    weight_scheme: str = "exp",       # "exp" | "linear" | "harmonic"
    gamma: float = 0.85,              # for exp: w_i ∝ gamma^(i)
    alpha: float = 1.0,               # for harmonic: w_i ∝ 1/(i+1)^alpha
    normalize_weights: bool = True,
    col_suffix: str = "wmode",        # output col suffix: <q>__wmode_match
) -> pd.DataFrame:
    """
    对每个受访者和题目，比较“自己的答案”与“按邻居顺序加权后的邻居众数(Weighted Mode)”是否一致。
    邻居列表有序：靠前的邻居权重更大（指数/线性/调和三种可选）。

    输出：
      - 每题一个布尔/NaN列：<q>__{col_suffix}_match
      - overall_{col_suffix}_match_mean：各题(忽略NaN)的平均匹配率

    tie_policy:
      - "strict": 需要唯一的加权众数；若并列最大 → NaN
      - "lenient": 若自答在并列最大集合里 → 记为匹配
    """
    if id_col not in df_survey.columns:
        raise ValueError(f"`{id_col}` not found in survey data.")
    if question_cols is None:
        question_cols = [c for c in df_survey.columns if c != id_col]
    if tie_policy not in ("strict", "lenient"):
        raise ValueError("tie_policy must be 'strict' or 'lenient'")
    if weight_scheme not in ("exp", "linear", "harmonic"):
        raise ValueError("weight_scheme must be 'exp' | 'linear' | 'harmonic'")

    # --- clean survey data ---
    df = df_survey.copy()
    df = df.replace(-1, np.nan)   # 习惯上把 -1 当缺失
    df[id_col] = df[id_col].astype(str)

    # id -> row index
    id_to_idx = pd.Series(df.index.values, index=df[id_col]).to_dict()

    def _rank_weights(n: int) -> np.ndarray:
        if n <= 0:
            return np.array([], dtype=float)
        if weight_scheme == "exp":
            w = np.array([gamma**i for i in range(n)], dtype=float)   # i: 0..n-1
        elif weight_scheme == "linear":
            w = np.arange(n, 0, -1, dtype=float)                      # n, n-1, ..., 1
        else:  # harmonic
            w = 1.0 / (np.arange(n, dtype=float) + 1.0)**alpha
        if normalize_weights and w.sum() > 0:
            w = w / w.sum()
        return w

    out_rows = []
    for _, row in df.iterrows():
        cid = row[id_col]
        neighbors = neighbors_info.get(cid, [])
        if isinstance(neighbors, dict) and "neighbors" in neighbors:
            neighbors = neighbors["neighbors"]

        # 归一化 id，并保持原顺序（按相似度从高到低的假设）
        neighbors = [str(n) for n in neighbors if str(n) in id_to_idx and str(n) != cid]
        if top_k:
            neighbors = neighbors[:top_k]

        d = {id_col: cid}
        per_q_matches = []

        if not neighbors:
            for q in question_cols:
                d[f"{q}__{col_suffix}_match"] = np.nan
            d[f"overall_{col_suffix}_match_mean"] = np.nan
            out_rows.append(d)
            continue

        nb_idx = [id_to_idx[n] for n in neighbors]
        nb_df = df.loc[nb_idx, question_cols]         # 行顺序与 neighbors 一致
        weights = _rank_weights(len(nb_idx))          # 与行顺序对齐

        for q in question_cols:
            my_ans = row[q]
            if pd.isna(my_ans):
                d[f"{q}__{col_suffix}_match"] = np.nan
                continue

            # 逐邻居累加加权计数（按顺序）
            counts = defaultdict(float)
            col_vals = nb_df[q].values  # ndarray 与 weights 同长度、同顺序
            for i, a in enumerate(col_vals):
                if pd.notna(a):
                    counts[a] += weights[i]

            if not counts:
                d[f"{q}__{col_suffix}_match"] = np.nan
                continue

            # 选加权众数（可能并列）
            max_w = max(counts.values())
            top_vals = [ans for ans, w in counts.items() if np.isclose(w, max_w)]

            if tie_policy == "strict":
                if len(top_vals) == 1:
                    match = bool(my_ans == top_vals[0])
                else:
                    match = np.nan
            else:  # lenient
                match = bool(my_ans in set(top_vals))

            d[f"{q}__{col_suffix}_match"] = match
            if not pd.isna(match):
                per_q_matches.append(bool(match))

        d[f"overall_{col_suffix}_match_mean"] = np.mean(per_q_matches) if per_q_matches else np.nan
        out_rows.append(d)

    return pd.DataFrame(out_rows)



for year in ['20', '22', '24']:
    jsonl_file = f"/home/ruomeng/gae/dataset/ces/raw/{year}/neighbors_top30_semantic_{year}.jsonl"  
    neighbors_info = load_jsonl_as_dict_of_dict(jsonl_file, key='caseid')
    caseid_lst = list(neighbors_info.keys())
    print('len(caseid_lst)', len(caseid_lst))
    # print(neighbors_info['1249937987'])
    df_survey = pd.read_csv(f"/home/ruomeng/gae/dataset/ces/raw/{year}/question_{year}.csv")

    # Example usage:
    for top_k in [30]:
        sim_df = neighbor_weighted_mode_match(df_survey, neighbors_info, id_col="caseid", top_k=top_k)
        avg_rate = sim_df["overall_wmode_match_mean"].mean(skipna=True)
        print('Year=%s - TopK=%s '%(year, top_k), "Average overall_wmode_match_mean:", avg_rate)
        sim_df.to_csv('./analysis/analysis_neighbor_top%s_%s_wmode.csv'%(top_k, year))

len(caseid_lst) 6175
Year=20 - TopK=30  Average overall_wmode_match_mean: 0.6938491772532912
len(caseid_lst) 6175
Year=22 - TopK=30  Average overall_wmode_match_mean: 0.6903654228410767
len(caseid_lst) 6175
Year=24 - TopK=30  Average overall_wmode_match_mean: 0.6959471042340476
