In [1]:
import json
import io
import numpy as np
import pandas as pd
from collections import Counter
import random

def load_jsonl_as_dict_of_dict(path, key=None):
    data = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)
            data[obj[key]] = obj  
    return data


def extract_answers_and_majority(df: pd.DataFrame, id_list, col='ddd', seed: int | None = None):
    """
    Returns:
      answers_list: [[ans_q1, ans_q2, ...] per row] for rows with caseid in id_list (same order)
      majority_col: majority value of `col` among those rows (random tie-break; ignores NaN)
    """
    rng = random.Random(seed)

    # keep only answer columns (everything except 'caseid')
    q_cols = [c for c in df.columns if c != 'caseid']
    
    print(df)
    # subset rows by id_list, preserving order; may introduce NaN for missing ids
    sub = (df.set_index('caseid')
             .reindex(id_list))
    print('sub', sub)

    # 1) answers list for all questions
    answers_list = sub[q_cols].to_numpy().tolist()

    # 2) majority of the specified column among these rows (ignore NaN)
    s = sub[col].dropna()
    if s.empty:
        majority_col = None
    else:
        vc = s.value_counts()
        maxc = vc.max()
        winners = vc[vc == maxc].index.tolist()
        majority_col = rng.choice(winners)  # random tie-break (set seed for determinism)

    return answers_list, majority_col


def _majority_with_random_tie(values, rng: random.Random):
    if len(values) == 0:
        return None
    c = Counter(values)
    m = max(c.values())
    winners = [k for k, v in c.items() if v == m]
    return rng.choice(winners)

def _impute_with_neighbors(df: pd.DataFrame,
                           observed_ids: set,
                           community_map: dict[int, list[int]],
                           rng: random.Random) -> pd.DataFrame:
    """
    Impute every unobserved respondent's answers using the majority among their
    specified neighbor list (community_map[v]) *restricted to observed ids*.
    Fallback to global majority among observed when neighbor pool is empty.
    """
    assert "caseid" in df.columns
    q_cols = [c for c in df.columns if c != "caseid"]
    df_idx = df.set_index("caseid")
    all_ids = df_idx.index.tolist()

    observed_ids = set(observed_ids)
    unobs_ids = [cid for cid in all_ids if cid not in observed_ids]
    obs_ids_list = [cid for cid in all_ids if cid in observed_ids]

    imputed = df_idx.copy()

    # Precompute global observed values per question
    global_obs = {q: df_idx.loc[obs_ids_list, q].dropna().tolist() for q in q_cols}

    for q in q_cols:
        # For each unobserved respondent, try neighbor-majority first
        for v in unobs_ids:
            # Who counts for v?
            neigh = community_map.get(v, [])
            # Filter to observed neighbors that exist in df
            neigh_obs = [u for u in neigh if (u in observed_ids) and (u in df_idx.index)]
            vals = df_idx.loc[neigh_obs, q].dropna().tolist() if neigh_obs else []
            if not vals:
                vals = global_obs[q]
            if not vals:
                # if still empty, leave as NaN (no info anywhere)
                continue
            imputed.at[v, q] = _majority_with_random_tie(vals, rng)

    return imputed.reset_index()

def _evaluate_accuracy(df_true: pd.DataFrame, df_pred: pd.DataFrame):
    assert list(df_true.columns) == list(df_pred.columns)
    q_cols = [c for c in df_true.columns if c != "caseid"]
    t = df_true.set_index("caseid").sort_index()
    p = df_pred.set_index("caseid").sort_index()
    eq = (t[q_cols] == p[q_cols])
    return float(eq.values.mean()), eq.mean(axis=0).to_dict()

def _run_experiment(df: pd.DataFrame,
                    community_map: dict[int, list[int]],
                    sample_frac: float = 0.2,
                    n_trials: int = 100,
                    seed: int | None = None) -> pd.DataFrame:
    rng_master = random.Random(seed)
    caseids = df["caseid"].tolist()
    n = len(caseids)
    k = max(1, int(round(sample_frac * n)))
    rows = []

    for trial in range(n_trials):
        rng_trial = random.Random((seed or 0) + trial * 100003)
        observed_ids = set(rng_trial.sample(caseids, k))
        imputed = _impute_with_neighbors(df, observed_ids, community_map, rng_trial)
        overall_acc, per_q = _evaluate_accuracy(df, imputed)
        rows.append({
            "trial": trial,
            "observed_count": k,
            "sample_frac": sample_frac,
            "overall_acc": overall_acc,
            **{f"acc_{q}": v for q, v in per_q.items()}
        })

    return pd.DataFrame(rows)

def sweep_sample_fracs(df: pd.DataFrame,
                       community_map: dict[int, list[int]],
                       fracs=(0.1, 0.5, 0.75),
                       n_trials: int = 3,
                       seed: int | None = 42) -> pd.DataFrame:
    """
    Sweep over multiple sample fractions. Returns a compact summary table.
    """
    out = []
    for f in fracs:
        res = _run_experiment(df, community_map, sample_frac=f, n_trials=n_trials, seed=seed)
        out.append({
            "sample_frac": f,
            "mean_overall_acc": float(res["overall_acc"].mean()),
            "std_overall_acc": float(res["overall_acc"].std(ddof=1)) if len(res) > 1 else 0.0,
            "observed_count": int(max(1, round(f * len(df))))
        })
    return pd.DataFrame(out)



In [2]:

year = '24'
jsonl_file = f"/home/ruomeng/gae/dataset/ces_pro/raw/{year}/neighbors_{year}.jsonl"  
neighbors_info = load_jsonl_as_dict_of_dict(jsonl_file, key='caseid')

community_map = dict()
for caseid in neighbors_info:
    community_map[caseid] =  neighbors_info[caseid]['neighbors']


df = pd.read_csv("/home/ruomeng/gae/dataset/ces_pro/raw/24/questions_test_24.csv")
df["caseid"] = df["caseid"].astype(str)
qs = pd.read_csv("/home/ruomeng/gae/dataset/ces_pro/raw/24/train_qs.csv")['train_qs'].tolist()
df_cand = df[['caseid'] + qs]

summary = sweep_sample_fracs(df, community_map, fracs=(0.1, 0.5, 0.75), n_trials=3, seed=42)
summary


Unnamed: 0,sample_frac,mean_overall_acc,std_overall_acc,observed_count
0,0.1,0.848753,0.011403,96
1,0.5,0.932744,0.002337,481
2,0.75,0.966031,0.00063,722


In [3]:
year = '24'
jsonl_file = f"/home/ruomeng/gae/dataset/ces_pro/raw/{year}/neighbors_{year}.jsonl"  
neighbors_info = load_jsonl_as_dict_of_dict(jsonl_file, key='caseid')

community_map_top10 = dict()
for caseid in neighbors_info:
    community_map_top10[caseid] =  neighbors_info[caseid]['neighbors'][:20]


df = pd.read_csv("/home/ruomeng/gae/dataset/ces_pro/raw/24/questions_test_24.csv")
df["caseid"] = df["caseid"].astype(str)
qs = pd.read_csv("/home/ruomeng/gae/dataset/ces_pro/raw/24/train_qs.csv")['train_qs'].tolist()
df_cand = df[['caseid'] + qs]

summary = sweep_sample_fracs(df, community_map_top10, fracs=(0.1, 0.5, 0.75), n_trials=3, seed=42)
summary

Unnamed: 0,sample_frac,mean_overall_acc,std_overall_acc,observed_count
0,0.1,0.822569,0.015376,96
1,0.5,0.931324,0.001908,481
2,0.75,0.966413,0.000447,722


In [4]:

year = '24'
jsonl_file = f"/home/ruomeng/gae/dataset/ces_pro/raw/{year}/neighbors_{year}.jsonl"  
neighbors_info = load_jsonl_as_dict_of_dict(jsonl_file, key='caseid')

community_map_top5 = dict()
for caseid in neighbors_info:
    community_map_top5[caseid] =  neighbors_info[caseid]['neighbors'][:5]

df = pd.read_csv("/home/ruomeng/gae/dataset/ces_pro/raw/24/questions_test_24.csv")
df["caseid"] = df["caseid"].astype(str)
qs = pd.read_csv("/home/ruomeng/gae/dataset/ces_pro/raw/24/train_qs.csv")['train_qs'].tolist()
df_cand = df[['caseid'] + qs]


summary = sweep_sample_fracs(df, community_map_top5, fracs=(0.1, 0.5, 0.75), n_trials=3, seed=42)
summary

Unnamed: 0,sample_frac,mean_overall_acc,std_overall_acc,observed_count
0,0.1,0.759656,0.014638,96
1,0.5,0.915639,0.007888,481
2,0.75,0.96207,0.00116,722
