In [7]:
import pandas as pd

# load data
df20 = pd.read_csv("/home/ruomeng/gae/dataset/ces/raw/20/question_20.csv")
df22 = pd.read_csv("/home/ruomeng/gae/dataset/ces/raw/22/question_22.csv")
df24 = pd.read_csv("/home/ruomeng/gae/dataset/ces/raw/24/question_24.csv")

# ensure consistent id type
for df in (df20, df22, df24):
    df["caseid"] = df["caseid"].astype(str)

# find common respondents & common questions
common_ids = set(df20["caseid"]) & set(df22["caseid"]) & set(df24["caseid"])
question_cols = [c for c in df20.columns if c not in ["caseid", "year"]]


# align
df20 = df20[df20["caseid"].isin(common_ids)].set_index("caseid")[question_cols]
df22 = df22[df22["caseid"].isin(common_ids)].set_index("caseid")[question_cols]
df24 = df24[df24["caseid"].isin(common_ids)].set_index("caseid")[question_cols]


import pandas as pd

# --- assume df20, df22, df24 are already aligned and -1 replaced with NaN ---

# binary change indicators (NaN-safe)
chg_20_22 = (df20 != df22).astype(int)
chg_22_24 = (df22 != df24).astype(int)
chg_20_24 = (df20 != df24).astype(int)

# -------------------------------
# 1. Per-person: % of answers changed
# -------------------------------
per_person_pct = pd.DataFrame({
    "pct_change_20_22": chg_20_22.mean(axis=1) * 100,
    "pct_change_22_24": chg_22_24.mean(axis=1) * 100,
    "pct_change_20_24": chg_20_24.mean(axis=1) * 100,
})

print("=== Per-Person Average % of Changed Answers ===")
print(per_person_pct.mean().round(2).to_string(), "\n")

# -------------------------------
# 2. Per-question: % of people who changed
# -------------------------------
per_question_pct = pd.DataFrame({
    "pct_change_20_22": chg_20_22.mean(axis=0) * 100,
    "pct_change_22_24": chg_22_24.mean(axis=0) * 100,
    "pct_change_20_24": chg_20_24.mean(axis=0) * 100,
})

print("=== Per-Question % of People Who Changed ===")
print(per_question_pct.round(2).head(20))  # show first 10 questions as example

# -------------------------------
# 3. Summary stats
# -------------------------------
summary = {
    "Avg % change 20→22": per_person_pct["pct_change_20_22"].mean().round(2),
    "Avg % change 22→24": per_person_pct["pct_change_22_24"].mean().round(2),
    "Avg % change 20→24": per_person_pct["pct_change_20_24"].mean().round(2),
    "People with ≥1 change 20→22 (%)": (chg_20_22.sum(axis=1) > 0).mean() * 100,
    "People with ≥1 change 22→24 (%)": (chg_22_24.sum(axis=1) > 0).mean() * 100,
    "People with ≥1 change 20→24 (%)": (chg_20_24.sum(axis=1) > 0).mean() * 100,
}

print("\n=== Summary ===")
for k, v in summary.items():
    print(f"{k:<35} {v:.2f}%")


=== Per-Person Average % of Changed Answers ===
pct_change_20_22    23.25
pct_change_22_24    17.51
pct_change_20_24    25.37 

=== Per-Question % of People Who Changed ===
        pct_change_20_22  pct_change_22_24  pct_change_20_24
pid3               11.01              8.65             13.28
300_1              17.41             16.92             20.11
300_2              19.71             18.74             21.77
300_3              21.34             19.98             22.96
300_4              21.18             18.95             21.28
302                77.88             56.53             84.66
303                54.20             44.65             56.81
305_1               3.40              3.24              3.08
305_2               7.77              4.97              8.53
305_3               1.07              1.04              1.13
305_4               6.28              5.52              6.56
305_5               1.43              1.21              1.28
305_6               1.73          