In [2]:
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
SIMILARITY_THRESHOLD = 0.85
MODEL_NAME = "all-MiniLM-L6-v2"

FILES = {
    "02": "../data_dump/variables/2002/ZA3880_variables_short.json",
    "12": "../data_dump/variables/2012/ZA5900_variables_short.json",
    "22": "../data_dump/variables/2022/ZA10000_variables_short.json",
}

def load_unique_questions(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    q_to_var = {}
    for var, q in data.items():
        q = q.strip()
        if q not in q_to_var:
            q_to_var[q] = var
    return q_to_var

records = []
for year, path in FILES.items():
    qmap = load_unique_questions(path)
    for q, var in qmap.items():
        records.append({"year": year, "question": q, "var": var})

meta_df = pd.DataFrame(records)
model = SentenceTransformer(MODEL_NAME)
embeddings = model.encode(meta_df["question"].tolist(), normalize_embeddings=True)

cos_sim = cosine_similarity(embeddings)
distance = 1 - cos_sim

years = meta_df["year"].values

for i in range(len(distance)):
    for j in range(len(distance)):
        if i != j and years[i] == years[j]:
            if meta_df.iloc[i]["question"] != meta_df.iloc[j]["question"]:
                distance[i, j] = 1.0  

clustering = AgglomerativeClustering(
    metric="precomputed",
    linkage="average",
    distance_threshold=1 - SIMILARITY_THRESHOLD,
    n_clusters=None,
)

labels = clustering.fit_predict(distance)
meta_df["group"] = labels

rows = []

for gid, group in meta_df.groupby("group"):
    row = {
        "question": group.iloc[0]["question"],
        "question_02": "",
        "question_12": "",
        "question_22": "",
        "var_02": "",
        "var_12": "",
        "var_22": "",
        "similarity_score": cosine_similarity(model.encode(group["question"].tolist(), normalize_embeddings=True)).mean()}

    for _, r in group.iterrows():
        row[f"question_{r['year']}"] = r["question"]
        row[f"var_{r['year']}"] = r["var"]

    rows.append(row)

final_df = pd.DataFrame(rows)
final_df.to_csv("common_question_mapping.csv", index=False)


In [None]:
mapping_var = pd.read_csv("../data_dump/common_question_mapping.csv")
mapping_var

Unnamed: 0,hh,question_02,question_12,question_22,var_02,var_12,var_22,score,COMMON_VAR
0,Place of living: urban - rural,Type of community: urban-rural self-ass (URBRU...,Place of living: urban - rural,Place of living: urban - rural,V358,URBRURAL,URBRURAL,1.0,urban_rural
1,Top-Bottom self-placement,R:Top Bottom self-placement 10 pt scale (TOPBOT),Top-Bottom self-placement,Top-Bottom self-placement,V291,TOPBOT,TOPBOT,1.0,TOPBOT
2,"Spouse, partner: main status",S-P: Current employment status (SPWRKST),"Spouse, partner: main status","Spouse, partner: main status",V246,SPMAINST,SPMAINST,1.0,spouse_work_status
3,"Spouse, partner: hours worked weekly",Q27 Spouse: hours worked weekly,"Spouse, partner: hours worked weekly","Spouse, partner: hours worked weekly",V71,SPWRKHRS,SPWRKHRS,1.0,SPWRKHRS
4,SEX,R: Sex (SEX),Sex of Respondent,Sex of Respondent,V200,SEX,SEX,1.0,sex
5,Sample Prefix ISO 3166 Code - alphanumeric,Sample Prefix ISO 3166 Code - alphanumeric,Country Prefix ISO 3166 Code - alphanumeric,Country/ Sample Prefix ISO 3166 code - alphanu...,C_ALPHAN,C_ALPHAN,c_alphan,0.938,C_ALPHAN
6,Q4e Couple livg together without marriage,Q4e Couple livg together without marriage,Q4c Marriage: Couple living together without m...,Q4d Couple living together without marriage ok,V22,V16,v14,0.944,LIVWOMAR
7,Q3c Shld women work:youngest kid at school,Q3c Shld women work:youngest kid at school,Q3b Should women work: Youngest kid at school,Q3b Should women work: youngest kid at school,V16,V13,v10,0.944,WWYKS
8,Q3b Shld women work:child under school age,Q3b Shld women work:child under school age,Q3a Should women work: Child under school age,Q3a Should women work: child under school age,V15,V12,v9,0.948,WWYKUS
9,"Q32 Spouse, partner degree: Highest completed ...",Q26 Spouse degree: highest qualification,"Q32 Spouse, partner degree: Highest completed ...",ISCED 2011 simplified: highest completed degre...,V70,V65,SPEDULEV,1.0,SP_DEGREE


In [None]:
var_02_map = {}
var_12_map = {}
var_22_map = {}

for row in mapping_var.itertuples():
    common_var = row.COMMON_VAR
    var_02_map[row.var_02] = common_var
    var_12_map[row.var_12] = common_var
    var_22_map[row.var_22] = common_var

In [None]:
with open("../data_dump/var_02_map.json", "w", encoding="utf-8") as f:
    json.dump(var_02_map, f, indent=4)

with open("../data_dump/var_12_map.json", "w", encoding="utf-8") as f:
    json.dump(var_12_map, f, indent=4)  

with open("../data_dump/var_22_map.json", "w", encoding="utf-8") as f:
    json.dump(var_22_map, f, indent=4)