In [None]:
import pandas as pd

In [None]:
# ---------- 1. Load all country files and build master file ----------

country_files = {
    "Pakistan":   "sample_data/RAI_processed_output_pakistan.csv",
    "Bangladesh": "sample_data/RAI_processed_output_bangladesh.csv",
    "Sri Lanka":  "sample_data/RAI_processed_output_srilanka.csv",
    "Maldives":   "sample_data/RAI_processed_output_maldives.csv",
    "Bhutan":     "sample_data/RAI_processed_output_bhutan.csv",
}

dfs = []
for country, path in country_files.items():
    df = pd.read_csv(path)
    df["country"] = country
    dfs.append(df)

master = pd.concat(dfs, ignore_index=True)

# keep only the columns we actually need (adjust list if needed)
cols_keep = [
    "Title", "Abstract", "title_clean", "text", "country",
    "rq1_responsible_ai", "rq1_ethical_ai", "rq1_trustworthy_ai",
    "rq1_xai", "rq1_governance", "rq1_policy", "rq1_transparency",
    "rq1_accountability", "rq1_fairness", "rq1_privacy", "rq1_safety",
    "rq1_oversight", "rq1_bias",
    "rq2_public", "rq2_institutional", "rq2_policy_level",
    "rq3_capacity", "rq3_regulation", "rq3_education",
    "rq3_awareness_gap", "rq3_data_quality",
    "rq4_framework_support_score",
]

# keep only columns that exist in the file (in case some are missing)
cols_keep = [c for c in cols_keep if c in master.columns]

master = master[cols_keep]

# save master dataset
master.to_csv("RAI_master_all_countries.csv", index=False)

# ---------- 2. Per-country RQ1 / RQ2 / RQ3 keyword counts ----------

rq1_cols = [c for c in master.columns if c.startswith("rq1_")]
rq2_cols = [c for c in master.columns if c.startswith("rq2_")]
rq3_cols = [c for c in master.columns if c.startswith("rq3_")]

rq1_summary = master.groupby("country")[rq1_cols].sum().reset_index()
rq2_summary = master.groupby("country")[rq2_cols].sum().reset_index()
rq3_summary = master.groupby("country")[rq3_cols].sum().reset_index()

rq1_summary.to_csv("RAI_RQ1_summary_by_country.csv", index=False)
rq2_summary.to_csv("RAI_RQ2_summary_by_country.csv", index=False)
rq3_summary.to_csv("RAI_RQ3_summary_by_country.csv", index=False)

# ---------- 3. RQ4: average framework support score per country ----------

if "rq4_framework_support_score" in master.columns:
    rq4_summary = (
        master.groupby("country")["rq4_framework_support_score"]
        .mean()
        .reset_index()
        .rename(columns={"rq4_framework_support_score": "rq4_avg_framework_support"})
    )
    rq4_summary.to_csv("sample_data/RAI_RQ4_summary_by_country.csv", index=False)