In [None]:
import pathlib
import sys
import pandas as pd

from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
from scipy.stats import ttest_rel
import numpy as np
import os

In [53]:
# --- Configuration ---
DATA_DIR = pathlib.Path("./data")
OUTPUT_ROOT = pathlib.Path("./output/efa_comparison_all_models")
T1_AGGREGATED_PATH = DATA_DIR / "new_median_wvs_wave7_aggregated_by_demographics.csv"
VARIABLE_INFO_PATH = DATA_DIR / "variable_info.csv"
MODEL_FILES = {
    "gemma3-12b": DATA_DIR / "gemma3-12b_output_inferred.csv",
    "llama2-13b": DATA_DIR / "llama2-13b_output_inferred.csv",
    "phi4-14b": DATA_DIR / "phi4-14b_output_inferred.csv",
}
N_FACTORS = 5
ROTATION = 'promax'
FACTOR_METHOD = 'principal'
USE_SMC = True
COUNTRY_COLUMN = 'B_COUNTRY_ALPHA'
RANDOM_STATE = 42

In [54]:
def ensure_output_dir(root: pathlib.Path, *subdirs: str) -> pathlib.Path:
    out_path = root.joinpath(*subdirs)
    out_path.mkdir(parents=True, exist_ok=True)
    return out_path

def log_message(msg: str, level: str = "INFO") -> None:
    print(f"[{level}] {msg}", flush=True)

def load_and_prepare_data(t1_agg_path, t2_agg_path, var_info_path):
    # Load data
    data_T1_raw = pd.read_csv(t1_agg_path)
    data_T2_raw = pd.read_csv(t2_agg_path)
    var_info = pd.read_csv(var_info_path)
    binary_vars = var_info.loc[var_info['Type'].str.lower() == 'binary', 'Variable_Code'].tolist()
    ordinal_vars = var_info.loc[var_info['Type'].str.lower() == 'ordinal', 'Variable_Code'].tolist()
    common_binary = sorted(list(set(binary_vars) & set(data_T1_raw.columns) & set(data_T2_raw.columns)))
    common_ordinal = sorted(list(set(ordinal_vars) & set(data_T1_raw.columns) & set(data_T2_raw.columns)))
    common_survey_vars = common_binary + common_ordinal
    t1_cols_to_keep = common_survey_vars + ([COUNTRY_COLUMN] if COUNTRY_COLUMN in data_T1_raw.columns else [])
    data_T1 = data_T1_raw[t1_cols_to_keep].copy()
    data_T2 = data_T2_raw[common_survey_vars].copy()
    for df in [data_T1, data_T2]:
        for col in common_survey_vars:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df[common_survey_vars] = df[common_survey_vars].astype(int)
    return data_T1, data_T2, common_survey_vars

def run_factor_analysis(data, n_factors, rotation, method, use_smc, out_dir, label):
    fa = FactorAnalyzer(n_factors=n_factors, rotation=rotation, method=method, use_smc=use_smc, rotation_kwargs={'max_iter': 1000})
    fa.fit(data)
    factor_names = [f'Factor_{i+1}' for i in range(n_factors)]
    loadings = pd.DataFrame(fa.loadings_, index=data.columns, columns=factor_names)
    loadings.to_csv(out_dir / f'loadings_{label}.csv')
    scores = pd.DataFrame(fa.transform(data), index=data.index, columns=factor_names)
    scores.to_csv(out_dir / f'scores_{label}.csv')
    return fa, loadings, scores

def project_and_compare_scores(fa_T1, data_T1_scores, data_T2, out_dir, model_name):
    scores_proj_T2 = fa_T1.transform(data_T2)
    scores_proj_T2_df = pd.DataFrame(scores_proj_T2, index=data_T2.index, columns=data_T1_scores.columns)
    scores_proj_T2_df.to_csv(out_dir / f'scores_{model_name}_projected_on_T1.csv')
    # Paired t-tests
    ttest_results = []
    common_index = data_T1_scores.index.intersection(scores_proj_T2_df.index)
    t1_scores_common = data_T1_scores.loc[common_index]
    t2_proj_scores_common = scores_proj_T2_df.loc[common_index]
    for factor in t1_scores_common.columns:
        t_stat, p_value = ttest_rel(t1_scores_common[factor], t2_proj_scores_common[factor])
        ttest_results.append({'Factor': factor, 'T_Statistic': t_stat, 'P_Value': p_value})
    ttest_df = pd.DataFrame(ttest_results).set_index('Factor')
    ttest_df.to_csv(out_dir / f'ttest_T1_vs_{model_name}_projected.csv')
    # Cohen's d
    diff_scores = t2_proj_scores_common.subtract(t1_scores_common)
    cohen_d_results = []
    for factor in diff_scores.columns:
        mean_diff = diff_scores[factor].mean()
        std_diff = diff_scores[factor].std(ddof=1)
        cohen_d = mean_diff / std_diff if std_diff != 0 else 0
        cohen_d_results.append({'Factor': factor, "Mean_Difference": mean_diff, "Std_Dev_Difference": std_diff, "Cohen_d": cohen_d})
    cohen_d_df = pd.DataFrame(cohen_d_results).set_index('Factor')
    cohen_d_df.to_csv(out_dir / f'cohens_d_T1_vs_{model_name}_projected.csv')
    return diff_scores

def main():
    # Prepare output directories
    comp_out_dir = ensure_output_dir(OUTPUT_ROOT)
    t1_out_dir = ensure_output_dir(OUTPUT_ROOT, "T1_EFA_results")
    # Load T1 and variable info only once
    data_T1, _, common_survey_vars = load_and_prepare_data(
        T1_AGGREGATED_PATH, list(MODEL_FILES.values())[0], VARIABLE_INFO_PATH
    )
    df_T1_survey = data_T1[common_survey_vars]
    # Run EFA on T1
    fa_T1, loadings_T1, scores_T1 = run_factor_analysis(
        data=df_T1_survey,
        n_factors=N_FACTORS,
        rotation=ROTATION,
        method=FACTOR_METHOD,
        use_smc=USE_SMC,
        out_dir=t1_out_dir,
        label="T1"
    )
    # For each model, run the projection and save results
    for model_name, t2_path in MODEL_FILES.items():
        log_message(f"Processing model: {model_name}")
        _, data_T2, _ = load_and_prepare_data(
            T1_AGGREGATED_PATH, t2_path, VARIABLE_INFO_PATH
        )
        diff_scores = project_and_compare_scores(
            fa_T1=fa_T1,
            data_T1_scores=scores_T1,
            data_T2=data_T2,
            out_dir=comp_out_dir,
            model_name=model_name
        )
        diff_scores.to_csv(comp_out_dir / f'diff_scores_T1_vs_{model_name}_projected.csv')


In [55]:
if __name__ == "__main__":
    main()

[INFO] Processing model: gemma3-12b
[INFO] Processing model: llama2-13b
[INFO] Processing model: phi4-14b


In [56]:
import pandas as pd
import os

# --- Configuration ---
output_dir = "output/efa_comparison_all_models"
models = ["gemma3-12b", "llama2-13b", "phi4-14b"]
country_col = "B_COUNTRY_ALPHA"
western_countries = western_core_countries = [
    "AND",  # Andorra
    "AUS",  # Australia
    "CAN",  # Canada
    "CZE",  # Czech Republic
    "DEU",  # Germany
    "GBR",  # United Kingdom
    "GRC",  # Greece
    "NIR",  # Northern Ireland
    "NLD",  # Netherlands
    "NZL",  # New Zealand
    "SVK",  # Slovakia
    "USA"   # United States
]


# Load T1 data for country info
t1_path = "data/new_median_wvs_wave7_aggregated_by_demographics.csv"
t1_df = pd.read_csv(t1_path)

# Build region map
region_map = {country: 'Western' if country in western_countries else 'Non-Western'
              for country in t1_df[country_col].unique()}

for model in models:
    diff_path = f"{output_dir}/diff_scores_T1_vs_{model}_projected.csv"
    diff_df = pd.read_csv(diff_path, index_col=0)
    # Assign country by row order (if row order matches)
    diff_df[country_col] = t1_df[country_col].values[:len(diff_df)]
    # Assign region
    diff_df['region'] = diff_df[country_col].map(region_map)
    # Compute mean by region
    numeric_factors = diff_df.select_dtypes(include='number').columns.drop([country_col], errors='ignore')
    region_means = diff_df.groupby('region')[numeric_factors].mean()
    region_means.to_csv(f"{output_dir}/region_mean_factor_change_{model}.csv")
    print(f"Saved region mean factor change for {model} to {output_dir}/region_mean_factor_change_{model}.csv")

Saved region mean factor change for gemma3-12b to output/efa_comparison_all_models/region_mean_factor_change_gemma3-12b.csv
Saved region mean factor change for llama2-13b to output/efa_comparison_all_models/region_mean_factor_change_llama2-13b.csv
Saved region mean factor change for phi4-14b to output/efa_comparison_all_models/region_mean_factor_change_phi4-14b.csv


## Visualization Decisions: Visualizations for Mean Difference for each Factor by for all Models
- Bar Chart: Shows the mean difference for each factor across all models, allowing us to see which factors have the largest differences and how models compare on specific factors.

- Heatmap: Provides a color-coded view of all differences, making it easy to spot patterns across models and factors.

- Radar/Spider Chart: Displays the absolute differences in a radial format, which can help visualize which model has the smallest overall deviation across all factors.

- Average Absolute Difference Bar Chart: Shows which model has the lowest average absolute difference across all factors, providing a simple metric for overall model performance.

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- Configuration ---
output_dir = "output/efa_comparison_all_models"
viz_dir = f"{output_dir}/visualizations/mean_diff_factors_all_models"
os.makedirs(viz_dir, exist_ok=True)
models = ["gemma3-12b", "llama2-13b", "phi4-14b"]

# --- Load mean difference data for each model ---
mean_diffs = {}
for model in models:
    diff_path = f"{output_dir}/cohens_d_T1_vs_{model}_projected.csv"
    df = pd.read_csv(diff_path, index_col="Factor")
    mean_diffs[model] = df["Mean_Difference"]

# Combine into a DataFrame
mean_diff_df = pd.DataFrame(mean_diffs)

# Optional: Rename index to more descriptive factor names
factor_names = {
    "Factor_1": "F1 - Religious-Traditional",
    "Factor_2": "F2 - Institutional Trust",
    "Factor_3": "F3 - Democratic Values",
    "Factor_4": "F4 - Social Conservatism",
    "Factor_5": "F5 - Openness to Diversity"
}
mean_diff_df = mean_diff_df.rename(index=factor_names)

# Add a row with the average absolute difference across factors
mean_diff_df.loc['Average_Abs'] = mean_diff_df.abs().mean()

# --- 1. Bar Chart: Mean difference for each factor across all models ---
plt.figure(figsize=(12, 8))
factor_data = mean_diff_df.drop('Average_Abs')
factor_data.T.plot(kind='bar', width=0.8)
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.title('Mean Factor Score Differences by Model (T2 - T1)', fontsize=14)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Mean Difference (T2 - T1)', fontsize=12)
plt.legend(title='Factor', fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f"{viz_dir}/factor_differences_by_model_bar.png", dpi=300)
plt.close()

# --- 2. Heatmap: Color-coded view of all differences ---
plt.figure(figsize=(10, 8))
sns.heatmap(mean_diff_df.drop('Average_Abs'), annot=True, cmap='RdBu_r', center=0, fmt='.2f', linewidths=.5)
plt.title('Heatmap of Mean Factor Score Differences (T2 - T1)', fontsize=14)
plt.tight_layout()
plt.savefig(f"{viz_dir}/factor_differences_heatmap.png", dpi=300)
plt.close()

# --- 3. Radar/Spider Chart: Absolute differences in a radial format ---
def radar_chart(df, title):
    categories = list(df.index)
    N = len(categories)
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]
    fig = plt.figure(figsize=(8, 8))
    ax = plt.subplot(111, polar=True)
    plt.xticks(angles[:-1], categories, size=12)
    ax.set_rlabel_position(0)
    max_val = df.values.max()
    plt.yticks([0.5, 1.0, 1.5, 2.0, 2.5], ["0.5", "1.0", "1.5", "2.0", "2.5"], color="grey", size=10)
    plt.ylim(0, max(2.5, max_val * 1.1))
    for model in df.columns:
        values = df[model].values.tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=model)
        ax.fill(angles, values, alpha=0.1)
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title(title, size=15, y=1.1)
    return fig

abs_diff = mean_diff_df.abs().drop('Average_Abs')
radar_fig = radar_chart(abs_diff, 'Absolute Factor Score Differences by Model')
radar_fig.savefig(f"{viz_dir}/factor_differences_radar.png", dpi=300, bbox_inches='tight')
plt.close()

# --- 4. Average Absolute Difference Bar Chart ---
plt.figure(figsize=(10, 6))
avg_abs_diff = mean_diff_df.loc['Average_Abs'].sort_values()
avg_abs_diff.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Average Absolute Factor Score Difference by Model', fontsize=14)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Average Absolute Difference', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f"{viz_dir}/average_abs_difference_by_model.png", dpi=300)
plt.close()

print(f"Visualizations saved to {viz_dir}")

Visualizations saved to output/efa_comparison_all_models/visualizations/mean_diff_factors_all_models


<Figure size 1200x800 with 0 Axes>

## Visualization Decisions: Visualizations for Mean Difference for each Factor by for all Models in different regions

Factor-specific bar charts: For each factor, shows how the Western and Non-Western regions differ across models.

Model-specific heatmaps: For each model, displays a heatmap of factor changes by region.

Western vs Non-Western difference chart: Shows the gap between Western and Non-Western regions for each factor and model.

Radar charts: For each model, displays the pattern of factor changes across regions in a radial format.

Average absolute difference chart: Compares models based on their average absolute difference in Western and Non-Western regions.

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- Configuration ---
output_dir = "output/efa_comparison_all_models"
viz_dir = f"{output_dir}/visualizations/region_diff_factors_all_models"
os.makedirs(viz_dir, exist_ok=True)
models = ["gemma3-12b", "llama2-13b", "phi4-14b"]

# --- Load region mean difference data for each model ---
# Each CSV should have index as region (e.g., 'Western', 'Non-Western') and columns as Factor_1 ... Factor_5
region_data = {}
for model in models:
    region_path = f"{output_dir}/region_mean_factor_change_{model}.csv"
    df = pd.read_csv(region_path, index_col=0)
    region_data[model] = df

# Optional: Rename columns to more descriptive factor names
factor_names = {
    "Factor_1": "F1 - Religious-Traditional",
    "Factor_2": "F2 - Institutional Trust",
    "Factor_3": "F3 - Democratic Values",
    "Factor_4": "F4 - Social Conservatism",
    "Factor_5": "F5 - Openness to Diversity"
}
for model in models:
    region_data[model] = region_data[model].rename(columns=factor_names)

# --- 1. Factor-specific bar charts: Western vs Non-Western by model for each factor ---
for factor in factor_names.values():
    plt.figure(figsize=(10, 6))
    data = {
        "Model": [],
        "Region": [],
        "Mean Difference": []
    }
    for model in models:
        for region in ["Western", "Non-Western"]:
            if region in region_data[model].index:
                data["Model"].append(model)
                data["Region"].append(region)
                data["Mean Difference"].append(region_data[model].loc[region, factor])
    plot_df = pd.DataFrame(data)
    sns.barplot(x="Model", y="Mean Difference", hue="Region", data=plot_df, palette="Set2")
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.title(f'Mean {factor} Score Change by Region and Model (T2 - T1)', fontsize=14)
    plt.xlabel('Model', fontsize=12)
    plt.ylabel('Mean Difference (T2 - T1)', fontsize=12)
    plt.legend(title='Region')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f"{viz_dir}/{factor.replace(' ', '_')}_by_region_model.png", dpi=300)
    plt.close()

# --- 2. Model-specific heatmaps: factor changes by region ---
for model in models:
    plt.figure(figsize=(8, 5))
    sns.heatmap(region_data[model], annot=True, cmap='RdBu_r', center=0, fmt='.2f', linewidths=.5)
    plt.title(f'Heatmap of Mean Factor Score Changes by Region for {model} (T2 - T1)', fontsize=14)
    plt.tight_layout()
    plt.savefig(f"{viz_dir}/{model}_region_heatmap.png", dpi=300)
    plt.close()

# --- 3. Western vs Non-Western difference chart for each model and factor ---
west_nonwest_diff = {}
for model in models:
    df = region_data[model]
    # Western - Non-Western for each factor
    if "Western" in df.index and "Non-Western" in df.index:
        west_nonwest_diff[model] = df.loc["Western"] - df.loc["Non-Western"]
    else:
        # If region missing, fill with NaN
        west_nonwest_diff[model] = pd.Series(np.nan, index=df.columns)

west_nonwest_diff_df = pd.DataFrame(west_nonwest_diff)

plt.figure(figsize=(12, 8))
west_nonwest_diff_df.plot(kind='bar', width=0.8)
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.title('Difference Between Western and Non-Western Regions by Model (T2 - T1)', fontsize=14)
plt.xlabel('Factor', fontsize=12)
plt.ylabel('Western - Non-Western Difference', fontsize=12)
plt.legend(title='Model', fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f"{viz_dir}/west_nonwest_diff_by_model.png", dpi=300)
plt.close()

# --- 4. Radar charts: pattern of factor changes across regions for each model ---
def radar_chart(df, title):
    categories = list(df.columns)
    N = len(categories)
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]
    fig = plt.figure(figsize=(8, 8))
    ax = plt.subplot(111, polar=True)
    plt.xticks(angles[:-1], categories, size=12)
    ax.set_rlabel_position(0)
    max_val = max(abs(df.values.min()), abs(df.values.max()))
    plt.yticks([-2, -1, 0, 1, 2], ["-2", "-1", "0", "1", "2"], color="grey", size=10)
    plt.ylim(-max(2, max_val * 1.1), max(2, max_val * 1.1))
    for region in df.index:
        values = df.loc[region].values.tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=region)
        ax.fill(angles, values, alpha=0.1)
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title(title, size=15, y=1.1)
    return fig

for model in models:
    df = region_data[model]
    radar_fig = radar_chart(df, f'Mean Factor Score Changes by Region for {model}')
    radar_fig.savefig(f"{viz_dir}/{model}_region_radar.png", dpi=300, bbox_inches='tight')
    plt.close()

# --- 5. Average absolute difference chart: by region and model ---
avg_abs_diff = {}
for model in models:
    df = region_data[model]
    avg_abs_diff[model] = {
        'Western': df.loc['Western'].abs().mean() if 'Western' in df.index else np.nan,
        'Non-Western': df.loc['Non-Western'].abs().mean() if 'Non-Western' in df.index else np.nan
    }

avg_abs_diff_df = pd.DataFrame(avg_abs_diff).T
avg_abs_diff_df['Overall'] = avg_abs_diff_df.mean(axis=1)
avg_abs_diff_df = avg_abs_diff_df.sort_values('Overall')

plt.figure(figsize=(12, 6))
avg_abs_diff_df[['Western', 'Non-Western', 'Overall']].plot(kind='bar', width=0.8)
plt.title('Average Absolute Factor Score Difference by Region and Model', fontsize=14)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Average Absolute Difference', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f"{viz_dir}/avg_abs_diff_by_region_model.png", dpi=300)
plt.close()

print(f"Region visualizations saved to {viz_dir}")

Region visualizations saved to output/efa_comparison_all_models/visualizations/region_diff_factors_all_models


<Figure size 1200x800 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# --- Configuration ---
output_dir = "output/efa_comparison_all_models"
viz_dir = f"{output_dir}/visualizations/country_diff_factors_all_models"
os.makedirs(viz_dir, exist_ok=True)
models = ["gemma3-12b", "llama2-13b", "phi4-14b"]
country_col = "B_COUNTRY_ALPHA"

# Load T1 data for country info
t1_path = "data/new_median_wvs_wave7_aggregated_by_demographics.csv"
t1_df = pd.read_csv(t1_path)

for model in models:
    diff_path = f"{output_dir}/diff_scores_T1_vs_{model}_projected.csv"
    diff_df = pd.read_csv(diff_path, index_col=0)
    # Assign country by row order (if row order matches)
    diff_df[country_col] = t1_df[country_col].values[:len(diff_df)]
    # Compute mean by country
    numeric_factors = diff_df.select_dtypes(include='number').columns
    country_means = diff_df.groupby(country_col)[numeric_factors].mean()
    country_means.to_csv(f"{output_dir}/country_mean_factor_change_{model}.csv")
    print(f"Saved country mean factor change for {model} to {output_dir}/country_mean_factor_change_{model}.csv")

    # --- Visualization: Barplot for each factor, top 10 countries by absolute mean change ---
    for factor in country_means.columns:
        top10 = country_means[factor].abs().sort_values(ascending=False).head(10).index
        plot_df = country_means.loc[top10].sort_values(factor)
        plt.figure(figsize=(10, 6))
        plot_df[factor].plot(kind='barh', color='skyblue', edgecolor='black')
        plt.title(f"Top 10 Countries by Absolute Mean {factor} Change ({model})")
        plt.xlabel("Mean Factor Score Change (T2 - T1)")
        plt.ylabel("Country")
        plt.tight_layout()
        plt.savefig(f"{viz_dir}/top10_{model}_{factor}_by_country.png", dpi=300)
        plt.close()

Saved country mean factor change for gemma3-12b to output/efa_comparison_all_models/country_mean_factor_change_gemma3-12b.csv
Saved country mean factor change for llama2-13b to output/efa_comparison_all_models/country_mean_factor_change_llama2-13b.csv
Saved country mean factor change for phi4-14b to output/efa_comparison_all_models/country_mean_factor_change_phi4-14b.csv


## Aesthetic Visuals

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- Configuration ---
output_dir = "output/efa_comparison_all_models"
viz_dir = f"{output_dir}/visualizations"
os.makedirs(viz_dir, exist_ok=True)
os.makedirs(f"{viz_dir}/mean_diff_factors_all_models", exist_ok=True)
os.makedirs(f"{viz_dir}/region_diff_factors_all_models", exist_ok=True)
os.makedirs(f"{viz_dir}/country_diff_factors_all_models", exist_ok=True)
models = ["gemma3-12b", "llama2-13b", "phi4-14b"]

# ------------- PART 1: FACTOR SCORE DIFFERENCES ACROSS MODELS -------------

# --- Load mean difference data for each model ---
cohens_d_data = {}
for model in models:
    diff_path = f"{output_dir}/cohens_d_T1_vs_{model}_projected.csv"
    df = pd.read_csv(diff_path)
    cohens_d_data[model] = df.set_index("Factor")

# Combine into a DataFrame for Cohen's d
cohens_d_df = pd.DataFrame({
    model: data["Cohen_d"] for model, data in cohens_d_data.items()
})

# Combine into a DataFrame for mean differences
mean_diff_df = pd.DataFrame({
    model: data["Mean_Difference"] for model, data in cohens_d_data.items()
})

# Rename index to more descriptive factor names
factor_names = {
    "Factor_1": "F1 - Religious-Traditional",
    "Factor_2": "F2 - Institutional Trust",
    "Factor_3": "F3 - Democratic Values",
    "Factor_4": "F4 - Social Conservatism",
    "Factor_5": "F5 - Openness to Diversity"
}
cohens_d_df = cohens_d_df.rename(index=factor_names)
mean_diff_df = mean_diff_df.rename(index=factor_names)

# Add a row with the average absolute difference across factors
cohens_d_df.loc['Average_Abs'] = cohens_d_df.abs().mean()
mean_diff_df.loc['Average_Abs'] = mean_diff_df.abs().mean()

# --- 1. Bar Chart: Cohen's d effect sizes across models ---
plt.figure(figsize=(12, 8))
factor_data = cohens_d_df.drop('Average_Abs')

# Create bar chart
ax = factor_data.plot(kind='bar', width=0.8, figsize=(12, 8))
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.title('Cohen\'s d Effect Sizes by Model and Cultural Dimension', fontsize=16)
plt.xlabel('Cultural Dimension', fontsize=14)
plt.ylabel('Cohen\'s d Effect Size', fontsize=14)
plt.legend(title='Model', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add horizontal lines for effect size thresholds
plt.axhline(y=0.2, color='gray', linestyle='--', alpha=0.5)
plt.axhline(y=-0.2, color='gray', linestyle='--', alpha=0.5)
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
plt.axhline(y=-0.5, color='gray', linestyle='--', alpha=0.5)
plt.axhline(y=0.8, color='gray', linestyle='--', alpha=0.5)
plt.axhline(y=-0.8, color='gray', linestyle='--', alpha=0.5)

# Add text annotations for effect size thresholds
plt.text(len(factor_data)-1, 0.2, 'Small +', va='bottom', ha='right', color='gray')
plt.text(len(factor_data)-1, -0.2, 'Small -', va='top', ha='right', color='gray')
plt.text(len(factor_data)-1, 0.5, 'Medium +', va='bottom', ha='right', color='gray')
plt.text(len(factor_data)-1, -0.5, 'Medium -', va='top', ha='right', color='gray')
plt.text(len(factor_data)-1, 0.8, 'Large +', va='bottom', ha='right', color='gray')
plt.text(len(factor_data)-1, -0.8, 'Large -', va='top', ha='right', color='gray')

plt.tight_layout()
plt.savefig(f"{viz_dir}/cohens_d_by_model_and_dimension.png", dpi=300, bbox_inches='tight')
plt.close()

# --- 2. Heatmap: Color-coded view of Cohen's d values ---
plt.figure(figsize=(10, 8))
sns.heatmap(cohens_d_df.drop('Average_Abs'), annot=True, cmap='RdBu_r', center=0, fmt='.2f', linewidths=.5)
plt.title('Heatmap of Cohen\'s d Effect Sizes Across Models and Dimensions', fontsize=16)
plt.tight_layout()
plt.savefig(f"{viz_dir}/cohens_d_heatmap.png", dpi=300, bbox_inches='tight')
plt.close()

# --- 3. Overall Cultural Bias Index and Western Bias Index ---
# Calculate Overall Cultural Bias Index (average absolute Cohen's d)
ocbi = cohens_d_df.loc['Average_Abs']

# Bar chart of overall cultural bias index
plt.figure(figsize=(10, 6))
ocbi.sort_values().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Overall Cultural Bias Index by Model (Mean |Cohen\'s d|)', fontsize=16)
plt.xlabel('Model', fontsize=14)
plt.ylabel('Average Absolute Effect Size', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f"{viz_dir}/overall_cultural_bias_index.png", dpi=300, bbox_inches='tight')
plt.close()

# ------------- PART 2: REGIONAL ANALYSIS -------------

# --- Load region mean difference data for each model ---
region_data = {}
for model in models:
    region_path = f"{output_dir}/region_mean_factor_change_{model}.csv"
    df = pd.read_csv(region_path)
    region_data[model] = df.set_index("region")
    # Rename columns
    region_data[model] = region_data[model].rename(columns=factor_names)

# --- 1. Create Western Bias Index ---
# (Western - NonWestern) difference for Religious Traditional and Social Conservatism (the key factors)
western_bias_indices = {}
for model in models:
    if "Western" in region_data[model].index and "Non-Western" in region_data[model].index:
        # Focus on the two factors showing strongest Western bias patterns
        f1_diff = region_data[model].loc["Western", "F1 - Religious-Traditional"] - region_data[model].loc["Non-Western", "F1 - Religious-Traditional"]
        f4_diff = region_data[model].loc["Western", "F4 - Social Conservatism"] - region_data[model].loc["Non-Western", "F4 - Social Conservatism"]
        # Higher values indicate stronger Western bias
        western_bias_indices[model] = (f1_diff + f4_diff) / 2

# Western Bias Index bar chart
plt.figure(figsize=(10, 6))
wbi_series = pd.Series(western_bias_indices).sort_values(ascending=False)
ax = wbi_series.plot(kind='bar', color='coral', edgecolor='black')
plt.title('Western Bias Index by Model', fontsize=16)
plt.xlabel('Model', fontsize=14)
plt.ylabel('Western Bias Index', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add explanatory text
plt.figtext(0.5, 0.01, 
            "Higher values indicate stronger Western bias pattern\n(overestimation for Western and underestimation for Non-Western profiles)",
            ha="center", fontsize=12, bbox={"facecolor":"lightgray", "alpha":0.5, "pad":5})

plt.tight_layout(rect=[0, 0.07, 1, 1])  # Make room for the text
plt.savefig(f"{viz_dir}/western_bias_index.png", dpi=300, bbox_inches='tight')
plt.close()

# --- 2. Western vs. Non-Western comparison for Religious-Traditional Values (Factor 1) ---
plt.figure(figsize=(12, 7))
data = {
    "Model": [],
    "Region": [],
    "Mean Difference": []
}
factor = "F1 - Religious-Traditional"  # Focus on the most consistently biased factor
for model in models:
    for region in ["Western", "Non-Western"]:
        if region in region_data[model].index:
            data["Model"].append(model)
            data["Region"].append(region)
            data["Mean Difference"].append(region_data[model].loc[region, factor])
plot_df = pd.DataFrame(data)

ax = sns.barplot(x="Model", y="Mean Difference", hue="Region", data=plot_df, palette=["#1f77b4", "#ff7f0e"])
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.title(f'Western vs. Non-Western Comparison: {factor} (T2 - T1)', fontsize=16)
plt.xlabel('Model', fontsize=14)
plt.ylabel('Mean Difference (T2 - T1)', fontsize=14)
plt.legend(title='Region')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add annotations to bars
for i, p in enumerate(ax.patches):
    height = p.get_height()
    direction = "Underestimation" if height < 0 else "Overestimation"
    if abs(height) > 0.5:  # Only label significant differences
        ax.annotate(f"{direction}", 
                   (p.get_x() + p.get_width() / 2., height),
                   ha = 'center', va = 'center', 
                   xytext = (0, 10 if height > 0 else -10),
                   textcoords = 'offset points',
                   fontsize=10)

plt.tight_layout()
plt.savefig(f"{viz_dir}/religious_traditional_regional_comparison.png", dpi=300, bbox_inches='tight')
plt.close()

# --- 3. Regional bias heatmap across all models ---
# Create a combined heatmap showing regional bias for all models
plt.figure(figsize=(16, 8))

# Number of models and factors
n_models = len(models)
n_factors = len(factor_names)

# Create subplots
fig, axes = plt.subplots(1, n_models, figsize=(16, 5), sharey=True)
fig.suptitle('Regional Bias Patterns Across Models (T2 - T1)', fontsize=18)

# Loop through models
for i, model in enumerate(models):
    df = region_data[model]
    sns.heatmap(df, annot=True, cmap='RdBu_r', center=0, fmt='.2f', 
                linewidths=.5, ax=axes[i], cbar=(i == n_models-1))
    axes[i].set_title(f'{model}')
    if i > 0:  # Remove y-labels for all but the first plot
        axes[i].set_ylabel('')

plt.tight_layout(rect=[0, 0, 1, 0.95])  # Make room for the title
plt.savefig(f"{viz_dir}/regional_bias_heatmap_all_models.png", dpi=300, bbox_inches='tight')
plt.close()

# --- 4. Cultural flattening visualization ---
# Create a visualization showing how models flatten cultural differences
plt.figure(figsize=(12, 8))

factor = "F4 - Social Conservatism"  # Focus on social conservatism which shows clear flattening
model_colors = {"gemma3-12b": "#1f77b4", "llama2-13b": "#ff7f0e", "phi4-14b": "#2ca02c"}

# Base values (estimated) to show the ground truth difference
base_western = -0.5
base_nonwestern = 1.0
x_positions = [0, 1]
x_labels = ["Western", "Non-Western"]

# Plot the ground truth (estimated) line
plt.plot(x_positions, [base_western, base_nonwestern], 'k-', linewidth=3, label='Ground Truth (T1)')

# Plot each model's representation
for model in models:
    if "Western" in region_data[model].index and "Non-Western" in region_data[model].index:
        # Get the change for each region
        west_change = region_data[model].loc["Western", factor]
        nonwest_change = region_data[model].loc["Non-Western", factor]
        
        # Plot the model line (T1 + change = T2)
        plt.plot(x_positions, [base_western + west_change, base_nonwestern + nonwest_change], 
                 '-o', color=model_colors[model], linewidth=2, label=f'{model} (T2)')
        
        # Add arrows showing the change
        plt.arrow(x_positions[0], base_western, 0, west_change, 
                 head_width=0.05, head_length=0.1, fc=model_colors[model], ec=model_colors[model], alpha=0.6)
        plt.arrow(x_positions[1], base_nonwestern, 0, nonwest_change, 
                 head_width=0.05, head_length=0.1, fc=model_colors[model], ec=model_colors[model], alpha=0.6)

plt.xticks(x_positions, x_labels, fontsize=14)
plt.ylabel(f'{factor} Factor Score', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=12)
plt.title('Cultural Flattening Effect: Social Conservatism', fontsize=16)

# Add explanatory text
plt.figtext(0.5, 0.01, 
            "This visualization shows how LLMs flatten cultural differences by reducing Non-Western distinctiveness.",
            ha="center", fontsize=12, bbox={"facecolor":"lightgray", "alpha":0.5, "pad":5})

plt.tight_layout(rect=[0, 0.07, 1, 1])  # Make room for the text
plt.savefig(f"{viz_dir}/cultural_flattening_effect.png", dpi=300, bbox_inches='tight')
plt.close()

# ------------- PART 3: COUNTRY-LEVEL ANALYSIS -------------

# --- Load country-level data for visualization ---
country_data = {}
for model in models:
    country_path = f"{output_dir}/country_mean_factor_change_{model}.csv"
    df = pd.read_csv(country_path)
    country_data[model] = df.set_index("B_COUNTRY_ALPHA")
    # Rename columns
    country_data[model] = country_data[model].rename(columns=factor_names)

# Create a world map visualization if possible (this is a simplified example)
# In a real implementation, we would use geopandas to create a proper choropleth map

# --- Country top 10 visualization for Religious-Traditional Values ---
for model in models:
    df = country_data[model]
    factor = "F1 - Religious-Traditional"
    
    # Get top 5 overestimation and top 5 underestimation countries
    top5_over = df[factor].sort_values(ascending=False).head(5)
    top5_under = df[factor].sort_values().head(5)
    
    # Combine them
    top10 = pd.concat([top5_over, top5_under])
    
    # Plot
    plt.figure(figsize=(12, 8))
    colors = ['lightcoral' if x < 0 else 'lightgreen' for x in top10.values]
    ax = top10.sort_values().plot(kind='barh', color=colors)
    
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.title(f'Top Countries by Religious-Traditional Values Bias ({model})', fontsize=16)
    plt.xlabel('Religious-Traditional Values Change (T2 - T1)', fontsize=14)
    plt.ylabel('Country Code', fontsize=14)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    
    # Add labels for over/underestimation
    for i, v in enumerate(top10.sort_values()):
        direction = "Underestimation" if v < 0 else "Overestimation"
        ax.text(v - 0.1 if v < 0 else v + 0.1, 
                i, 
                direction, 
                color='black', 
                va='center',
                ha='right' if v < 0 else 'left',
                fontsize=10)
    
    plt.tight_layout()
    plt.savefig(f"{viz_dir}/country_diff_factors_all_models/{model}_religious_traditional_top10_countries.png", dpi=300)
    plt.close()

print("All visualizations have been generated successfully.")

All visualizations have been generated successfully.


<Figure size 1200x800 with 0 Axes>

<Figure size 1600x800 with 0 Axes>

## Final Report:
Based on my analysis of the visualizations and data, I have identified eight key findings regarding cultural bias in the LLMs:

1. Systematic underrepresentation of Religious-Traditional Values: All models significantly underestimate these values for Non-Western profiles (Cohen's d from -0.89 to -1.17) while slightly overestimating them for Western profiles.

2. Model-specific bias patterns: Each model shows distinct bias patterns. Llama2-13b has the strongest Western bias; Gemma3-12b extremely overestimates Openness to Diversity; Phi4-14b shows the highest overall bias magnitude but lowest Western bias.

3. Cultural flattening effect: All models compress differences between Western and Non-Western regions, reducing cultural distinctiveness, especially for Non-Western values.

4. Inconsistent representation of Democratic Values: Models dramatically disagree on this dimension, with Phi4-14b overestimating (d = +0.95), Llama2-13b severely underestimating (d = -1.15), and Gemma3-12b slightly underestimating (d = -0.12).

5. Western bias across all models: The Western Bias Index reveals varying degrees of Western orientation (Llama2-13b: 1.37, Gemma3-12b: 1.02, Phi4-14b: 0.90).

6. Country-specific patterns: Certain countries show extreme values, like Ethiopia, Colombia, and Nigeria showing strong underestimation of Religious-Traditional Values.

7. Bias magnitude vs. pattern: The overall bias magnitude doesn't correlate with Western orientation - Phi4-14b has highest bias magnitude but lowest Western bias.

8. Extreme overestimation of Openness to Diversity by Gemma3-12b (d = +1.02) compared to other models.