# WVS Wave 7 - EFA Comparison: Ground Truth vs. LLM Simulation

This script compares a ground truth dataset (T1) with an LLM-simulated dataset (T2) using Exploratory Factor Analysis (EFA).

### Workflow:
1. Load and prepare both datasets, ensuring common variables and data types.
2. Perform EFA on the ground truth data (T1) to establish the reference factor structure.
3. (Optional) Perform EFA on the simulated data (T2) for congruence analysis.
4. (Optional) Calculate Tucker's Congruence Coefficient between T1 and T2 factor loadings.
5. Project the simulated data (T2) onto the T1 factor structure using the T1 FA model.
6. Compare the original T1 factor scores with the projected T2 scores using t-tests and Cohen's d.
7. Analyze differences in factor scores across demographic groups (country, region).
8. (Optional) Analyze raw score changes for specific variables by group.


In [None]:
# =============================================================================
# Imports
# =============================================================================
from __future__ import annotations
import argparse
import datetime as dt
import os
import sys
import warnings
import pathlib
import math

import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
from scipy.stats import ttest_rel # For paired t-tests

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module='factor_analyzer')
warnings.filterwarnings("ignore", category=UserWarning, module='pandas')


In [None]:
# =============================================================================
# Configuration / Constants
# =============================================================================
DATA_DIR = pathlib.Path("./data")
OUTPUT_DIR_ROOT = pathlib.Path("./output/efa_comparison_T1_vs_T2")

# --- Input Data ---
# Ground Truth Aggregated Data (Result of Aggregation Script)
T1_AGGREGATED_PATH = DATA_DIR / "new_median_wvs_wave7_aggregated_by_demographics.csv"
# LLM Simulated Aggregated Data (Needs to be generated and have same structure)
T2_AGGREGATED_PATH = DATA_DIR / "phi4-14b_output_inferred.csv" # <-- UPDATE THIS PATH
# Variable information (Binary/Ordinal types)
VARIABLE_INFO_PATH = DATA_DIR / "variable_info.csv"

In [105]:
# --- EFA Parameters ---
# Determined from previous EFA steps (e.g., comparison script)
N_FACTORS = 5 # Number of factors identified for T1
ROTATION = 'promax' # Rotation used for the reference T1 model
FACTOR_METHOD = 'principal' # Extraction method used for T1
USE_SMC = True # Use Squared Multiple Correlation for communalities

In [106]:
# --- Comparison Parameters ---
# For Tucker's Congruence analysis (optional)
RUN_CONGRUENCE_ANALYSIS = True # Set to True to also run FA on T2 and compare
CONGRUENCE_THRESHOLD_HIGH = 0.90 # Threshold for strong factor correspondence
CONGRUENCE_THRESHOLD_LOW = 0.85 # Threshold for acceptable correspondence (use with caution)
CONGRUENCE_OFF_DIAGONAL_MAX = 0.3 # Max acceptable loading on non-corresponding factors


In [None]:
# For Group Change Analysis
COUNTRY_COLUMN = 'B_COUNTRY_ALPHA' # Column containing country identifiers
WESTERN_COUNTRIES = ['USA','CAN','GBR','FRA','DEU','NLD','AUS','NZL'] # Western List

# For Raw Variable Change Analysis
ANALYZE_RAW_VARIABLE = True # Set to True to analyze specific variable
RAW_VARIABLE_TO_ANALYZE = 'Q165P' # Homosexuality Acceptance (Q165P)

RANDOM_STATE = 42

In [None]:
# =============================================================================
# Helper Functions
# =============================================================================

def ensure_output_dir(root: pathlib.Path, *subdirs: str) -> pathlib.Path:
    """Creates nested output directories."""
    out_path = root.joinpath(*subdirs)
    out_path.mkdir(parents=True, exist_ok=True)
    print(f"Output directory ensured: {out_path}")
    return out_path

def log_message(msg: str, level: str = "INFO") -> None:
    """Prints a formatted message."""
    print(f"[{level}] {msg}", flush=True)

def load_and_prepare_data(t1_agg_path: pathlib.Path, t2_agg_path: pathlib.Path,
                          var_info_path: pathlib.Path) -> tuple[pd.DataFrame, pd.DataFrame, list, list, pd.DataFrame]:
    """Loads T1 and T2, identifies common variables, validates, returns prepared data."""
    log_message("--- Loading and Preparing Data (T1 & T2) ---")

    # --- Load Data ---
    if not t1_agg_path.exists(): raise FileNotFoundError(f"T1 file not found: {t1_agg_path}")
    if not t2_agg_path.exists(): raise FileNotFoundError(f"T2 file not found: {t2_agg_path}")
    if not var_info_path.exists(): raise FileNotFoundError(f"Variable info file not found: {var_info_path}")

    try:
        data_T1_raw = pd.read_csv(t1_agg_path)
        data_T2_raw = pd.read_csv(t2_agg_path)
        var_info = pd.read_csv(var_info_path)
        log_message(f"Loaded T1: {data_T1_raw.shape}, T2: {data_T2_raw.shape}, VarInfo: {var_info.shape}")
    except Exception as e:
        raise IOError(f"Error reading input files: {e}")

    # --- Identify Variable Types and Common Columns ---
    if not all(c in var_info.columns for c in ['Variable_Code', 'Type']):
        raise ValueError("Variable info file must contain 'Variable_Code' and 'Type' columns.")
    binary_vars = var_info.loc[var_info['Type'].str.lower() == 'binary', 'Variable_Code'].tolist()
    ordinal_vars = var_info.loc[var_info['Type'].str.lower() == 'ordinal', 'Variable_Code'].tolist()

    common_binary = sorted(list(set(binary_vars) & set(data_T1_raw.columns) & set(data_T2_raw.columns)))
    common_ordinal = sorted(list(set(ordinal_vars) & set(data_T1_raw.columns) & set(data_T2_raw.columns)))
    common_survey_vars = common_binary + common_ordinal

    if not common_survey_vars:
        raise ValueError("No common Binary or Ordinal survey variables found between T1, T2, and variable info.")
    log_message(f"Found {len(common_binary)} common Binary and {len(common_ordinal)} common Ordinal variables.")

    # --- Subset and Prepare Data ---
    # Keep only common survey vars + necessary demographic/ID columns for T1 (used later)
    t1_cols_to_keep = common_survey_vars + ([COUNTRY_COLUMN] if COUNTRY_COLUMN in data_T1_raw.columns else [])
    data_T1 = data_T1_raw[t1_cols_to_keep].copy()
    # T2 only needs the common survey variables for projection
    data_T2 = data_T2_raw[common_survey_vars].copy()

    # Assumes the *meaning* of rows corresponds (same demographic profile)
    # If indices are meaningful and match, use reindex:

    if data_T1.index.equals(data_T2.index):
       log_message("Indices match between T1 and T2.")
    else:
       log_message("Warning: Indices do not match. Assuming row order corresponds for comparison.")
       data_T2 = data_T2.reindex(data_T1.index) # HARD LESSON: Requires meaningful, aligned index in CSVs

    # Coerce to appropriate types (numeric for FA)
    for df, name in [(data_T1, "T1"), (data_T2, "T2")]:
        for col in common_survey_vars:
            try:
                # Attempt conversion to numeric, coerce errors to NaN
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except Exception as e:
                log_message(f"Warning: Could not convert column '{col}' in {name} to numeric: {e}", "WARN")
        # Convert to integer (only if no NaNs introduced)
        df[common_survey_vars] = df[common_survey_vars].astype(int)

    # --- Validation ---
    for df, name in [(data_T1[common_survey_vars], "T1_survey"), (data_T2, "T2_survey")]:
        if df.isnull().any().any():
             nan_cols = df.columns[df.isnull().any()].tolist()
             raise ValueError(f"NaNs present in {name} after preparation (Columns: {nan_cols}). Impute or check data source.")
        if not all(pd.api.types.is_numeric_dtype(df[col]) for col in df.columns):
             non_num_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])]
             raise TypeError(f"Non-numeric data detected in {name} columns: {non_num_cols}")
        zero_var_cols = df.columns[df.var() == 0].tolist()
        if zero_var_cols:
             raise ValueError(f"Zero variance columns found in {name}: {zero_var_cols}")

    log_message("Data loading and preparation complete.")

    # Return T1 (with demographics), T2 (survey only), common survey vars, common binary, common ordinal
    return data_T1, data_T2, common_survey_vars, common_binary, common_ordinal


def run_factor_analysis(data: pd.DataFrame, n_factors: int, rotation: str | None, method: str,
                        use_smc: bool, out_dir: pathlib.Path, label: str) -> tuple[FactorAnalyzer, dict]:
    """Performs EFA and saves results."""
    log_message(f"--- Running Factor Analysis on {label} ---")
    results = {}

    # Define output paths
    kmo_file = out_dir / f'kmo_per_variable_{label}.csv'
    pattern_file = out_dir / f'loadings_{rotation}_{n_factors}factors_{label}.csv'
    structure_file = out_dir / f'structure_{rotation}_{n_factors}factors_{label}.csv'
    corr_file = out_dir / f'factor_corr_{rotation}_{n_factors}factors_{label}.csv'
    var_file = out_dir / f'variance_{rotation}_{n_factors}factors_{label}.csv'
    comm_file = out_dir / f'communalities_{rotation}_{n_factors}factors_{label}.csv'
    scores_file = out_dir / f'scores_{rotation}_{n_factors}factors_{label}.csv'

    # --- Suitability Tests ---
    try:
        chi2, p = calculate_bartlett_sphericity(data)
        log_message(f" Bartlett's Test: Chi2={chi2:.2f}, p={p:.4g}")
        if p >= 0.05: log_message(f"Bartlett p-value >= 0.05, suitability questionable.", "WARN")
        km_per_var, km_overall = calculate_kmo(data)
        log_message(f" KMO Test: Overall={km_overall:.3f}")
        if km_overall < 0.6: log_message(f"Overall KMO < 0.6, suitability questionable.", "WARN")
        pd.DataFrame({'KMO': km_per_var}, index=data.columns).sort_values('KMO').to_csv(kmo_file)
    except Exception as e:
        log_message(f"Could not perform suitability tests for {label}: {e}", "ERROR")

    # --- Fit FA Model ---
    try:
        fa = FactorAnalyzer(n_factors=n_factors, rotation=rotation, method=method,
                            use_smc=use_smc, rotation_kwargs={'max_iter': 1000})
        fa.fit(data)
        log_message(" EFA model fitted successfully.")
    except Exception as e:
        log_message(f"EFA model fitting failed for {label}: {e}", "ERROR")
        raise e # Stop execution if fitting fails

    # --- Extract and Save Results ---
    factor_names = [f'Factor_{i+1}' for i in range(n_factors)]
    results['factor_names'] = factor_names

    try:
        # Loadings (Pattern matrix for oblique, Factor matrix for orthogonal)
        results['loadings'] = pd.DataFrame(fa.loadings_, index=data.columns, columns=factor_names)
        results['loadings'].to_csv(pattern_file)

        # Communalities
        results['communalities'] = pd.DataFrame({'Communality': fa.get_communalities()}, index=data.columns)
        results['communalities'].to_csv(comm_file)

        # Variance Explained
        var, prop, cum = fa.get_factor_variance()
        results['variance'] = pd.DataFrame({'Factor': factor_names, 'SSL': var, 'Prop_Var': prop, 'Cum_Var': cum})
        results['variance'].to_csv(var_file, index=False)

        # Factor Correlations (Phi) - only for oblique rotations
        if fa.phi_ is not None:
            results['factor_correlations'] = pd.DataFrame(fa.phi_, index=factor_names, columns=factor_names)
            results['factor_correlations'].to_csv(corr_file)

        # Structure Matrix - only for oblique rotations
        if hasattr(fa, 'structure_') and fa.structure_ is not None:
            results['structure'] = pd.DataFrame(fa.structure_, index=data.columns, columns=factor_names)
            results['structure'].to_csv(structure_file)

        # Factor Scores (calculate and save)
        results['scores'] = pd.DataFrame(fa.transform(data), index=data.index, columns=factor_names)
        results['scores'].to_csv(scores_file)

        log_message(f" EFA results saved to: {out_dir}")

    except Exception as e:
        log_message(f"Error extracting or saving results for {label}: {e}", "ERROR")
        # Decide if partial results are acceptable or raise error

    return fa, results


def calculate_congruence(loadings1: pd.DataFrame, loadings2: pd.DataFrame) -> pd.DataFrame:
    """Calculates Tucker's Congruence Coefficient matrix between two loading matrices."""
    log_message("--- Calculating Tucker's Congruence Coefficients ---")
    F1 = loadings1.values
    F2 = loadings2.values

    if F1.shape[0] != F2.shape[0]:
        log_message("Row mismatch between loading matrices. Attempting to reindex T2 based on T1.", "WARN")
        common_index = loadings1.index.intersection(loadings2.index)
        if len(common_index) < F1.shape[0] * 0.8: # Heuristic check
             raise ValueError("Significant row mismatch, cannot reliably calculate congruence.")
        F1 = loadings1.loc[common_index].values
        F2 = loadings2.loc[common_index].values
        log_message(f" Congruence calculated on {len(common_index)} common variables.")

    n_f1, n_f2 = F1.shape[1], F2.shape[1]
    congruence_matrix = np.zeros((n_f1, n_f2))

    for i in range(n_f1):
        for j in range(n_f2):
            numerator = np.sum(F1[:, i] * F2[:, j])
            denominator = np.sqrt(np.sum(F1[:, i]**2) * np.sum(F2[:, j]**2))
            congruence_matrix[i, j] = numerator / denominator if denominator != 0 else np.nan

    congruence_df = pd.DataFrame(congruence_matrix, index=loadings1.columns, columns=loadings2.columns)
    log_message("Congruence calculation complete.")
    return congruence_df


def analyze_congruence(congruence_df: pd.DataFrame, high_threshold: float,
                       low_threshold: float, off_diag_max: float, out_dir: pathlib.Path) -> None:
    """Analyzes and interprets the congruence coefficient matrix."""
    log_message("--- Analyzing Congruence ---")
    print("Congruence Matrix (T1 vs T2):")
    print(congruence_df.round(3))

    # Best match for each T1 factor
    best_match_df = pd.DataFrame({
        'Best_Matching_T2_Factor': congruence_df.idxmax(axis=1),
        'Congruence_Coefficient': congruence_df.max(axis=1)
    })
    print("\nBest T2 Match for each T1 Factor:")
    print(best_match_df.round(3))
    best_match_df.to_csv(out_dir / "congruence_best_matches.csv")

    # Check diagonal congruence and off-diagonal noise
    if congruence_df.shape[0] == congruence_df.shape[1]:
        diagonal_values = np.diag(congruence_df.values)
        off_diagonal_mask = ~np.eye(congruence_df.shape[0], dtype=bool)
        off_diagonal_values = np.abs(congruence_df.values[off_diagonal_mask])

        num_high_congruence = np.sum(diagonal_values >= high_threshold)
        num_low_congruence = np.sum(diagonal_values < low_threshold)
        max_off_diagonal = off_diagonal_values.max() if off_diagonal_values.size > 0 else 0

        log_message(f"Diagonal congruence check: {num_high_congruence} factors >= {high_threshold:.2f}, {num_low_congruence} factors < {low_threshold:.2f}")
        log_message(f"Off-diagonal check: Max absolute value = {max_off_diagonal:.3f} (Threshold < {off_diag_max:.2f})")

        if num_high_congruence == congruence_df.shape[0] and max_off_diagonal < off_diag_max:
            log_message("Factor structure shows high congruence and good distinction.")
        elif num_low_congruence > 0 or max_off_diagonal >= off_diag_max:
            log_message("Factor structure congruence is questionable. Check coefficients.", "WARN")
        else:
            log_message("Factor structure shows moderate congruence.")
    else:
        log_message("Congruence matrix is not square, skipping diagonal/off-diagonal checks.")


def project_and_compare_scores(fa_T1: FactorAnalyzer, data_T1_scores: pd.DataFrame,
                               data_T2: pd.DataFrame, out_dir: pathlib.Path) -> pd.DataFrame:
    """Projects T2 data onto T1 factors, compares scores, calculates stats."""
    log_message("--- Projecting T2 onto T1 Factors and Comparing Scores ---")

    # Project T2 data using the T1 factor analysis object
    try:
        scores_proj_T2 = fa_T1.transform(data_T2) # data_T2 should only contain common survey vars
        scores_proj_T2_df = pd.DataFrame(scores_proj_T2, index=data_T2.index, columns=data_T1_scores.columns)
        scores_proj_T2_df.to_csv(out_dir / 'scores_T2_projected_on_T1.csv')
        log_message(" T2 data projected onto T1 factors.")
    except Exception as e:
        log_message(f"Error projecting T2 data: {e}", "ERROR")
        raise e

    # --- Paired t-tests: T1 scores vs Projected T2 scores ---
    log_message(" Running paired t-tests (T1 vs Projected T2)...")
    ttest_results = []
    common_index = data_T1_scores.index.intersection(scores_proj_T2_df.index)
    if len(common_index) < len(data_T1_scores) or len(common_index) < len(scores_proj_T2_df):
         log_message(f"Warning: Comparing scores on {len(common_index)} common indices only.", "WARN")

    t1_scores_common = data_T1_scores.loc[common_index]
    t2_proj_scores_common = scores_proj_T2_df.loc[common_index]

    for factor in t1_scores_common.columns:
        try:
            t_stat, p_value = ttest_rel(t1_scores_common[factor], t2_proj_scores_common[factor])
            ttest_results.append({'Factor': factor, 'T_Statistic': t_stat, 'P_Value': p_value})
        except Exception as e:
            log_message(f"Could not perform t-test for {factor}: {e}", "WARN")
            ttest_results.append({'Factor': factor, 'T_Statistic': np.nan, 'P_Value': np.nan})

    ttest_df = pd.DataFrame(ttest_results).set_index('Factor')
    ttest_df.to_csv(out_dir / 'ttest_T1_vs_T2projected.csv')
    print(" Paired t-test results (T1 vs Projected T2):")
    print(ttest_df.round(4))

    # --- Cohen's d for effect size ---
    log_message(" Calculating Cohen's d for score differences...")
    cohen_d_results = []
    # Calculate difference based on common indices
    diff_scores = t2_proj_scores_common.subtract(t1_scores_common)
    for factor in diff_scores.columns:
        mean_diff = diff_scores[factor].mean()
        std_diff = diff_scores[factor].std(ddof=1) # Use sample standard deviation
        cohen_d = mean_diff / std_diff if std_diff != 0 else 0
        cohen_d_results.append({'Factor': factor, "Mean_Difference": mean_diff, "Std_Dev_Difference": std_diff, "Cohen_d": cohen_d})
        # print(f"  {factor}: Mean Diff={mean_diff:.3f}, SD Diff={std_diff:.3f}, Cohen's d = {cohen_d:.3f}")

    cohen_d_df = pd.DataFrame(cohen_d_results).set_index('Factor')
    cohen_d_df.to_csv(out_dir / 'cohens_d_T1_vs_T2projected.csv')
    print(" Cohen's d results:")
    print(cohen_d_df.round(4))

    log_message(" Score comparison complete.")
    return diff_scores # Return differences on common index


def analyze_group_changes(diff_scores: pd.DataFrame, t1_aggregated_data: pd.DataFrame,
                          country_col: str, region_map: dict, out_dir: pathlib.Path) -> None:
    """Analyzes mean factor score differences by country and region."""
    log_message("--- Analyzing Group Changes in Factor Scores ---")

    # Merge demographic info (country) using the index
    if country_col not in t1_aggregated_data.columns:
        log_message(f"Country column '{country_col}' not found in T1 aggregated data. Skipping group analysis.", "WARN")
        return

    # Select only country column and ensure index matches diff_scores
    demo_info = t1_aggregated_data[[country_col]].copy()
    # Align indices before merging - crucial step
    common_index = diff_scores.index.intersection(demo_info.index)
    if len(common_index) < len(diff_scores):
         log_message(f"Group analysis performed on {len(common_index)} participants with matching demographic info.", "WARN")

    diff_scores_grp = diff_scores.loc[common_index].copy()
    demo_info = demo_info.loc[common_index]
    diff_scores_grp[country_col] = demo_info[country_col]


    # --- Mean Change by Country ---
    try:
        # Ensure only numeric factor columns are aggregated
        numeric_factor_cols = diff_scores_grp.select_dtypes(include=np.number).columns.tolist()
        country_mean_change = diff_scores_grp.groupby(country_col)[numeric_factor_cols].mean()
        country_mean_change.to_csv(out_dir / 'country_mean_factor_change.csv')
        log_message(" Calculated mean factor change by country.")
        print("\n Mean Factor Score Change by Country (Projected T2 - T1):")
        print(country_mean_change.round(4).head()) # Print head
    except Exception as e:
        log_message(f"Error calculating country mean change: {e}", "ERROR")

    # --- Mean Change by Region ---
    try:
        diff_scores_grp['region'] = diff_scores_grp[country_col].apply(
            lambda x: region_map.get(x, 'Other') # Use map, default to 'Other'
        )
        numeric_factor_cols = diff_scores_grp.select_dtypes(include=np.number).columns.tolist()
        region_mean_change = diff_scores_grp.groupby('region')[numeric_factor_cols].mean()
        region_mean_change.to_csv(out_dir / 'region_mean_factor_change.csv')
        log_message(" Calculated mean factor change by region.")
        print("\n Mean Factor Score Change by Region (Projected T2 - T1):")
        print(region_mean_change.round(4))
    except Exception as e:
        log_message(f"Error calculating region mean change: {e}", "ERROR")

    log_message("Group change analysis complete.")


def analyze_raw_variable_change(t1_agg_path: pathlib.Path, t2_agg_path: pathlib.Path,
                                variable: str, country_col: str, out_dir: pathlib.Path) -> None:
    """Analyzes the change in a specific raw variable score by country."""
    log_message(f"--- Analyzing Raw Change for Variable: {variable} ---")

    try:
        # Reload necessary columns only to minimize memory usage
        cols_to_load = [country_col, variable]
        # Need index if participant_id is index, otherwise need participant_id col
        use_cols_t1 = [col for col in cols_to_load if col in pd.read_csv(t1_agg_path, nrows=0).columns]
        use_cols_t2 = [col for col in cols_to_load if col in pd.read_csv(t2_agg_path, nrows=0).columns]

        # Assume first column is index if not participant_id
        # So assume index_col=0 is standard if participant_id isn't used. Adjust if needed.
        index_col_name = 'participant_id' # ASSUME this is the index name in CSV
        if index_col_name not in pd.read_csv(t1_agg_path, nrows=0).columns:
             index_col_name = None # Use default index 0 if not present

        t1_raw = pd.read_csv(t1_agg_path, usecols=use_cols_t1 + ([index_col_name] if index_col_name else []), index_col=index_col_name)
        t2_raw = pd.read_csv(t2_agg_path, usecols=use_cols_t2 + ([index_col_name] if index_col_name else []), index_col=index_col_name)


        if variable not in t1_raw.columns or variable not in t2_raw.columns:
            log_message(f"Variable '{variable}' not found in both T1 and T2 raw files. Skipping.", "WARN")
            return
        if country_col not in t1_raw.columns:
            log_message(f"Country column '{country_col}' not found in T1 raw file. Skipping.", "WARN")
            return

        # Align data on common index
        common_index = t1_raw.index.intersection(t2_raw.index)
        t1_aligned = t1_raw.loc[common_index]
        t2_aligned = t2_raw.loc[common_index]

        # Calculate difference
        var_diff = (t2_aligned[variable] - t1_aligned[variable]).to_frame(f'{variable}_diff')
        var_diff[country_col] = t1_aligned[country_col] # Add country from aligned T1

        # Group by country
        country_var_change = var_diff.groupby(country_col)[f'{variable}_diff'].mean()
        country_var_change.to_csv(out_dir / f'country_{variable}_raw_change.csv')
        log_message(f" Calculated mean raw change for {variable} by country.")
        print(f"\n Mean Raw Score Change for {variable} by Country (T2 - T1):")
        print(country_var_change.reindex(country_var_change.abs().sort_values(ascending=False).index).round(4))

    except FileNotFoundError:
        log_message(f"Could not load raw data files for variable '{variable}' change analysis.", "ERROR")
    except KeyError as e:
        log_message(f"Column missing for raw variable change analysis: {e}", "ERROR")
    except Exception as e:
        log_message(f"Error during raw variable change analysis for '{variable}': {e}", "ERROR")


In [None]:
# # =============================================================================
# # Main Execution Workflow
# # =============================================================================

# Ignoring for now
# def main(args):
#     """Main function to orchestrate the EFA comparison pipeline."""
#     log_message("--- Starting EFA Comparison Pipeline (T1 vs T2) ---")

#     # --- Setup Output Directories ---
#     comp_out_dir = ensure_output_dir(pathlib.Path(args.output_root)) # Main comparison output
#     t1_out_dir = ensure_output_dir(pathlib.Path(args.output_root), "T1_EFA_results")
#     t2_out_dir = ensure_output_dir(pathlib.Path(args.output_root), "T2_EFA_results") if args.run_congruence else None

#     # --- Load and Prepare Data ---
#     # Pass paths directly from args
#     t1_path = pathlib.Path(args.t1_file)
#     t2_path = pathlib.Path(args.t2_file)
#     var_info_path = pathlib.Path(args.var_file)
#     df_T1_full, df_T2_survey, common_vars, _, _ = load_and_prepare_data(
#         t1_path, t2_path, var_info_path
#     )
#     df_T1_survey = df_T1_full[common_vars] # Extract survey vars for T1 EFA

#     # --- Run EFA on T1 (Ground Truth) ---
#     fa_T1, results_T1 = run_factor_analysis(
#         data=df_T1_survey,
#         n_factors=args.n_factors,
#         rotation=args.rotation,
#         method=args.method,
#         use_smc=args.use_smc,
#         out_dir=t1_out_dir,
#         label="T1"
#     )
#     # Extract T1 scores for later comparison
#     data_T1_scores = results_T1.get('scores')
#     if data_T1_scores is None:
#          log_message("Could not retrieve T1 factor scores. Aborting comparison.", "ERROR")
#          sys.exit(1)


#     # --- (Optional) Run EFA on T2 and Calculate Congruence ---
#     if args.run_congruence:
#         if t2_out_dir:
#              fa_T2, results_T2 = run_factor_analysis(
#                  data=df_T2_survey, # Use only common survey vars
#                  n_factors=args.n_factors,
#                  rotation=args.rotation,
#                  method=args.method,
#                  use_smc=args.use_smc,
#                  out_dir=t2_out_dir,
#                  label="T2"
#              )
#              # Calculate and analyze congruence
#              if 'loadings' in results_T1 and 'loadings' in results_T2:
#                   congruence_df = calculate_congruence(results_T1['loadings'], results_T2['loadings'])
#                   analyze_congruence(congruence_df, args.cong_high, args.cong_low, args.cong_off_diag, comp_out_dir)
#              else:
#                   log_message("Could not calculate congruence, missing loading matrices.", "WARN")
#         else:
#              log_message("T2 output directory not created, skipping T2 EFA run.", "WARN")


#     # --- Project T2 onto T1 and Compare Scores ---
#     diff_scores = project_and_compare_scores(
#         fa_T1=fa_T1,
#         data_T1_scores=data_T1_scores,
#         data_T2=df_T2_survey, # Use only common survey vars
#         out_dir=comp_out_dir
#     )

#     # --- Analyze Group Changes ---
#     # Create region map based on config
#     region_map = {country: 'Western' if country in WESTERN_COUNTRIES else 'Non-Western'
#                   for country in df_T1_full[COUNTRY_COLUMN].unique()}
#     analyze_group_changes(
#         diff_scores=diff_scores,
#         t1_aggregated_data=df_T1_full, # Pass full T1 with demographics
#         country_col=args.country_col,
#         region_map=region_map,
#         out_dir=comp_out_dir
#     )

#     # --- (Optional) Analyze Raw Variable Change ---
#     if args.analyze_raw:
#         analyze_raw_variable_change(
#             t1_agg_path=t1_path,
#             t2_agg_path=t2_path,
#             variable=args.raw_var,
#             country_col=args.country_col,
#             out_dir=comp_out_dir
#         )

#     log_message("--- EFA Comparison Pipeline Finished ---")

In [110]:
# # =============================================================================
# # Entry Point & Argument Parsing
# # =============================================================================

# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description="Compare Ground Truth (T1) vs LLM Simulated (T2) data using EFA.")

#     # Input Files
#     parser.add_argument("--t1_file", default=str(T1_AGGREGATED_PATH), help="Path to T1 (ground truth) aggregated CSV.")
#     parser.add_argument("--t2_file", default=str(T2_AGGREGATED_PATH), help="Path to T2 (LLM simulated) aggregated CSV.")
#     parser.add_argument("--var_file", default=str(VARIABLE_INFO_PATH), help="Path to variable info CSV.")

#     # EFA Parameters (should match the final chosen model for T1)
#     parser.add_argument("--n_factors", type=int, default=N_FACTORS, help="Number of factors.")
#     parser.add_argument("--rotation", default=ROTATION, help="Rotation method (e.g., 'promax', 'varimax', None).")
#     parser.add_argument("--method", default=FACTOR_METHOD, help="Factor extraction method (e.g., 'principal', 'ml', 'minres').")
#     parser.add_argument("--use_smc", type=bool, default=USE_SMC, help="Use Squared Multiple Correlation for communalities.")

#     # Comparison Options
#     parser.add_argument("--run_congruence", action='store_true', default=RUN_CONGRUENCE_ANALYSIS, help="Run FA on T2 and calculate Tucker's Congruence.")
#     parser.add_argument("--cong_high", type=float, default=CONGRUENCE_THRESHOLD_HIGH, help="High threshold for congruence.")
#     parser.add_argument("--cong_low", type=float, default=CONGRUENCE_THRESHOLD_LOW, help="Low threshold for congruence.")
#     parser.add_argument("--cong_off_diag", type=float, default=CONGRUENCE_OFF_DIAGONAL_MAX, help="Max acceptable off-diagonal congruence.")

#     # Group Analysis Options
#     parser.add_argument("--country_col", default=COUNTRY_COLUMN, help="Column name for country identifier.")

#     # Raw Variable Analysis Options
#     parser.add_argument("--analyze_raw", action='store_true', default=ANALYZE_RAW_VARIABLE, help="Analyze raw change for a specific variable.")
#     parser.add_argument("--raw_var", default=RAW_VARIABLE_TO_ANALYZE, help="Variable code for raw change analysis.")

#     # Output Options
#     parser.add_argument("--output_root", default=str(OUTPUT_DIR_ROOT), help="Root directory for all output.")

#     cli_args = parser.parse_args()

#     # --- Execute Main Function ---
#     try:
#         main(cli_args)
#     except FileNotFoundError as e:
#         log_message(f"Error: Input file not found. {e}", "CRITICAL")
#         sys.exit(1)
#     except (ValueError, TypeError, KeyError) as e:
#         log_message(f"Error: Data validation or configuration issue. {e}", "CRITICAL")
#         sys.exit(1)
#     except Exception as e:
#         log_message(f"An unexpected error occurred: {e}", "CRITICAL")
#         # import traceback # Uncomment for detailed traceback
#         # traceback.print_exc()
#         sys.exit(1)

In [None]:
# =============================================================================
# Main Execution Workflow
# =============================================================================

def main(args):
    """Main function to orchestrate the EFA comparison pipeline."""
    log_message("--- Starting EFA Comparison Pipeline (T1 vs T2) ---")
    
    # --- Setup Output Directories ---
    comp_out_dir = ensure_output_dir(pathlib.Path(args.output_root)) # Main comparison output
    t1_out_dir = ensure_output_dir(pathlib.Path(args.output_root), "T1_EFA_results")
    t2_out_dir = ensure_output_dir(pathlib.Path(args.output_root), "T2_EFA_results") if args.run_congruence else None
    
    # --- Load and Prepare Data ---
    # Pass paths directly from args
    t1_path = pathlib.Path(args.t1_file)
    t2_path = pathlib.Path(args.t2_file)
    var_info_path = pathlib.Path(args.var_file)
    df_T1_full, df_T2_survey, common_vars, _, _ = load_and_prepare_data(
        t1_path, t2_path, var_info_path
    )
    df_T1_survey = df_T1_full[common_vars] # Extract survey vars for T1 EFA
    
    # --- Run EFA on T1 (Ground Truth) ---
    fa_T1, results_T1 = run_factor_analysis(
        data=df_T1_survey,
        n_factors=args.n_factors,
        rotation=args.rotation,
        method=args.method,
        use_smc=args.use_smc,
        out_dir=t1_out_dir,
        label="T1"
    )
    # Extract T1 scores for later comparison
    data_T1_scores = results_T1.get('scores')
    if data_T1_scores is None:
         log_message("Could not retrieve T1 factor scores. Aborting comparison.", "ERROR")
         sys.exit(1)
    
    
    # --- Run EFA on T2 and Calculate Congruence ---
    if args.run_congruence:
        if t2_out_dir:
             fa_T2, results_T2 = run_factor_analysis(
                 data=df_T2_survey, # Use only common survey vars
                 n_factors=args.n_factors,
                 rotation=args.rotation,
                 method=args.method,
                 use_smc=args.use_smc,
                 out_dir=t2_out_dir,
                 label="T2"
             )
             # Calculate and analyze congruence
             if 'loadings' in results_T1 and 'loadings' in results_T2:
                  congruence_df = calculate_congruence(results_T1['loadings'], results_T2['loadings'])
                  analyze_congruence(congruence_df, args.cong_high, args.cong_low, args.cong_off_diag, comp_out_dir)
             else:
                  log_message("Could not calculate congruence, missing loading matrices.", "WARN")
        else:
             log_message("T2 output directory not created, skipping T2 EFA run.", "WARN")
    
    
    # --- Project T2 onto T1 and Compare Scores ---
    diff_scores = project_and_compare_scores(
        fa_T1=fa_T1,
        data_T1_scores=data_T1_scores,
        data_T2=df_T2_survey, # Use only common survey vars
        out_dir=comp_out_dir
    )
    
    # --- Analyze Group Changes ---
    # Create region map based on config
    region_map = {country: 'Western' if country in WESTERN_COUNTRIES else 'Non-Western'
                  for country in df_T1_full[COUNTRY_COLUMN].unique()}
    analyze_group_changes(
        diff_scores=diff_scores,
        t1_aggregated_data=df_T1_full, # Pass full T1 with demographics
        country_col=args.country_col,
        region_map=region_map,
        out_dir=comp_out_dir
    )
    
    # --- Analyze Raw Variable Change ---
    if args.analyze_raw:
        analyze_raw_variable_change(
            t1_agg_path=t1_path,
            t2_agg_path=t2_path,
            variable=args.raw_var,
            country_col=args.country_col,
            out_dir=comp_out_dir
        )
    
    log_message("--- EFA Comparison Pipeline Finished ---")


In [None]:
# =============================================================================
# Entry Point with Default Values (No Parser)
# =============================================================================

# Create a simple class to hold all the default values
class Args:
    def __init__(self):
        # Input Files
        self.t1_file = str(T1_AGGREGATED_PATH)
        self.t2_file = str(T2_AGGREGATED_PATH)
        self.var_file = str(VARIABLE_INFO_PATH)
        
        # EFA Parameters
        self.n_factors = N_FACTORS
        self.rotation = ROTATION
        self.method = FACTOR_METHOD
        self.use_smc = USE_SMC
        
        # Comparison Options
        self.run_congruence = RUN_CONGRUENCE_ANALYSIS
        self.cong_high = CONGRUENCE_THRESHOLD_HIGH
        self.cong_low = CONGRUENCE_THRESHOLD_LOW
        self.cong_off_diag = CONGRUENCE_OFF_DIAGONAL_MAX
        
        # Group Analysis Options
        self.country_col = COUNTRY_COLUMN
        
        # Raw Variable Analysis Options
        self.analyze_raw = ANALYZE_RAW_VARIABLE
        self.raw_var = RAW_VARIABLE_TO_ANALYZE
        
        # Output Options
        self.output_root = str(OUTPUT_DIR_ROOT)

# Execute the main function with default arguments
try:
    args = Args()
    main(args)
except FileNotFoundError as e:
    log_message(f"Error: Input file not found. {e}", "CRITICAL")
    sys.exit(1)
except (ValueError, TypeError, KeyError) as e:
    log_message(f"Error: Data validation or configuration issue. {e}", "CRITICAL")
    sys.exit(1)
except Exception as e:
    log_message(f"An unexpected error occurred: {e}", "CRITICAL")
    # import traceback # Uncomment for detailed traceback
    # traceback.print_exc()
    sys.exit(1)

[INFO] --- Starting EFA Comparison Pipeline (T1 vs T2) ---
Output directory ensured: output\efa_comparison_T1_vs_T2
Output directory ensured: output\efa_comparison_T1_vs_T2\T1_EFA_results
Output directory ensured: output\efa_comparison_T1_vs_T2\T2_EFA_results
[INFO] --- Loading and Preparing Data (T1 & T2) ---
[INFO] Loaded T1: (2250, 102), T2: (2250, 104), VarInfo: (227, 4)
[INFO] Found 20 common Binary and 77 common Ordinal variables.
[INFO] Indices match between T1 and T2.
[INFO] Data loading and preparation complete.
[INFO] --- Running Factor Analysis on T1 ---
[INFO]  Bartlett's Test: Chi2=115635.21, p=0
[INFO]  KMO Test: Overall=0.943
[INFO]  EFA model fitted successfully.
[INFO]  EFA results saved to: output\efa_comparison_T1_vs_T2\T1_EFA_results
[INFO] --- Running Factor Analysis on T2 ---
[INFO]  Bartlett's Test: Chi2=inf, p=0
[INFO]  KMO Test: Overall=0.848
[INFO]  EFA model fitted successfully.


  statistic = -np.log(corr_det) * (n - 1 - (2 * p + 5) / 6)


[INFO]  EFA results saved to: output\efa_comparison_T1_vs_T2\T2_EFA_results
[INFO] --- Calculating Tucker's Congruence Coefficients ---
[INFO] Congruence calculation complete.
[INFO] --- Analyzing Congruence ---
Congruence Matrix (T1 vs T2):
          Factor_1  Factor_2  Factor_3  Factor_4  Factor_5
Factor_1    -0.175     0.005     0.729     0.090     0.103
Factor_2     0.570     0.110    -0.167     0.040     0.120
Factor_3     0.224    -0.070     0.121    -0.273    -0.140
Factor_4    -0.245     0.130     0.401     0.598     0.258
Factor_5     0.011     0.030    -0.059     0.427    -0.112

Best T2 Match for each T1 Factor:
         Best_Matching_T2_Factor  Congruence_Coefficient
Factor_1                Factor_3                   0.729
Factor_2                Factor_1                   0.570
Factor_3                Factor_1                   0.224
Factor_4                Factor_4                   0.598
Factor_5                Factor_4                   0.427
[INFO] Diagonal congruence 

## Output with llama2:13b-chat-fp16
```python
Paired t-test results (T1 vs Projected T2):
          T_Statistic  P_Value
Factor                        
Factor_1      42.2702      0.0
Factor_2     -15.5940      0.0
Factor_3      54.5968      0.0
Factor_4      20.8409      0.0
Factor_5      19.0582      0.0
[INFO]  Calculating Cohen's d for score differences...
 Cohen's d results:
          Mean_Difference  Std_Dev_Difference  Cohen_d
Factor                                                
Factor_1          -0.9535              1.0700  -0.8911
Factor_2           0.4395              1.3368   0.3288
Factor_3          -1.1983              1.0411  -1.1510
Factor_4          -0.4261              0.9698  -0.4394
Factor_5          -0.4183              1.0411  -0.4018
[INFO]  Score comparison complete.
[INFO] --- Analyzing Group Changes in Factor Scores ---
[INFO]  Calculated mean factor change by country.

 Mean Factor Score Change by Country (Projected T2 - T1):
                 Factor_1  Factor_2  Factor_3  Factor_4  Factor_5
B_COUNTRY_ALPHA                                                  
AND               -0.0062    0.5755   -2.5958    1.1834   -0.2685
ARG               -1.1583    0.8554   -1.5766    0.7305   -0.3605
ARM               -1.6693    0.7156   -2.1888   -0.6251   -0.2799
AUS                0.7707    1.1320   -1.5405    0.4829   -0.3449
BGD               -1.7075   -0.6219   -2.0154   -1.7092   -0.7582
[INFO]  Calculated mean factor change by region.

 Mean Factor Score Change by Region (Projected T2 - T1):
             Factor_1  Factor_2  Factor_3  Factor_4  Factor_5
region                                                       
Non-Western   -1.1312    0.4091   -1.1499   -0.5645   -0.4398
Western        0.5208    0.6917   -1.5996    0.7222   -0.2398
[INFO] Group change analysis complete.
[INFO] --- Analyzing Raw Change for Variable: Q165P ---
[INFO]  Calculated mean raw change for Q165P by country.

 Mean Raw Score Change for Q165P by Country (T2 - T1):
B_COUNTRY_ALPHA
CHN    0.6667
HKG   -0.6111
NLD    0.5556
SGP   -0.5278
CZE    0.5000
        ...  
IRQ   -0.0556
AND   -0.0441
NZL    0.0312
MNG    0.0139
PAK    0.0000
Name: Q165P_diff, Length: 66, dtype: float64
[INFO] --- EFA Comparison Pipeline Finished ---
```

## Output with gemma3:12b-it-fp16
```python
[INFO] --- Starting EFA Comparison Pipeline (T1 vs T2) ---
Output directory ensured: output\efa_comparison_T1_vs_T2
Output directory ensured: output\efa_comparison_T1_vs_T2\T1_EFA_results
Output directory ensured: output\efa_comparison_T1_vs_T2\T2_EFA_results
[INFO] --- Loading and Preparing Data (T1 & T2) ---
[INFO] Loaded T1: (2250, 102), T2: (2250, 104), VarInfo: (227, 4)
[INFO] Found 20 common Binary and 77 common Ordinal variables.
[INFO] Indices match between T1 and T2.
[INFO] Data loading and preparation complete.
[INFO] --- Running Factor Analysis on T1 ---
[INFO]  Bartlett's Test: Chi2=115635.21, p=0
[INFO]  KMO Test: Overall=0.943
[INFO]  EFA model fitted successfully.
[INFO]  EFA results saved to: output\efa_comparison_T1_vs_T2\T1_EFA_results
[INFO] --- Running Factor Analysis on T2 ---
[INFO]  Bartlett's Test: Chi2=688519.57, p=0
[INFO]  KMO Test: Overall=0.978
[INFO]  EFA model fitted successfully.
[INFO]  EFA results saved to: output\efa_comparison_T1_vs_T2\T2_EFA_results
[INFO] --- Calculating Tucker's Congruence Coefficients ---
[INFO] Congruence calculation complete.
[INFO] --- Analyzing Congruence ---
Congruence Matrix (T1 vs T2):
          Factor_1  Factor_2  Factor_3  Factor_4  Factor_5
Factor_1    -0.109     0.694     0.276    -0.027    -0.097
Factor_2     0.249    -0.046     0.076     0.112     0.515
Factor_3     0.403     0.105    -0.403    -0.049     0.077
Factor_4    -0.470     0.449     0.356     0.186     0.140
Factor_5    -0.295    -0.031    -0.041     0.039     0.113

Best T2 Match for each T1 Factor:
         Best_Matching_T2_Factor  Congruence_Coefficient
Factor_1                Factor_2                   0.694
Factor_2                Factor_5                   0.515
Factor_3                Factor_1                   0.403
Factor_4                Factor_2                   0.449
Factor_5                Factor_5                   0.113
[INFO] Diagonal congruence check: 0 factors >= 0.90, 5 factors < 0.85
[INFO] Off-diagonal check: Max absolute value = 0.694 (Threshold < 0.30)
[WARN] Factor structure congruence is questionable. Check coefficients.
[INFO] --- Projecting T2 onto T1 Factors and Comparing Scores ---
[INFO]  T2 data projected onto T1 factors.
[INFO]  Running paired t-tests (T1 vs Projected T2)...
 Paired t-test results (T1 vs Projected T2):
          T_Statistic  P_Value
Factor                        
Factor_1      55.4551      0.0
Factor_2      30.2494      0.0
Factor_3       5.7951      0.0
Factor_4      16.7415      0.0
Factor_5     -48.2455      0.0
[INFO]  Calculating Cohen's d for score differences...
 Cohen's d results:
          Mean_Difference  Std_Dev_Difference  Cohen_d
Factor                                                
Factor_1          -1.0370              0.8870  -1.1691
Factor_2          -0.8555              1.3415  -0.6377
Factor_3          -0.1557              1.2747  -0.1222
Factor_4          -0.4220              1.1957  -0.3529
Factor_5           2.3773              2.3373   1.0171
[INFO]  Score comparison complete.
[INFO] --- Analyzing Group Changes in Factor Scores ---
[INFO]  Calculated mean factor change by country.

 Mean Factor Score Change by Country (Projected T2 - T1):
                 Factor_1  Factor_2  Factor_3  Factor_4  Factor_5
B_COUNTRY_ALPHA                                                  
AND               -0.0684   -0.7371   -1.5215    1.1659    2.3771
ARG               -1.1405   -0.5816   -0.9056    0.9384    2.7462
ARM               -1.6753   -0.5820   -1.2433   -0.6108    2.3303
AUS                0.1844   -0.0354   -0.2090   -0.0501    1.4290
BGD               -1.4730   -1.9649   -1.0228   -1.3853    2.0961
[INFO]  Calculated mean factor change by region.

 Mean Factor Score Change by Region (Projected T2 - T1):
             Factor_1  Factor_2  Factor_3  Factor_4  Factor_5
region                                                       
Non-Western   -1.1677   -0.9052   -0.1346   -0.5100    2.4269
Western        0.0474   -0.4428   -0.3315    0.3077    1.9659
[INFO] Group change analysis complete.
[INFO] --- Analyzing Raw Change for Variable: Q165P ---
[INFO]  Calculated mean raw change for Q165P by country.

 Mean Raw Score Change for Q165P by Country (T2 - T1):
B_COUNTRY_ALPHA
CZE    0.6667
MAC    0.6111
NLD    0.6111
VNM    0.4571
HKG   -0.4444
        ...  
NGA    0.0000
PHL    0.0000
PAK    0.0000
TUN    0.0000
TJK    0.0000
Name: Q165P_diff, Length: 66, dtype: float64
[INFO] --- EFA Comparison Pipeline Finished ---
```

## Output with phi4:14b-fp16
```python
[INFO]  EFA results saved to: output\efa_comparison_T1_vs_T2\T2_EFA_results
[INFO] --- Calculating Tucker's Congruence Coefficients ---
[INFO] Congruence calculation complete.
[INFO] --- Analyzing Congruence ---
Congruence Matrix (T1 vs T2):
          Factor_1  Factor_2  Factor_3  Factor_4  Factor_5
Factor_1    -0.175     0.005     0.729     0.090     0.103
Factor_2     0.570     0.110    -0.167     0.040     0.120
Factor_3     0.224    -0.070     0.121    -0.273    -0.140
Factor_4    -0.245     0.130     0.401     0.598     0.258
Factor_5     0.011     0.030    -0.059     0.427    -0.112

Best T2 Match for each T1 Factor:
         Best_Matching_T2_Factor  Congruence_Coefficient
Factor_1                Factor_3                   0.729
Factor_2                Factor_1                   0.570
Factor_3                Factor_1                   0.224
Factor_4                Factor_4                   0.598
Factor_5                Factor_4                   0.427
[INFO] Diagonal congruence check: 0 factors >= 0.90, 5 factors < 0.85
[INFO] Off-diagonal check: Max absolute value = 0.729 (Threshold < 0.30)
[WARN] Factor structure congruence is questionable. Check coefficients.
[INFO] --- Projecting T2 onto T1 Factors and Comparing Scores ---
[INFO]  T2 data projected onto T1 factors.
[INFO]  Running paired t-tests (T1 vs Projected T2)...
 Paired t-test results (T1 vs Projected T2):
          T_Statistic  P_Value
Factor                        
Factor_1      48.7597      0.0
Factor_2      12.1358      0.0
Factor_3     -45.0082      0.0
Factor_4      62.5731      0.0
Factor_5     -17.5349      0.0
[INFO]  Calculating Cohen's d for score differences...
 Cohen's d results:
          Mean_Difference  Std_Dev_Difference  Cohen_d
Factor                                                
Factor_1          -0.7620              0.7412  -1.0279
Factor_2          -0.3699              1.4458  -0.2558
Factor_3           1.0205              1.0755   0.9489
Factor_4          -1.1827              0.8966  -1.3192
Factor_5           0.6909              1.8689   0.3697
[INFO]  Score comparison complete.
[INFO] --- Analyzing Group Changes in Factor Scores ---
[INFO]  Calculated mean factor change by country.

 Mean Factor Score Change by Country (Projected T2 - T1):
                 Factor_1  Factor_2  Factor_3  Factor_4  Factor_5
B_COUNTRY_ALPHA                                                  
AND                0.0369    0.1814    0.0049    0.0621    0.4506
ARG               -1.0293    0.1997    0.8819   -0.2453    0.6180
ARM               -1.2058   -0.0893    0.1890   -1.4426    0.2852
AUS                0.3299    0.6868    0.9165   -0.7338    0.1932
BGD               -1.1469   -1.4163    0.3345   -2.2322    0.1736
[INFO]  Calculated mean factor change by region.

 Mean Factor Score Change by Region (Projected T2 - T1):
             Factor_1  Factor_2  Factor_3  Factor_4  Factor_5
region                                                       
Non-Western   -0.8706   -0.4471    1.0531   -1.2668    0.7098
Western        0.1391    0.2706    0.7503   -0.4850    0.5336
[INFO] Group change analysis complete.
[INFO] --- Analyzing Raw Change for Variable: Q165P ---
[INFO]  Calculated mean raw change for Q165P by country.

 Mean Raw Score Change for Q165P by Country (T2 - T1):
B_COUNTRY_ALPHA
TWN   -0.9429
CHL   -0.7941
HKG   -0.6667
DEU   -0.6571
SGP   -0.6389
        ...  
TJK    0.0000
TUR    0.0000
TUN    0.0000
UZB    0.0000
ZWE    0.0000
Name: Q165P_diff, Length: 66, dtype: float64
[INFO] --- EFA Comparison Pipeline Finished ---
```

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# DataFrame with the mean differences from the previous outputs (Lame)
mean_diff_comparison = pd.DataFrame({
    'llama2-13b-chat': {
        'F1 - Religious-Traditional': -0.9535,
        'F2 - Institutional Trust': 0.4395,
        'F3 - Democratic Values': -1.1983,
        'F4 - Social Conservatism': -0.4261,
        'F5 - Openness to Diversity': -0.4183
    },
    'gemma3-12b-it': {
        'F1 - Religious-Traditional': -1.0370,
        'F2 - Institutional Trust': -0.8555,
        'F3 - Democratic Values': -0.1557,
        'F4 - Social Conservatism': -0.4220,
        'F5 - Openness to Diversity': 2.3773
    },
    'phi4-14b': {
        'F1 - Religious-Traditional': -0.7620,
        'F2 - Institutional Trust': -0.3699,
        'F3 - Democratic Values': 1.0205,
        'F4 - Social Conservatism': -1.1827,
        'F5 - Openness to Diversity': 0.6909
    }
})

# Add a row with the average absolute difference across factors
mean_diff_comparison.loc['Average_Abs'] = mean_diff_comparison.abs().mean()

# Display and save the results
print("Mean Factor Score Differences by Model (T2 - T1):")
print(mean_diff_comparison.round(4))

# Save to CSV
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
mean_diff_comparison.to_csv(f"{output_dir}/mean_factor_score_diff_by_model.csv")

# Create visualizations directory
viz_dir = f"{output_dir}/visualizations"
os.makedirs(viz_dir, exist_ok=True)

# 1. Bar chart for each factor across models
plt.figure(figsize=(12, 8))

# Drop the Average_Abs row for the factor comparison
factor_data = mean_diff_comparison.drop('Average_Abs')
factor_data.T.plot(kind='bar', width=0.8)
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.title('Mean Factor Score Differences by Model (T2 - T1)', fontsize=14)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Mean Difference (T2 - T1)', fontsize=12)
plt.legend(title='Factor', fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f"{viz_dir}/factor_differences_by_model_bar.png", dpi=300)
plt.close()

# 2. Heatmap of differences
plt.figure(figsize=(10, 8))
# Create a heatmap without the Average_Abs row
sns.heatmap(mean_diff_comparison.drop('Average_Abs'), annot=True, cmap='RdBu_r', 
            center=0, fmt='.2f', linewidths=.5)
plt.title('Heatmap of Mean Factor Score Differences (T2 - T1)', fontsize=14)
plt.tight_layout()
plt.savefig(f"{viz_dir}/factor_differences_heatmap.png", dpi=300)
plt.close()

# 3. Radar/Spider chart for absolute differences
def radar_chart(df, title):
    # Number of variables
    categories = list(df.index)
    N = len(categories)
    
    # We are going to plot the first line of the data frame.
    # But we need to repeat the first value to close the circular graph
    values = df.values.flatten().tolist()
    values += values[:1]
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variables)
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]
    
    # Initialize the plot
    fig = plt.figure(figsize=(8, 8))
    ax = plt.subplot(111, polar=True)
    
    # Draw one axis per variable + add labels
    plt.xticks(angles[:-1], categories, size=12)
    
    # Draw ylabels
    ax.set_rlabel_position(0)
    max_val = max(values)
    plt.yticks([0.5, 1.0, 1.5, 2.0, 2.5], ["0.5", "1.0", "1.5", "2.0", "2.5"], 
               color="grey", size=10)
    plt.ylim(0, max(2.5, max_val * 1.1))
    
    # Plot data
    for i, model in enumerate(df.columns):
        values = df[model].values.tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=model)
        ax.fill(angles, values, alpha=0.1)
    
    # Add legend
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title(title, size=15, y=1.1)
    return fig

# Create absolute differences for radar chart
abs_diff = mean_diff_comparison.abs().drop('Average_Abs')
radar_fig = radar_chart(abs_diff, 'Absolute Factor Score Differences by Model')
radar_fig.savefig(f"{viz_dir}/factor_differences_radar.png", dpi=300, bbox_inches='tight')
plt.close()

# 4. Average absolute difference comparison (bar chart)
plt.figure(figsize=(10, 6))
avg_abs_diff = mean_diff_comparison.loc['Average_Abs'].sort_values()
avg_abs_diff.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Average Absolute Factor Score Difference by Model', fontsize=14)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Average Absolute Difference', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f"{viz_dir}/average_abs_difference_by_model.png", dpi=300)
plt.close()

print(f"Visualizations saved to {viz_dir}")


Mean Factor Score Differences by Model (T2 - T1):
                            llama2-13b-chat  gemma3-12b-it  phi4-14b
F1 - Religious-Traditional          -0.9535        -1.0370   -0.7620
F2 - Institutional Trust             0.4395        -0.8555   -0.3699
F3 - Democratic Values              -1.1983        -0.1557    1.0205
F4 - Social Conservatism            -0.4261        -0.4220   -1.1827
F5 - Openness to Diversity          -0.4183         2.3773    0.6909
Average_Abs                          0.6871         0.9695    0.8052
Visualizations saved to output/visualizations


<Figure size 1200x800 with 0 Axes>

This code creates four different visualizations:

Bar Chart: Shows the mean difference for each factor across all models, allowing you to see which factors have the largest differences and how models compare on specific factors.

Heatmap: Provides a color-coded view of all differences, making it easy to spot patterns across models and factors.

Radar/Spider Chart: Displays the absolute differences in a radial format, which can help visualize which model has the smallest overall deviation across all factors.

Average Absolute Difference Bar Chart: Shows which model has the lowest average absolute difference across all factors, providing a simple metric for overall model performance.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Create output directory
output_dir = "output"
viz_dir = f"{output_dir}/visualizations"
os.makedirs(viz_dir, exist_ok=True)

# Define the region data for each model
# Model: llama2-13b-chat
llama2_regions = pd.DataFrame({
    'Non-Western': {
        'Factor_1': -1.1312,
        'Factor_2': 0.4091,
        'Factor_3': -1.1499,
        'Factor_4': -0.5645,
        'Factor_5': -0.4398
    },
    'Western': {
        'Factor_1': 0.5208,
        'Factor_2': 0.6917,
        'Factor_3': -1.5996,
        'Factor_4': 0.7222,
        'Factor_5': -0.2398
    }
})

# Model: gemma3-12b-it
gemma3_regions = pd.DataFrame({
    'Non-Western': {
        'Factor_1': -1.1677,
        'Factor_2': -0.9052,
        'Factor_3': -0.1346,
        'Factor_4': -0.5100,
        'Factor_5': 2.4269
    },
    'Western': {
        'Factor_1': 0.0474,
        'Factor_2': -0.4428,
        'Factor_3': -0.3315,
        'Factor_4': 0.3077,
        'Factor_5': 1.9659
    }
})

# Model: phi4-14b
phi4_regions = pd.DataFrame({
    'Non-Western': {
        'Factor_1': -0.8706,
        'Factor_2': -0.4471,
        'Factor_3': 1.0531,
        'Factor_4': -1.2668,
        'Factor_5': 0.7098
    },
    'Western': {
        'Factor_1': 0.1391,
        'Factor_2': 0.2706,
        'Factor_3': 0.7503,
        'Factor_4': -0.4850,
        'Factor_5': 0.5336
    }
})

# Combine into a dictionary for easier access
region_data = {
    'llama2-13b-chat': llama2_regions,
    'gemma3-12b-it': gemma3_regions,
    'phi4-14b': phi4_regions
}

# 1. Grouped bar chart for each factor by region and model
for factor in ['Factor_1', 'Factor_2', 'Factor_3', 'Factor_4', 'Factor_5']:
    plt.figure(figsize=(12, 6))
    
    # Extract data for this factor
    data = {
        'Model': [],
        'Region': [],
        'Mean Difference': []
    }
    
    for model, df in region_data.items():
        for region in ['Western', 'Non-Western']:
            data['Model'].append(model)
            data['Region'].append(region)
            data['Mean Difference'].append(df[region][factor])
    
    # Convert to DataFrame for plotting
    plot_df = pd.DataFrame(data)
    
    # Create grouped bar chart
    sns.barplot(x='Model', y='Mean Difference', hue='Region', data=plot_df, palette='Set2')
    
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.title(f'Mean {factor} Score Change by Region and Model (T2 - T1)', fontsize=14)
    plt.xlabel('Model', fontsize=12)
    plt.ylabel('Mean Difference (T2 - T1)', fontsize=12)
    plt.legend(title='Region')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f"{viz_dir}/{factor}_by_region_model.png", dpi=300)
    plt.close()

# 2. Heatmap for each model showing region differences
for model, df in region_data.items():
    plt.figure(figsize=(10, 6))
    sns.heatmap(df, annot=True, cmap='RdBu_r', center=0, fmt='.2f', linewidths=.5)
    plt.title(f'Heatmap of Mean Factor Score Changes by Region for {model} (T2 - T1)', fontsize=14)
    plt.tight_layout()
    plt.savefig(f"{viz_dir}/{model}_region_heatmap.png", dpi=300)
    plt.close()

# 3. Calculate Western vs Non-Western difference for each model and factor
west_nonwest_diff = {}
for model, df in region_data.items():
    west_nonwest_diff[model] = df['Western'] - df['Non-Western']

west_nonwest_diff_df = pd.DataFrame(west_nonwest_diff)

# Plot the Western vs Non-Western difference
plt.figure(figsize=(12, 8))
west_nonwest_diff_df.plot(kind='bar', width=0.8)
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.title('Difference Between Western and Non-Western Regions by Model (T2 - T1)', fontsize=14)
plt.xlabel('Factor', fontsize=12)
plt.ylabel('Western - Non-Western Difference', fontsize=12)
plt.legend(title='Model', fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f"{viz_dir}/west_nonwest_diff_by_model.png", dpi=300)
plt.close()

# 4. Radar chart comparing Western and Non-Western patterns for each model
def radar_chart(df, title):
    # Number of variables
    categories = list(df.index)
    N = len(categories)
    
    # We are going to plot the first line of the data frame.
    # But we need to repeat the first value to close the circular graph
    values = df.values.flatten().tolist()
    values += values[:1]
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variables)
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]
    
    # Initialize the plot
    fig = plt.figure(figsize=(10, 10))
    ax = plt.subplot(111, polar=True)
    
    # Draw one axis per variable + add labels
    plt.xticks(angles[:-1], categories, size=12)
    
    # Draw ylabels
    ax.set_rlabel_position(0)
    max_val = max(abs(df.values.min()), abs(df.values.max()))
    plt.yticks([-2, -1, 0, 1, 2], ["-2", "-1", "0", "1", "2"], color="grey", size=10)
    plt.ylim(-max(2, max_val * 1.1), max(2, max_val * 1.1))
    
    # Plot data
    for i, region in enumerate(df.columns):
        values = df[region].values.tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=region)
        ax.fill(angles, values, alpha=0.1)
    
    # Add legend
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title(title, size=15, y=1.1)
    return fig

for model, df in region_data.items():
    radar_fig = radar_chart(df, f'Mean Factor Score Changes by Region for {model}')
    radar_fig.savefig(f"{viz_dir}/{model}_region_radar.png", dpi=300, bbox_inches='tight')
    plt.close()

# 5. Combined visualization showing average absolute difference by region and model
avg_abs_diff = {}
for model, df in region_data.items():
    avg_abs_diff[model] = {
        'Western': df['Western'].abs().mean(),
        'Non-Western': df['Non-Western'].abs().mean()
    }

avg_abs_diff_df = pd.DataFrame(avg_abs_diff).T
avg_abs_diff_df['Overall'] = (avg_abs_diff_df['Western'] + avg_abs_diff_df['Non-Western']) / 2
avg_abs_diff_df = avg_abs_diff_df.sort_values('Overall')

plt.figure(figsize=(12, 6))
avg_abs_diff_df[['Western', 'Non-Western', 'Overall']].plot(kind='bar', width=0.8)
plt.title('Average Absolute Factor Score Difference by Region and Model', fontsize=14)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Average Absolute Difference', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f"{viz_dir}/avg_abs_diff_by_region_model.png", dpi=300)
plt.close()

print(f"Region visualizations saved to {viz_dir}")


Region visualizations saved to output/visualizations


<Figure size 1200x800 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

The code above creates several visualizations to help understand the regional differences:

1. Factor-specific bar charts: For each factor, shows how the Western and Non-Western regions differ across models.

2. Model-specific heatmaps: For each model, displays a heatmap of factor changes by region.

3. Western vs Non-Western difference chart: Shows the gap between Western and Non-Western regions for each factor and model.

4. Radar charts: For each model, displays the pattern of factor changes across regions in a radial format.

5. Average absolute difference chart: Compares models based on their average absolute difference in Western and Non-Western regions.

These visualizations help us answer 3 important questions:

1. Which models show the largest regional differences
2. Which factors show consistent regional patterns across models
3. Whether certain models better capture regional variations in the ground truth data

Note: Details presented in final report.