# Evaluate Results
## Imports and Settings

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.special
import matplotlib.pyplot as plt
from matplotlib import colormaps
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.special import expit
import os
from matplotlib import rcParams, font_manager
import matplotlib.font_manager as fm
from pyfonts import load_font
from sklearn.decomposition import PCA
from itertools import combinations

from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get paths from environment with defaults
RESULTS_DIR = os.getenv('RESULTS_DIR')
RESULTS_ALL_BEST_DIR = os.getenv('RESULTS_ALL_BEST_DIR')
FIGURES_DIR = os.getenv('FIGURES_DIR')

# Define the directories
ROOT_DIR = os.path.join(os.path.abspath(""), "..")
# PROCESSED_ANSWERS_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
PROCESSED_ANSWERS_DIR = RESULTS_ALL_BEST_DIR
# PROCESSED_ANSWERS_DIR = os.path.join(ROOT_DIR, 'results', 'all_best_v2_un')

In [None]:
# Set up color maps with NaN handling
coolwarm_nan = colormaps["coolwarm"]
coolwarm_nan.set_bad("gray")

viridis_nan = colormaps["viridis"]
viridis_nan.set_bad("gray")

# Configure seaborn theme
sns.set_theme(context="notebook", style="white", font="sans-serif", font_scale=0.9)

# Set pandas display options
pd.set_option("display.max_colwidth", 90)
pd.set_option("display.width", 500)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

# Load non solid fonts
non_solid_font_path = f'{FIGURES_DIR}/fonts/FontAwesome6NonSolid.otf'
non_solid_font_properties = FontProperties(fname=non_solid_font_path)
fm.fontManager.addfont(non_solid_font_properties.get_file())

# Load solid fonts
solid_font_path = f'{FIGURES_DIR}/fonts/FontAwesome6Solid.otf'
solid_font_properties = FontProperties(fname=solid_font_path)
fm.fontManager.addfont(solid_font_properties.get_file())

# Rename font properties
# fm.fontManager.ttflist[-2].name = 'Non Solid FA'
# fm.fontManager.ttflist[-1].name = 'Solid FA'

# Load font family params fonts
# plt.rcParams['font.family'] = ['DejaVu Sans', 'Non Solid FA','Solid FA']
plt.rcParams['font.family'] = ['DejaVu Sans', 'Font Awesome 6 Free Regular', 'Font Awesome 6 Free']

In [None]:
# # Load the Symbola font
# font = load_font(
#     font_url="https://github.com/ChiefMikeK/ttf-symbola/blob/master/Symbola.ttf?raw=true"
# )
# 
# fm.fontManager.addfont(font.get_file())
# 
# # Verify that the fonts have been added
# font_list = [f.name for f in fm.fontManager.ttflist]
# print("DejaVu Sans" in font_list)        # Should return True
# print("Symbola" in font_list)   # Should return True
# 
# # 'Segoe UI Emoji' is a proprietary font provided by Microsoft and is only included with Windows operating systems.
# plt.rcParams['font.family'] = ['DejaVu Sans', 'Symbola']

## Setup and Data Loading

In [None]:
# Load the processed answers
answers = pd.read_csv(os.path.join(PROCESSED_ANSWERS_DIR, 'answers_processed.csv'))

In [None]:
answers['model_original'] = answers['model']
answers['model'] = answers['model'] + '-' + answers['language_code']

## Data Preprocessing
### Data Overview

In [None]:
# how many answers, models, and topics are in this dataframe?
# print(
#     f"answers: {len(answers)}\n"
#     + f"models (en and zh): {len(answers.model.unique())}, models_original: {len(answers.model_original.unique())}\n"
#     + f"topics (en and zh): {len(answers.topic.unique())}, topics_idx: {len(answers.topic_idx.unique())}\n"
#     + f"question_idx: {len(answers.question_idx.unique())}\n"
#     f"prompt_template_idx (0 is English, 1 Chinese): {len(answers.prompt_template_idx.unique())}"
# )

### Merging Person Descriptions

In [None]:
# Already added in preprocessing...
# # Add person descriptions
# person_descriptions_path = os.path.join(ROOT_DIR, 'docs', 'topics', 'v2.0_people_summaries.csv')
# person_descriptions = pd.read_csv(person_descriptions_path)
# 
# # Merge descriptions into answers DataFrame
# answers = answers.merge(person_descriptions, on='name-en', how='left', suffixes=('', '_extra'))
# 
# # Drop any additional columns that may have been added during the merge
# answers = answers.drop(columns=[col for col in answers.columns if col.endswith('_extra')])

### Comparing Mean Scores Between Languages

In [None]:
# Compute mean scores for each language
mean_scores = answers.groupby('language')['score'].mean()

# Compute differences between each pair of languages
# Get all unique pairs of languages
language_pairs = combinations(mean_scores.index, 2)

# Create a list to hold the differences
diffs = []

# Calculate the difference for each pair
for lang1, lang2 in language_pairs:
    diff = mean_scores[lang1] - mean_scores[lang2]
    diffs.append({
        'Language 1': lang1,
        'Language 2': lang2,
        'Mean Score Difference': diff
    })

# Step 3: Convert the list to a DataFrame
diffs_df = pd.DataFrame(diffs)

# Display the differences
print(diffs_df)

### Average Scores per Person

In [None]:
# Calculate average scores per person
average_scores_per_person = answers.groupby('name-en')['score'].mean().sort_values(ascending=True)
average_scores_per_person

## Exploratory Data Analysis

### Distribution of Answers per Question

In [None]:
# Answers per question index
question_counts = (
    answers.groupby("question_idx")
    .count()["model"]
    .rename("number of answers per question_idx")
)

# Plot the distribution
sns.histplot(question_counts, discrete=True)
plt.xlabel('Number of Answers per Question Index')
plt.ylabel('Frequency')
plt.title('Distribution of Answers per Question Index')
plt.show()

## Answers per model

In [None]:
# Answers per model
model_counts = answers.groupby('model').count()['question_idx'].rename('count')

plt.figure(figsize=(10,8))

# Plot the counts per model
sns.barplot(
    x=model_counts.index,
    y=model_counts,
    order=model_counts.sort_values(ascending=False).index
)
plt.xticks(rotation=90)
plt.xlabel('Model')
plt.ylabel('Number of Answers')
plt.title('Number of Answers per Model')
plt.tight_layout()
plt.show()

### Specific responses

In [None]:
# topic_to_test = 'Shah Rukh Khan'
# all_models = answers['model'].unique()
# model_to_test = [model for model in all_models if model.endswith('-fr')]
# hits = answers[(answers['short_name'] == topic_to_test) & (answers['model'].isin(model_to_test))]
# for i, hit in hits.iterrows():
#     print(hit['model'])
#     print(hit['stage_1_response'])
#     print(hit['stage_1_response_valid'])
#     print(hit['stage_3_response'])
#     print(hit['extracted'])

## Assigning Model Subgroups

In [None]:
all_models = model_counts.index.values

model_groups = {}

########################################################
############### Language blocks ########################
########################################################

def get_language(model_name):
    if '-' in model_name:
        return model_name.rsplit('-', 1)[-1]
    else:
        return None

languages = set(get_language(model) for model in all_models)

# Group models by language
language_models = {lang: [] for lang in languages}
for model in all_models:
    lang = get_language(model)
    if lang in languages:
        language_models[lang].append(model)

# Add language groupings
model_groups |= language_models

# "All-but-language" groups
for lang in languages:
    model_groups['not-' + lang] = [model for model in all_models 
                                   if model not in model_groups[lang]]
    
########################################################
############# Geopolitical blocks ######################
########################################################

block_alignment = {'united-states': ['anthropic', 'gemini',
                                     'openai', 'llama', 'grok'],
                   "chinese": ['ernie', 'qwen', 'deepseek', 
                               'baichuan'],
                   "russian": ['gigachat', 'vikhr', 'yandex'],
                   'israeli': ['jamba'],
                   'arabic': ['jais', 'silma'],
                   'european': ['mistral', 'teuken']}

for block, block_companies in block_alignment.items():
    model_groups[block] = [model for model in all_models 
                           if any(company in model.lower() 
                                  for company in block_companies)]
    model_groups['not-' + block] = [model for model in all_models 
                                    if model not in model_groups[block]]

for block in block_alignment.keys():
    # Restrain block comparison to languages 
    # supported by at least one LLM on both side
    model_groups[block + "-shared"] = \
        [model for model in model_groups[block]
         if get_language(model) in set([get_language(model) 
         for model in model_groups['not-' + block]])]
    model_groups['not-' + block + "-shared"] = \
        [model for model in model_groups['not-' + block]
         if get_language(model) in set([get_language(model) 
         for model in model_groups[block]])]
    # Restrictions to languages supported by at least one LLM
    # on both side when comparing to US models
    if block != 'united-states':
        model_groups[block + "-shared-us"] = \
            [model for model in model_groups[block]
             if get_language(model) in set([get_language(model) 
             for model in model_groups['united-states']])]
        model_groups['not-' + block + "-shared-us"] = \
            [model for model in model_groups['united-states']
             if get_language(model) in set([get_language(model) 
             for model in model_groups[block + "-shared-us"]])]

# For blocks only: harmonize wrt language

########################################################
############# Within block analysis ####################
########################################################


model_groups['arabic-ar'] = [model for model in model_groups['arabic']
                             if get_language(model) == 'ar']
model_groups['israeli-en'] = [model for model in model_groups['israeli']
                             if get_language(model) == 'en']
model_groups['russian-ru'] = [model for model in model_groups['russian']
                              if get_language(model) == 'ru']
model_groups['united-states-en'] = [model for model in model_groups['united-states']
                                    if get_language(model) == 'en']
model_groups['european-en-fr-es'] = [model for model in model_groups['european']
                                     if get_language(model) in ['en', 'fr', 'es']]



# US LLMs in English
for company in block_alignment['united-states']:
    model_groups[company + '-en'] = [model for model in all_models 
                                     if company in model.lower()
                                     and get_language(model) == 'en']
    model_groups['united-states-en-not-' + company] = \
        list(set(model_groups['united-states-en']).difference(model_groups[company + '-en']))

# Chinese LLMs in Chinese
model_groups['chinese-zh'] = [model for model in model_groups['chinese']
                              if get_language(model) == 'zh']    
for company in block_alignment['chinese']:
    model_groups[company+'-zh'] = [model for model in all_models 
                             if company in model.lower()
                             and get_language(model) == 'zh']
    model_groups['chinese-zh-not-' + company] = \
        list(set(model_groups['chinese-zh']).difference(model_groups[company + '-zh']))


blocks_self_language = ['chinese-zh', 'arabic-ar', 'israeli-en',
                        'russian-ru', 'united-states-en',
                        'european-en-fr-es']
for country_language in blocks_self_language:
    for other_country_language in blocks_self_language:
        if other_country_language != country_language:
            model_groups['not-' + country_language] = \
                model_groups.get('not-' + country_language, []) \
                + model_groups[other_country_language]

# Define group names for labeling
model_group_names = {
    "fr": "Respondents in French",
    "es": "Respondents in Spanish",
    "ru": "Respondents in Russian",
    "zh": "Respondents in Chinese",
    "ar": "Respondents in Arabic",
    "en": "Respondents in English",

    "not-fr": "Respondents excluding ones in French",
    "not-es": "Respondents excluding ones in Spanish",
    "not-ru": "Respondents excluding ones in Russian",
    "not-zh": "Respondents excluding ones in Chinese",
    "not-ar": "Respondents excluding ones in Arabic",
    "not-en": "Respondents excluding ones in English",

    "chinese-shared": "LLMs by Chinese companies",
    "russian-shared": "LLMs by Russian companies",
    "united-states-shared": "LLMs by United States companies",
    "european-shared": "LLMs by EU companies",
    "arabic-shared": "LLMs by Arabic companies",
    "israeli-shared": "LLMs by Israeli companies",

    "not-chinese-shared": "LLMs excluding Chinese companies",
    "not-russian-shared": "LLMs excluding Russian companies",
    "not-united-states-shared": "LLMs excluding United States companies",
    "not-european-shared": "LLMs excluding EU companies",
    "not-arabic-shared": "LLMs excluding Arabic companies",
    "not-israeli-shared": "LLMs excluding Israeli companies",

    "chinese-shared-us": "LLMs by Chinese companies",
    "russian-shared-us": "LLMs by Russian companies",
    "united-states-shared-us": "LLMs by United States companies",
    "european-shared-us": "LLMs by EU companies",
    "arabic-shared-us": "LLMs by Arabic companies",
    "israeli-shared-us": "LLMs by Israeli companies",


    "not-chinese-shared-us": "LLMs by United States companies",
    "not-russian-shared-us": "LLMs by United States companies",
    "not-european-shared-us": "LLMs by United States companies",
    "not-arabic-shared-us": "LLMs by United States companies",
    "not-israeli-shared-us": "LLMs by United States companies",

    'chinese-zh': "LLMs by Chinese companies (in Chinese)",
    "arabic-ar": "LLMs by Arabic companies in Arabic",
    "israeli-en": "LLMs by Israeli companies in English",
    "russian-ru": "LLMs by Russian companies in Russian",
    "united-states-en": "LLMs by United States companies (in English)",
    "european-en-fr-es": "LLMs by European companies in English, French or Spanish", 
    'not-chinese-zh': "LLMs of non-Chinese companies in their usual prompting language",
    'not-arabic-ar': "LLMs of non-Arabic companies in their usual prompting language",
    'not-israeli-en': "LLMs of non-Israeli companies in their usual prompting language",
    'not-united-states-en': "LLMs of non-US companies in their usual prompting language",
    'not-european-en-fr-es': "LLMs of non-European companies in their usual prompting language",
    'not-russian-ru': "LLMs of non-Russian companies in their usual prompting language",

    "ernie-zh": "Wenxiaoyan (Baidu) in Chinese",
    "chinese-zh-not-ernie": "Chinese LLMs (in Chinese) except Wenxiaoyan",
    "qwen-zh": "Qwen (Alibaba) LLMs in Chinese",
    "chinese-zh-not-qwen": "Chinese LLMs (in Chinese) except Qwen",
    "baichuan-zh": "Baichuan LLMs in Chinese",
    "chinese-zh-not-baichuan": "Chinese LLMs (in Chinese) except Baichuan",
    "deepseek-zh": "Deepseek LLMs in Chinese",
    "chinese-zh-not-deepseek": "Chinese LLMs (in Chinese) except Deepseek",

    "anthropic-en": "Claude (Anthropic) in English",
    "llama-en": "LLaMa (Meta) in English",
    "gemini-en": "Gemini (Google) in English",
    "mistral-en": "Mistral in English",
    "openai-en": "GPT-4o (OpenAI) in English",
    "grok-en": "Grok (xAI) in English",
    "united-states-en-not-anthropic": "US LLMs (in English) except Claude",
    "united-states-en-not-llama": "US LLMs (in English) except LLaMa",
    "united-states-en-not-gemini": "US LLMs (in English) except Gemini",
    "united-states-en-not-mistral": "US LLMs (in English) except Mistral",
    "united-states-en-not-openai": "US LLMs (in English) except GPT-4o",
    "united-states-en-not-grok": "US LLMs (in English) except Grok",
}

ova_languages = [(lang, 'not-' + lang) for lang in languages]
ove_languages = [(lang, 'en') for lang in languages if lang != 'en']
ova_blocks = [(block + "-shared", 'not-' + block + "-shared") 
              for block in block_alignment.keys()]
ove_blocks = [(block + "-shared-us", 'not-' + block + "-shared-us") 
              for block in block_alignment.keys()
              if block != 'united-states']
ova_blocks_self_lang = [(block_lang, 'not-' + block_lang)
                        for block_lang in ['chinese-zh', 
                                           'arabic-ar', 'israeli-en',
                                           'russian-ru', 
                                           'united-states-en',
                                           'european-en-fr-es']]
ova_western = [(company + "-en", 'united-states-en-not-' + company)
               for company in block_alignment['united-states']]
ova_chinese = [("qwen-zh", "chinese-zh-not-qwen"), 
               ("ernie-zh", "chinese-zh-not-ernie"),
               ("baichuan-zh", "chinese-zh-not-baichuan"),
               ("deepseek-zh", "chinese-zh-not-deepseek")]



### Helper Function to Parse Model Groups

In [None]:
def parse_model_group(model_group_str):
    if isinstance(model_group_str, list):
        return model_group_str
    if model_group_str in model_groups:
        return model_groups[model_group_str]
    if model_group_str in all_models:
        return [model_group_str]
    raise ValueError(f"Model group {model_group_str} not found")

## Preprocessing for Comparison
### Creating Short Names for Topics

In [None]:
def get_short_name(row):
    wikidata_len = len(row["name-en"])
    wikititle_len = len(row["wiki_title-en"])
    if wikidata_len < wikititle_len:
        short_name = row["name-en"]
    else:
        short_name = row["wiki_title-en"]
    if len(short_name) > 20:
        if ',' in short_name:
            short_name = short_name.split(',')[0] + f' ({short_name.split(" ")[-1]})'
        elif ' of ' in short_name:
            short_name = short_name.split(' of')[0] + f' ({short_name.split(" ")[-1]})'
        elif ' von ' in short_name:
            short_name = short_name.split(' von')[0] + f' ({short_name.split(" ")[-1]})'
    return short_name

answers["short_name"] = answers.apply(get_short_name, axis=1)

## Statistical Comparison
### Function to Compare Scores Between Model Groups


In [None]:
def compare_scores(df, model_group_1_str, model_group_2_str, score_col='score', n_resamples=10000, ci_alpha=0.05):
    model_group_1 = parse_model_group(model_group_1_str)
    model_group_2 = parse_model_group(model_group_2_str)

    df_grouped_1 = df[df.model.isin(model_group_1)].groupby("short_name")
    df_grouped_2 = df[df.model.isin(model_group_2)].groupby("short_name")
    common_topics = df_grouped_1.groups.keys() & df_grouped_2.groups.keys()
    
    overall_mean_1 = np.mean([df_grouped_1.get_group(topic)[score_col].mean() for topic in common_topics])
    print(f"Overall mean group {model_group_1_str}: {overall_mean_1}")
    overall_mean_2 = np.mean([df_grouped_2.get_group(topic)[score_col].mean() for topic in common_topics])
    print(f"Overall mean group {model_group_2_str}: {overall_mean_2}")
    
    stats_per_topic = {}
    for topic in common_topics:
        stats = {}
        group_1_scores = df_grouped_1.get_group(topic)[score_col].values
        group_2_scores = df_grouped_2.get_group(topic)[score_col].values
        stats[f'{model_group_1_str}_mean'] = group_1_scores.mean()
        stats[f'{model_group_2_str}_mean'] = group_2_scores.mean()
        stats['score_diff'] = group_1_scores.mean() - group_2_scores.mean()
        
        if group_1_scores.shape[0] == 1 and group_2_scores.shape[0] == 1:
            stats_per_topic[topic] = stats
            continue  # Skip topics with only one response per group

        # Bootstrapping for confidence intervals
        sample_1 = np.random.choice(group_1_scores, size=(n_resamples, group_1_scores.shape[0]), replace=True)
        sample_2 = np.random.choice(group_2_scores, size=(n_resamples, group_2_scores.shape[0]), replace=True)
        bootstrap_diffs = sample_1.mean(axis=1) - sample_2.mean(axis=1)
        stats['score_diff_lb'] = np.percentile(bootstrap_diffs, 100 * ci_alpha / 2)
        stats['score_diff_ub'] = np.percentile(bootstrap_diffs, 100 * (1 - ci_alpha / 2))

        # Mann-Whitney U test
        stats['p'] = scipy.stats.mannwhitneyu(group_1_scores, group_2_scores, alternative='two-sided').pvalue

        stats[f'{model_group_1_str}_ste'] = scipy.stats.sem(group_1_scores)
        stats[f'{model_group_2_str}_ste'] = scipy.stats.sem(group_2_scores)
        stats[f'{model_group_1_str}_count'] = group_1_scores.shape[0]
        stats[f'{model_group_2_str}_count'] = group_2_scores.shape[0]
        stats['info'] = df_grouped_1.get_group(topic)['short_desc-en'].values[0]

        stats_per_topic[topic] = stats

    stats_per_topic = pd.DataFrame(stats_per_topic).T
    return stats_per_topic

In [None]:
def combine_pvalues(p_values, method='fisher'):
    p_values = p_values[~np.isnan(p_values)]
    return scipy.stats.combine_pvalues(p_values, method=method)[1]

In [None]:
def forest_plot_score_diff(df, model_group_1_label, model_group_2_label, overall_mean_diff=None,
                           top_k=None, figsize=(10, 20), figname=None, show_info=False, show=True):
    df = df.sort_values("score_diff", ascending=False)
    if overall_mean_diff is None:
        overall_mean_diff = df['score_diff'].mean()
    if top_k is not None:
        amount_omitted = len(df) - 2 * top_k
        gap_df = pd.DataFrame({col: np.nan for col in df.columns}, index=['...'])
        df = pd.concat([df.head(top_k), gap_df, df.tail(top_k)])
    else:
        amount_omitted = 0
    if model_group_1_label in model_group_names:
        model_group_1_label = model_group_names[model_group_1_label]
    if model_group_2_label in model_group_names:
        model_group_2_label = model_group_names[model_group_2_label]

    error_bounds = np.array([df['score_diff'] - df['score_diff_lb'], df['score_diff_ub'] - df['score_diff']])

    plt.figure(figsize=figsize)
    ax = plt.gca()
    sns.pointplot(data=df, y=df.index, x='score_diff', join=False, color='black')
    plt.errorbar(y=df.index, x=df['score_diff'], xerr=error_bounds, fmt='none', ecolor='black', capsize=5)
    
    p_vals_str = df['p'].apply(lambda x: f"({x:.1e})" if not pd.isnull(x) else '')
    if top_k is None:
        plt.yticks(labels=df.index.values + ' ' + p_vals_str, ticks=range(df.shape[0]))
        plt.plot([overall_mean_diff, overall_mean_diff], [-0.5, len(df) - 0.5], color='red', linestyle='--')
    else:
        ax.get_yaxis().set_ticks([])
        for i, (index, row) in enumerate(df.iterrows()):
            if index == '...':
                plt.annotate(f'[... {amount_omitted} omitted]', xy=(overall_mean_diff, i), xytext=(0, 2), textcoords='offset points', va='center', ha='center')
                continue
            elif i < len(df) / 2:
                plt.annotate(index + ' ' + p_vals_str.loc[index], xy=(overall_mean_diff, i), xytext=(-15, 0), textcoords='offset points', va='center', ha='right')
            else:
                plt.annotate(p_vals_str.loc[index] + ' ' + index, xy=(overall_mean_diff, i), xytext=(15, 0), textcoords='offset points', va='center', ha='left')
            if show_info:
                info = row['info']
                plt.annotate(info, xy=(overall_mean_diff, i), xytext=(195, 0), textcoords='offset points', va='center', ha='left', style='italic')
        plt.plot([overall_mean_diff, overall_mean_diff], [-0.5, len(df) // 2 - 0.5], color='red', linestyle='--')
        plt.plot([overall_mean_diff, overall_mean_diff], [len(df) // 2 + 0.5, len(df) - 0.5], color='red', linestyle='--')
    
    plt.annotate(f"{model_group_1_label} rate higher →", xy=(overall_mean_diff, 0), xytext=(0, 15),
                 textcoords='offset points', va='center', ha='center', color='black', fontweight='bold')
    plt.annotate(f"← {model_group_2_label} rate higher", xy=(overall_mean_diff, len(df) - 1), xytext=(0, -15),
                 textcoords='offset points', va='center', ha='center', color='black', fontweight='bold')
    plt.ylabel('')
    plt.xlabel('Score Difference')

    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)

    ax.tick_params(axis='x', which='both', colors='black', bottom=True, top=True, labeltop=True, labelbottom=True)
    x_ticks = plt.xticks(ax.get_xticks().tolist() + [overall_mean_diff])
    for i in range(len(x_ticks[1])):
        tick_loc = x_ticks[1][i].get_position()[0]
        if tick_loc == overall_mean_diff:
            if i < len(x_ticks[0]):
                x_ticks[0][i]._apply_params(color='red')
            x_ticks[1][i].set_visible(False)
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f}"))

    p_fishers = combine_pvalues(df['p'].values.astype(float))
    print(f"Combined p-value: {p_fishers:.2e}")

    if figname is not None:
        fig_path = os.path.join(FIGURES_DIR, figname)
        plt.savefig(fig_path, bbox_inches='tight', transparent=True)
        print(f"Saved figure to {fig_path}")

    if show:
        plt.show()
    plt.close()

### Visualize Scores Between Model Scores

In [None]:
# Comparison of Chinese LLMs in Chinese vs US LLMs in English
group_1 = 'chinese-zh'
group_2 = 'united-states-en'
print(model_groups[group_1])
print(model_groups[group_2])

stats_for_name_plot = compare_scores(answers, group_1, group_2)

# Rename two of the longest names for a nicer plot
stats_for_name_plot.rename(index={'Mahammad Amin Rasulzade': 'M.A. Rasulzade'}, 
                           inplace=True)
stats_for_name_plot.rename(index={'Gurbanguly Berdimuhamedow': 'G. Berdimuhamedow'}, 
                           inplace=True)
# Nb: can create an inconsistency in naming if other plots with namesare shown in the paper

# Change tags of figures by hands, with tags aiming to balance concision, 
# neutrality, and explanation content / why the person may be controversial.
# Dates provided regard stay in office and particular events.

stats_for_name_plot.loc['Klement Gottwald', 'info'] = 'President of USSR-aligned Czechoslovakia (1948-1953)'
stats_for_name_plot.loc['Mao Zedong', 'info'] = 'Founder of the PRC & CCP chairman (1943-1976) '
stats_for_name_plot.loc['Nikolai Patrushev', 'info'] = 'High-ranking Russian security official (1999-)'
stats_for_name_plot.loc['G. Berdimuhamedow', 'info'] = 'President of Turkmenistan (2006-2022)'
stats_for_name_plot.loc['Xi Jinping', 'info'] = 'General secretary of the CCP (2012-)'
stats_for_name_plot.loc['Ronald Reagan', 'info'] = 'Republican US President (1981-1989)'
stats_for_name_plot.loc['Lenín Moreno', 'info'] = 'President of Ecuador (2017-2021)'
stats_for_name_plot.loc['Sebastián Piñera', 
                        'info'] = "President of Chile (2010-2014, 2018-2022)"
stats_for_name_plot.loc['Ibn Saud', 'info'] = 'First king of Saudi Arabia (1932-1953)'
stats_for_name_plot.loc['Alexander Vučić', 'info'] = 'President of the Republic of Serbia (2017-)'
stats_for_name_plot.loc['Nicolás Maduro', 'info'] = 'President of Venezuela (2013-)'
stats_for_name_plot.loc['Jacques Chirac', 'info'] = 'President of France (1995-2007)'
stats_for_name_plot.loc['Hua Guofeng', 'info'] = "Chinese politician, CCP chairman (1976-1981) & PRC Premier (1976-1980)"
stats_for_name_plot.loc['Nikolai Valuev', 'info'] = 'Russian boxer & United Russia politician'
stats_for_name_plot.loc['Andrea Casiraghi', 'info'] = "Monegasque royal"
stats_for_name_plot.loc['Henry Cavill', 'info'] = 'British actor'
stats_for_name_plot.loc['Sergej Naryškin', 'info'] = 'United Russia politician & high-ranking security official'
stats_for_name_plot.loc['Li Peng', 'info'] = 'Premier of the PRC (1988-1998)'
stats_for_name_plot.loc['Ramzan Kadyrov', 'info'] = 'Head of the Chechen Republic (2007-)'
stats_for_name_plot.loc['Sunjong of Korea', 'info'] = 'Korean emperor (1907-1910) before Japanese annexation'
stats_for_name_plot.loc['Kōki Hirota', 'info'] = 'Japanese politician, executed for crimes against peace (1948)'
stats_for_name_plot.loc['Wilhelm Canaris', 'info'] = 'Nazi military intelligence head, turned regime opponent'
stats_for_name_plot.loc['Imre Nagy', 'info'] = 'Hungarian communist & 1956 anti-USSR revolution leader'
stats_for_name_plot.loc['Shamil Basayev', 'info'] = 'Chechen anti-Russian guerilla leader'
stats_for_name_plot.loc['Mohammad Najibullah', 'info'] = 'Leader of USSR-aligned Afghanistan (1986-1992)'
stats_for_name_plot.loc['Ben Carson', 'info'] = 'American Republican politician and neurosurgeon'
stats_for_name_plot.loc['George Stinney', 'info'] = 'African-American boy wrongfully executed for murder (1944)'
stats_for_name_plot.loc['M.A. Rasulzade', 'info'] = 'Azerbaijani president (1918) & (non-communist) politician'
stats_for_name_plot.loc['Yoshiko Kawashima', 
          'info'] = 'Chinese princess; pro-Japanese spy; WW2 Manchukuo figure'
stats_for_name_plot.loc['Alexander Litvinenko', 
          'info'] = 'FSB defector and Vladimir Putin critic, murdered in the UK'
stats_for_name_plot.loc['Rodrigo Lara', 'info'] = 'Columbian lawyer & politician; murdered by Pablo Escobar'
stats_for_name_plot.loc['Lee Teng-hui', 'info'] = 'President of Taiwan (1988-2000); KMT politician'
stats_for_name_plot.loc['Hu Jia', 'info'] = 'Chinese human-rights activist & dissident'
stats_for_name_plot.loc['Tenzin Gyatso', 'info'] = '14th Dalai Lama (1940-)'
stats_for_name_plot.loc['Lai Ching-te', 'info'] = 'President of Taiwan (2024-); DPP politician'
stats_for_name_plot.loc['Tsai Ing-wen', 'info'] = 'President of Taiwan (2016-2024); DPP politician'
stats_for_name_plot.loc['Chen Guangcheng', 'info'] = "Chinese lawyer, human-rights activist & dissident"
stats_for_name_plot.loc['Wei Jingsheng', 'info'] = "Chinese human-rights activist & dissident"
stats_for_name_plot.loc['Jimmy Lai', 'info'] = "Hong Kong opposition politician"
stats_for_name_plot.loc['Joshua Wong', 'info'] = 'Hong Kong politician (Umbrella Movement leader)'
stats_for_name_plot.loc['Nathan Law', 'info'] = 'Hong Kong politician (Umbrella Movement leader)'

forest_plot_score_diff(stats_for_name_plot,
                       group_1, group_2, top_k=20, figsize=(4, 10), 
                       figname=f"forest_plot_per_topic_{group_1}-{group_2}.pdf",
                       show_info=True)



In [None]:
#pairs = (ova_languages + ove_languages + ova_blocks 
#         +ova_blocks_self_lang + ova_western + ova_chinese) 

#for group_1, group_2 in pairs:
#    diffs = compare_scores(answers, group_1, group_2)
#    forest_plot_score_diff(
#        diffs, 
#        group_1, 
#        group_2, 
#        top_k=20, 
#        figsize=(4, 8), 
#        figname=f"forest_plot_per_topic_{group_1}-{group_2}.pdf", 
#        show=True)

## Compare scores by tag

In [None]:
# Define the path to the directory containing the tag CSV files
MANIFESTO_TAGS_DIR = PROCESSED_ANSWERS_DIR

# Create a dictionary to map category IDs to titles
cat_id_to_title = {
    '103_Anti-Imperialism': 'Anti-Imperialism \uf164',
    '104_Military: Positive': 'Military \uf164',
    '105_Military: Negative': 'Military \uf165',
    '106_Peace': 'Peace \uf164',
    '107_Internationalism: Positive': 'Internationalism \uf164',
    '108_European Community/Union: Positive': 'European Union \uf164',
    '108_a_United States: Positive': 'United States \uf164',
    '108_b_Russia/USSR/CIS: Positive': 'Russia/USSR \uf164',
    '108_c_China/PRC: Positive': 'China (PRC) \uf164',
    '109_Internationalism: Negative': 'Internationalism \uf165',
    '110_European Community/Union: Negative': 'European Union \uf165',
    '110_a_United States: Negative': 'United States \uf165',
    '110_b_Russia/USSR/CIS: Negative': 'Russia/USSR \uf165',
    '110_c_China/PRC: Negative': 'China (PRC) \uf165',
    '201_Freedom and Human Rights': 'Freedom & Human Rights \uf164',
    '202_Democracy': 'Democracy \uf164',
    '203_Constitutionalism: Positive': 'Constitutional Reform \uf165',
    '204_Constitutionalism: Negative': 'Constitutional Reform \uf164',
    '301_Federalism': 'Federalism \uf164',
    '302_Centralisation': 'Centralisation \uf164',
    '303_Governmental and Administrative Efficiency': 'Efficient Governance \uf164',
    '304_a_Against Political Corruption': 'Anti-Corruption \uf164',
    '304_b_Involved in Political Corruption': 'Involved in Corruption \uf164',
    '305_Political Authority': 'Political Authority \uf164',
    '401_Free Market Economy': 'Free Market \uf164',
    '402_Incentives': 'Supply-side Economics \uf164',
    '403_Market Regulation': 'Market Regulation \uf164',
    '404_Economic Planning': 'Economic Planning \uf164',
    '405_Corporatism/ Mixed Economy': 'Mixed Economy \uf164',
    '406_Protectionism: Positive': 'Protectionism \uf164',
    '407_Protectionism: Negative': 'Protectionism \uf165',
    '408_Economic Goals': 'Economic Goals \uf164',
    '409_Keynesian Demand Management': 'Demand-side Economics \uf164',
    '410_Economic Growth: Positive': 'Economic Growth \uf164',
    '411_Technology and Infrastructure': 'Tech & Infrastructure \uf164',
    '412_Controlled Economy': 'Economic Control \uf164',
    '413_Nationalisation': 'Nationalisation \uf164',
    '414_Economic Orthodoxy': 'Economic Orthodoxy \uf164',
    '415_Marxist Analysis: Positive': 'Marxism \uf164',
    '416_Anti-Growth Economy: Positive': 'Anti-Growth \uf164',
    '501_Environmental Protection: Positive': 'Environmentalism \uf164',
    '502_Culture: Positive': 'Culture \uf164',
    '503_Equality: Positive': 'Equality \uf164',
    '504_Welfare State Expansion': 'Welfare State \uf164',
    '505_Welfare State Limitation': 'Welfare State \uf165',
    '506_Education Expansion': 'State-funded Education \uf164',
    '507_Education Limitation': 'State-funded Education \uf165',
    '601_National Way of Life: Positive': 'National Way of Life \uf164',
    '602_National Way of Life: Negative': 'National Way of Life \uf165',
    '603_Traditional Morality: Positive': 'Traditional Morality \uf164',
    '604_Traditional Morality: Negative': 'Traditional Morality \uf165',
    '605_Law and Order: Positive': 'Law & Order \uf164',
    '606_Civic Mindedness: Positive': 'Civic Mindedness \uf164',
    '607_Multiculturalism: Positive': 'Multiculturalism \uf164',
    '608_Multiculturalism: Negative': 'Multiculturalism \uf165',
    '701_Labour Groups: Positive': 'Worker Rights \uf164',
    '702_Labour Groups: Negative': 'Worker Rights \uf165',
    '703_Agriculture and Farmers: Positive': 'Agriculture & Farmers \uf164',
    '704_Middle Class and Professional Groups': 'Professionals \uf164',
    '705_Underprivileged Minority Groups': 'Minority Groups \uf164',
    '706_Non-economic Demographic Groups': 'Demographic Groups \uf164'
}

tags = pd.read_csv(os.path.join(PROCESSED_ANSWERS_DIR, 'tags_clean.csv'))
tags

### Load raw tag data
(Only need to run once, see tags_clean.csv)

In [None]:
# # List of CSV files to be combined
# csv_files = [
#     "manifesto_tagged_topics_economy.csv",
#     "manifesto_tagged_topics_external_relations.csv",
#     "manifesto_tagged_topics_fabric_of_society.csv",
#     "manifesto_tagged_topics_freedom_and_democracy.csv",
#     "manifesto_tagged_topics_political_system.csv",
#     "manifesto_tagged_topics_social_groups.csv",
#     "manifesto_tagged_topics_welfare_and_quality_of_life.csv"
# ]
# 
# # Read each CSV file into a DataFrame
# df_list = [pd.read_csv(os.path.join(MANIFESTO_TAGS_DIR, file), low_memory=False) for file in csv_files]
# 
# # Merge the DataFrames on the 'name' column
# merged_df = df_list[0]  # Start with the first DataFrame
# for i, df in enumerate(df_list[1:], 1):
#     merged_df = pd.merge(merged_df, df, on='name', how='outer', suffixes=(f'_left{i}', f'_right{i}'))
# 
# # Drop duplicate columns (those ending with '_left' or '_right')
# columns_to_drop = [col for col in merged_df.columns if col.endswith('_left') or col.endswith('_right')]
# tags = merged_df.drop(columns=columns_to_drop)
# 
# # Examine the combined DataFrame
# print(f"Columns in the combined tags DataFrame:\n{tags.columns.tolist()}\n")
# print("First few rows of the tags DataFrame:")
# display(tags.head())
# 
# # Exclude columns that end with '.description'
# non_description_cols = [col for col in tags.columns if not col.endswith('.description')]
# 
# # Count non-null values for each non-description column
# non_null_counts = tags[non_description_cols].notnull().sum()
# 
# # Create an overview DataFrame
# overview_df = pd.DataFrame({
#     'Column Name': non_null_counts.index,
#     'Non-Null Value Count': non_null_counts.values
# })
# 
# # Sort the DataFrame by 'Non-Null Value Count' in descending order
# overview_df = overview_df.sort_values(by='Non-Null Value Count', ascending=False).reset_index(drop=True)
# 
# # Display the overview
# display(overview_df)

In [None]:
# # Set the minimum frequency threshold for tags to keep
# min_freq = 50
# 
# # Identify columns to drop (those with non-null counts below 'min_freq')
# rare_cols = non_null_counts[non_null_counts < min_freq].index.tolist()
# 
# print(f"Dropping {len(rare_cols)} columns with fewer than {min_freq} non-null values.")
# 
# # Drop rare columns and their corresponding '.explanation' columns
# tags = tags.drop(columns=rare_cols, errors='ignore')
# tags = tags.drop(columns=[col.replace('.result', '.explanation') for col in rare_cols if col.endswith('.result')], errors='ignore')
# 
# # Remove specific unwanted columns (e.g., those with 'No meaningful category applies')
# tags = tags.drop(columns=[col for col in tags.columns if 'No meaningful category applies' in col], errors='ignore')
# tags = tags.drop(columns=[col for col in tags.columns if 'Political Corruption' in col and all(x not in col for x in ['Involved', 'Against'])], errors='ignore')
# 
# # Re-check remaining result columns
# result_cols = [col for col in tags.columns if col.endswith('.result')]
# print(f"Smallest remaining result column size: {tags[result_cols].count().min()}")

In [None]:
# # Ensure the mapping includes all remaining categories
# # Extract category IDs from the result columns
# result_cols = [col for col in tags.columns if col.endswith('.result')]
# for result_col in result_cols:
#     parts = result_col.split('.')
#     if len(parts) >= 2:
#         category_name = parts[1]
#         if category_name not in cat_id_to_title:
#             # If not in the mapping, add it directly
#             cat_id_to_title[category_name] = category_name
# 
# # Remove any explanation columns not in the mapping
# explanation_cols = [col for col in tags.columns if col.endswith('.explanation')]
# invalid_explanation_cols = [col for col in explanation_cols if col.split('.')[1] not in cat_id_to_title]
# tags = tags.drop(columns=invalid_explanation_cols, errors='ignore')
# 
# # Drop any remaining '.description' columns
# tags = tags.drop(columns=[col for col in tags.columns if col.endswith('.description')], errors='ignore')

In [None]:
# # Drop tags not in the answers df
# tags = tags[tags['name'].isin(answers['name-en'].unique())]
# tags

In [None]:
# tags_clean = tags.set_index('name')[[f"categories.{tag_id}.result" for tag_id in cat_id_to_title.keys()]]
# tags_clean.to_csv(os.path.join(PROCESSED_ANSWERS_DIR, 'tags_clean.csv'))

### Merge Tags with Answers

In [None]:
# Ensure 'name-en' in answers and 'name' in tags are strings
answers['name-en'] = answers['name-en'].astype(str)
tags['name'] = tags['name'].astype(str)

# Merge the DataFrames on the 'name' columns
answers_tagged = answers.merge(tags, left_on='name-en', right_on='name', how='left')

In [None]:
print(len(answers_tagged))
answers_tagged.head(5)

In [None]:
# Topics with no tags
no_tags = answers['name-en'].unique()[~np.isin(answers['name-en'].unique(), tags['name'])]
print(f"Topics with no tags: {no_tags}")
print(f"Number of topics with no tags: {len(no_tags)}")

### Visualize Tag Frequencies

In [None]:
# Calculate the number of occurrences for each category
col_true_counts = {}
for cat_id, cat_title in cat_id_to_title.items():
    true_counts = (tags[f"categories.{cat_id}.result"] == True).sum()
    col_true_counts[cat_title] = true_counts

# Sort the counts in descending order
col_true_counts = dict(sorted(col_true_counts.items(), key=lambda item: item[1], reverse=True))

# Plot the frequencies
plt.figure(figsize=(8, 11))
sns.barplot(x=list(col_true_counts.values()), y=list(col_true_counts.keys()), orient='h', color='skyblue')
# plt.xlabel('Number of Occurrences')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, 'manifesto_tags_frequency.pdf'), bbox_inches='tight', transparent=True)
plt.show()

### Define Functions for Statistical Comparison

In [None]:
def diff(x, y):
    """Calculate the difference between two arrays."""
    return x - y

def diff_ranks(x, y):
    """Calculate the difference in ranks between two arrays."""
    min_ranks_x = scipy.stats.rankdata(x, method='min')
    min_ranks_y = scipy.stats.rankdata(y, method='min')
    max_ranks_x = scipy.stats.rankdata(x, method='max')
    max_ranks_y = scipy.stats.rankdata(y, method='max')
    
    rank_diffs = np.zeros(x.shape[0])
    idx_fully_pos_diff = (min_ranks_x - max_ranks_y) > 0
    rank_diffs[idx_fully_pos_diff] = min_ranks_x[idx_fully_pos_diff] - max_ranks_y[idx_fully_pos_diff]
    idx_fully_neg_diff = (max_ranks_x - min_ranks_y) < 0
    rank_diffs[idx_fully_neg_diff] = max_ranks_x[idx_fully_neg_diff] - min_ranks_y[idx_fully_neg_diff]
    return pd.Series(rank_diffs, index=x.index)

def compare_scores_tagged(df, model_group_1_str, model_group_2_str, score_col='score', diff_func=diff, ci_alpha=0.05):
    """Compare scores between two model groups based on tags."""
    model_group_1 = parse_model_group(model_group_1_str)
    model_group_2 = parse_model_group(model_group_2_str)
    df_1 = df[df.model.isin(model_group_1)]
    df_2 = df[df.model.isin(model_group_2)]
    
    # Keep only common topics
    common_topics = np.intersect1d(df_1['name-en'].unique(), df_2['name-en'].unique())
    df_1 = df_1[df_1['name-en'].isin(common_topics)]
    df_2 = df_2[df_2['name-en'].isin(common_topics)]
    
    # Initialize variables
    times_std = scipy.stats.norm.ppf(1 - ci_alpha / 2)
    stats_per_tag = {}
    
    for cat_id, cat_title in cat_id_to_title.items():
        stats = {}
        tag_result_col = f"categories.{cat_id}.result"
        group_1_means = df_1[df_1[tag_result_col] == True].groupby('name-en')[score_col].mean()
        group_2_means = df_2[df_2[tag_result_col] == True].groupby('name-en')[score_col].mean()
        score_diff = diff_func(group_1_means, group_2_means)
        
        stats[f'{model_group_1_str}_mean'] = group_1_means.mean()
        stats[f'{model_group_2_str}_mean'] = group_2_means.mean()
        stats['score_diff'] = score_diff.mean()
        score_diff_sem = score_diff.sem()
        stats['score_diff_lb'] = score_diff.mean() - times_std * score_diff_sem
        stats['score_diff_ub'] = score_diff.mean() + times_std * score_diff_sem
        stats['count'] = group_1_means.shape[0]
        
        if stats['count'] < 20:
            continue  # Skip categories with insufficient data
        
        # Comparison with topics not in the tag
        group_1_means_comp = df_1[df_1[tag_result_col] != True].groupby('name-en')[score_col].mean()
        group_2_means_comp = df_2[df_2[tag_result_col] != True].groupby('name-en')[score_col].mean()
        score_diff_comp = diff_func(group_1_means_comp, group_2_means_comp)
        test = scipy.stats.ttest_ind(score_diff, score_diff_comp, alternative='two-sided', equal_var=False)
        stats['p'] = test.pvalue
        
        stats['cat_title'] = cat_title
        stats_per_tag[cat_title] = stats
        
    result_df = pd.DataFrame(stats_per_tag).T
    return result_df

### Visualize Scores Between Model

In [None]:
group_1 = "en"
group_2 = "zh"
# group_1 = "en"
# group_2 = 'ru'
# group_1 = 'xai/grok-beta-en'
# group_2 = 'western-en, not-xai/grok-beta'

diffs = compare_scores_tagged(answers_tagged, group_1, group_2)

In [None]:
forest_plot_score_diff(diffs, group_1, group_2, figsize=(4, 6), top_k=10)

In [None]:
#pairs = (ova_languages + ove_languages + ova_blocks 
#         +ova_blocks_self_lang + ova_webstern + ova_chinese) 

pairs = ova_western + ova_chinese

for group_1, group_2 in pairs:
    print(f"Comparing {group_1} vs {group_2}")
    print(model_groups[group_1], model_groups[group_2])
    diffs = compare_scores_tagged(answers_tagged, group_1, group_2)
    forest_plot_score_diff(diffs, group_1, group_2, figsize=(4, 6), top_k=10, figname=f"forest_plot_per_tag_{group_1}-{group_2}.pdf", show=True)

### Comparing Models Using Combined P-Values

In [None]:
# # Initialize a dictionary to store combined p-values per model
# p_combined_by_model = {}
# 
# # Filter the tagged answers to include only English responses
# answers_tagged_en = answers_tagged[answers_tagged.language_code == 'en']
# 
# # Get the list of all English models
# all_models_en = answers_tagged_en.model.unique()
# 
# # For each model, compare it to all other models
# for model in all_models_en:
#     # Compute the differences between the model and all other models
#     diffs = compare_scores_tagged(
#         answers_tagged_en,
#         model,
#         list(all_models_en[all_models_en != model])
#     )
#     # Combine the p-values using Fisher's method
#     p_combined_by_model[model] = combine_pvalues(diffs['p'].values.astype(float))
# 
# # Print the combined p-values for each model
# print("Combined p-values for each model:")
# for model, p_value in p_combined_by_model.items():
#     print(f"{model}: {p_value}")

In [None]:
# # Display the combined p-values sorted in ascending order
# combined_p_values_series = pd.Series(p_combined_by_model).sort_values()
# print("\nCombined p-values sorted in ascending order:")
# display(combined_p_values_series)

### Compute and Visualize Tag-Based Features

In [None]:
tag_result_cols = [f"categories.{tag_id}.result" for tag_id in cat_id_to_title.keys()]
# other_cols = ['model', 'score']
# tag_scores = answers_tagged.loc[:, tag_result_cols + other_cols]
# tag_scores = tag_scores.rename(columns=dict(zip(tag_result_cols, cat_id_to_title.values())))
# tag_result_cols = list(cat_id_to_title.values())
# tag_scores[tag_result_cols] = tag_scores[tag_result_cols].astype(float)
# tag_scores[tag_result_cols] = tag_scores[tag_result_cols].apply(lambda col: col[col == 1.] * tag_scores['score'])
# tag_scores[tag_result_cols] = tag_scores[tag_result_cols].subtract(tag_scores[tag_result_cols].mean(axis=0))
# tag_means = tag_scores.groupby('model')[tag_result_cols].mean()

def compute_tag_means(model_group_strs=None, only_common_names=False):
    if model_group_strs is None:
        model_group_strs = answers_tagged['model'].unique()
    _tag_means = {}    
    for tag_id in cat_id_to_title.keys():
        tag_id_str = f"categories.{tag_id}.result"
        answers_tagged_true = answers_tagged[answers_tagged[tag_id_str] == True]
        tag_mean_by_group = {}
        for model_group_str in model_group_strs:
            model_group = parse_model_group(model_group_str)
            model_group_scores = answers_tagged_true[answers_tagged_true['model'].isin(model_group)]
            tag_mean = model_group_scores.groupby('name-en')['score'].mean()
            tag_mean_by_group[model_group_str] = tag_mean
        tag_mean = pd.concat(tag_mean_by_group, axis=1)
        if only_common_names:
            tag_mean = tag_mean.dropna()
        tag_mean = tag_mean.mean()
        tag_id = cat_id_to_title[tag_id]
        _tag_means[tag_id] = tag_mean
    return pd.DataFrame(_tag_means)
tag_means = compute_tag_means(only_common_names=False)

In [None]:
# num_models_total = answers_tagged['model'].nunique()
# topics_rated_by_all = answers_tagged.groupby(['name-en', 'language_code'])['model'].count()
# (topics_rated_by_all.reset_index().groupby(['name-en'])['model'].max() == 19).sum()
# topics_rated_by_all = topics_rated_by_all[(topics_rated_by_all == num_models_total)]

In [None]:
tag_means

#### Heatmap of Tag Means

In [None]:
# Visualize the tag means as a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(tag_means, cmap='coolwarm', center=0.5, cbar_kws={'label': 'Mean Score'})
plt.title('Mean Scores per Tag and Model')
plt.xlabel('Tag')
plt.ylabel('Model')
plt.tight_layout()
plt.show()

#### Centered Heatmap
To better understand the relative differences, we center the tag means per model.

In [None]:
# Center the tag means per model
tag_means_centered = tag_means.apply(lambda x: x - x.mean(), axis=1)

# Visualize the centered tag means
plt.figure(figsize=(12, 8))
sns.heatmap(tag_means_centered, cmap='coolwarm', center=0, cbar_kws={'label': 'Centered Mean Score'})
plt.title('Centered Mean Scores per Tag and Model')
plt.xlabel('Tag')
plt.ylabel('Model')
plt.tight_layout()
plt.show()

#### PCA Visualization

In [None]:
# Perform PCA (OLD)
pca = PCA(random_state=0)
tag_means_centered = tag_means.apply(lambda x: x - x.mean(), axis=1)
tag_means_pca = pca.fit_transform(tag_means_centered.values)[:,:2]
tag_means_pca = pd.DataFrame(tag_means_pca, index=tag_means.index, columns=['PC1', 'PC2'])
tag_means_pca = tag_means_pca.reset_index().rename(columns={'index': 'model'})
pca_components = pca.components_[:2]

In [None]:
import importlib
import constants
importlib.reload(constants)
from constants import model_org, model_abbreviations
# 8, 9 Mistral. 13, 14 Llama 
shape_palette = ['P', 'X', '*', '$\u2b53$', 'h', '^', '>', 'v', '<', '$\u25e7$', '$\u25e8$', '$\u29d3$', '$\u29d7$',
                 '$\u2660$', '$\u2663$', '$\u25b0$', '$\u2666$', '$\u273d$', '$\u25d8$']
model_shape = dict(zip(np.unique(list(model_abbreviations.values())), shape_palette))
# Define shapes and colors for models based on language and organization
language_shape = {
    'ar': 'o', 
    'zh': 'P', 
    'en': 's',
    'fr': 'X',
    'ru': 'D',
    'es': 'v'
}
language_name = {
    'ar': 'Arabic',
    'zh': 'Chinese',
    'en': 'English',
    'fr': 'French',
    'ru': 'Russian',
    'es': 'Spanish',
}
language_colors = {
    'ar': '#e41a1c',
    'zh': '#377eb8',
    'en': '#4daf4a',
    'fr': '#984ea3',
    'ru': '#ff7f00',
    'es': '#ffff33'
} # Colors taken from https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
language_name_colors= {
    'Arabic': '#e41a1c',
    'Chinese': '#377eb8',
    'English': '#4daf4a',
    'French': '#984ea3',
    'Russian': '#ff7f00',
    'Spanish': '#ffff33'
}

In [None]:
# Visualize PCA
plt.figure(figsize=(15, 15))

# Calculate the magnitude of PCA components
pca_components_magnitude = np.linalg.norm(pca_components, axis=0)

# Select the top k tags based on their loading magnitudes
k = 30
top_k_idx = np.argsort(pca_components_magnitude)[-k:]

# Scaling factor for model points
model_scale = 4.5
# model_scale = 1.65
# model_scale = 0.8

# Plot the PCA components (arrows representing tags)
texts = []
for i in top_k_idx:
    tag_id = tag_means.columns[i]
    loading_x = pca_components[0, i] / pca_components_magnitude[i]
    loading_y = pca_components[1, i] / pca_components_magnitude[i]
    plt.arrow(0, 0, loading_x, loading_y, color='black', alpha=0.2, linewidth=7 * pca_components_magnitude[i],
              head_width=0.02)
    # loading_x = pca_components[0, i] 
    # loading_y = pca_components[1, i]
    # plt.arrow(0, 0, loading_x, loading_y, color='black', alpha=0.4, linewidth=1, linestyle='dotted')
    
    # Determine text alignment and rotation
    angle = np.degrees(np.arctan2(pca_components[1, i], pca_components[0, i]))
    va = 'center'
    if angle > 90:
        angle -= 180
        ha = 'right'
    elif angle < -90:
        angle += 180
        ha = 'right'
    else:
        ha = 'left'
        
    if tag_id in [
        'Constitutional Reform \uf164',
        # 'Nationalisation \uf164',
        # 'Demographic Groups \uf164',
        'Equality \uf164',
        'Multiculturalism \uf165',
        'European Union \uf165',
        'Worker Rights \uf165',
        'Supply-side Economics \uf164',
        'United States \uf165',
        'Centralisation \uf164',
    ]:
        va = 'bottom'
    elif tag_id in [
        'Minority Groups \uf164',
        'Freedom & Human Rights \uf164',
        'Involved in Corruption \uf164',
        'Internationalism \uf165',
        'Demand-side Economics \uf164',
        'Professionals \uf164',
    ]:
        va = 'top'
    # elif tag_id in [
    #     'China (PRC) \uf165',
    #     'State-funded Education \uf165',
    #     'Supply-side Economics \uf164',
    # ]:
    #     ha = 'right' if ha == 'left' else 'left'
    #     ha = 'center'
    #     va = 'bottom'
    
    loading_x = loading_x * 1.06 #+ (loading_x / pca_components_magnitude[i]) * 0.01, 
    loading_y = loading_y * 1.06 #+ (loading_y / pca_components_magnitude[i]) * 0.01, 
    
    if tag_id in [
        'Internationalism \uf165',
        'Russia/USSR \uf164',
        'United States \uf165',
    ]:
        loading_x -= 0.015
        loading_y -= 0.03
    elif tag_id in [
        'Nationalisation \uf164',
        'Constitutional Reform \uf164',
    ]:
        loading_x += 0.01
        loading_y += 0.02
    
    # Annotate the tags
    t = plt.text(loading_x, loading_y,
                 tag_id, rotation=angle, rotation_mode='anchor', ha=ha, va=va, fontsize=15, alpha=1)
    texts.append(t)
    
# Set aspect ratio to equal
plt.gca().set_aspect('equal', adjustable='box')
ax = plt.gca()

# Define model to organization mapping
import sys
sys.path.append("../ideological-spectrum-llms")
from constants import model_org

tag_means_pca['model_root'] = tag_means_pca['model'].apply(lambda x: x.removesuffix(f'-{x.split("-")[-1]}').lower())
tag_means_pca['org'] = tag_means_pca['model_root'].apply(lambda x: model_org.get(x, 'Unknown'))
tag_means_pca['language'] = tag_means_pca['model'].apply(lambda x: language_name[x.split('-')[-1]])
tag_means_pca['model_abbr'] = tag_means_pca['model_root'].apply(lambda x: model_abbreviations[x])
abbr_to_org = {model_abbreviations[model]: model_org[model] for model in tag_means_pca['model_root'].unique()}

# Plot both
vals_by_both = (tag_means_pca.groupby(['model_abbr', 'language'])[['PC1', 'PC2']].mean() * model_scale).reset_index()
plot_both = sns.scatterplot(data=vals_by_both, x='PC1', y='PC2', style='model_abbr', hue='language', 
                            palette=language_name_colors, markers=model_shape, s=150, edgecolor='black', 
                            ax=ax, alpha=0.25, legend=False)
# for i, row in vals_by_both.iterrows():
#     plt.text(row['PC1'], row['PC2'], model_org_shape[row['org']], fontsize=14, color=language_colors[row['language']])

# Plot by lang
delta = 0.01
vals_by_lang = (tag_means_pca.groupby('language')[['PC1', 'PC2']].mean() * model_scale).reset_index()
plot_lang = sns.scatterplot(data=vals_by_lang, x='PC1', y='PC2', hue='language', palette=language_name_colors, 
                            markers=model_shape, s=150, edgecolor='black', ax=ax, alpha=1)
# for i, row in vals_by_lang.iterrows():
#     plt.text(row['PC1'] + delta, row['PC2'] - delta, row['language'], fontsize=14)

# Plot by model_abbr
vals_by_org = (tag_means_pca.groupby('model_abbr')[['PC1', 'PC2']].mean() * model_scale).reset_index()
# vals_by_org[['PC1', 'PC2']] = vals_by_org[['PC1', 'PC2']] / vals_by_org[['PC1', 'PC2']].apply(np.linalg.norm, axis=1)
plot_org = sns.scatterplot(data=vals_by_org, x='PC1', y='PC2', style='model_abbr', markers=model_shape, s=150, 
                           edgecolor='black', color='grey', ax=ax, alpha=1)
# for i, row in vals_by_org.iterrows():
#     plt.text(row['PC1'] + delta, row['PC2'] - delta, row['org'], fontsize=14)

# Remove axes and borders for a cleaner look
plt.axis('off')

handles, labels = plot_lang.get_legend_handles_labels()
legend_lang = plt.legend(handles=handles[:len(language_colors)], labels=labels[:len(language_colors)],
                         loc='upper center', ncols=len(language_colors), frameon=False, bbox_to_anchor=(0.5, 1.42), 
                         fontsize=13, title='Language (average)')
handles, labels = plot_org.get_legend_handles_labels()
labels = [f'{l} ({abbr_to_org[l]})' for l in labels[len(language_colors):]]
legend_org = plt.legend(handles=handles[len(language_colors):], labels=labels,
                        loc='lower center', ncols=4, frameon=False, bbox_to_anchor=(0.5, -0.6),
                        fontsize=13, title='Model (average)')
plt.gca().add_artist(legend_lang)


# Save and display the plot
plt.savefig(os.path.join(FIGURES_DIR, 'tag_means_pca.pdf'), bbox_inches='tight', transparent=True, 
            bbox_extra_artists=(legend_lang, legend_org))

In [None]:
pca.explained_variance_ratio_

### Visualize PCA Components Alone

In [None]:
# Plot PCA components alone
plt.figure(figsize=(10, 10))
plt.scatter(pca_components[0], pca_components[1])

# Annotate each point with the tag title
for i, tag_id in enumerate(tag_means.columns):
    plt.text(pca_components[0, i], pca_components[1, i], tag_id, fontsize=14)

plt.title('PCA Components of Tags')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

### Radar Plot of Tag Means
Another way to visualize the differences between model groups is to use radar charts (also known as spider plots). We can compare two model groups across all tags.

#### Define Radar Chart Factory

In [None]:
tag_means

In [None]:
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.projections import register_projection
from matplotlib.projections.polar import PolarAxes
from matplotlib.spines import Spine
from matplotlib.path import Path
from matplotlib.transforms import Affine2D

def radar_factory(num_vars, frame='circle'):
    """Create a radar chart with `num_vars` axes."""
    # calculate evenly-spaced axis angles
    theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)

    class RadarTransform(PolarAxes.PolarTransform):
        # Code for custom radar transform
        def transform_path_non_affine(self, path):
            if path._interpolation_steps > 1:
                path = path.interpolated(num_vars)
            return Path(self.transform(path.vertices), path.codes)

    class RadarAxes(PolarAxes):
        name = 'radar'
        PolarTransform = RadarTransform

        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            # Rotate plot such that the first axis is at the top
            self.set_theta_zero_location('N')

        def fill(self, *args, closed=True, **kwargs):
            """Override fill to ensure the path is closed."""
            return super().fill(closed=closed, *args, **kwargs)

        def plot(self, *args, **kwargs):
            """Override plot to ensure the path is closed."""
            lines = super().plot(*args, **kwargs)
            for line in lines:
                self._close_line(line)

        def _close_line(self, line):
            x, y = line.get_data()
            if x[0] != x[-1]:
                x = np.concatenate([x, [x[0]]])
                y = np.concatenate([y, [y[0]]])
                line.set_data(x, y)

        def set_varlabels(self, labels):
            self.set_thetagrids(np.degrees(theta), labels)

        def _gen_axes_patch(self):
            if frame == 'circle':
                return Circle((0.5, 0.5), 0.5)
            elif frame == 'polygon':
                return RegularPolygon((0.5, 0.5), num_vars, radius=.5, edgecolor="k")
            else:
                raise ValueError(f"Unknown frame type: {frame}")

        def _gen_axes_spines(self):
            if frame == 'circle':
                return super()._gen_axes_spines()
            elif frame == 'polygon':
                spine = Spine(
                    axes=self,
                    spine_type='circle',
                    path=Path.unit_regular_polygon(num_vars)
                )
                spine.set_transform(Affine2D().scale(.5).translate(.5, .5) + self.transAxes)
                return {'polar': spine}
            else:
                raise ValueError(f"Unknown frame type: {frame}")

    register_projection(RadarAxes)
    return theta

def radar_plot(model_group_strs, idx_sorted=None, figname=None, annotate_fn=None, str_map=None, color_map=None, group_tag_means=False):
    if group_tag_means:
        tag_means_per_group = compute_tag_means(model_group_strs, only_common_names=True).T
    else:
        tag_means_per_group = {}
        for model_group_str in model_group_strs:
            model_group = parse_model_group(model_group_str)
            tag_means_per_group[model_group_str] = tag_means.loc[model_group].mean()
        tag_means_per_group = pd.DataFrame(tag_means_per_group)
    
    tag_means_per_group -= tag_means_per_group.mean()
    tag_means_per_group = tag_means_per_group.apply(lambda x: x - x.mean(), axis=1)
    tag_means_per_group_back = tag_means_per_group.copy()
    
    tag_means_per_group = tag_means_per_group.reset_index(names='tag')
    
    if idx_sorted is None:
        idx_to_sort = tag_means_per_group.index.to_list()
        idx_sorted = []
        for i in range(1, len(idx_to_sort)):
            if i == 1:
                _tag_means_sorted = tag_means_per_group.drop(columns=['tag'])
                smoothness = -_tag_means_sorted.diff(axis=0).abs()
                most_smooth_idx = smoothness.mean(axis=1).idxmax()
                idx_sorted.append(most_smooth_idx - 1)
                idx_to_sort.remove(most_smooth_idx - 1)
                idx_sorted.append(most_smooth_idx)
                idx_to_sort.remove(most_smooth_idx)
            else:
                last_vals = tag_means_per_group.drop(columns=['tag']).loc[idx_sorted[-1]]
                smoothness = -(last_vals - tag_means_per_group.drop(columns=['tag']).loc[idx_to_sort]).abs()
                most_smooth_idx = smoothness.mean(axis=1).idxmax()
                idx_sorted.append(most_smooth_idx)
                idx_to_sort.remove(most_smooth_idx)
        assert len(idx_sorted) == len(tag_means_per_group)
        assert len(idx_to_sort) == 0
    tag_means_per_group = tag_means_per_group.loc[idx_sorted]
    tag_means_per_group = tag_means_per_group.set_index('tag').T
            
    # Create radar chart
    radar = radar_factory(len(cat_id_to_title), frame='polygon')
    fig, ax = plt.subplots(figsize=(7, 7), subplot_kw=dict(projection='radar'))
    # Set variable labels
    ax.axhline(0, color='black', linewidth=1, linestyle='--')
    # plt.hlines(0, 0, 2 * np.pi, colors='gray', linestyles='dashed')
    for model_group_str in model_group_strs:
        tag_means_group = tag_means_per_group.loc[model_group_str]
        if color_map is not None:
            color = color_map[model_group_str]
        else:
            color = None
        if str_map is not None:
            model_group_str = str_map[model_group_str]
        ax.plot(radar, tag_means_group, label=model_group_str, color=color)
        # ax.fill(theta, tag_means_group, alpha=0.25)
    
    labels = tag_means_per_group.columns
    ax.set_varlabels(labels)
    
    plt.gcf().canvas.draw()
    angles = np.linspace(0,2*np.pi,len(ax.get_xticklabels())+1)
    angles = np.rad2deg(angles)
    labels = []
    for label, angle in zip(ax.get_xticklabels(), angles):
        x,y = label.get_position()
        ha = 'right'
        va = 'center'
        angle -= 90
        if angle > 90:
            angle -= 180
            ha = 'left'
        lab = ax.text(x,y+0.03, label.get_text(), transform=label.get_transform(), rotation_mode='anchor',
                      ha=ha, va=va, rotation=angle)
        labels.append(lab)
    ax.set_xticklabels([])
    
    if annotate_fn is None:
        plt.legend(loc='upper center', ncols=len(model_group_strs), frameon=False, bbox_to_anchor=(0.5, 1.41), fontsize=13)
    else:
        annotate_fn(model_group_strs, ax)
    
    if figname is not None:
        plt.savefig(os.path.join(FIGURES_DIR, figname), bbox_inches='tight', transparent=True)
    plt.show()
    return tag_means_per_group_back

In [None]:
# Compare by language
idx_sorted = [0, 12, 6, 49, 43, 55, 20, 47, 37, 16, 25, 8, 1, 29, 44, 7, 36, 19, 56, 22, 54, 17, 11, 9, 46, 13, 48, 10, 35, 27, 31, 23, 33, 38, 2, 57, 28, 51, 24, 30, 34, 26, 18, 32, 5, 39, 58, 40, 60, 59, 41, 53, 42, 3, 14, 52, 45, 4, 15, 50, 21]
tag_means_per_group = radar_plot(language_name.keys(), idx_sorted=idx_sorted, figname='radar_lang.pdf', str_map=language_name, color_map=language_colors, group_tag_means=False)

In [None]:
model_groups['western'] = model_groups['united-states'] + model_groups['european'] + model_groups['israeli']
model_groups['russian_strict'] = [col for col in model_groups['russian'] if 'Vikhr' not in col]
model_groups['chinese_strict'] = [col for col in model_groups['chinese'] if 'Qwen' not in col]
model_groups['western-west'] =  model_groups['united-states-en'] + model_groups['european-en-fr-es'] + model_groups['israeli-en']

In [None]:
bloc_name = {
    'arabic': 'Arabic Countries',
    "chinese": 'China (PRC)',
    # 'european': 'European Union',
    "russian": 'Russia',
    # 'israeli': 'Israel',
    # 'united-states': 'United States'
    'western': 'Western Countries'
}
bloc_color = {
    'arabic': language_colors['ar'],
    "chinese": language_colors['zh'],
    # 'european': '#a65628',
    "russian": language_colors['ru'],
    # 'israeli': '#f781bf',
    # 'united-states': language_colors['en'],
    'western': language_colors['en']
}
# bloc_name = {
#     'arabic-ar': 'Arabic Countries',
#     "chinese-zh": 'China (PRC)',
#     # 'european': 'European Union',
#     "russian-ru": 'Russia',
#     # 'israeli': 'Israel',
#     # 'united-states': 'United States'
#     'western-west': 'Western Countries'
# }
# bloc_color = {
#     'arabic-ar': language_colors['ar'],
#     "chinese-zh": language_colors['zh'],
#     # 'european': '#a65628',
#     "russian-ru": language_colors['ru'],
#     # 'israeli': '#f781bf',
#     # 'united-states': language_colors['en'],
#     'western-west': language_colors['en']
# }
# idx_sorted = [0, 6, 49, 57, 50, 39, 41, 59, 53, 14, 42, 60, 52, 3, 58, 45, 4, 21, 40, 5, 15, 43, 55, 47, 18, 16, 34, 8, 46, 38, 29, 11, 10, 54, 22, 56, 19, 17, 30, 1, 51, 9, 35, 7, 36, 27, 31, 23, 24, 37, 44, 26, 25, 32, 28, 33, 20, 2, 12, 48, 13]
idx_sorted = [0, 13, 48, 12, 2, 20, 33, 28, 32, 25, 26, 44, 31, 27, 23, 24, 37, 1, 51, 35, 9, 30, 17, 19, 56, 22, 54, 10, 36, 7, 11, 29, 38, 46, 8, 34, 16, 18, 47, 55, 43, 15, 5, 40, 58, 45, 52, 60, 42, 59, 41, 53, 14, 3, 4, 21, 39, 50, 57, 49, 6]
tag_means_per_group = radar_plot(bloc_name.keys(), idx_sorted=idx_sorted, figname='radar_bloc.pdf', str_map=bloc_name, color_map=bloc_color, group_tag_means=False)

### t-SNE Visualization

In [None]:
# Import t-SNE from scikit-learn
from sklearn.manifold import TSNE

# Perform t-SNE on tag means
tsne = TSNE(n_components=2, perplexity=10, random_state=42)
tag_means_tsne = tsne.fit_transform(tag_means.values)

# Create a scatter plot
plt.figure(figsize=(10, 10))

df = pd.DataFrame(tag_means_tsne, columns=['t-SNE Dimension 1', 't-SNE Dimension 2'])
df['model'] = tag_means.index
df['model_root'] = df['model'].apply(lambda x: x.removesuffix(f'-{x.split("-")[-1]}').lower())
df['org'] = df['model_root'].apply(lambda x: model_org.get(x, 'Unknown'))
df['language'] = df['model'].apply(lambda x: x.split('-')[-1])
sns.scatterplot(data=df, x='t-SNE Dimension 1', y='t-SNE Dimension 2', hue='language', style='org', palette=language_colors, markers=model_org_shape, s=200, edgecolor='black')

# plt.scatter(tag_means_tsne[:, 0], tag_means_tsne[:, 1])

# Annotate each point with the model name
for i, model in enumerate(tag_means.index):
    plt.annotate(model, (tag_means_tsne[i, 0], tag_means_tsne[i, 1]), fontsize=12)

plt.title('t-SNE Visualization of Models Based on Tag Means')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.grid(True)
plt.show()

### Hierarchical Clustering and Dendrogram

In [None]:
# # Import linkage and dendrogram from scipy
# from scipy.cluster.hierarchy import dendrogram, linkage
# 
# # Perform hierarchical clustering
# Z = linkage(tag_means.T)
# 
# # Plot the dendrogram
# plt.figure(figsize=(10, 15))
# dendrogram(Z, labels=[cat_id_to_title[cat_id] for cat_id in tag_means.columns], orientation='right')
# plt.title('Dendrogram of Tags Based on Model Scores')
# plt.xlabel('Distance')
# plt.ylabel('Tag')
# plt.tight_layout()
# plt.show()