# Answer preprocessing

## Imports

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.special
import matplotlib.pyplot as plt
from matplotlib import colormaps
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.special import expit
from matplotlib import rcParams, font_manager
import matplotlib.font_manager as fm
from pyfonts import load_font
from sklearn.decomposition import PCA
from itertools import combinations
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get paths from environment with defaults
RESULTS_DIR = os.getenv('RESULTS_DIR')
RESULTS_ALL_BEST_DIR = os.getenv('RESULTS_ALL_BEST_DIR')
FIGURES_DIR = os.getenv('FIGURES_DIR')

# Get font paths
FONT_AWESOME_SOLID = os.getenv('FONT_AWESOME_SOLID')
FONT_AWESOME_REGULAR = os.getenv('FONT_AWESOME_REGULAR')

# Clean combined data
ONLY RUN ONCE!

In [None]:
# # cleaning
# answers = []
# for file in os.listdir(RESULTS_DIR):
#     if file.startswith('answers_') and file != 'answers_extracted.csv':
#         df = pd.read_csv(os.path.join(RESULTS_DIR, file))
#         df['file'] = file
#         answers.append(df)
# answers = pd.concat(answers, ignore_index=True)
# len(answers)

In [None]:
# models_to_drop = ['gemini/gemini-exp-1114']
# answers = answers[~answers['model'].isin(models_to_drop)]

In [None]:
# answers[answers['file'] == 'answers_alex.csv'].reset_index()['model'].value_counts()

In [None]:
# models_to_drop = ['together_ai/Qwen/Qwen2.5-72B-Instruct-Turbo', 'mistral/mistral-medium-latest']
# file = 'answers_extracted_alex.csv'
# answers = answers[~(answers['model'].isin(models_to_drop) & (answers['file'] == file))]

In [None]:
# models_to_drop = ['together_ai/Qwen/Qwen2.5-72B-Instruct-Turbo', 'mistral/mistral-medium-latest', 'jais', 'Vikhr-Nemo', 'together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'deepseek/deepseek-chat', '']
# file = 'answers_alex.csv'
# answers = answers[~(answers['model'].isin(models_to_drop) & (answers['file'] == file))]

In [None]:
# answers[answers.duplicated(['question_idx', 'model'], keep=False)]['file'].unique()

In [None]:
# answers = answers[~((answers['file'] == 'answers_extracted_alex.csv') & answers['stage_1_response'].isna())]

In [None]:
# Keep answers for which an answer was already extracted
# answers = answers.sort_values('file')
# answers = answers[~answers.duplicated(['question_idx', 'model'], keep='last')]

In [None]:
# answers.groupby('file')['model'].value_counts()

In [None]:
# models_to_drop = ['together_ai/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo', 'together_ai/meta-llama/Llama-3.2-3B-Instruct-Turbo', 'mistral/mistral-small-latest', 'together_ai/Qwen/Qwen2.5-7B-Instruct-Turbo']
# answers = answers[~answers['model'].isin(models_to_drop)]

In [None]:
# answers.duplicated(['question_idx', 'model']).sum()

In [None]:
# answers = answers.drop(columns=['file'])

In [None]:
# len(answers)

In [None]:
# answers.to_csv(os.path.join(RESULTS_DIR, 'answers.csv'), index=False)

# Merge metadata

In [None]:
# answers = pd.read_csv(os.path.join(RESULTS_DIR, 'answers_extracted.csv'))
# questions = pd.read_csv(os.path.join(RESULTS_DIR, 'questions.csv'))

In [None]:
# answers = pd.merge(answers, questions, on='question_idx', suffixes=('', '_dupe'))

In [None]:
# answers.model.value_counts()

In [None]:
# Adjust the 'model' column to reflect different models per language
# answers['model_original'] = answers['model']
# answers['model'] = answers['model'] + '-' + answers['language_code']

In [None]:
# Load metadata about topics (this step is optional)
# topic_metadata = pd.read_csv('../docs/topics/v2.0_people_summaries_un.csv')
# answers = pd.merge(answers, topic_metadata, left_on='topic_idx', right_on='wikidata_id')

In [None]:
# answers = answers.drop(columns=[col for col in answers.columns if col.endswith('_dupe')] + ['Unnamed: 0'], errors='ignore')

In [None]:
# answers.head()

## Data Cleaning and Preprocessing

In [None]:
answers = pd.read_csv(os.path.join(RESULTS_DIR, 'answers_extracted_checked.csv'))

In [None]:
# Stage one response
if 'stage_1_response_valid' not in answers.columns:
    answers['stage_1_response_valid'] = ~answers['stage_1_response'].isna()
answers['stage_1_response_valid'].value_counts()

In [None]:
# Analyze the 'stage_1_response_valid' status per person
stage1_response_validity = (
    answers.groupby('name-en')['stage_1_response_valid']
    .value_counts()
)
print("Unique 'stage_1_response_valid' values per person")
print(stage1_response_validity)

In [None]:
stage1_response_validity = stage1_response_validity.reset_index()

In [None]:
# Where was the stage 1 response most often invalid?
stage1_response_validity[stage1_response_validity['stage_1_response_valid'] == 'no'].sort_values('count', ascending=False)

In [None]:
# Where was the stage 1 response most often refused?
stage1_response_validity[stage1_response_validity['stage_1_response_valid'] == 'refusal'].sort_values('count', ascending=False)

In [None]:
# Where was the stage 1 response most often refused or invalid?
stage1_response_validity[stage1_response_validity['stage_1_response_valid'].isin(['no', 'refusal'])].groupby('name-en')['count'].sum().sort_values(ascending=False)

In [None]:
# Examine specific cases where 'stage_1_response_valid' is 'unknown' for 'Alexei Navalny' in English
# unknown_stage1_responses = answers[
#     (answers['language'] == 'English') &
#     (answers['name-en'] == '50 Cent') &
#     (answers['stage_1_response_valid'] == False)
# ]['stage_1_response']
# print("Stage 1 responses for Alexei Navalny where response is 'unknown':")
# print(unknown_stage1_responses)

## Post-processing
### Label Translation

In [None]:
# Display value counts of 'extracted' before cleaning
print("Value counts of 'extracted' before cleaning:")
answers['extracted'].value_counts()

In [None]:
import ast
for lc in answers['language_code'].unique():
    lang_df = answers[answers['language_code'] == lc]
    answer_scale = lang_df['answer_scale'].unique()
    assert len(answer_scale) == 1, f"Multiple answer scales for {lc}: {answer_scale}"
    answer_scale = ast.literal_eval(answer_scale[0])
    
    responses = lang_df['extracted']
    responses = responses[~responses.isna()].copy()
    responses = responses.str.lower().str.strip().str.replace(r'[.!\'"\n]+', '', regex=True)
    valid_responses = responses[responses.isin(answer_scale)]
    answers.loc[valid_responses.index, 'extracted'] = valid_responses

In [None]:
# Display value counts of 'extracted' before cleaning
print("Value counts of 'extracted' deep cleaning:")
answers['extracted'].value_counts()

In [None]:
# Define label translations for incorrectly extracted labels (e.g., Chinese to English)

# Existing translations
label_translations = {
    '非常负面': 'very negative',
    '负面': 'negative',
    '中性': 'neutral',
    '正面': 'positive',
    '非常正面': 'very positive',
}

# Additional translations
additional_translations = {
    # Russian translations
    'положительно': 'positive',
    'нейтрально': 'neutral',
    'отрицательно': 'negative',
    'крайне положительно': 'very positive',
    'крайне отрицательно': 'very negative',
    
    # Arabic translations
    'إيجابي': 'positive',
    'إيجابي جداً': 'very positive',
    'إيجابي للغاية': 'very positive',  # Means 'extremely positive'
    'محايد': 'neutral',
    'سلبي': 'negative',
    'سلبي جداً': 'very negative',
    'سلبي جدًا': 'very negative',  # Different Arabic script for 'very'
    'سلبيًا': 'negative',  # Means 'negatively'

    # Spanish translations
    'positiva': 'positive',
    'negativa': 'negative',
    'muy positiva': 'very positive',
    'muy negativa': 'very negative',
    'Positiva': 'positive',  # Capitalized
    'Muy positiva': 'very positive',  # Capitalized
    
    # French translations
    'très positive': 'very positive',
    'très négative': 'very negative',
    'neutre': 'neutral',
    'négative': 'negative',
    
    # Chinese translations (some already included)
    '正面': 'positive',
    '中性': 'neutral',
    '负面': 'negative',
    '非常正面': 'very positive',
    '非常负面': 'very negative',
    
    # Other translations
    'unknown': 'unknown',  # Keep 'unknown' as is or map appropriately
}

# Update the existing translations with the additional ones
label_translations.update(additional_translations)

# Additional translations for misextracted labels
additional_translations = {
    "Отношение автора текста к Отто-Эрнсту Ремеру можно оценить как 'отрицательно'.": "negative",
    "Ответ будет 'отрицательно'.": "negative",
    "Ответ может быть не однозначным, так как в тексте упоминаются как положительные, так и отрицательные стороны деятельности Джона Эдгара Гувера. Однако, учитывая, что большая часть текста посвящена критике его авторитарного стиля управления и использования экстра-законных методов, а также упоминается множество критики и споров вокруг его личности, можно предположить, что автор относится к Джону Эдгару Гуверу скорее отрицательно. \n\n### Response: отрицательно": "negative",
}

# Update the translations dictionary
label_translations.update(additional_translations)

# Apply label translations to correct 'extracted' labels
answers['extracted'] = answers['extracted'].apply(
    lambda x: label_translations.get(x, x)
)

In [None]:
# Define a mapping from labels to numerical scores
score_translations = {
    'very negative': 0.0,
    'negative': 0.25,
    'neutral': 0.5,
    'positive': 0.75,
    'very positive': 1.0,
}

# Map the cleaned 'extracted' labels to numerical scores
answers['score'] = answers['extracted'].apply(
    lambda x: score_translations.get(x, 'unknown')
)

In [None]:
# Display value counts of 'extracted' after cleaning and translation
print("Value counts of 'extracted_clean' after cleaning:")
print(answers['extracted'].value_counts())

### Handling Duplicates

In [None]:
# Identify duplicate entries based on 'model' and 'question_idx', keeping the last occurrence
duplicates = answers.duplicated(subset=['model', 'question_idx'], keep='last')
num_duplicates = duplicates.sum()
duplicate_topics = answers.loc[duplicates, 'name-en'].unique()

print(f"Filtered out {num_duplicates} duplicates (keeping the last), for the topics {duplicate_topics}")

# Remove duplicate entries from the DataFrame
# answers[~duplicates].reset_index(drop=True)

# Examples

In [None]:
answers['model'].unique()

In [None]:
topic_to_test = 'Muhammad Zia-ul-Haq'
# model_to_test = 'anthropic/claude-3-5-sonnet-20241022-en'
# model_to_test = 'gemini/gemini-pro-en'
# model_to_test = 'mistral/mistral-large-latest'
language_code = 'en'
# cols_to_viz = ['model', 'stage_1', 'stage_1_response', 'stage_1_response_valid', 'stage_3', 'stage_3_response', 'extracted']
cols_to_viz = ['model', 'stage_3', 'stage_3_response', 'extracted']
for col in cols_to_viz:
    match = answers[(answers['name-en'] == topic_to_test) 
                    & ~(answers['model'].isin(['YandexGPT']))
                    & (answers['language_code'] == language_code)
                    # & (answers['stage_1_response_valid'] == 'refusal')
                    # & ~(answers['stage_3_response'].isin(['very negative', 'negative', 'neutral', 'positive', 'very positive']))
                    & (answers['extracted'] == 'unknown')
    ][col].values[0]
    print(f"{col}: {match}")

In [None]:
answers[(answers['language_code'] == 'en') & (answers['extracted'] == 'unknown') & ~(answers['model'].isin(['YandexGPT', 'teuken']))]['name-en'].value_counts()

## Refusal analysis

In [None]:
answers[(answers['stage_1_response'].isna()) & (answers['model'].str.startswith('gemini'))]['name-en'].unique()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import matplotlib.font_manager as fm

# Load non solid fonts
non_solid_font_path = f'{FIGURES_DIR}/fonts/FontAwesome6NonSolid.otf'
non_solid_font_properties = FontProperties(fname=non_solid_font_path)
fm.fontManager.addfont(non_solid_font_properties.get_file())

# Load solid fonts
solid_font_path = f'{FIGURES_DIR}/fonts/FontAwesome6Solid.otf'
solid_font_properties = FontProperties(fname=solid_font_path)
fm.fontManager.addfont(solid_font_properties.get_file())

# Rename font properties
# fm.fontManager.ttflist[-2].name = 'Non Solid FA'
# fm.fontManager.ttflist[-1].name = 'Solid FA'

# Load font family params fonts
# plt.rcParams['font.family'] = ['DejaVu Sans', 'Non Solid FA','Solid FA']
plt.rcParams['font.family'] = ['DejaVu Sans', 'Font Awesome 6 Free Regular', 'Font Awesome 6 Free']

In [None]:
# Create a dictionary to map category IDs to titles
cat_id_to_title = {
    '103_Anti-Imperialism': 'Anti-Imperialism \uf164',
    '104_Military: Positive': 'Military \uf164',
    '105_Military: Negative': 'Military \uf165',
    '106_Peace': 'Peace \uf164',
    '107_Internationalism: Positive': 'Internationalism \uf164',
    '108_European Community/Union: Positive': 'European Union \uf164',
    '108_a_United States: Positive': 'United States \uf164',
    '108_b_Russia/USSR/CIS: Positive': 'Russia/USSR \uf164',
    '108_c_China/PRC: Positive': 'China (PRC) \uf164',
    '109_Internationalism: Negative': 'Internationalism \uf165',
    '110_European Community/Union: Negative': 'European Union \uf165',
    '110_a_United States: Negative': 'United States \uf165',
    '110_b_Russia/USSR/CIS: Negative': 'Russia/USSR \uf165',
    '110_c_China/PRC: Negative': 'China (PRC) \uf165',
    '201_Freedom and Human Rights': 'Freedom & Human Rights \uf164',
    '202_Democracy': 'Democracy \uf164',
    '203_Constitutionalism: Positive': 'Constitutional Reform \uf165',
    '204_Constitutionalism: Negative': 'Constitutional Reform \uf164',
    '301_Federalism': 'Federalism \uf164',
    '302_Centralisation': 'Centralisation \uf164',
    '303_Governmental and Administrative Efficiency': 'Efficient Governance \uf164',
    '304_a_Against Political Corruption': 'Anti-Corruption \uf164',
    '304_b_Involved in Political Corruption': 'Involved in Corruption \uf164',
    '305_Political Authority': 'Political Authority \uf164',
    '401_Free Market Economy': 'Free Market \uf164',
    '402_Incentives': 'Supply-side Economics \uf164',
    '403_Market Regulation': 'Market Regulation \uf164',
    '404_Economic Planning': 'Economic Planning \uf164',
    '405_Corporatism/ Mixed Economy': 'Mixed Economy \uf164',
    '406_Protectionism: Positive': 'Protectionism \uf164',
    '407_Protectionism: Negative': 'Protectionism \uf165',
    '408_Economic Goals': 'Economic Goals \uf164',
    '409_Keynesian Demand Management': 'Demand-side Economics \uf164',
    '410_Economic Growth: Positive': 'Economic Growth \uf164',
    '411_Technology and Infrastructure': 'Tech & Infrastructure \uf164',
    '412_Controlled Economy': 'Economic Control \uf164',
    '413_Nationalisation': 'Nationalisation \uf164',
    '414_Economic Orthodoxy': 'Economic Orthodoxy \uf164',
    '415_Marxist Analysis: Positive': 'Marxism \uf164',
    '416_Anti-Growth Economy: Positive': 'Anti-Growth \uf164',
    '501_Environmental Protection: Positive': 'Environmentalism \uf164',
    '502_Culture: Positive': 'Culture \uf164',
    '503_Equality: Positive': 'Equality \uf164',
    '504_Welfare State Expansion': 'Welfare State \uf164',
    '505_Welfare State Limitation': 'Welfare State \uf165',
    '506_Education Expansion': 'State-funded Education \uf164',
    '507_Education Limitation': 'State-funded Education \uf165',
    '601_National Way of Life: Positive': 'National Way of Life \uf164',
    '602_National Way of Life: Negative': 'National Way of Life \uf165',
    '603_Traditional Morality: Positive': 'Traditional Morality \uf164',
    '604_Traditional Morality: Negative': 'Traditional Morality \uf165',
    '605_Law and Order: Positive': 'Law & Order \uf164',
    '606_Civic Mindedness: Positive': 'Civic Mindedness \uf164',
    '607_Multiculturalism: Positive': 'Multiculturalism \uf164',
    '608_Multiculturalism: Negative': 'Multiculturalism \uf165',
    '701_Labour Groups: Positive': 'Worker Rights \uf164',
    '702_Labour Groups: Negative': 'Worker Rights \uf165',
    '703_Agriculture and Farmers: Positive': 'Agriculture & Farmers \uf164',
    '704_Middle Class and Professional Groups': 'Professionals \uf164',
    '705_Underprivileged Minority Groups': 'Minority Groups \uf164',
    '706_Non-economic Demographic Groups': 'Demographic Groups \uf164'
}

tags = pd.read_csv(os.path.join(RESULTS_DIR, 'tags_clean.csv'))
tags

# Ensure 'name-en' in answers and 'name' in tags are strings
answers['name-en'] = answers['name-en'].astype(str)
tags['name'] = tags['name'].astype(str)

# Merge the DataFrames on the 'name' columns
answers_tagged = answers.merge(tags, left_on='name-en', right_on='name', how='left')

In [None]:
all_models = pd.Series(index=answers_tagged['model'].unique(), data=np.ones(len(answers_tagged['model'].unique())))
all_models

In [None]:
from constants import model_abbreviations
refusal_freq_dfs = []
for tag_id, tag_name in cat_id_to_title.items():
    tag_id_str = f"categories.{tag_id}.result"
    answers_tagged_true = answers_tagged[answers_tagged[tag_id_str] == True]
    answers_refusals = answers_tagged_true[answers_tagged_true['stage_1_response_valid'] == 'refusal']
    answers_refusals = answers_refusals['model'].value_counts(normalize=True) * all_models
    answers_refusals = answers_refusals.reset_index(name='count').rename(columns={'index': 'model'})
    answers_refusals['tag'] = tag_name
    refusal_freq_dfs.append(answers_refusals)

In [None]:
pd.concat(refusal_freq_dfs, ignore_index=True)

In [None]:
%matplotlib inline
refusals_df = pd.concat(refusal_freq_dfs, ignore_index=True).fillna(0.0)
refusals_df['model'] = refusals_df['model'].str.lower().map(model_abbreviations)
# refusals_df['count'] = 1 - refusals_df['count']
refusals_df = refusals_df.pivot(columns='tag', index='model')
refusals_df = refusals_df.droplevel(None, axis=1)
plt.figure(figsize=(5, 13))
sns.heatmap(refusals_df.T, cbar_kws={"shrink": 0.5, 'label': 'Refusal Frequency'}, cmap='inferno_r')
plt.gca().tick_params(top=False, bottom=False, labeltop=True, labelbottom=False)
plt.gca().tick_params(axis='x', rotation=90)
plt.xlabel(None)
plt.ylabel(None)
plt.savefig(os.path.join(FIGURES_DIR, 'refusal_freq_heatmap.pdf'), bbox_inches='tight', transparent=True)

In [None]:
answers[answers['stage_1_response_valid'] == 'refusal']['model'].value_counts(normalize=True)

# Stats

In [None]:
answers['model'].nunique()

In [None]:
answers['topic_idx'].nunique()

In [None]:
(answers['model'] + '-' + answers['language_code']).nunique()

In [None]:
len(answers)

## Plot figures

### Data Preparation for Visualization

In [None]:
# Mark invalid responses in the 'extracted' column
answers.loc[answers['stage_1_response_valid'] != 'yes', 'extracted'] = 'invalid'
answers.loc[answers['extracted'] == 'unknown', 'extracted'] = 'invalid'

In [None]:
# Create a DataFrame with one-hot encoding for the 'extracted' responses
df_viz = pd.get_dummies(
    answers[
        [
            "question_idx",
            "model",
            "extracted",
            "stage_1_response_valid",
            "language_code",
        ]
    ],
    columns=["extracted"],
    prefix="response",
    prefix_sep="_",
)

# Define the list of valid responses
valid_responses = [
    "very negative",
    "negative",
    "neutral",
    "positive",
    "very positive",
    "invalid",
]

# Define colors for each response type
response_colors = {
    "very negative": "chocolate",
    "negative": "darkorange",
    "neutral": "gray",
    "positive": "limegreen",
    "very positive": "darkgreen",
    "invalid": "black",
}

# Display the prepared DataFrame
df_viz.head()

In [None]:
(df_viz['model'] + '-' + df_viz['language_code']).nunique()

In [None]:
from constants import model_abbreviations

# Set font size for plots
FONTSIZE = 12
plt.rcParams.update({"font.size": FONTSIZE})
plt.rc("font", size=FONTSIZE)  # Controls default text sizes

# Set parameters for visualization
show_percentages = True
df_viz['model_abbr'] = df_viz['model'].str.lower().map(model_abbreviations)
groupby_colname = ["model_abbr", "language_code"]

# Aggregate the data by model and compute the mean for each response type
df_viz_agg = df_viz.groupby(groupby_colname).agg(
    **{f"{k}": (f"response_{k}", "mean") for k in valid_responses}
).reset_index()

# Sort the DataFrame for consistent plotting
df_viz_agg = df_viz_agg.sort_values(['language_code', 'model_abbr'], ascending=True)

In [None]:
df_viz_agg

In [None]:
plt.style.use('default')

In [None]:
for language, df_language in df_viz_agg.groupby('language_code'):
    # Drop unnecessary columns and set 'model' as the index
    df_language = df_language.drop(columns=['language_code'])
    df_language = df_language.set_index('model_abbr')

    # Compute the percentage of invalid responses
    fraction_invalid = df_language['invalid'] * 100

    # # Limit the number of models displayed if necessary
    # if len(df_language) > 100:
    #     show_every = len(df_language) // 60
    #     df_language = df_language.iloc[::show_every]
    #     fraction_invalid = fraction_invalid.iloc[::show_every]

    # Create subplots with shared Y-axis
    v_size = 1 + len(df_language) * 0.7
    fig, (ax, ax2) = plt.subplots(
        nrows=1,
        ncols=2,
        sharey=True,
        width_ratios=[4, 1],
        figsize=(15, v_size),
        gridspec_kw={'wspace': 0.2},
    )

    # Prepare data for the stacked bar chart (excluding 'invalid')
    df_responses = df_language[[col for col in df_language.columns if col != 'invalid']]
    df_responses = df_responses.div(df_responses.sum(axis=1), axis=0)
    df_responses.sort_values('model_abbr', ascending=True)

    # Plot the stacked horizontal bar chart for valid responses
    df_responses.plot(
        kind='barh',
        stacked=True,
        color=[response_colors[k] for k in valid_responses if k != 'invalid'],
        alpha=0.7,
        ax=ax,
        legend=False,
        fontsize=FONTSIZE,
    )

    # Set Y-axis labels using model abbreviations
    ax.set_yticks(range(len(df_responses)))
    yticklabels = [
        model_abbreviations.get('-'.join(label.split('-')[:-1]), label)
        for label in df_responses.index
    ]
    ax.set_yticklabels(yticklabels, fontsize=FONTSIZE)

    # Plot the bar chart for invalid responses
    fraction_invalid.plot(
        kind='barh',
        color=response_colors['invalid'],
        alpha=0.7,
        ax=ax2,
        legend=False,
        fontsize=FONTSIZE,
        edgecolor='none'
    )

    # Sync Y-axis labels on the second plot
    ax2.set_yticks(range(len(df_responses)))
    ax2.set_yticklabels(yticklabels, fontsize=FONTSIZE)

    # Set X-axis limits and labels for the invalid responses plot
    ax2.set_xlim(0, 100)
    ax2.set_xticks([0, 100])
    ax2.set_xticklabels(['0', '100%'], fontsize=FONTSIZE)

    # Set X-axis ticks and labels for the valid responses plot
    xticks = np.arange(0, 1.1, 0.5)
    ax.set_xticks(xticks)
    ax.set_xticklabels([f"{int(x * 100)}%" for x in xticks], fontsize=FONTSIZE)

    # Annotate percentages on the bars if enabled
    if show_percentages:
        for i in range(len(df_responses)):
            row = df_responses.iloc[i]
            sum_so_far = 0
            for col in row.index:
                value = row[col]
                if value > 0:
                    percentage = int(value * 100)
                    if percentage > 0:
                        ax.text(
                            sum_so_far + value / 2,
                            i,
                            f"{percentage}%",
                            ha='center',
                            va='center',
                            color='white',
                            fontsize=FONTSIZE - 2,
                        )
                    sum_so_far += value
            invalid_percentage = fraction_invalid.iloc[i]
            if invalid_percentage > 0:
                ax2.text(
                    invalid_percentage,
                    i,
                    f"{invalid_percentage:.1f}%" if invalid_percentage > 1 else f"{invalid_percentage:.1g}%",
                    ha='left',
                    va='center',
                    color='gray',
                    fontsize=FONTSIZE - 2,
                )

    # Combine and customize the legend
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(
        handles,
        [label.capitalize() for label in df_responses.columns],
        loc='upper center',  # Position the legend at the top center
        bbox_to_anchor=(0.5, 1.1),  # Move the legend above the plot
        ncol=5,  # Number of columns for the legend
        fontsize=FONTSIZE,
        facecolor='white',
    )

    # Set labels and nothing for y-axis label
    ax.set_xlabel('Label distribution among valid responses')
    ax2.set_xlabel('% of responses that were invalid')
    ax.set_ylabel('')

    # Remove spines and adjust layout
    sns.despine(fig=fig, ax=[ax, ax2], top=True, right=True, left=True, bottom=False)
    plt.subplots_adjust(wspace=0.05)

    ax.grid(False)
    ax2.grid(False)

    # Save the figure
    fname = f"response_distribution_{language}.pdf"
    plt.savefig(os.path.join(FIGURES_DIR, fname), bbox_inches='tight', transparent=True)
    print(f"Saved figure to {os.path.join(FIGURES_DIR, fname)}")

In [None]:
# Filter out invalid responses based on stage 1 validation
stage_1_validation = True
if stage_1_validation:
    nb_answers = len(answers)
    answers = answers[answers['stage_1_response_valid'] == 'yes']
    print(f"Filtered out {(nb_answers - len(answers)) * 100 / nb_answers:.2f}% out of {nb_answers} answers due to poor stage 1 responses.")

In [None]:
# Filter out answers with unknown scores due to poor Stage 3 responses
nb_answers = len(answers)
answers = answers[answers['score'] != 'unknown']
filtered_percentage = (nb_answers - len(answers)) * 100 / nb_answers
print(f"Filtered out {filtered_percentage:.6f}% of {nb_answers} answers due to poor Stage 3 responses.")

In [None]:
answers['model_lang'] = answers['model'] + '-' + answers['language_code']

In [None]:
answers

In [None]:
# Calculate the frequency of answers per model
freq_per_model = answers.groupby('model_lang')['question_idx'].count().rename('freq')

# Set the minimum number of answers required per model
min_model_freq = answers['topic_idx'].nunique() // 6

# Plot the number of answers per model
plt.figure(figsize=(12, 13))
sns.barplot(
    x=freq_per_model.sort_values(ascending=True).index,
    y=freq_per_model.sort_values(ascending=True).values,
    order=freq_per_model.sort_values(ascending=True).index
)
plt.axhline(y=min_model_freq, color='red', linestyle='--', label='Minimum Required Frequency')
plt.xticks(rotation=90)
plt.xlabel('Model')
plt.ylabel('Number of Answers')
plt.title('Number of Answers per Model')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Filter out models with less than the minimum required answers
nb_models = answers['model_lang'].nunique()
models_to_keep = freq_per_model[freq_per_model >= min_model_freq].index
answers = answers[answers['model_lang'].isin(models_to_keep)]
filtered_models = nb_models - answers['model_lang'].nunique()
print(f"Filtered out {filtered_models} out of {nb_models} models with less than {min_model_freq} answers.")

In [None]:
# Do the below by language
min_question_freqs = {}
for lang, lang_df in answers.groupby('language_code'):
    num_models = lang_df['model'].nunique()
    min_question_freq = int(np.ceil(num_models / 2))
    freq_per_question = lang_df.groupby('question_idx')['model'].count().rename('freq')
    min_question_freqs[lang] = min_question_freq
    
    # Plot a histogram of the frequency of models per question
    plt.figure(figsize=(10, 8))
    sns.histplot(
        freq_per_question,
        bins=range(0, num_models + 2),
        discrete=True,
        binrange=(0, num_models)
    )
    
    plt.xticks(range(0, num_models + 1))
    plt.axvline(x=min_question_freq, color='red', linestyle='--', label='Minimum Required Frequency')
    plt.xlabel('Number of Models that Answered the Question')
    plt.ylabel('Number of Questions')
    plt.title(f'Distribution of Model Frequencies per Question ({lang})')
    plt.legend()
    plt.tight_layout()
    plt.show()

# # Calculate the number of unique original models
# num_model_original = answers['model'].nunique()
# # Set the minimum number of models required per question
# # min_question_freq = 8  # Alternatively, use int(np.ceil(num_model_original / 2))
# min_question_freq = int(np.ceil(num_model_original / 2))
# # Calculate the frequency of models per question
# freq_per_question = answers.groupby('topic_idx')['model'].count().rename('freq')
# 
# # Plot a histogram of the frequency of models per question
# plt.figure(figsize=(10, 8))
# sns.histplot(
#     freq_per_question,
#     bins=range(0, num_model_original + 2),
#     discrete=True,
#     binrange=(0, num_model_original)
# )
# plt.xticks(range(0, num_model_original + 1))
# plt.axvline(x=min_question_freq, color='red', linestyle='--', label='Minimum Required Frequency')
# plt.xlabel('Number of Models that Answered the Question')
# plt.ylabel('Number of Questions')
# plt.title('Distribution of Model Frequencies per Question')
# plt.legend()
# plt.tight_layout()
# plt.show()

In [None]:
num_filtered_total = 0
nb_questions_total = answers['question_idx'].nunique()
for lang, min_freq in min_question_freqs.items():
    nb_questions = answers[answers['language_code'] == lang]['question_idx'].nunique()
    freq_per_question = answers[answers['language_code'] == lang].groupby('question_idx')['model'].count().rename('freq')
    questions_to_drop = freq_per_question[freq_per_question < min_freq].index
    answers = answers[~answers['question_idx'].isin(questions_to_drop)]
    filtered_questions = nb_questions - answers[answers['language_code'] == lang]['question_idx'].nunique()
    filtered_percentage = (filtered_questions) * 100 / nb_questions
    print(f"Filtered out {filtered_percentage:.2f}% of {nb_questions} questions with less than {min_freq} models ({lang}).")
    num_filtered_total += filtered_questions
filtered_percentage = (num_filtered_total) * 100 / nb_questions_total
print(f"Filtered out {filtered_percentage} questions in total.")
    
# Filter out questions answered by fewer than the minimum required number of models
# nb_questions = answers['question_idx'].nunique()
# questions_to_keep = freq_per_question[freq_per_question >= min_question_freq].index
# answers = answers[answers['question_idx'].isin(questions_to_keep)]
# filtered_questions = nb_questions - answers['question_idx'].nunique()
# filtered_percentage = (filtered_questions) * 100 / nb_questions
# print(f"Filtered out {filtered_percentage:.2f}% of {nb_questions} questions with less than {min_question_freq} models.")

In [None]:
filtered_percentage

In [None]:
print(f"At the end, {len(answers)} answers remain")

In [None]:
# how many answers, models, and topics are in this dataframe?
print(
    f"answers: {len(answers)}\n"
    + f"models (en and zh): {len(answers.model.unique())}, \n"
    + f"topics (en and zh): {len(answers.topic.unique())}, topics_idx: {len(answers.topic_idx.unique())}\n"
    + f"question_idx: {len(answers.question_idx.unique())}\n"
    f"prompt_template_idx (0 is English, 1 Chinese): {len(answers.prompt_template_idx.unique())}"
)

In [None]:
lang_pct = answers.groupby('language')['extracted'].value_counts(normalize=True)
lang_pct = lang_pct.unstack()
lang_pct.index = lang_pct.index.str.replace(' (Simplified)', '')
lang_pct

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
lang_pct.plot(
    kind='barh',
    stacked=True,
    color=[response_colors[k] for k in valid_responses if k != 'invalid'],
    alpha=0.7,
    legend=False,
    fontsize=FONTSIZE,
    ax=ax
)
lang_score_mean = answers.groupby('language')['score'].mean()
ax.vlines(lang_score_mean, np.arange(len(lang_score_mean))-0.5, np.arange(len(lang_score_mean))+0.5, color='red', linestyle='--', label=None)
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles,
    [label.capitalize() for label in df_responses.columns],
    loc='upper center',  # Position the legend at the top center
    bbox_to_anchor=(0.5, 1.1),  # Move the legend above the plot
    ncol=5,  # Number of columns for the legend
    fontsize=FONTSIZE,
    facecolor='white',
)
ax.set_xlabel('Label distribution among valid responses')
ax.set_ylabel(None)
plt.savefig(os.path.join(FIGURES_DIR, 'response_distribution_lang.pdf'), bbox_inches='tight', transparent=True)

In [None]:
answers.groupby('name-en')['score'].mean().max()

### Save the data

In [None]:
answers.head()

In [None]:
# Drop summaries first
answers = answers.drop(columns=[col for col in answers.columns if col.startswith('summary-')] + ['Unnamed: 0', 'model_lang'], errors='ignore')

In [None]:
# Define the output directory to save the processed data
# FIGURES_DIR = os.path.join(os.path.abspath(''), '..', 'data', 'processed')
os.makedirs(FIGURES_DIR, exist_ok=True)

# Construct the full file path for the output CSV file
output_file = os.path.join(FIGURES_DIR, 'answers_processed.csv')

# Save the processed 'answers' DataFrame to a CSV file without the index
answers.to_csv(output_file, index=False)

print(f"Processed 'answers' DataFrame saved to {output_file}")