In [1]:
import os
import pandas as pd
import re

def extract_llm_answer(value):
    """Extract the first letter (answer) from a TextBlock representation."""
    try:
        # Use regex to capture the first letter (answer) from TextBlock(text='B)About $3k', type='text')
        match = re.search(r"TextBlock\(text=['\"]([A-Da-d])\)", value)
        return match.group(1) if match else ""
    except TypeError:
        return ""

input_path = "sonnet35/"
output_path = "sonnet35/accuracy/"

os.makedirs(output_path, exist_ok=True)
for file_name in os.listdir(input_path):
    if file_name.endswith(".csv"):
        try:
            file_path = os.path.join(input_path, file_name)
            
            # Read the file with tab as delimiter
            df = pd.read_csv(file_path, sep='\t', on_bad_lines="skip")

            # Normalize column names (strip spaces and lowercase)
            df.columns = df.columns.str.strip().str.lower()

            # Ensure required columns exist
            if 'answer' not in df.columns or 'llm' not in df.columns:
                print(f"Skipping {file_name}: Missing required columns.")
                continue

            # Extract llm_answer from the llm column
            df['llm_answer'] = df['llm'].apply(extract_llm_answer)

            # Add accuracy column as 0 or 1
            df['accuracy'] = (df['llm_answer'] == df['answer']).astype(int)

            # Save updated file
            new_file_name = file_name.split(".")[0] + "_accuracy.csv"
            new_file_path = os.path.join(output_path, new_file_name)
            df.to_csv(new_file_path, index=False)

            print(f"Processed: {file_name}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

Processed: orm.csv
Processed: xho.csv
Processed: sot.csv
Processed: swa.csv
Processed: sna.csv
Processed: fra.csv
Processed: zul.csv
Processed: lin.csv
Processed: ibo.csv
Processed: twi.csv
Processed: amh.csv
Processed: eng.csv
Processed: ewe.csv
Processed: wol.csv
Processed: hau.csv
Processed: yor.csv
Processed: lug.csv
Processed: kin.csv


In [3]:
# Aggregate accuracy files into a single DataFrame

# List all the accuracy CSV files
accuracy_files = [f for f in os.listdir(output_path) if f.endswith('_accuracy.csv')]

# Initialize a list to hold individual DataFrames
dfs = []

# Loop through each CSV file and load it into a DataFrame
for accuracy_file in accuracy_files:
    file_path = os.path.join(output_path, accuracy_file)
    try:
        # Load the CSV into a DataFrame
        df = pd.read_csv(file_path)
        
        # Add a column to the DataFrame for the file name (language)
        df['language'] = accuracy_file.replace('_accuracy.csv', '')  # Remove '_accuracy.csv' to get the language name
        
        # Add the DataFrame to the list
        dfs.append(df)
    except Exception as e:
        print(f"Error loading {accuracy_file}: {e}")

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Display the resulting DataFrame
(combined_df)

Unnamed: 0,question,choices,answer,subject,llm,prompt,llm_answer,accuracy,language
0,በ 24 = 2p ውስጥ የ p ዋጋ ስንት ነው,"['p = 5', 'p = 9', 'p = 13', 'p = 25']",C,elementary_mathematics,"[TextBlock(text='C)p = 13', type='text')]","[{'role': 'user', 'content': [{'type': 'text',...",C,1,amh
1,ወይዘሮ ፔሬዝ በ5 ቀናት ውስጥ በአጠቃላይ 40 ማይል ነድታለች። በየቀኑ ...,"['5', '7', '8', '9']",C,elementary_mathematics,[TextBlock(text='C) 8\n\nTo find the daily mil...,"[{'role': 'user', 'content': [{'type': 'text',...",C,1,amh
2,የ −40 ÷ (−8) ድርሻ ፈልጉ,"['1 ሲካፈል 5', '-5', '−1 ሲካፈል 5', '5']",D,elementary_mathematics,[TextBlock(text='B) -5\n\nThe correct answer i...,"[{'role': 'user', 'content': [{'type': 'text',...",B,0,amh
3,የእግር ኳስ ቡድን የእግር ኳስ ኳሶችን ለመግዛት $90.00 አለው። አንድ...,"['4', '5', '6', '7']",B,elementary_mathematics,"[TextBlock(text=""B) 5\n\nTo find the maximum n...","[{'role': 'user', 'content': [{'type': 'text',...",B,1,amh
4,አንተ እና ሶስት ጓደኞች ወደ ኮንሰርት ሄዳችሁ። የአራት ትኬቶች አጠቃላይ...,"['4t = 112; $449', '4t = 112; $29', 't over 4 ...",B,elementary_mathematics,[TextBlock(text='B) 4t = 112; $28\n\nThe equat...,"[{'role': 'user', 'content': [{'type': 'text',...",B,1,amh
...,...,...,...,...,...,...,...,...,...
8995,Ho fokotswa ha meputso e fokolang tlhahisong y...,['sehlahiswa se tlwaelehileng sa mosebetsi se ...,D,high_school_microeconomics,[TextBlock(text='B) Kakaretso ya ho etsa sehla...,"[{'role': 'user', 'content': [{'type': 'text',...",B,0,sot
8996,Seo o se telang ho tswellisa tsela e nngwe e b...,"['tjhelete.', 'lefatshe', 'ditjeho tsa tjhelet...",D,high_school_microeconomics,"[TextBlock(text='B) lefatshe', type='text')]","[{'role': 'user', 'content': [{'type': 'text',...",B,0,sot
8997,Ke efe ho tse latelang e senang tshobotsi ya i...,"['Ho kena mahala indastering.', 'Product diffe...",B,high_school_microeconomics,[TextBlock(text='B) Product differentiation\n\...,"[{'role': 'user', 'content': [{'type': 'text',...",B,1,sot
8998,E amana le mmaraka wa basebetsi o nang le more...,"['e lefa hanyane mme e hira haholo', 'e lefa h...",D,high_school_microeconomics,[TextBlock(text='B) e lefa hanyane ebe e hira ...,"[{'role': 'user', 'content': [{'type': 'text',...",B,0,sot


In [4]:
grouped_mean_accuracy = combined_df.groupby(['language']).agg(
    mean_accuracy=('accuracy', 'mean'))
grouped_mean_accuracy

Unnamed: 0_level_0,mean_accuracy
language,Unnamed: 1_level_1
amh,0.628
eng,0.748
ewe,0.37
fra,0.712
hau,0.48
ibo,0.498
kin,0.496
lin,0.48
lug,0.456
orm,0.518


![image](../paper_results.png)

In [5]:
anthropic_results = {
    'amh': 62.8,   # Amharic
    'eng': 74.8,   # English
    'ewe': 37.0,   # Ewe
    'fra': 71.2,   # French
    'hau': 48.0,   # Hausa
    'ibo': 49.8,   # Igbo
    'kin': 49.6,   # Kinyarwanda
    'lin': 48.0,   # Lingala
    'lug': 45.6,   # Luganda
    'orm': 51.8,   # Oromo
    'sna': 51.6,   # Shona
    'sot': 52.0,   # Southern Sotho
    'swa': 63.8,   # Swahili
    'twi': 39.4,   # Twi
    'wol': 35.6,   # Wolof
    'xho': 53.8,   # Xhosa
    'yor': 50.6,   # Yoruba
    'zul': 53.6    # Zulu
}

for language, expected_accuracy in anthropic_results.items():
    try:
        actual_accuracy = grouped_mean_accuracy.loc[language, 'mean_accuracy'] * 100
        difference = abs(actual_accuracy - expected_accuracy)
        
        if difference > 1:
            print(f"Significant difference for {language}: Expected {expected_accuracy}%, but got {actual_accuracy:.1f}%. Difference: {difference:.1f}")
        else:
            print(f"Match for {language}: {actual_accuracy:.1f}% (Difference: {difference:.1f}%)")
    except KeyError:
        print(f"KeyError: '{language}' not found in the DataFrame.")

Match for amh: 62.8% (Difference: 0.0%)
Match for eng: 74.8% (Difference: 0.0%)
Match for ewe: 37.0% (Difference: 0.0%)
Match for fra: 71.2% (Difference: 0.0%)
Match for hau: 48.0% (Difference: 0.0%)
Match for ibo: 49.8% (Difference: 0.0%)
Match for kin: 49.6% (Difference: 0.0%)
Match for lin: 48.0% (Difference: 0.0%)
Match for lug: 45.6% (Difference: 0.0%)
Match for orm: 51.8% (Difference: 0.0%)
Match for sna: 51.6% (Difference: 0.0%)
Match for sot: 52.0% (Difference: 0.0%)
Match for swa: 63.8% (Difference: 0.0%)
Match for twi: 39.4% (Difference: 0.0%)
Match for wol: 35.6% (Difference: 0.0%)
Match for xho: 53.8% (Difference: 0.0%)
Match for yor: 50.6% (Difference: 0.0%)
Match for zul: 53.6% (Difference: 0.0%)
