# Parallel analysis for all the models on the Benchmark dataset
Test to see their different preformance under different language environment

In [1]:
# import statements
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from glob import glob

## Dataset Preparation

In [2]:
file_path = r'C:\Users\zihengfeng\CrossLingual-Benchmark-Eval\Dataset\result\Global_MMLU_Lite.xlsx'
MMLU_df = pd.read_excel(file_path)

file_path = r'C:\Users\zihengfeng\CrossLingual-Benchmark-Eval\Dataset\result\mlogiqa.xlsx'
Mlogiqa_df = pd.read_excel(file_path)

file_path = r'C:\Users\zihengfeng\CrossLingual-Benchmark-Eval\Dataset\result\PolyMath.xlsx'
PolyMath_df = pd.read_excel(file_path)

In [3]:
def process_data(df):
    columns_to_keep = ['样本ID', '一级分类', '二级分类', '三级分类', 'tag'] + \
                       [col for col in df.columns if col.startswith('标注结果-多模型打分')]
    df_filtered = df[columns_to_keep].copy() 
    
    score_mapping = {'不切题': 0, '结果不满意': 1, '基本满意': 2, '满意': 3, '超出预期': 3}
    result_columns = [col for col in df_filtered.columns if col.startswith('标注结果-多模型打分')]
    for col in result_columns:
        df_filtered.loc[:, col] = df_filtered[col].map(score_mapping)  
    
    df_filtered.rename(columns={
        '样本ID': 'SampleID',
        '一级分类': 'class1',
        '二级分类': 'class2',
        '三级分类': 'class3',
        'tag': 'tag',
        '标注结果-多模型打分（5档）-B端现网-混元-T1-latest-流式-文生文-API': 'T1_latest',
        '标注结果-多模型打分（5档）-Doubao-Seed-1.6-thinking-0715-文生文-API': 'Doubao_1.6_Thinking',
        '标注结果-多模型打分（5档）-Gemini-2.5-Pro-最长思考-文生文-API': 'Gemini_2.5_Pro',
        '标注结果-多模型打分（5档）-安平_OpenAI-o3_azure-文生文-API': 'OpenAI_o3_azure',
        '标注结果-多模型打分（5档）-通义千问-qwen3-235b-a22b慢思考-文生文-API': 'Qwen3_235b_a22b',
        '标注结果-多模型打分（5档）-安平_deepseek-R1-文生文-API': 'Deepseek_R1'
    }, inplace=True)
    
    return df_filtered

In [4]:
MMLU_df_cleaned = process_data(MMLU_df)
Mlogiqa_df_cleaned = process_data(Mlogiqa_df)

In [5]:
PolyMath_df_cleaned = process_data(PolyMath_df)

In [None]:
Mlogiqa_df_cleaned.head()

In [6]:
def process_tag_column(df):
    def extract_category_and_number(tag):
        special_char_pos = max(tag.rfind('/'), tag.rfind('_'))
        
        if special_char_pos == -1:
            category = tag
            number = None
        else:
            category = tag[:special_char_pos]  
            number = tag[special_char_pos + 1:] 
        
        return category, number

    df[['category', 'number']] = df['tag'].apply(lambda x: pd.Series(extract_category_and_number(x)))
    
    return df

In [7]:
MMLU_df_final = process_tag_column(MMLU_df_cleaned)
MMLU_df_final.head()

Unnamed: 0,SampleID,class1,class2,class3,tag,T1_latest,Doubao_1.6_Thinking,Gemini_2.5_Pro,OpenAI_o3_azure,Qwen3_235b_a22b,Deepseek_R1,category,number
0,29113178,中英文评测,Global_MMLU,en,sociology/test/183,1,1,1,3,1,1,sociology/test,183
1,29113179,中英文评测,Global_MMLU,en,nutrition/test/14,3,3,3,3,3,3,nutrition/test,14
2,29113180,中英文评测,Global_MMLU,en,high_school_geography/test/61,3,3,3,3,3,3,high_school_geography/test,61
3,29113181,中英文评测,Global_MMLU,en,philosophy/test/45,3,3,3,3,3,3,philosophy/test,45
4,29113182,中英文评测,Global_MMLU,zh_cn,security_studies/test/151,3,3,3,3,3,3,security_studies/test,151


In [8]:
PolyMath_df_final = process_tag_column(PolyMath_df_cleaned)
PolyMath_df_final.head()

Unnamed: 0,SampleID,class1,class2,class3,tag,T1_latest,Doubao_1.6_Thinking,Gemini_2.5_Pro,OpenAI_o3_azure,Qwen3_235b_a22b,Deepseek_R1,category,number
0,29172079,中英文评测,PolyMath,zh_cn,top_109,1,0,1,1,1,1,top,109
1,29172080,中英文评测,PolyMath,zh_cn,medium_97,1,3,3,3,1,3,medium,97
2,29172081,中英文评测,PolyMath,zh_cn,high_111,1,1,1,1,1,1,high,111
3,29172082,中英文评测,PolyMath,zh_cn,top_44,1,3,1,1,1,1,top,44
4,29172083,中英文评测,PolyMath,zh_cn,top_120,1,1,1,1,3,1,top,120


In [9]:
Mlogiqa_df_final = process_tag_column(Mlogiqa_df_cleaned)
Mlogiqa_df_final.head()

Unnamed: 0,SampleID,class1,class2,class3,tag,T1_latest,Doubao_1.6_Thinking,Gemini_2.5_Pro,OpenAI_o3_azure,Qwen3_235b_a22b,Deepseek_R1,category,number
0,29139154,中英文评测,mlogiqa,zh_cn,mlogiqa_71,3,3,1,1,1,3,mlogiqa,71
1,29139155,中英文评测,mlogiqa,zh_cn,mlogiqa_2,1,1,1,1,1,1,mlogiqa,2
2,29139156,中英文评测,mlogiqa,zh_cn,mlogiqa_34,3,3,1,1,3,1,mlogiqa,34
3,29139157,中英文评测,mlogiqa,zh_cn,mlogiqa_29,3,3,3,3,3,3,mlogiqa,29
4,29139158,中英文评测,mlogiqa,zh_cn,mlogiqa_41,1,1,1,1,1,1,mlogiqa,41


## Data Analysis

In [None]:
def average_score_group(df, group = 'class3'):
    models = ['T1_latest', 'Doubao_1.6_Thinking', 'Gemini_2.5_Pro', 
              'OpenAI_o3_azure', 'Qwen3_235b_a22b', 'Deepseek_R1']

    result = df.groupby(group)[models].mean()
    
    return result

In [None]:
def pretty_print_pandas(df):
    print(df.to_string(index=True, float_format='{:,.4f}'.format))

In [None]:
avg_MMLU = average_score_group(MMLU_df_final, group='class3')
avg_MMLU_by_tag = average_score_group(MMLU_df_final, group= ['class3', 'category'])
pretty_print_pandas(avg_MMLU)
pretty_print_pandas(avg_MMLU_by_tag)

In [None]:
avg_Mlogiqa = average_score_group(Mlogiqa_df_final, group='class3')
abg_Mlogiqa_by_tag = average_score_group(Mlogiqa_df_final, group= ['class3', 'category'])
pretty_print_pandas(avg_Mlogiqa)
pretty_print_pandas(abg_Mlogiqa_by_tag)

In [None]:
def model_with_most_points(df):
    models = ['T1_latest', 'Doubao_1.6_Thinking', 'Gemini_2.5_Pro', 
              'OpenAI_o3_azure', 'Qwen3_235b_a22b', 'Deepseek_R1']
    
    result = {}
    
    for grade in [0, 1, 2, 3]:
        grade_results = {}
        for model in models:
            count = (df[model] == grade).sum()
            grade_results[model] = count
        most_model = max(grade_results, key=grade_results.get)
        result[grade] = most_model
    
    return result


def most_frequent_model_per_grade(df):
    models = ['T1_latest', 'Doubao_1.6_Thinking', 'Gemini_2.5_Pro', 
              'OpenAI_o3_azure', 'Qwen3_235b_a22b', 'Deepseek_R1']
    
    result = {}
    
    grouped = df.groupby(['class3', 'category'])
    
    for (class3, category), group in grouped:
        grade_counts = {0: {}, 1: {}, 2: {}, 3: {}}
        
        for grade in [0, 1, 2, 3]:
            for model in models:
                count = (group[model] == grade).sum()
                grade_counts[grade][model] = count
        result[(class3, category)] = {
            grade: max(grade_counts[grade], key=grade_counts[grade].get) for grade in grade_counts
        }
    
    return result

def most_frequent_model_per_class(df):
    models = ['T1_latest', 'Doubao_1.6_Thinking', 'Gemini_2.5_Pro', 
              'OpenAI_o3_azure', 'Qwen3_235b_a22b', 'Deepseek_R1']
    
    result = {}
    
    grouped = df.groupby('class3')
    
    for (class3), group in grouped:
        grade_counts = {0: {}, 1: {}, 2: {}, 3: {}}
        
        for grade in [0, 1, 2, 3]:
            for model in models:
                count = (group[model] == grade).sum()
                grade_counts[grade][model] = count
        result[(class3)] = {
            grade: max(grade_counts[grade], key=grade_counts[grade].get) for grade in grade_counts
        }
    
    return result

In [10]:
def convert_to_percentag(df):
    models = ['T1_latest', 'Doubao_1.6_Thinking', 'Gemini_2.5_Pro', 'OpenAI_o3_azure', 
              'Qwen3_235b_a22b', 'Deepseek_R1']

    df[models] = df[models] / 3 * 100
    return df

convert_to_percentag(PolyMath_df_final)

Unnamed: 0,SampleID,class1,class2,class3,tag,T1_latest,Doubao_1.6_Thinking,Gemini_2.5_Pro,OpenAI_o3_azure,Qwen3_235b_a22b,Deepseek_R1,category,number
0,29172079,中英文评测,PolyMath,zh_cn,top_109,33.333333,0.0,33.333333,33.333333,33.333333,33.333333,top,109
1,29172080,中英文评测,PolyMath,zh_cn,medium_97,33.333333,100.0,100.0,100.0,33.333333,100.0,medium,97
2,29172081,中英文评测,PolyMath,zh_cn,high_111,33.333333,33.333333,33.333333,33.333333,33.333333,33.333333,high,111
3,29172082,中英文评测,PolyMath,zh_cn,top_44,33.333333,100.0,33.333333,33.333333,33.333333,33.333333,top,44
4,29172083,中英文评测,PolyMath,zh_cn,top_120,33.333333,33.333333,33.333333,33.333333,100.0,33.333333,top,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...
971,29173050,中英文评测,PolyMath,en,top_50,100.0,100.0,100.0,100.0,33.333333,100.0,top,50
972,29173051,中英文评测,PolyMath,zh_cn,low_95,100.0,100.0,100.0,100.0,100.0,100.0,low,95
973,29173052,中英文评测,PolyMath,en,top_85,33.333333,0.0,33.333333,33.333333,100.0,100.0,top,85
974,29173053,中英文评测,PolyMath,en,low_41,100.0,100.0,100.0,100.0,100.0,100.0,low,41


In [32]:
convert_to_percentag(Mlogiqa_df_final)
convert_to_percentag(MMLU_df_final)

Unnamed: 0,SampleID,class1,class2,class3,tag,T1_latest,Doubao_1.6_Thinking,Gemini_2.5_Pro,OpenAI_o3_azure,Qwen3_235b_a22b,Deepseek_R1,category,number
0,29113178,中英文评测,Global_MMLU,en,sociology/test/183,1111.111111,1111.111111,1111.111111,3333.333333,1111.111111,1111.111111,sociology/test,183
1,29113179,中英文评测,Global_MMLU,en,nutrition/test/14,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,nutrition/test,14
2,29113180,中英文评测,Global_MMLU,en,high_school_geography/test/61,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,high_school_geography/test,61
3,29113181,中英文评测,Global_MMLU,en,philosophy/test/45,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,philosophy/test,45
4,29113182,中英文评测,Global_MMLU,zh_cn,security_studies/test/151,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,security_studies/test,151
...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,29113970,中英文评测,Global_MMLU,zh_cn,human_aging/test/196,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,human_aging/test,196
793,29113971,中英文评测,Global_MMLU,en,human_sexuality/test/13,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,human_sexuality/test,13
794,29113972,中英文评测,Global_MMLU,zh_cn,high_school_world_history/test/87,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,high_school_world_history/test,87
795,29113973,中英文评测,Global_MMLU,en,virology/test/142,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,3333.333333,virology/test,142


## File save

In [None]:
def save_model_with_most_points(df, file_path='model_with_most_points.txt'):
    model_most_points_class3 = model_with_most_points(df)
    most_frequent_model_class3_category = most_frequent_model_per_grade(df)
    most_frequent_model_per_class3 = most_frequent_model_per_class(df)
    
    with open(file_path, 'w') as file:
        file.write("Model with Most Points by class3:\n")
        for grade, model in model_most_points_class3.items():
            file.write(f"Grade {grade}: {model}\n")
        
        file.write("\n") 
        file.write("Most Frequent Model Per Grade by class3:\n")
        for class3, model in most_frequent_model_per_class3.items():
            file.write(f"Class3: {class3}, Most Frequent Model: {model}\n")

        file.write("\n") 
        file.write("Most Frequent Model Per Grade by class3 and category:\n")
        for (class3, category), model in most_frequent_model_class3_category.items():
            file.write(f"Class3: {class3}, Category: {category}, Most Frequent Model: {model}\n")
    
    print(f"Model with Most Points has been saved to {file_path}")

In [None]:
def save_avg_MMLU_by_tag(file_path='avg_grade.txt'):
    avg_Mlogiqa_class3 = average_score_group(MMLU_df_final, group='class3')
    avg_Mlogiqa_by_tag = average_score_group(MMLU_df_final, group=['class3', 'category'])
    avg_Mlogiqa_class3 = average_score_group(Mlogiqa_df_final, group='class3')

    with open(file_path, 'w') as file:
        file.write("Average grade for MMLU by class3:\n")
        file.write(avg_Mlogiqa_class3.to_string())
        
        file.write("\n")  
        file.write("Average grade for MMLU by class3 and category:\n")
        file.write(avg_Mlogiqa_by_tag.to_string())

        file.write("\n")  
        file.write("Average grade for Mlogiqa by class3:\n")
        file.write(avg_Mlogiqa_class3.to_string())
    
    print(f"Average grade by tag has been saved to {file_path}")

save_avg_MMLU_by_tag(file_path='avg_grade.txt')

In [None]:
def save_average_score_group(df, file_path='average_score_group.csv', file_format='csv'):
    if file_format == 'csv':
        df.to_csv(file_path, index=True) 
        print(f"Average scores saved to {file_path}")
    elif file_format == 'xlsx':
        df.to_excel(file_path, index=True) 
        print(f"Average scores saved to {file_path}")

def save_class(df, file_path='most_frequent_model_per_grade.csv', file_format='csv'):
    flattened_data = []
    for key, value in df.items():
        class3 = key
        for grade, model in value.items():
            flattened_data.append([class3, grade, model])
    
    result = pd.DataFrame(flattened_data, columns=['class3', 'grade', 'Most Frequent Model'])
    
    if file_format == 'csv':
        result.to_csv(file_path, index=False)
        print(f"Most frequent model per grade saved to {file_path}")
    elif file_format == 'xlsx':
        result.to_excel(file_path, index=False)
        print(f"Most frequent model per grade saved to {file_path}")

def save_new(df, file_path='most_frequent_model_per_grade.csv', file_format='csv'):
    flattened_data = []
    for key, value in df.items():
        class3, category = key
        for grade, model in value.items():
            flattened_data.append([class3, category, grade, model])
    
    # Create a DataFrame from the flattened data
    result = pd.DataFrame(flattened_data, columns=['class3', 'category', 'grade', 'Most Frequent Model'])
    
    # Save the DataFrame to CSV or Excel
    if file_format == 'csv':
        result.to_csv(file_path, index=False)
        print(f"Most frequent model per grade saved to {file_path}")
    elif file_format == 'xlsx':
        result.to_excel(file_path, index=False)
        print(f"Most frequent model per grade saved to {file_path}")

In [None]:
save_average_score_group(average_score_group(PolyMath_df_final, group='class3'), 'Analysis_report/average_score_by_class3(PolyMath).xlsx', 'xlsx')
save_average_score_group(average_score_group(PolyMath_df_final, group=['class3', 'category']), 'Analysis_report/average_score_by_tag(PolyMath).xlsx', 'xlsx')

In [None]:
# save_new(most_frequent_model_per_grade(Mlogiqa_df_final), 'Analysis_report/most_frequent_model_by_class3(Mlogiqa).xlsx', 'xlsx')
# save_most_frequent_model_per_grade(most_frequent_model_per_grade(MMLU_df_final), 'Analysis_report/most_frequent_model_by_class3(MMLU).xlsx', 'xlsx')
save_class(most_frequent_model_per_class(PolyMath_df_final), 'Analysis_report/most_frequent_model_by_class(PolyMath).xlsx', 'xlsx')
save_new(most_frequent_model_per_grade(PolyMath_df_final), 'Analysis_report/most_frequent_model_by_tag(PolyMath).xlsx', 'xlsx')

In [None]:
most_frequent_model_per_grade(Mlogiqa_df_final)

## A/B Test

In [18]:
# import statements
from scipy.stats import ttest_rel

### Which Models Have Significant Differences Across Languages?

Null Hypothesis (H0): The performance of each model is the same in both en and zh_cn.

Alternative Hypothesis (H1): The performance of each model is significantly different in en and zh_cn.

idea: perform paired t-tests for each model individually. Then rank the model by their p-values or the magnitude of their performance difference. Calculate the **effect size (Cohen's d)** for each model to quantify the size of the difference. The larger the effect size, the stronger the difference.

If p < 0.05, we reject the null hypothesis and conclude that there is a significant difference in how the model performs in en vs. zh_cn.


In [36]:
def compare_models_across_languages(df, models):
    p_values = {}
    t_statistics = {}
    effect_sizes = {}

    for model in models:
        grouped = df.groupby('class3')[model].mean()

        if 'en' in grouped.index and 'zh_cn' in grouped.index:
            en_scores = df[df['class3'] == 'en'][[model, 'tag']].dropna()
            zh_scores = df[df['class3'] == 'zh_cn'][[model, 'tag']].dropna()
            merged_scores = pd.merge(en_scores, zh_scores, on='tag', suffixes=('_en', '_zh'))

            merged_scores[f'{model}_en'] = pd.to_numeric(merged_scores[f'{model}_en'], errors='coerce')
            merged_scores[f'{model}_zh'] = pd.to_numeric(merged_scores[f'{model}_zh'], errors='coerce')
            merged_scores.dropna(subset=[f'{model}_en', f'{model}_zh'], inplace=True)

            if len(merged_scores) > 0:
                t_stat, p_val = ttest_rel(merged_scores[f'{model}_en'], merged_scores[f'{model}_zh'])

                mean_diff = np.mean(merged_scores[f'{model}_en']) - np.mean(merged_scores[f'{model}_zh'])
                pooled_std = np.sqrt(((np.std(merged_scores[f'{model}_en']) ** 2) + (np.std(merged_scores[f'{model}_zh']) ** 2)) / 2)
                cohens_d = mean_diff / pooled_std

                p_values[model] = p_val
                t_statistics[model] = t_stat
                effect_sizes[model] = cohens_d
            else:
                print(f"Skipping model {model}: No common questions between en and zh_cn")

    results_df = pd.DataFrame({
        'Model': models,
        'p-value': [p_values.get(model, None) for model in models],
        't-statistic': [t_statistics.get(model, None) for model in models],
        'Effect Size (Cohen’s d)': [effect_sizes.get(model, None) for model in models]
    })

    print("Model Performance Comparison Across Languages (en vs zh_cn):")
    print(results_df)

    return results_df

In [37]:
# MMLU LITE
models = ['T1_latest', 'Doubao_1.6_Thinking', 'Gemini_2.5_Pro', 'OpenAI_o3_azure', 
          'Qwen3_235b_a22b', 'Deepseek_R1']
result_df = compare_models_across_languages(MMLU_df_final, models)

Model Performance Comparison Across Languages (en vs zh_cn):
                 Model   p-value  t-statistic  Effect Size (Cohen’s d)
0            T1_latest  0.000012     4.442099                 0.228374
1  Doubao_1.6_Thinking  0.005925     2.766883                 0.129717
2       Gemini_2.5_Pro  0.000362     3.597669                 0.146808
3      OpenAI_o3_azure  0.004547     2.853762                 0.146377
4      Qwen3_235b_a22b  0.085821     1.722159                 0.082534
5          Deepseek_R1  0.009384     2.610516                 0.125336


In [38]:
# Mlogiqa              
result_df = compare_models_across_languages(Mlogiqa_df_final, models)

Model Performance Comparison Across Languages (en vs zh_cn):
                 Model   p-value  t-statistic  Effect Size (Cohen’s d)
0            T1_latest  0.007115    -2.764300                -0.288301
1  Doubao_1.6_Thinking  0.356447    -0.927657                -0.088932
2       Gemini_2.5_Pro  0.181376    -1.348574                -0.107833
3      OpenAI_o3_azure  0.483002    -0.704851                -0.070578
4      Qwen3_235b_a22b  0.057267    -1.929803                -0.199889
5          Deepseek_R1  0.132552    -1.520009                -0.168742


In [39]:
# PolyMath
result_df = compare_models_across_languages(PolyMath_df_final, models)

Model Performance Comparison Across Languages (en vs zh_cn):
                 Model   p-value  t-statistic  Effect Size (Cohen’s d)
0            T1_latest  0.641291     0.466190                 0.018363
1  Doubao_1.6_Thinking  0.440414    -0.772136                -0.030966
2       Gemini_2.5_Pro  0.452055     0.752608                 0.030791
3      OpenAI_o3_azure  0.687611     0.402343                 0.014587
4      Qwen3_235b_a22b  0.000298     3.643367                 0.112302
5          Deepseek_R1  0.005500     2.788841                 0.103109


### How Does T1_latest Perform on the Same Question Across Languages?
Null Hypothesis (H0): T1_latest performs the same when answering the same question in both en and zh_cn.

Alternative Hypothesis (H1): T1_latest performs differently when answering the same question in en vs. zh_cn.

idea: Use **Wilcoxon signed-rank test** for each question pair. This test will evaluate whether T1_latest’s performance on the same question in en differs from its performance in zh_cn. Then calculate the magnitude of the difference (e.g., by calculating the difference in mean score between en and zh_cn for each question pair) to find **top 10**

Better to find **top 10** questions' number to see which pair show the most significant difference in language

###  Evaluating T1_latest's Performance Across Categories (in Different Languages)
Null Hypothesis (H0): T1_latest performs equally across languages for each category.

Alternative Hypothesis (H1): T1_latest performs differently across languages for each category.

Rank the categories by the magnitude of the difference in performance between en and zh_cn

### Comparing T1_latest’s Performance in Each Category Across Languages

Null Hypothesis (H0): There is no difference in how T1_latest handles the same categories in en vs. zh_cn.

Alternative Hypothesis (H1): T1_latest performs differently on the same categories in en vs. zh_cn.

### if the performance of models differs for each category (e.g., science, history, etc.) across en and zh_cn.

Hypothesis: Models perform better in certain categories in one language (e.g., en for science) and worse in others (e.g., zh_cn for literature).