# Parallel analysis for all the models on the Benchmark dataset
Test to see their different preformance under different language environment

In [1]:
# import statements
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from glob import glob

## Dataset Preparation

In [2]:
file_path = r'C:\Users\zihengfeng\CrossLingual-Benchmark-Eval\Dataset\result\Global_MMLU_Lite.xlsx'
MMLU_df = pd.read_excel(file_path)

file_path = r'C:\Users\zihengfeng\CrossLingual-Benchmark-Eval\Dataset\result\mlogiqa.xlsx'
Mlogiqa_df = pd.read_excel(file_path)

In [3]:
def process_data(df):
    columns_to_keep = ['样本ID', '一级分类', '二级分类', '三级分类', 'tag'] + \
                       [col for col in df.columns if col.startswith('标注结果-多模型打分')]
    df_filtered = df[columns_to_keep].copy() 
    
    score_mapping = {'不切题': 0, '结果不满意': 1, '基本满意': 2, '满意': 3, '超出预期': 3}
    result_columns = [col for col in df_filtered.columns if col.startswith('标注结果-多模型打分')]
    for col in result_columns:
        df_filtered.loc[:, col] = df_filtered[col].map(score_mapping)  
    
    df_filtered.rename(columns={
        '样本ID': 'SampleID',
        '一级分类': 'class1',
        '二级分类': 'class2',
        '三级分类': 'class3',
        'tag': 'tag',
        '标注结果-多模型打分（5档）-B端现网-混元-T1-latest-流式-文生文-API': 'T1_latest',
        '标注结果-多模型打分（5档）-Doubao-Seed-1.6-thinking-0715-文生文-API': 'Doubao_1.6_Thinking',
        '标注结果-多模型打分（5档）-Gemini-2.5-Pro-最长思考-文生文-API': 'Gemini_2.5_Pro',
        '标注结果-多模型打分（5档）-安平_OpenAI-o3_azure-文生文-API': 'OpenAI_o3_azure',
        '标注结果-多模型打分（5档）-通义千问-qwen3-235b-a22b慢思考-文生文-API': 'Qwen3_235b_a22b',
        '标注结果-多模型打分（5档）-安平_deepseek-R1-文生文-API': 'Deepseek_R1'
    }, inplace=True)
    
    return df_filtered

In [4]:
MMLU_df_cleaned = process_data(MMLU_df)
Mlogiqa_df_cleaned = process_data(Mlogiqa_df)

In [5]:
MMLU_df_cleaned.head()

Unnamed: 0,SampleID,class1,class2,class3,tag,T1_latest,Doubao_1.6_Thinking,Gemini_2.5_Pro,OpenAI_o3_azure,Qwen3_235b_a22b,Deepseek_R1
0,29113178,中英文评测,Global_MMLU,en,sociology/test/183,1,1,1,3,1,1
1,29113179,中英文评测,Global_MMLU,en,nutrition/test/14,3,3,3,3,3,3
2,29113180,中英文评测,Global_MMLU,en,high_school_geography/test/61,3,3,3,3,3,3
3,29113181,中英文评测,Global_MMLU,en,philosophy/test/45,3,3,3,3,3,3
4,29113182,中英文评测,Global_MMLU,zh_cn,security_studies/test/151,3,3,3,3,3,3


In [6]:
Mlogiqa_df_cleaned.head()

Unnamed: 0,SampleID,class1,class2,class3,tag,T1_latest,Doubao_1.6_Thinking,Gemini_2.5_Pro,OpenAI_o3_azure,Qwen3_235b_a22b,Deepseek_R1
0,29139154,中英文评测,mlogiqa,zh_cn,mlogiqa_71,3,3,1,1,1,3
1,29139155,中英文评测,mlogiqa,zh_cn,mlogiqa_2,1,1,1,1,1,1
2,29139156,中英文评测,mlogiqa,zh_cn,mlogiqa_34,3,3,1,1,3,1
3,29139157,中英文评测,mlogiqa,zh_cn,mlogiqa_29,3,3,3,3,3,3
4,29139158,中英文评测,mlogiqa,zh_cn,mlogiqa_41,1,1,1,1,1,1


In [None]:
def process_tag_column(df):
    special_char_pos = df['tag'].apply(lambda x: max(x.rfind('/'), x.rfind('_')))
    
    df['category'] = df['tag'].str.slice(0, special_char_pos)
    df['number'] = df['tag'].str.slice(special_char_pos + 1)
    
    # df.drop(columns=['tag'], inplace=True)
    return df

In [9]:
MMLU_df_final = process_tag_column(MMLU_df_cleaned)
MMLU_df_final.head()

Unnamed: 0,SampleID,class1,class2,class3,tag,T1_latest,Doubao_1.6_Thinking,Gemini_2.5_Pro,OpenAI_o3_azure,Qwen3_235b_a22b,Deepseek_R1,category,number
0,29113178,中英文评测,Global_MMLU,en,sociology/test/183,1,1,1,3,1,1,,
1,29113179,中英文评测,Global_MMLU,en,nutrition/test/14,3,3,3,3,3,3,,
2,29113180,中英文评测,Global_MMLU,en,high_school_geography/test/61,3,3,3,3,3,3,,
3,29113181,中英文评测,Global_MMLU,en,philosophy/test/45,3,3,3,3,3,3,,
4,29113182,中英文评测,Global_MMLU,zh_cn,security_studies/test/151,3,3,3,3,3,3,,


In [10]:
Mlogiqa_df_final = process_tag_column(Mlogiqa_df_cleaned)
Mlogiqa_df_final.head()

Unnamed: 0,SampleID,class1,class2,class3,tag,T1_latest,Doubao_1.6_Thinking,Gemini_2.5_Pro,OpenAI_o3_azure,Qwen3_235b_a22b,Deepseek_R1,category,number
0,29139154,中英文评测,mlogiqa,zh_cn,mlogiqa_71,3,3,1,1,1,3,,
1,29139155,中英文评测,mlogiqa,zh_cn,mlogiqa_2,1,1,1,1,1,1,,
2,29139156,中英文评测,mlogiqa,zh_cn,mlogiqa_34,3,3,1,1,3,1,,
3,29139157,中英文评测,mlogiqa,zh_cn,mlogiqa_29,3,3,3,3,3,3,,
4,29139158,中英文评测,mlogiqa,zh_cn,mlogiqa_41,1,1,1,1,1,1,,
