In [48]:
# import statement
import pandas as pd
import json
import re
from datasets import load_dataset
import requests

In [2]:
# helper function to save
def write_jsonl(df, output_filename):
    with open(output_filename, 'w', encoding='utf-8') as jsonl_file:
        for idx, row in df.iterrows():
            row_dict = {
                "prompt_sn": "", 
                "class1": row['class1'],
                "class2": row['class2'],
                "class3": row['class3'],
                "questions": row['questions'],
                "ref_answers": row['ref_answers'],
                "multi_media": row['multi_media'],
                "history_answers": row['history_answers'],
                "is_markdown": row['is_markdown'],
                "tags": row['tags'],
                "auto_eval_type": row['auto_eval_type'],
                "param": row['param'],
                "assign_tag": row['assign_tag'],
                "prompt_elements": row['prompt_elements'],
                "question_setter": row['question_setter'],
                "auto_eval_config": row['auto_eval_config']
            }
            json.dump(row_dict, jsonl_file, ensure_ascii=False)
            jsonl_file.write("\n")

    print(f"Data has been successfully saved to {output_filename}")

## Benchmark for MLogiQA
Resources: https://huggingface.co/datasets/Qwen/P-MMEval/viewer/mlogiqa/test?p=7

In [32]:
dataset = load_dataset("Qwen/P-MMEval", "mlogiqa")
dataset

README.md: 0.00B [00:00, ?B/s]

mlogiqa/test/ar.jsonl:   0%|          | 0.00/110k [00:00<?, ?B/s]

mlogiqa/test/en.jsonl:   0%|          | 0.00/76.8k [00:00<?, ?B/s]

mlogiqa/test/es.jsonl:   0%|          | 0.00/87.6k [00:00<?, ?B/s]

mlogiqa/test/fr.jsonl:   0%|          | 0.00/91.7k [00:00<?, ?B/s]

mlogiqa/test/ja.jsonl:   0%|          | 0.00/83.3k [00:00<?, ?B/s]

mlogiqa/test/ko.jsonl:   0%|          | 0.00/81.0k [00:00<?, ?B/s]

mlogiqa/test/pt.jsonl:   0%|          | 0.00/84.1k [00:00<?, ?B/s]

mlogiqa/test/th.jsonl:   0%|          | 0.00/172k [00:00<?, ?B/s]

mlogiqa/test/vi.jsonl:   0%|          | 0.00/95.8k [00:00<?, ?B/s]

mlogiqa/test/zh.jsonl:   0%|          | 0.00/59.7k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['options', 'id', 'answer', 'question', 'context'],
        num_rows: 800
    })
})

In [None]:
url = 'https://huggingface.co/datasets/Qwen/P-MMEval/resolve/main/mlogiqa/test/en.jsonl'
local_file_path = 'mlogiqa_test_en.jsonl'

response = requests.get(url)
response.raise_for_status()  

with open(local_file_path, 'wb') as f:
    f.write(response.content)


File saved successfully to mlogiqa_test_en.jsonl


In [49]:
url = 'https://huggingface.co/datasets/Qwen/P-MMEval/resolve/main/mlogiqa/test/zh.jsonl'
local_file_path = 'mlogiqa_test_zh.jsonl'

response = requests.get(url)
response.raise_for_status()  

with open(local_file_path, 'wb') as f:
    f.write(response.content)

In [51]:
file_path = 'mlogiqa_test_zh.jsonl'
mlogiqa_zh = pd.read_json(file_path, lines=True)
file_path = 'mlogiqa_test_en.jsonl'
mlogiqa_en = pd.read_json(file_path, lines=True)

In [72]:
def transform_mlogiqa_dataset(df, language):
    df['class1'] = '中英文评测'
    df['class2'] = 'mlogiqa'
    df['class3'] = language
    
    def format_options(options):
        option_dict = {
            "A": [f"\"{options[0]}\""],
            "B": [f"\"{options[1]}\""],
            "C": [f"\"{options[2]}\""],
            "D": [f"\"{options[3]}\""]
        }
        formatted_options = '{' + ', '.join([f'\"{key}\": [{value[0]}]' for key, value in option_dict.items()]) + '}'
        return formatted_options
    df['questions'] = df.apply(lambda row: [f"{row['context']} {row['question']} \n{format_options(row['options'])}"], axis=1)

    answer_map = {0: "A", 1: "B", 2: "C", 3: "D"}
    df['ref_answers'] = df['answer'].apply(lambda x: [answer_map.get(x, None)])

    df['tags'] = df['id'].apply(lambda x: [f"mlogiqa_{str(x)}"]) 
    df['multi_media'] = [[] for _ in range(len(df))]
    df['history_answers'] = [None for _ in range(len(df))]
    df['is_markdown'] = 1
    df['auto_eval_type'] = 0
    df['param'] = None
    df['assign_tag'] = ""
    df['prompt_elements'] = [[] for _ in range(len(df))]
    df['question_setter'] = ""
    df['auto_eval_config'] = [{"evaluator": {"name": "", "extra": None}} for _ in range(len(df))]
    
    return df[['class1', 'class2', 'class3', 'questions', 'ref_answers', 'tags', 'multi_media',
               'history_answers', 'is_markdown', 'auto_eval_type', 'param', 'assign_tag', 'prompt_elements', 
               'question_setter', 'auto_eval_config']]

In [73]:
transformed_en_mlogiqa = transform_mlogiqa_dataset(mlogiqa_en, language = "en")
transformed_zh_mlogiqa = transform_mlogiqa_dataset(mlogiqa_zh, language = "zh_cn")

In [74]:
combined_mlogiqa = pd.concat([transformed_en_mlogiqa, transformed_zh_mlogiqa], ignore_index=True)
write_jsonl(combined_mlogiqa, output_filename = 'transformed_mlogiqa_data.jsonl')

Data has been successfully saved to transformed_mlogiqa_data.jsonl


## Benchmark for PolyMath
Resources: https://huggingface.co/datasets/Qwen/PolyMath

In [26]:
def transform_polymath_dataset(df, language):
    df['class1'] = '中英文评测'
    df['class2'] = 'PolyMath'
    df['class3'] = language
    df['questions'] = df['question'].apply(lambda x: [x])
    df['ref_answers'] = df['answer'].apply(lambda x: [x])  
    df['tags'] = df['id'].apply(lambda x: [re.sub(r'-[a-zA-Z]{2}-', '_', x)])
    df['multi_media'] = [[] for _ in range(len(df))]
    df['history_answers'] = [None for _ in range(len(df))] 
    df['is_markdown'] = 1  
    df['auto_eval_type'] = 0  
    df['param'] = None 
    df['assign_tag'] = ""  
    df['prompt_elements'] = [[] for _ in range(len(df))]
    df['question_setter'] = "" 
    df['auto_eval_config'] = [{"evaluator": {"name": "", "extra": None}} for _ in range(len(df))]

    # Return the transformed dataframe
    return df[['class1', 'class2', 'class3', 'questions', 'ref_answers', 'tags', 'multi_media',
               'history_answers', 'is_markdown', 'auto_eval_type', 'param', 'assign_tag', 'prompt_elements', 
               'question_setter', 'auto_eval_config']]

In [21]:
splits = {'top': 'en/top.parquet', 'high': 'en/high.parquet', 'medium': 'en/medium.parquet', 'low': 'en/low.parquet'}
en_top = pd.read_parquet("hf://datasets/Qwen/PolyMath/" + splits["top"])
en_high = pd.read_parquet("hf://datasets/Qwen/PolyMath/" + splits["high"])
en_medium = pd.read_parquet("hf://datasets/Qwen/PolyMath/" + splits["medium"])
en_low = pd.read_parquet("hf://datasets/Qwen/PolyMath/" + splits["low"])

splits = {'top': 'zh/top.parquet', 'high': 'zh/high.parquet', 'medium': 'zh/medium.parquet', 'low': 'zh/low.parquet'}
zh_top = pd.read_parquet("hf://datasets/Qwen/PolyMath/" + splits["top"])
zh_high = pd.read_parquet("hf://datasets/Qwen/PolyMath/" + splits["high"])
zh_medium = pd.read_parquet("hf://datasets/Qwen/PolyMath/" + splits["medium"])
zh_low = pd.read_parquet("hf://datasets/Qwen/PolyMath/" + splits["low"])

In [27]:
transformed_en_top = transform_polymath_dataset(en_top, language = "en")
transformed_en_high = transform_polymath_dataset(en_high, language = "en")
transformed_en_medium = transform_polymath_dataset(en_medium, language = "en")
transformed_en_low = transform_polymath_dataset(en_low, language = "en")

transformed_zh_top = transform_polymath_dataset(zh_top, language = "zh_cn")
transformed_zh_high = transform_polymath_dataset(zh_high, language = "zh_cn")
transformed_zh_medium = transform_polymath_dataset(zh_medium, language = "zh_cn")
transformed_zh_low = transform_polymath_dataset(zh_low, language = "zh_cn")

In [28]:
combined_polymath = pd.concat([transformed_en_top, transformed_en_high, transformed_en_medium, transformed_en_low,
                         transformed_zh_top, transformed_zh_high, transformed_zh_medium, transformed_zh_low],
                        ignore_index=True)
# combined_polymath.head()
write_jsonl(combined_polymath, output_filename = 'transformed_polyMath_data.jsonl')

Data has been successfully saved to transformed_polyMath_data.jsonl


## Benchmark for MMLU
Resources: https://huggingface.co/datasets/CohereLabs/Global-MMLU/viewer/en?views%5B%5D=en_test

In [None]:
def transform_mmlu_dataset(df, language):
    df['class1'] = '中英文评测'
    df['class2'] = 'Global_MMLU'
    df['class3'] = language
    df['questions'] = df.apply(
        lambda row: [f"{row['question']}\n{{ \"A\": [ \"{row['option_a']}\"], \"B\": [ \"{row['option_b']}\"], \"C\": [ \"{row['option_c']}\"], \"D\": [ \"{row['option_d']}\"] }}"], axis=1)
    df['ref_answers'] = df['answer'].apply(lambda x: [x])  
    df['tags'] = df['sample_id'].apply(lambda x: [x]) 
    df['multi_media'] = [[] for _ in range(len(df))]
    df['history_answers'] = [None for _ in range(len(df))] 
    df['is_markdown'] = 1  
    df['auto_eval_type'] = 0  
    df['param'] = None 
    df['assign_tag'] = ""  
    df['prompt_elements'] = [[] for _ in range(len(df))]
    df['question_setter'] = "" 
    df['auto_eval_config'] = [{"evaluator": {"name": "", "extra": None}} for _ in range(len(df))]

    # Return the transformed dataframe
    return df[['class1', 'class2', 'class3', 'questions', 'ref_answers', 'tags', 'multi_media',
               'history_answers', 'is_markdown', 'auto_eval_type', 'param', 'assign_tag', 'prompt_elements', 
               'question_setter', 'auto_eval_config']]

### Test on Global MMLU
~14.3k datastes for each language

In [None]:
splits = {'test': 'en/test-00000-of-00001.parquet', 'dev': 'am/dev-00000-of-00001.parquet'}
MMLU_en = pd.read_parquet("hf://datasets/CohereLabs/Global-MMLU/" + splits["test"])

In [None]:
MMLU_en.columns
unique_tags = MMLU_en['subject'].explode().unique()
unique_tags

In [None]:
transformed_en_df = transform_mmlu_dataset(MMLU_en, language = "en")

In [None]:
splits = {'test': 'zh/test-00000-of-00001.parquet', 'dev': 'am/dev-00000-of-00001.parquet'}
MMLU_cn = pd.read_parquet("hf://datasets/CohereLabs/Global-MMLU/" + splits["test"])
transformed_cn_df = transform_mmlu_dataset(MMLU_cn, language = "zh_cn")

In [None]:
combined_df = pd.concat([transformed_cn_df, transformed_en_df], ignore_index=True)

In [None]:
write_jsonl(combined_df, output_filename = 'transformed_mmlu_data.jsonl')

### Test on Lite MMLU
~400 datastes for each language

In [None]:
splits = {'test': 'en/test-00000-of-00001.parquet', 'dev': 'en/dev-00000-of-00001.parquet'}
en_lite = pd.read_parquet("hf://datasets/CohereLabs/Global-MMLU-Lite/" + splits["test"])

splits = {'test': 'zh/test-00000-of-00001.parquet', 'dev': 'zh/dev-00000-of-00001.parquet'}
zh_lite = pd.read_parquet("hf://datasets/CohereLabs/Global-MMLU-Lite/" + splits["test"])

In [None]:
transformed_en_lite = transform_mmlu_dataset(en_lite, language = "en")
transformed_zh_lite = transform_mmlu_dataset(zh_lite, language = "zh_cn")
combined_lite = pd.concat([transformed_en_lite, transformed_zh_lite], ignore_index=True)
write_jsonl(combined_lite, output_filename = 'transformed_mmlu_lite_data.jsonl')

In [None]:
raise KeyboardInterrupt("Stopping execution here")

## Benchmark for MCLM
Resources: https://huggingface.co/datasets/amphora/MCLM

In [None]:
# load datasets
df_imo = pd.read_parquet("hf://datasets/amphora/MCLM/m-imo.parquet")
df_math100 = pd.read_parquet("hf://datasets/amphora/MCLM/mt-math100.parquet")
df_aime2024 = pd.read_parquet("hf://datasets/amphora/MCLM/mt-aime2024.parquet")

In [None]:
df_imo.columns

In [None]:
def transform_dataset(df, input='imo'):
    transformed_data = []

    for idx, row in df.iterrows():
        transformed_data.append({
            'class1': '中英文评测',
            'class2': 'Global_MMLU_IMO',
            'class3': 'zh-cn',
            'questions': row['zh-cn'], 
            'ref_answers': row['answer'],
            'tags': f'mmlu_{input}_cn_{idx}',  
            'is_markdown': '1'
        })
        
        transformed_data.append({  
            'class1': '中英文评测',
            'class2': 'Global_MMLU_IMO',
            'class3': 'en',
            'questions': row['en'], 
            'ref_answers': row['answer'],
            'tags':  f'mmlu_{input}_en_{idx}',  
            'is_markdown': '1'
        })
    transformed_df = pd.DataFrame(transformed_data, columns=['class1', 'class2', 'class3', 'questions', 'ref_answers', 'tags', 'is_markdown'])

    return transformed_df

In [None]:
transformed_df_imo = transform_dataset(df_imo)
transformed_df_imo.head()

In [None]:
def save_to_json(df, filename):
    data_dict = df.to_dict(orient='records')
    
    with open(filename, 'w', encoding='utf-8') as json_file:
        json.dump(data_dict, json_file, ensure_ascii=False, indent=4)
    
    print(f"Data has been successfully saved to {filename}")

In [None]:
save_to_json(transformed_df_imo, "MMLU_dataset.json")

In [None]:
def save_to_jsonl(df, filename):
    data_dict = df.to_dict(orient='records')
    
    with open(filename, 'w', encoding='utf-8') as jsonl_file:
        for record in data_dict:
            jsonl_file.write(json.dumps(record, ensure_ascii=False) + "\n")
    
    print(f"Data has been successfully saved to {filename}")


In [None]:
save_to_jsonl(transformed_df_imo, "transformed_data.jsonl")

In [None]:
transformed_df_math100 = transform_dataset(df_math100, input='math100')

In [None]:
transformed_df_math100.head()

In [None]:
transformed_df_aime2024 = transform_dataset(df_aime2024, input='aime2024')
transformed_df_aime2024.head()