In [9]:
import datasets
from datasets import load_dataset

In [10]:
save_path = "../data/"

In [11]:
# get size of save folder
import os
import shutil
total_size = 0
for dirpath, dirnames, filenames in os.walk(save_path):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)
size_in_mb = total_size / 1024 / 1024
print("Size of save folder: {} MB".format(size_in_mb))

Size of save folder: 50.19893836975098 MB


# ARC

In [23]:
dataset_easy = load_dataset('ai2_arc', 'ARC-Easy')
dataset_challenge = load_dataset('ai2_arc', 'ARC-Challenge')

In [24]:
splits = ['train', 'validation', 'test']

for split in splits:
    easy = dataset_easy[split].to_pandas()
    challenge = dataset_challenge[split].to_pandas()
    # save to parquet
    easy.to_parquet(save_path + 'ARC-Easy-' + split + '.parquet', index=False)
    challenge.to_parquet(save_path + 'ARC-Challenge-' + split + '.parquet', index=False)


# HellaSwag

In [25]:
dataset = load_dataset("AlekseyKorshuk/hellaswag")

In [26]:
splits = ['train', 'validation', 'test']

for split in splits:
    ds = dataset[split].to_pandas()
    # save to parquet
    ds.to_parquet(save_path + 'hellaswag-' + split + '.parquet', index=False)

# TruthfulQA

In [27]:
dataset = load_dataset("EleutherAI/truthful_qa_mc")

In [28]:
splits = ['validation']

for split in splits:
    ds = dataset[split].to_pandas()
    # save to parquet
    ds.to_parquet(save_path + 'truthful_qa_mc-' + split + '.parquet', index=False)

# MMLU

In [29]:
options = ['high_school_european_history', 'business_ethics', 'clinical_knowledge', 'medical_genetics', 'high_school_us_history', 'high_school_physics', 'high_school_world_history', 'virology', 'high_school_microeconomics', 'econometrics', 'college_computer_science', 'high_school_biology', 'abstract_algebra', 'professional_accounting', 'philosophy', 'professional_medicine', 'nutrition', 'global_facts', 'machine_learning', 'security_studies', 'public_relations', 'professional_psychology', 'prehistory', 'anatomy', 'human_sexuality', 'college_medicine', 'high_school_government_and_politics', 'college_chemistry', 'logical_fallacies', 'high_school_geography', 'elementary_mathematics', 'human_aging', 'college_mathematics', 'high_school_psychology', 'formal_logic', 'high_school_statistics', 'international_law', 'high_school_mathematics', 'high_school_computer_science', 'conceptual_physics', 'miscellaneous', 'high_school_chemistry', 'marketing', 'professional_law', 'management', 'college_physics', 'jurisprudence', 'world_religions', 'sociology', 'us_foreign_policy', 'high_school_macroeconomics', 'computer_security', 'moral_scenarios', 'moral_disputes', 'electrical_engineering', 'astronomy', 'college_biology']

mmlu_datasets = {}
for option in options:
    mmlu_datasets[option] = load_dataset("lukaemon/mmlu", option)

In [30]:
splits = ['train', 'validation', 'test']

for split in splits:
    for option in options:
        ds = mmlu_datasets[option][split].to_pandas()
        # save to parquet
        ds.to_parquet(save_path + 'mmlu-' + option + '-' + split + '.parquet', index=False)

# Winogrande

In [5]:
options = ['winogrande_xs', 'winogrande_s', 'winogrande_m', 'winogrande_l', 'winogrande_xl', 'winogrande_debiased']
winogrande_datasets = {}
for option in options:
    winogrande_datasets[option] = load_dataset("winogrande", option)

Downloading data:   0%|          | 0.00/3.40M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1267 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/640 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1267 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/2558 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1267 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/10234 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1267 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/40398 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1267 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/9248 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1267 [00:00<?, ? examples/s]

In [6]:
splits = ['train', 'validation', 'test']

for split in splits:
    for option in options:
        ds = winogrande_datasets[option][split].to_pandas()
        # save to parquet
        ds.to_parquet(save_path + 'winogrande-' + option + '-' + split + '.parquet', index=False)

# GSM8K

In [7]:
options = ['main', 'socratic']
gsm8k_datasets = {}
for option in options:
    gsm8k_datasets[option] = load_dataset("gsm8k", option)

Downloading builder script:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.70k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/242k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/279k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [8]:
splits = ['train', 'test']
for split in splits:
    for option in options:
        ds = gsm8k_datasets[option][split].to_pandas()
        # save to parquet
        ds.to_parquet(save_path + 'gsm8k-' + option + '-' + split + '.parquet', index=False)