In [21]:
import os
from pathlib import Path
from typing import Literal
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import upload_folder

In [22]:
def N(n):
    if n >= 1_000_000_000:
        return f"{n/1_000_000_000:.1f}".rstrip('0').rstrip('.') + 'B'
    elif n >= 1_000_000:
        return f"{n/1_000_000:.1f}".rstrip('0').rstrip('.') + 'M'
    elif n >= 1_000:
        return f"{n/1_000:.1f}".rstrip('0').rstrip('.') + 'K'
    else:
        return str(n)

In [23]:
task = 'wikipedia'
lang = 'ja'
train_size = 5000
val_size = 1000
test_size = 1000
max_seq_length = 64
hf_data_id = 'alxxtexxr/Nero-XLT-Dataset'

save_dir_name = '_'.join([
    task, 
    lang, 
    N(train_size), 
    N(val_size), 
    N(test_size), 
    str(max_seq_length),
])
save_dir = Path('data/preprocessed') / save_dir_name
print("[CONFIG] Save directory name:", save_dir_name)
print("[CONFIG] Save directory:", save_dir)

base_model_name = 'unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit'
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

[CONFIG] Save directory name: wikipedia_ja_5K_1K_1K_64
[CONFIG] Save directory: data/preprocessed/wikipedia_ja_5K_1K_1K_64


In [24]:
def load_datasets(
    task: Literal['wikipedia', 'gsm8k'],
    lang: str,
    train_size: int = 5000,
    val_size: int = 1000,
    test_size: int = 1000,
):
    # Set up Hugging Face data configuration
    data_id_map = {
        'wikipedia': 'wikimedia/wikipedia',
        'gsm8k': 'openai/gsm8k',
    }
    data_id = data_id_map[task]
    data_dir = f'20231101.{lang}' if task == 'wikipedia' else 'main'

    # Use streaming to avoid full download
    dataset_stream = load_dataset(
        data_id, 
        data_dir=data_dir, 
        split='train', 
        streaming=True,
    )

    # Manually take train_size + test_size samples
    total_size = train_size + test_size + val_size
    sliced_data = []
    for i, example in enumerate(dataset_stream):
        if i >= total_size:
            break
        sliced_data.append(example)

    # Convert to regular in-memory dataset
    full_dataset = Dataset.from_list(sliced_data)

    # Split into train, validation, and test datasets
    train_dataset = full_dataset.select(range(train_size))
    val_dataset = full_dataset.select(range(train_size, train_size + val_size))
    test_dataset = full_dataset.select(range(train_size + val_size, total_size))

    return DatasetDict({
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    })

def _args(x=None):
    return dict(
        batched=True,
        remove_columns=x,
        num_proc=os.cpu_count(),
    )

def process_gsm8k_dataset(dataset, tokenizer, max_seq_length):
    eos_token = tokenizer.eos_token

    def format(example):
        prompt = """### Instruction:
Solve the following math problem step by step.

### Question: 
{q}

### Answer: 
{a}""" + eos_token

        return {'text': prompt.format(q=example['question'], a=example['answer'])}

    def tokenize(example):
        return tokenizer(
            example['text'],
            truncation=True,
            padding='max_length',
            max_length=max_seq_length,
        )

    formatted = dataset.map(format, **_args(x=dataset.column_names))
    tokenized = formatted.map(tokenize, **_args(x=formatted.column_names))
    return tokenized

def process_wikipedia_dataset(dataset, tokenizer, block_size):
    def tokenize(example):
        return tokenizer(example['text'])

    def group_texts(examples):
        concatenated = []
        for input_ids in examples['input_ids']:
            concatenated += input_ids

        total_length = len(concatenated) // block_size * block_size

        input_ids = [concatenated[i:i + block_size] for i in range(0, total_length, block_size)]
        attention_mask = [[1] * block_size for _ in input_ids]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
        }

    tokenized = dataset.map(tokenize, **_args(x=dataset.column_names))
    grouped = tokenized.map(group_texts, **_args(x=tokenized.column_names))
    return grouped

In [25]:
datasets = load_datasets(task, lang, train_size, test_size)
print(datasets)

README.md: 0.00B [00:00, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 5000
    })
    val: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 1000
    })
})


In [26]:
if task == 'gsm8k':
    train_dataset = process_gsm8k_dataset(datasets['train'], tokenizer, max_seq_length)
    val_dataset = process_gsm8k_dataset(datasets['val'], tokenizer, max_seq_length)
    test_dataset = process_gsm8k_dataset(datasets['test'], tokenizer, max_seq_length)
else:
    train_dataset = process_wikipedia_dataset(datasets['train'], tokenizer, max_seq_length)
    val_dataset = process_wikipedia_dataset(datasets['val'], tokenizer, max_seq_length)
    test_dataset = process_wikipedia_dataset(datasets['test'], tokenizer, max_seq_length)

Map (num_proc=2):   0%|          | 0/5000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (141723 > 131072). Running this sequence through the model will result in indexing errors


Map (num_proc=2):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [27]:
train_dataset.to_parquet(save_dir / 'train.parquet')
val_dataset.to_parquet(save_dir / 'val.parquet')
test_dataset.to_parquet(save_dir / 'test.parquet')
print(train_dataset)
print(val_dataset)
print(test_dataset)

Creating parquet from Arrow format:   0%|          | 0/444 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/72 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/66 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 443134
})
Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 71172
})
Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 65808
})


In [28]:
upload_folder(
    repo_id=hf_data_id,
    repo_type='dataset',
    folder_path=save_dir,
    path_in_repo=save_dir_name,
    commit_message=f"Upload preprocessed {save_dir_name} dataset folder"
)

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

train.parquet:   0%|          | 0.00/48.1M [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/7.25M [00:00<?, ?B/s]

val.parquet:   0%|          | 0.00/7.78M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/alxxtexxr/Nero-XLT-Dataset/commit/e3cf25d87a652f29f7d52f9924de906ef5282089', commit_message='Upload preprocessed wikipedia_ja_5K_1K_1K_64 dataset folder', commit_description='', oid='e3cf25d87a652f29f7d52f9924de906ef5282089', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/alxxtexxr/Nero-XLT-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='alxxtexxr/Nero-XLT-Dataset'), pr_revision=None, pr_num=None)

In [29]:
# # Sanity check
# from datasets import load_dataset
# load_dataset(hf_data_id, data_dir=save_dir_name)