In [1]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer

import random

In [2]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Train dataset

## Tokenize train data

In [4]:
summary_prefs_ds = load_dataset('openai/summarize_from_feedback', 
                                name='comparisons', split='train')

In [5]:
def text_to_messages(info, target_words):
    return [
        {"role": "system", 
         "content": f"Summarize the text using exactly {target_words} words:"},
        {"role": "user", 
         "content": info['post']}
    ]

def create_query(text):
    target_words = random.choice([13, 21, 34])
    prompt = tokenizer.apply_chat_template(text_to_messages(text, target_words), 
                                           add_generation_prompt=True, 
                                           tokenize=False)
    return {"query": prompt}

def tokenize(text, max_len=1024):
    return {"tokens": tokenizer(text, padding="max_length", max_length=max_len,
                                return_tensors='pt', padding_side='left')}


In [6]:
N_TRAIN = 1024
N_TEST = 256
train_ds = summary_prefs_ds.shuffle(seed=12833).select(range(N_TRAIN)).map(create_query, input_columns=['info'])
test_ds = summary_prefs_ds.shuffle(seed=12833).select(range(N_TRAIN, N_TRAIN+N_TEST)).map(create_query, input_columns=['info'])

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

In [7]:
train_ds = train_ds.map(tokenize, input_columns=['query'])
test_ds = test_ds.map(tokenize, input_columns=['query'])

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

In [8]:
summary_train_ds = DatasetDict({"train": train_ds, "test": test_ds})
summary_train_ds.push_to_hub('summary_train')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/amang1802/summary_train/commit/5745ae2778a4fae5b18d0b97221e15c26cc1f2d3', commit_message='Upload dataset', commit_description='', oid='5745ae2778a4fae5b18d0b97221e15c26cc1f2d3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/amang1802/summary_train', endpoint='https://huggingface.co', repo_type='dataset', repo_id='amang1802/summary_train'), pr_revision=None, pr_num=None)