In [1]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
NUM_SIZE = 1000
NUM_GPUS = 1

In [3]:
user_llm_instr_ds = load_dataset('lmsys/lmsys-chat-1m')['train'].shuffle(seed=42).select(range(NUM_SIZE))

In [4]:
instr_generation_sys_prompt = "Output an instruction or question to which the user provided text is the answer."

In [5]:
def get_chosen_rejected(llm, tokenizer, conv_batch):
    pair_0, pair_1 = zip(*[(conv[0]['content'], conv[1]['content']) for conv in conv_batch])
    user_instrs, assistant_responses = list(pair_0), list(pair_1)
    prompt_messages = [[{"role": "system", "content": instr_generation_sys_prompt},
                       {"role": "user", "content": text + "\n\n" + "Instruction:"}] for text in assistant_responses]
    prompts = [tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) for messages in prompt_messages]

    outputs = llm.generate(prompts, SamplingParams(temperature=0.25, top_p=0.9, max_tokens=512))

    return {
        "chosen": user_instrs,
        "rejected": [output.outputs[0].text.strip() for output in outputs],
        "user_input": assistant_responses,
        "system_prompt": [instr_generation_sys_prompt] * len(user_instrs)
    }    

In [6]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [8]:
llm = LLM(model=model_id, max_model_len=4096, tensor_parallel_size=NUM_GPUS)

INFO 12-23 10:18:02 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 12-23 10:18:02 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Lla

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 12-23 10:18:13 model_runner.py:1077] Loading model weights took 14.9888 GB
INFO 12-23 10:18:14 worker.py:232] Memory profiling results: total_gpu_memory=23.69GiB initial_memory_usage=15.88GiB peak_torch_memory=16.19GiB memory_usage_post_profile=15.89GiB non_torch_memory=0.90GiB kv_cache_size=4.23GiB gpu_memory_utilization=0.90
INFO 12-23 10:18:14 gpu_executor.py:113] # GPU blocks: 2167, # CPU blocks: 2048
INFO 12-23 10:18:14 gpu_executor.py:117] Maximum concurrency for 4096 tokens per request: 8.46x
INFO 12-23 10:18:16 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-23 10:18:16 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO

In [None]:
instr_preference_ds = user_llm_instr_ds.map(lambda batch: get_chosen_rejected(llm, tokenizer, batch),
                                            input_columns=['conversation'],
                                            batched=True,
                                            batch_size=512)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Processed prompts:   0%|                                   | 0/512 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A




Processed prompts:   0%|                        | 1/512 [00:07<1:02:54,  7.39s/it, est. speed input: 29.24 toks/s, output: 0.68 toks/s][A
Processed prompts:   1%|▏                         | 3/512 [00:07<16:41,  1.97s/it, est. speed input: 51.52 toks/s, output: 2.52 toks/s][A
Processed prompts:   1%|▎                         | 6/512 [00:07<06:35,  1.28it/s, est. speed input: 79.53 toks/s, output: 6.16 toks/s][A
Processed prompts:   2%|▌                      | 12/512 [00:07<02:31,  3.30it/s, est. speed input: 240.55 toks/s, output: 14.47 toks/s][A
Processed prompts:   3%|▋                      | 16/512 [00:08<01:52,  4.41it/s, est. speed input: 299.81 toks/s, output: 19.71 toks/s][A
Processed prompts:   4%|▉                      | 22/512 [00:08<01:11,  6.90it/s, est. speed input: 442.60 toks/s, output: 28.89 toks/s][A
Processed prompts:   5%|█                      | 25/512 [00:08<01:02,  7.80it/s, est. speed input: 459.60 toks/s, output: 33.42 toks/s][A
Processed prompts:   6%|█▎

In [None]:
#train_test_ds = instr_preference_ds.train_test_split(test_size=0.05, shuffle=True)
#train_test_ds.push_to_hub('lmsys_synthetic_instruction_preferences')