In [1]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

In [2]:
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train", streaming=True, trust_remote_code=True)

README.md:   0%|          | 0.00/905 [00:00<?, ?B/s]

In [3]:
NUM_SAMPLES = 50

In [4]:
ds_sample = dataset.take(1000).shuffle(seed=1998).take(50)
ds_sample = Dataset.from_list(list(ds_sample))
ds_sample

Dataset({
    features: ['instruction', 'input', 'output', 'prompt'],
    num_rows: 50
})

In [5]:
SYSTEM_PROMPT = \
"""
Implement the code for the provided spec.

- Your goal is be efficient and Nkeep the code short.
- The code should be understandable just by the naming and the flow.
- There should be no comments.
- Only output the code, nothing else.
""".strip()

In [6]:
def gen_code(llm, tokenizer, batch):
    messages = [[{"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": item}] for item in batch]
    prompts = [tokenizer.apply_chat_template(item, tokenize=False, add_generation_prompt=True) for item in messages]

    outputs = llm.generate(prompts, SamplingParams(temperature=0.75, top_p=0.9, max_tokens=2048))

    return {"code_prompt": list(batch), "generated_code": [output.outputs[0].text.strip() for output in outputs]}

In [7]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [9]:
llm = LLM(model=model_id, max_model_len=4096)

INFO 12-08 13:47:30 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 12-08 13:47:30 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Lla

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 12-08 13:47:41 model_runner.py:1077] Loading model weights took 14.9888 GB
INFO 12-08 13:47:43 worker.py:232] Memory profiling results: total_gpu_memory=23.69GiB initial_memory_usage=16.05GiB peak_torch_memory=16.19GiB memory_usage_post_profile=16.07GiB non_torch_memory=1.07GiB kv_cache_size=4.06GiB gpu_memory_utilization=0.90
INFO 12-08 13:47:43 gpu_executor.py:113] # GPU blocks: 2077, # CPU blocks: 2048
INFO 12-08 13:47:43 gpu_executor.py:117] Maximum concurrency for 4096 tokens per request: 8.11x
INFO 12-08 13:47:45 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-08 13:47:45 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO

In [11]:
ds_generated = ds_sample.map(lambda batch: gen_code(llm, tokenizer, batch), batched=True, batch_size=32, input_columns=['instruction'],
                             remove_columns=ds_sample.column_names)



Map:   0%|          | 0/50 [00:00<?, ? examples/s]


Processed prompts:   0%| | 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, [A
Processed prompts:   3%| | 1/32 [00:01<00:45,  1.47s/it, est. speed input: 71.25[A
Processed prompts:   9%| | 3/32 [00:01<00:14,  2.06it/s, est. speed input: 178.7[A
Processed prompts:  12%|▏| 4/32 [00:01<00:09,  2.82it/s, est. speed input: 230.2[A
Processed prompts:  19%|▏| 6/32 [00:02<00:06,  4.24it/s, est. speed input: 307.1[A
Processed prompts:  25%|▎| 8/32 [00:02<00:04,  5.73it/s, est. speed input: 380.5[A
Processed prompts:  31%|▎| 10/32 [00:02<00:03,  6.50it/s, est. speed input: 430.[A
Processed prompts:  38%|▍| 12/32 [00:03<00:04,  4.75it/s, est. speed input: 413.[A
Processed prompts:  41%|▍| 13/32 [00:03<00:03,  5.12it/s, est. speed input: 427.[A
Processed prompts:  47%|▍| 15/32 [00:03<00:03,  4.28it/s, est. speed input: 417.[A
Processed prompts:  50%|▌| 16/32 [00:04<00:03,  4.02it/s, est. speed input: 415.[A
Processed prompts:  53%|▌| 17/32 [00:04<00:03,  3.84it/s, est. speed input:

In [12]:
ds_generated.push_to_hub('python_alpaca_generated_sample')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/amang1802/python_alpaca_generated_sample/commit/588c3541f48174bd5b8c7b22c218f3e978dadbc1', commit_message='Upload dataset', commit_description='', oid='588c3541f48174bd5b8c7b22c218f3e978dadbc1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/amang1802/python_alpaca_generated_sample', endpoint='https://huggingface.co', repo_type='dataset', repo_id='amang1802/python_alpaca_generated_sample'), pr_revision=None, pr_num=None)