In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
from itertools import islice
from lib import models
from lib.nyt_connections import get_connections_games, get_connections_tasks
from lib.pack import packed_tensors_from_tokenized_results, plot_packed_tensors
from lib.recipe import ComponentConfig, TuneRecipeConfig
from lib.tasks import ChatCompletionParams, get_task_results
from lib.tokenize import TaskResultTokenizer
from lib.tune import tune
from lib.vllm import start_vllm, kill_vllm_workers
import openai
import os
import torch
from transformers import AutoTokenizer

games = get_connections_games()
tasks = list(
    islice(get_connections_tasks(games, parse_answers_liberally=False), len(games) * 2)
)
# distill_tasks = tasks[:436]
distill_tasks = tasks[:256]
val_tasks = tasks[436:508]
test_tasks = tasks[508 : len(games)]
train_tasks = tasks[len(games) : len(games) + 436]
len(distill_tasks), len(val_tasks), len(test_tasks), len(train_tasks)

(256, 72, 92, 436)

In [5]:
vllm = await start_vllm(
    "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
    # env={"VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1"},
    max_concurrent_requests=512,
    named_arguments=dict(
        block_size=32,
        disable_log_requests=True,
        # enable_chunked_prefill=True,
        enable_prefix_caching=True,
        enforce_eager=True,
        gpu_memory_utilization=0.95,
        max_model_len=16384,
        max_num_seqs=512,
        max_num_batched_tokens=16384,
        num_scheduler_steps=8,
        preemption_mode="swap",
        return_tokens_as_token_ids=True,
        swap_space=80,
        tensor_parallel_size=torch.cuda.device_count(),
    ),
    timeout=120 + 15 * torch.cuda.device_count(),
)
vllm

$ vllm serve deepseek-ai/DeepSeek-R1-Distill-Llama-70B --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.95 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --num-scheduler-steps=8 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=80 --tensor-parallel-size=8 --port=8000 --api-key=default
INFO 02-18 17:05:01 __init__.py:190] Automatically detected platform cuda.
INFO 02-18 17:05:02 api_server.py:840] vLLM API server version 0.7.2
INFO 02-18 17:05:02 api_server.py:841] args: Namespace(subparser='serve', model_tag='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_cer

Loading safetensors checkpoint shards:   0% Completed | 0/17 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   6% Completed | 1/17 [00:00<00:06,  2.43it/s]
Loading safetensors checkpoint shards:  12% Completed | 2/17 [00:00<00:06,  2.33it/s]
Loading safetensors checkpoint shards:  18% Completed | 3/17 [00:01<00:06,  2.31it/s]
Loading safetensors checkpoint shards:  24% Completed | 4/17 [00:01<00:05,  2.30it/s]
Loading safetensors checkpoint shards:  29% Completed | 5/17 [00:02<00:05,  2.26it/s]
Loading safetensors checkpoint shards:  35% Completed | 6/17 [00:02<00:04,  2.22it/s]
Loading safetensors checkpoint shards:  41% Completed | 7/17 [00:03<00:04,  2.21it/s]
Loading safetensors checkpoint shards:  47% Completed | 8/17 [00:03<00:04,  2.21it/s]
Loading safetensors checkpoint shards:  53% Completed | 9/17 [00:04<00:03,  2.19it/s]
Loading safetensors checkpoint shards:  59% Completed | 10/17 [00:04<00:03,  2.19it/s]
Loading safetensors checkpoint shards:  65% Completed | 11/17

[1;36m(VllmWorkerProcess pid=37089)[0;0m INFO 02-18 17:06:40 model_runner.py:1115] Loading model weights took 16.4606 GB
INFO 02-18 17:06:40 model_runner.py:1115] Loading model weights took 16.4606 GB
[1;36m(VllmWorkerProcess pid=37075)[0;0m INFO 02-18 17:06:40 model_runner.py:1115] Loading model weights took 16.4606 GB
[1;36m(VllmWorkerProcess pid=37074)[0;0m INFO 02-18 17:06:41 model_runner.py:1115] Loading model weights took 16.4606 GB
[1;36m(VllmWorkerProcess pid=37073)[0;0m INFO 02-18 17:06:41 model_runner.py:1115] Loading model weights took 16.4606 GB
[1;36m(VllmWorkerProcess pid=37090)[0;0m INFO 02-18 17:06:41 model_runner.py:1115] Loading model weights took 16.4606 GB
[1;36m(VllmWorkerProcess pid=37091)[0;0m INFO 02-18 17:06:41 model_runner.py:1115] Loading model weights took 16.4606 GB
[1;36m(VllmWorkerProcess pid=37072)[0;0m INFO 02-18 17:06:41 model_runner.py:1115] Loading model weights took 16.4606 GB
[1;36m(VllmWorkerProcess pid=37073)[0;0m INFO 02-18 17:06

INFO:     Started server process [36659]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO 02-18 17:07:55 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
INFO:     127.0.0.1:32894 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


vLLM(client=<openai.AsyncOpenAI object at 0x7c06928fbb90>, max_concurrent_tokens=1282539, process=<Process 36659>)

In [6]:
import asyncio

tokenized_results = [
    result
    for results in await get_task_results(
        tasks=train_tasks[: 512 // 16],
        client=vllm.client,
        model="deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
        cache=False,
        log_results=8,
        n=16,
        params=ChatCompletionParams(
            stream_options={
                "include_usage": True,
            },
        ),
        semaphore=asyncio.Semaphore(vllm.max_concurrent_tokens // 3800),
        transform=TaskResultTokenizer(
            AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-70B")
        ),
    )
    for result in results
]

  0%|          | 0/512 [00:00<?, ?it/s]

In [7]:
vllm.process.terminate()
kill_vllm_workers()