In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
import asyncio
from lib.tasks import ChatCompletionParams, get_task_results
from lib.temporal_clue import get_temporal_clue_tasks
from lib.vllm import start_vllm, kill_vllm_workers
import random
import torch

tasks = list(get_temporal_clue_tasks(reward_power=3))
val_tasks = tasks[:64]
test_tasks = tasks[64:128]
train_tasks = tasks[128:]
random.seed(42)
random.shuffle(train_tasks)
len(val_tasks), len(test_tasks), len(train_tasks)

(64, 64, 2860)

In [4]:
expected_tokens = 1000
model = "./models/044/0098"

In [6]:
vllm = await start_vllm(
    model,
    max_concurrent_requests=4096,
    env={"VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1"},
    named_arguments=dict(
        block_size=32,
        disable_log_requests=True,
        enable_prefix_caching=True,
        enforce_eager=True,
        gpu_memory_utilization=0.95,
        max_model_len=16384,
        max_num_seqs=4096,
        max_num_batched_tokens=16384,
        num_scheduler_steps=16,
        preemption_mode="swap",
        return_tokens_as_token_ids=True,
        swap_space=80,
        tensor_parallel_size=torch.cuda.device_count(),
    ),
    timeout=360 + 15 * torch.cuda.device_count(),
)

$ vllm serve /home/gcpuser/sky_workdir/experiments/models/044/0098 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.95 --max-model-len=16384 --max-num-seqs=4096 --max-num-batched-tokens=16384 --num-scheduler-steps=16 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=80 --tensor-parallel-size=2 --served-model-name=./models/044/0098 --port=8000 --api-key=default
INFO 03-05 17:51:57 __init__.py:190] Automatically detected platform cuda.
INFO 03-05 17:51:58 api_server.py:840] vLLM API server version 0.7.2
INFO 03-05 17:51:58 api_server.py:841] args: Namespace(subparser='serve', model_tag='/home/gcpuser/sky_workdir/experiments/models/044/0098', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_r

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  12% Completed | 1/8 [00:00<00:04,  1.47it/s]
Loading safetensors checkpoint shards:  25% Completed | 2/8 [00:01<00:04,  1.49it/s]
Loading safetensors checkpoint shards:  38% Completed | 3/8 [00:02<00:03,  1.49it/s]
Loading safetensors checkpoint shards:  50% Completed | 4/8 [00:02<00:01,  2.03it/s]
Loading safetensors checkpoint shards:  62% Completed | 5/8 [00:02<00:01,  2.00it/s]
Loading safetensors checkpoint shards:  75% Completed | 6/8 [00:03<00:01,  1.80it/s]
Loading safetensors checkpoint shards:  88% Completed | 7/8 [00:04<00:00,  1.68it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:04<00:00,  1.61it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:04<00:00,  1.68it/s]



INFO 03-05 17:52:33 model_runner.py:1115] Loading model weights took 13.9281 GB
[1;36m(VllmWorkerProcess pid=19064)[0;0m INFO 03-05 17:52:33 model_runner.py:1115] Loading model weights took 13.9281 GB
[1;36m(VllmWorkerProcess pid=19064)[0;0m INFO 03-05 17:52:38 worker.py:267] Memory profiling takes 3.98 seconds
[1;36m(VllmWorkerProcess pid=19064)[0;0m INFO 03-05 17:52:38 worker.py:267] the current vLLM instance can use total_gpu_memory (79.11GiB) x gpu_memory_utilization (0.95) = 75.15GiB
[1;36m(VllmWorkerProcess pid=19064)[0;0m INFO 03-05 17:52:38 worker.py:267] model weights take 13.93GiB; non_torch_memory takes 1.57GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 58.21GiB.
INFO 03-05 17:52:38 worker.py:267] Memory profiling takes 4.14 seconds
INFO 03-05 17:52:38 worker.py:267] the current vLLM instance can use total_gpu_memory (79.11GiB) x gpu_memory_utilization (0.95) = 75.15GiB
INFO 03-05 17:52:38 worker.py:267] model weigh

INFO:     Started server process [18557]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO 03-05 17:53:27 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
INFO:     127.0.0.1:51156 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


In [9]:
vllm.client.organization = "test1"

In [10]:
semaphore = asyncio.Semaphore(int(1.3 * vllm.max_concurrent_tokens / expected_tokens))
val_results = await get_task_results(
    tasks=val_tasks,
    client=vllm.client,
    model=vllm.model,
    cache=True,
    log_results=8,
    n=1,
    params=ChatCompletionParams(
        stream_options={
            "include_usage": True,
        }
    ),
    pbar_desc="val",
    semaphore=semaphore,
)

val:   0%|          | 0/64 [00:00<?, ?it/s]

val: 100%|██████████| 64/64 [01:10<00:00,  6.48s/it, completion_tokens=1291, prompt_tokens=1220, reward=0.304, acc=0.594, token_logprobs=82651]
