In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [9]:
import asyncio
from lib.tasks import ChatCompletionParams, get_task_results
from lib.temporal_clue import get_temporal_clue_tasks
from lib.vllm import start_vllm, kill_vllm_workers
import random
import torch

tasks = list(get_temporal_clue_tasks())
val_tasks = tasks[:64]
test_tasks = tasks[64:128]
train_tasks = tasks[128:]
random.seed(42)
random.shuffle(train_tasks)
len(val_tasks), len(test_tasks), len(train_tasks)

(64, 64, 2860)

In [4]:
expected_tokens = 1000
model = "./models/044/0098"

In [50]:
vllm = await start_vllm(
    model,
    max_concurrent_requests=4096,
    env={"VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1"},
    named_arguments=dict(
        block_size=32,
        disable_log_requests=True,
        enable_prefix_caching=True,
        enforce_eager=True,
        gpu_memory_utilization=0.95,
        max_model_len=16384,
        max_num_seqs=4096,
        max_num_batched_tokens=16384,
        num_scheduler_steps=16,
        preemption_mode="swap",
        return_tokens_as_token_ids=True,
        swap_space=80,
        tensor_parallel_size=torch.cuda.device_count(),
    ),
    timeout=360 + 15 * torch.cuda.device_count(),
)

$ vllm serve /home/gcpuser/sky_workdir/experiments/models/044/0098 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.95 --max-model-len=16384 --max-num-seqs=4096 --max-num-batched-tokens=16384 --num-scheduler-steps=16 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=80 --tensor-parallel-size=2 --served-model-name=./models/044/0098 --port=8001 --api-key=default
INFO 03-05 19:12:52 __init__.py:190] Automatically detected platform cuda.
INFO 03-05 19:12:53 api_server.py:840] vLLM API server version 0.7.2
INFO 03-05 19:12:53 api_server.py:841] args: Namespace(subparser='serve', model_tag='/home/gcpuser/sky_workdir/experiments/models/044/0098', config='', host=None, port=8001, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_r

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  12% Completed | 1/8 [00:00<00:04,  1.54it/s]
Loading safetensors checkpoint shards:  25% Completed | 2/8 [00:01<00:03,  1.58it/s]
Loading safetensors checkpoint shards:  38% Completed | 3/8 [00:01<00:03,  1.59it/s]
Loading safetensors checkpoint shards:  50% Completed | 4/8 [00:02<00:01,  2.19it/s]
Loading safetensors checkpoint shards:  62% Completed | 5/8 [00:02<00:01,  2.16it/s]
Loading safetensors checkpoint shards:  75% Completed | 6/8 [00:03<00:01,  1.95it/s]
Loading safetensors checkpoint shards:  88% Completed | 7/8 [00:03<00:00,  1.81it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:04<00:00,  1.75it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:04<00:00,  1.81it/s]



INFO 03-05 19:13:15 model_runner.py:1115] Loading model weights took 13.9281 GB
[1;36m(VllmWorkerProcess pid=28161)[0;0m INFO 03-05 19:13:15 model_runner.py:1115] Loading model weights took 13.9281 GB
[1;36m(VllmWorkerProcess pid=28161)[0;0m INFO 03-05 19:13:18 worker.py:267] Memory profiling takes 2.74 seconds
[1;36m(VllmWorkerProcess pid=28161)[0;0m INFO 03-05 19:13:18 worker.py:267] the current vLLM instance can use total_gpu_memory (79.11GiB) x gpu_memory_utilization (0.95) = 75.15GiB
[1;36m(VllmWorkerProcess pid=28161)[0;0m INFO 03-05 19:13:18 worker.py:267] model weights take 13.93GiB; non_torch_memory takes 1.57GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 58.21GiB.
INFO 03-05 19:13:18 worker.py:267] Memory profiling takes 2.88 seconds
INFO 03-05 19:13:18 worker.py:267] the current vLLM instance can use total_gpu_memory (79.11GiB) x gpu_memory_utilization (0.95) = 75.15GiB
INFO 03-05 19:13:18 worker.py:267] model weigh

INFO:     Started server process [27656]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)


INFO 03-05 19:14:06 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
INFO:     127.0.0.1:40084 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


In [51]:
semaphore = asyncio.Semaphore(int(0.9 * vllm.max_concurrent_tokens / expected_tokens))
val_results = await get_task_results(
    tasks=val_tasks,
    client=vllm.client,
    model=vllm.model,
    cache=False,
    log_results=8,
    n=16,
    params=ChatCompletionParams(
        stream_options={
            "include_usage": True,
        }
    ),
    pbar_desc="val",
    semaphore=semaphore,
)

val:   0%|          | 0/1024 [00:00<?, ?it/s]

val: 100%|██████████| 1024/1024 [06:56<00:00, 27.35s/it, completion_tokens=1363, prompt_tokens=1220, reward=0.594, acc=0.594, token_logprobs=1.4e+6]


In [61]:
from lib.tasks import TaskResult
from openai.types.chat.chat_completion import ChatCompletion


def get_likelihood(completion: ChatCompletion):
    logprobs = [
        token_logprob.logprob
        for token_logprob in completion.choices[0].logprobs.content
    ]
    return sum(logprobs)


def get_reward(result: TaskResult, completion: ChatCompletion):
    return result.rewards[(completion.id, 0)]


sum(
    get_reward(result, max(result.chat_completions, key=get_likelihood))
    for result in val_results
) / 64

0.5946428571428571

In [69]:
sum(get_reward(result, chat_completion) for result in val_results for chat_completion in result.chat_completions) / 64 / 16

0.5936918712797619

In [65]:
from lib.temporal_clue import get_temporal_clue_puzzles, TemporalCluePuzzle

puzzles = get_temporal_clue_puzzles()[:64]
puzzles

[{'num_clues': 72,
  'prompt': 'On a dark winter night, wealthy and enigmatic Mr. John Q. Boddy hosted a small, but lavish, dinner party for some of his closest associates. However, the night ended in tragedy when Mr. Boddy was found dead in one of the rooms of Tudor Mansion in the early hours of the morning. The following persons of interest have been identified as suspects:\n\n• Miss Peach\n• Monsieur Brunette\n• Mr. Green\n• Professor Plum\n• Mrs. White\n• Colonel Mustard\n• Miss Scarlet\n• Mrs. Peacock\n• Sgt. Gray\n• Madame Rose\n\nAnd the following weapons were found on the premises:\n\n• Candlestick\n• Wrench\n• Lead Pipe\n• Revolver\n• Poison\n• Knife\n• Rope\n• Horseshoe\n\nThe murder could only have occured in one of the following rooms:\n\n01. Studio\n02. Gazebo\n03. Lounge\n04. Drawing Room\n05. Library\n06. Trophy Room\n07. Cloak Room\n08. Courtyard\n09. Kitchen\n10. Fountain\n11. Dining Room\n12. Carriage House\n13. Ballroom\n\nThe rooms are laid out as follows:\n\n  NN N

In [74]:
from collections import Counter
import math
import re


def get_consensus(result: TaskResult, puzzle: TemporalCluePuzzle):
    consensus = {key: Counter() for key in puzzle["solution"]}
    for completion in result.chat_completions:
        for choice in completion.choices:
            content = choice.message.content
            assert isinstance(content, str)
            for key in puzzle["solution"]:
                if matches := re.findall(rf"{key}\. ([A-Za-z \.:-]+)", content):
                    match = matches[-1]
                    consensus[key][match.strip().lower()] += 1
                    # consensus[key][match.strip().lower()] += -math.log(
                    #     1
                    #     - math.exp(
                    #         sum(
                    #             token_logprob.logprob
                    #             for token_logprob in choice.logprobs.content
                    #         ) / len(choice.logprobs.content)
                    #     )
                    # )
    num_correct = 0
    for key in puzzle["solution"]:
        try:
            consensus_answer = max(consensus[key], key=consensus[key].get)
        except ValueError:
            consensus_answer = None
        if consensus_answer == puzzle["solution"][key].lower():
            num_correct += 1
    return num_correct / len(puzzle["solution"])


sum(get_consensus(result, puzzle) for result, puzzle in zip(val_results, puzzles)) / 64

0.6143043154761905