In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
import asyncio
import httpx
import json
from lib.rl.episode import Episode, EpisodeCompletion
from lib.rl.ppo import PPOLoss
from lib.rl.recipe import ComponentConfig, TuneRecipeConfig
from lib.rl.trainer import ExploreOptions, Trainer, vLLMConfig
import random
import re
import torch
from torchtune.models.llama3_1 import llama3_1_8b
from typing import Any, AsyncIterable, Literal, Optional

with open("./data/chain-of-thought-examples.json") as f:
    chain_of_thought_examples: list[dict[str, str]] = json.load(f)


async def sample_random_episode(
    difficulty: Literal["easy", "variable"] = "variable",
    example_probability: float = 0.0,
    max_prompt_characters: int = 8192,
    reward_follow_up_completion: bool = True,
    return_first_solver_as_winner: Optional[bool] = None,
) -> Episode:
    while True:
        async with httpx.AsyncClient(timeout=httpx.Timeout(5.0, read=600.0)) as client:
            params: dict[str, Any] = {
                "difficulty": difficulty,
            }
            if return_first_solver_as_winner is not None:
                params["return_first_solver_as_winner"] = return_first_solver_as_winner
            response = await client.get(
                "http://0.0.0.0:2218/new-episode-data",
                params=params,
            )
            response.raise_for_status()
            result = response.json()
            prompt = result["prompt"]
            follow_up = result["follow_up"]
            solution = result["solution"]
        if len(prompt) <= max_prompt_characters:
            break

    async def reward_completion(completion: EpisodeCompletion) -> EpisodeCompletion:
        if len(completion.messages) == 2:
            follow_up_completion = await completion.follow_up(
                messages=[
                    {"role": "user", "content": follow_up},
                ],
                max_tokens=10
                + len("\n".join(f"{key}: {value}" for key, value in solution.items()))
                // 2,
            )
        else:
            follow_up_completion = completion
        answer = follow_up_completion.last_assistant_message.get("content")
        assert isinstance(answer, str)
        if reward_follow_up_completion:
            completion = follow_up_completion
        completion.reward = sum(
            [
                bool(
                    # Find first match of key followed by colon and capture following text
                    (
                        match := re.search(
                            rf"{key}: ([A-Za-z \.:-]+)",
                            answer,
                            re.IGNORECASE,
                        )
                    )
                    # Check if captured group matches expected value
                    and match.group(1).strip().lower() == value.strip().lower()
                )
                for key, value in solution.items()
            ]
        ) / len(solution)
        completion.reward -= (
            completion.all_absent_stop_tokens
            / (3 if reward_follow_up_completion else 2)
            / len(solution)
        )
        return completion

    async def on_sample(completions: list[EpisodeCompletion]) -> None:
        for completion in await asyncio.gather(
            *[reward_completion(completion) for completion in completions]
        ):
            completion.commit()

    example = random.choice(chain_of_thought_examples)

    return Episode(
        messages=[{"role": "user", "content": prompt}],
        examples=lambda: (
            [
                {"role": "user", "content": example["prompt"]},
                {
                    "role": "assistant",
                    "content": example["chain_of_thought"]
                    + (example["answer"] and f"\n\n---\n\n{example['answer']}"),
                },
            ]
            if random.random() < example_probability
            else []
        ),
        on_sample=on_sample,
    )


episodes_per_iteration = 64 * torch.cuda.device_count()


async def train_episodes() -> AsyncIterable[Episode | BaseException]:
    pending: set[asyncio.Task[Episode | BaseException]] = set()
    while True:
        pending.update(
            asyncio.create_task(sample_random_episode())
            for _ in range(episodes_per_iteration - len(pending))
        )
        done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
        for task in done:
            try:
                yield task.result()
            except BaseException as e:
                yield e


async def val_episodes() -> AsyncIterable[Episode | BaseException]:
    for fut in asyncio.as_completed(
        sample_random_episode() for _ in range(64 * torch.cuda.device_count())
    ):
        try:
            yield await fut
        except BaseException as e:
            yield e


model_name = "rl33"

trainer = Trainer(
    base_model="NousResearch/Hermes-2-Theta-Llama-3-8B",
    output_dir=f"./models/{model_name}",
    explore_options=ExploreOptions(
        iterations=8,
        num_parents=5,
        branch_factor=3,
        patience=5,
        sample_probability_power=None,
        sampling_kwargs={"max_tokens": 1024},
        split_method="prob",
        split_point_std_deviation=0.5,
    ),
    train_episodes=train_episodes(),
    episodes_per_iteration=episodes_per_iteration,
    max_mask_sequence_batch_size=1,
    val_episodes=val_episodes(),
    val_patience=15,
    val_samples_per_episode=3,
    val_sampling_kwargs={"max_tokens": 1024},
    tune_model=llama3_1_8b,
    tune_model_type="LLAMA3",
    tune_recipe_config=TuneRecipeConfig(
        seed=42,
        shuffle=True,
        num_output_chunks=4,
        resume_from_checkpoint=False,
        batch_size=1,
        epochs=1,
        max_steps_per_epoch=32,
        optimizer=ComponentConfig(
            "torch.optim.AdamW",
            # "bitsandbytes.optim.PagedAdamW8bit",
            # "bitsandbytes.optim.AdamW",
            # params=PLACEHOLDER,
            lr=4e-6,
            fused=True,
        ),
        loss=ComponentConfig(
            PPOLoss,
            policy_coef=0.0,
            clip_epsilon=0.2,
            unclipped_policy_coef=0.0,
            tanh_log_policy_coef=0.9,
            value_coef=0.0,
            entropy_coef=0.0,
            entropy_target=0.75,
            entropy_target_coef=0.15,
            kl_coef=0.25,
            weighted_entropy_coef=0.1,
            weighted_kl_coef=0.0,
            weighted_ce_coef=0.0,
            normalize_values=False,
            normalize_advantages=False,
        ),
        compile=False,
        optimizer_in_bwd=False,
        gradient_accumulation_steps=1,
        enable_activation_checkpointing=True,
        enable_activation_offloading=False,
        custom_sharded_layers=["tok_embeddings", "output"],
        log_every_n_steps=1,
        log_peak_memory_stats=True,
    ),
    # tune_run=False,
    tune_sequence_length=16384,
    vllm_config=vLLMConfig(
        env={"VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1"},
        kwargs=dict(
            block_size=32,
            disable_log_requests=True,
            enable_prefix_caching=True,
            enforce_eager=True,
            gpu_memory_utilization=0.95,
            max_model_len=16384,
            max_num_seqs=4096,
            max_num_batched_tokens=16384,
            preemption_mode="swap",
            return_tokens_as_token_ids=True,
            swap_space=32,
        ),
        max_concurrent_samples=4096,
        timeout=120 + 15 * torch.cuda.device_count(),
    ),
    wandb_kwargs=dict(
        name=model_name,
        id=model_name,
    ),
)

INFO 12-14 21:09:59 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='NousResearch/Hermes-2-Theta-Llama-3-8B', speculative_config=None, tokenizer='NousResearch/Hermes-2-Theta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=NousResearch/Hermes-2-Theta-Llama-3-8B, num_scheduler_steps=1, chunked_prefill_enabled=Fal

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbradhilton[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
await trainer.train(iterations=12, verbosity=1)

val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl33/config.yaml


1|29|Loss: 0.0393: 100%|██████████| 29/29 [09:26<00:00, 19.27s/it, entropy=0.7355, entropy_target=0.0145, kl_div=0.1603, policy=0.0036, tanh_log_policy=-0.0027, unclipped_policy=-0.0102, value=2.2541, weighted_ce=-0.0051, weighted_entropy=0.0047, weighted_kl_div=0.0033]   

Saved iteration 5 model files to /home/ubuntu/atreides/experiments/models/rl33/0005
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl33/0005 --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.95 --max-model-len=16384 --max-num-seqs=4096 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl33/config.yaml


1|8|Loss: 0.0603:  28%|██▊       | 8/29 [02:42<06:48, 19.43s/it, entropy=0.6109, entropy_target=0.1391, kl_div=0.1505, policy=0.0083, tanh_log_policy=0.0026, unclipped_policy=0.0022, value=1.6979, weighted_ce=-0.0127, weighted_entropy=0.0059, weighted_kl_div=0.0114]   

In [4]:
await trainer.get_completion_sampler()

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=2048 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
INFO 12-14 20:59:28 api_server.py:528] vLLM API server version 0.6.3.post1
INFO 12-14 20:59:28 api_server.py:529] args: Namespace(subparser='serve', model_tag='NousResearch/Hermes-2-Theta-Llama-3-8B', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=True, disable_frontend_multiprocessing=False, enab

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:00,  4.05it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.56it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.15it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.28it/s]



INFO 12-14 20:59:44 model_runner.py:1067] Loading model weights took 14.9595 GB
INFO 12-14 20:59:45 gpu_executor.py:122] # GPU blocks: 11578, # CPU blocks: 8192
INFO 12-14 20:59:45 gpu_executor.py:126] Maximum concurrency for 16384 tokens per request: 22.61x
INFO 12-14 21:00:01 api_server.py:232] vLLM to use /tmp/tmpcu4ii5v7 as PROMETHEUS_MULTIPROC_DIR
INFO 12-14 21:00:01 launcher.py:19] Available routes are:
INFO 12-14 21:00:01 launcher.py:27] Route: /openapi.json, Methods: GET, HEAD
INFO 12-14 21:00:01 launcher.py:27] Route: /docs, Methods: GET, HEAD
INFO 12-14 21:00:01 launcher.py:27] Route: /docs/oauth2-redirect, Methods: GET, HEAD
INFO 12-14 21:00:01 launcher.py:27] Route: /redoc, Methods: GET, HEAD
INFO 12-14 21:00:01 launcher.py:27] Route: /health, Methods: GET
INFO 12-14 21:00:01 launcher.py:27] Route: /tokenize, Methods: POST
INFO 12-14 21:00:01 launcher.py:27] Route: /detokenize, Methods: POST
INFO 12-14 21:00:01 launcher.py:27] Route: /v1/models, Methods: GET
INFO 12-14 21:0

INFO:     Started server process [74481]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on socket ('0.0.0.0', 8000) (Press CTRL+C to quit)


INFO:     127.0.0.1:33974 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


<lib.rl.completion_sampler.CompletionSamplerPool at 0x75109cb08f50>

In [5]:
import nest_asyncio
nest_asyncio.apply()

In [10]:
trainer._completion_sampler.samplers[0].semaphore.max_concurrent_actions *= 2

In [6]:
%%prun
asyncio.run(trainer.explore(verbosity=0))

 

         276972809 function calls (268721619 primitive calls) in 483.300 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 15707/60   82.604    0.005    1.332    0.022 {method 'poll' of 'select.epoll' objects}
   3427/2   76.658    0.022    0.000    0.000 {method 'acquire' of '_thread.lock' objects}
 4582/506   68.241    0.015   82.788    0.164 {method 'recv' of '_socket.socket' objects}
       77   18.760    0.244   18.760    0.244 {method 'any' of 'torch._C.TensorBase' objects}
  780/778   16.731    0.021  108.403    0.139 threading.py:323(wait)
       76   13.972    0.184   25.251    0.332 pack.py:307(get_mask)
      509   12.864    0.025  100.606    0.198 sock_client.py:242(_read_packet_bytes)
       76   10.767    0.142   23.711    0.312 explore_result.py:181(_write_mask)
 12632079   10.289    0.000   16.383    0.000 utils.py:54(get_token)
42884758/42132634    8.722    0.000   11.009    0.000 {built-in method builtins.i

In [12]:
trainer.explore_results[-1].exceptions

[httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout('')]

In [4]:
await trainer.train(iterations=1, verbosity=2)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8001 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=2048 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
INFO 12-14 20:11:02 api_server.py:528] vLLM API server version 0.6.3.post1
INFO 12-14 20:11:02 api_server.py:529] args: Namespace(subparser='serve', model_tag='NousResearch/Hermes-2-Theta-Llama-3-8B', config='', host=None, port=8001, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=True, disable_frontend_multiprocessing=False, enab

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:00,  4.05it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.56it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.15it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.28it/s]



INFO 12-14 20:11:19 model_runner.py:1067] Loading model weights took 14.9595 GB
INFO 12-14 20:11:20 gpu_executor.py:122] # GPU blocks: 11578, # CPU blocks: 8192
INFO 12-14 20:11:20 gpu_executor.py:126] Maximum concurrency for 16384 tokens per request: 22.61x
INFO 12-14 20:11:37 api_server.py:232] vLLM to use /tmp/tmp3n2vbgld as PROMETHEUS_MULTIPROC_DIR
INFO 12-14 20:11:37 launcher.py:19] Available routes are:
INFO 12-14 20:11:37 launcher.py:27] Route: /openapi.json, Methods: HEAD, GET
INFO 12-14 20:11:37 launcher.py:27] Route: /docs, Methods: HEAD, GET
INFO 12-14 20:11:37 launcher.py:27] Route: /docs/oauth2-redirect, Methods: HEAD, GET
INFO 12-14 20:11:37 launcher.py:27] Route: /redoc, Methods: HEAD, GET
INFO 12-14 20:11:37 launcher.py:27] Route: /health, Methods: GET
INFO 12-14 20:11:37 launcher.py:27] Route: /tokenize, Methods: POST
INFO 12-14 20:11:37 launcher.py:27] Route: /detokenize, Methods: POST
INFO 12-14 20:11:37 launcher.py:27] Route: /v1/models, Methods: GET
INFO 12-14 20:1

INFO:     Started server process [33152]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on socket ('0.0.0.0', 8001) (Press CTRL+C to quit)


INFO:     127.0.0.1:47054 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


val: 0episode [00:00, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl33/config.yaml


DEBUG:torchtune.utils._logging:Training is not distributed. If you want to train on multiple GPUs and are using the tune CLI, specify --nnodes 1 and --nproc_per_node [num_gpus]
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 1
checkpointer:
  _component_: torchtune.training.checkpointing._checkpointer.FullModelHFCheckpointer
  checkpoint_dir: /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725
  checkpoint_files:
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725/model-00004-of-00004.safetensors
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725/model-00001-of-00004.safetensors
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b

CancelledError: 

In [6]:
trainer.explore_results[0].exceptions

[httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout('')]

In [5]:
trainer.eval_exceptions["val"]

[httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout('')]

In [27]:
await trainer.train(iterations=4, verbosity=1)

val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|185|Loss: 0.0443: 100%|██████████| 185/185 [51:20<00:00, 16.53s/it, entropy=0.6468, entropy_target=0.1032, kl_div=0.0934, policy=-0.0291, tanh_log_policy=0.0026, unclipped_policy=-0.0323, value=1.9048, weighted_ce=0.0188, weighted_entropy=-0.0312, weighted_kl_div=-0.0066]  

Saved iteration 10 model files to /home/ubuntu/atreides/experiments/models/rl29/0010
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl29/0010 --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|101|Loss: 0.0376: 100%|██████████| 101/101 [28:08<00:00, 16.61s/it, entropy=0.6686, entropy_target=0.0814, kl_div=0.0844, policy=-0.0282, tanh_log_policy=0.0022, unclipped_policy=-0.0298, value=1.0489, weighted_ce=0.0084, weighted_entropy=-0.0236, weighted_kl_div=-0.0026]

Saved iteration 11 model files to /home/ubuntu/atreides/experiments/models/rl29/0011
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl29/0011 --port=8001 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|127|Loss: 0.0216: 100%|██████████| 128/128 [35:40<00:00, 16.64s/it, entropy=0.7464, entropy_target=0.0036, kl_div=0.0928, policy=-0.0009, tanh_log_policy=-0.0017, unclipped_policy=-0.0051, value=5.0248, weighted_ce=-0.0075, weighted_entropy=0.0059, weighted_kl_div=0.0063] 

Saved iteration 12 model files to /home/ubuntu/atreides/experiments/models/rl29/0012
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl29/0012 --port=8001 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|143|Loss: 0.0361: 100%|██████████| 143/143 [39:47<00:00, 16.62s/it, entropy=0.6294, entropy_target=0.1206, kl_div=0.0729, policy=0.0015, tanh_log_policy=-0.0001, unclipped_policy=0.0004, value=1.1154, weighted_ce=-0.0008, weighted_entropy=0.0014, weighted_kl_div=0.0005]   

Saved iteration 13 model files to /home/ubuntu/atreides/experiments/models/rl29/0013
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl29/0013 --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)


In [28]:
await trainer.train(iterations=12, verbosity=1)

val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)
Early stopping exploration due to expired patience (0 remaining episodes x 5 patience per episode = 0 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|100|Loss: 0.0312:  45%|████▌     | 100/222 [27:53<33:44, 16.59s/it, entropy=0.8347, entropy_target=0.0847, kl_div=0.0736, policy=-0.0045, tanh_log_policy=-0.0004, unclipped_policy=-0.0056, value=2.1442, weighted_ce=0.0013, weighted_entropy=-0.0045, weighted_kl_div=-0.0014]