In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
from lib.langfuse import langfuse
langfuse.enabled = False
# langfuse.auth_check()

In [4]:
import json
from lib.rl.episode import Episode, EpisodeCompletion
import random
import re
from typing import TypedDict


class TemporalCluePuzzle(TypedDict):
    num_clues: int
    prompt: str
    solution: dict[str, str]


temporal_clue_puzzles: list[TemporalCluePuzzle] = json.load(
    open("./data/temporal-clue-puzzles.json")
)
random.seed(42)
random.shuffle(temporal_clue_puzzles)

In [5]:
from lib.clue import Clue

chain_of_thought_examples: list[dict[str, str]] = json.load(
    open("./data/chain-of-thought-examples.json")
)
chain_of_thought_examples.pop(6)
chain_of_thought_examples.pop(3)

def get_episode(puzzle: TemporalCluePuzzle) -> Episode:

    def validate(completion: EpisodeCompletion) -> None:
        ...

    def on_sample(completions: list[EpisodeCompletion]) -> None:
        for completion in completions:
            content = completion.last_assistant_message.get("content")
            assert isinstance(content, str)
            num_correct = 0
            for key, value in puzzle["solution"].items():
                if matches := re.findall(rf"{key}\. ([A-Za-z \.:-]+)", content):
                    match = matches[-1]
                    if match.strip().lower() == value.lower():
                        num_correct += 1
            completion.commit(reward=num_correct / len(puzzle["solution"]))
            
    example = random.choices(chain_of_thought_examples, k=1)

    return Episode(
        messages=[
            {
                "role": "user",
                "content": puzzle["prompt"]
                .replace(
                    "Fill out your final answers in the following format:",
                    "After verifiably finding all the correct answers, fill out your final answers in the following format:",
                )
                ,
            },
            # {
            #     "role": "assistant",
            #     "content": "Let's think this through step by step...",
            # },
        ],
        on_sample=on_sample,
        examples=[
            {"role": "user", "content": example[0]["prompt"]},
            {
                "role": "assistant", 
                "content": example[0]["chain_of_thought"]
                + (example[0]["answer"] and f"\n\n---\n\n{example[0]['answer']}"),
            },
            # {"role": "user", "content": example[1]["prompt"]},
            # {
            #     "role": "assistant",
            #     "content": example[1]["chain_of_thought"] 
            #     + (example[1]["answer"] and f"\n\n---\n\n{example[1]['answer']}"),
            # },
        ],
        # logprobs_mask=Clue.get_logprobs_mask(),
    )


temporal_clue_episodes = [get_episode(puzzle) for puzzle in temporal_clue_puzzles]

In [6]:
temporal_clue_episodes[64:] = [
    get_episode(puzzle)
    for puzzle in json.load(open("./data/temporal-clue-puzzles-2.json"))
]

In [7]:
import polars as pl

zebra_grid_questions = pl.read_parquet(
    "hf://datasets/allenai/ZebraLogicBench-private/grid_mode/test-00000-of-00001.parquet"
).to_dicts()
random.shuffle(zebra_grid_questions)


def get_episode(question: dict) -> Episode:
    prompt = f"""{question["puzzle"]}
Fill in the grid with the correct values:

| {' | '.join(question["solution"]["header"])} |
| {' | '.join(["-" * len(header) for header in question["solution"]["header"]])} |
"""

    for _ in question["solution"]["rows"]:
        prompt += f"| {' | '.join([" " * len(header) for header in question["solution"]["header"]])} |\n"

    pattern = re.compile(
        r"\| " + r"\|".join(r"(.*?)" for _ in question["solution"]["header"]) + r" \|"
    )

    def on_sample(completions: list[EpisodeCompletion]):
        for completion in completions:
            assert "content" in completion.last_assistant_message and isinstance(
                completion.last_assistant_message["content"], str
            )
            num_cells = sum(len(row) for row in question["solution"]["rows"])
            num_correct = 0
            for match, row in zip(
                re.findall(pattern, completion.last_assistant_message["content"])[
                    -len(question["solution"]["rows"]) :
                ],
                question["solution"]["rows"],
            ):
                for cell, value in zip(match, row):
                    if cell.strip().lower() == value.lower():
                        num_correct += 1
            completion.commit(reward=num_correct / num_cells)

    return Episode(
        messages=[{"role": "user", "content": prompt}],
        on_sample=on_sample,
    )

zebra_grid_episodes = [get_episode(question) for question in zebra_grid_questions]

In [8]:
from datasets import load_dataset

math_questions = list(
    load_dataset("lighteval/MATH", "all")["train"].to_iterable_dataset()  # type: ignore
)
random.shuffle(math_questions)


question_solution = None
pattern = re.compile(r"\\boxed{([^}]+)}")


def get_episode(question: dict) -> Episode:
    prompt = (
        f"{question['problem']}\n\n"
        "Solve this math problem and show your work. Your final answer MUST be "
        "formatted in a LaTeX box using \\boxed{{}}. For example: "
        "$1+1=\\boxed{{2}}$\n\n"
        "You can submit multiple attempts. Each attempt should end with a boxed "
        "answer. Your last answer will be weighted the most, but you can get "
        "partial credit if an earlier answer is correct. If after multiple "
        "attempts you decide an earlier answer is the correct one, just submit "
        "it again to get full credit."
    )

    global question_solution
    question_solution = question["solution"]
    solution = re.search(pattern, question["solution"])
    assert solution is not None, question["solution"]
    solution = solution.group(1)

    def on_sample(completions: list[EpisodeCompletion]):
        for completion in completions:
            content = completion.last_assistant_message.get("content")
            assert isinstance(content, str)
            solutions = [
                match.group(1) for match in re.finditer(r"\\boxed{([^}]+)}", content)
            ][::-1]
            try:
                reward = 0.9 ** solutions.index(solution)
            except ValueError:
                reward = 0
            completion.commit(reward=reward)

    return Episode(
        messages=[{"role": "user", "content": prompt}],
        on_sample=on_sample,
    )


math_episodes = [
    get_episode(question)
    for question in math_questions[:2048]
    if re.search(pattern, question["solution"]) is not None
]

0000.parquet:   0%|          | 0.00/2.99M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.86M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [9]:
import asyncio
from dataclasses import dataclass, field
from lib.rl.completion import SplitMethod
from lib.rl.completion_sampler import CompletionSampler, SamplingKwargs
from lib.rl.trainer import ExploreImpl, ExploreOptions
from lib.tokenizer import Tokenizer
import numpy as np
from typing import Callable


@dataclass
class DefaultExploreImpl(ExploreImpl):
    explore_options: ExploreOptions

    async def __call__(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        ready_episodes: asyncio.Queue[Episode],
        done_episodes: asyncio.Queue[Episode | BaseException],
        update_progress: Callable[[float], None],
    ) -> None:
        def done_callback(task: asyncio.Task[Episode]) -> None:
            try:
                done_episodes.put_nowait(task.result())
            except BaseException as exception:
                done_episodes.put_nowait(exception)

        priority = 1
        while episode := await ready_episodes.get():
            asyncio.create_task(
                self._explore_episode(
                    completion_sampler, tokenizer, episode, update_progress, priority
                )
            ).add_done_callback(done_callback)
            priority += 1

    async def _explore_episode(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        episode: Episode,
        update_progress: Callable[[float], None],
        priority: int,
    ) -> Episode:
        for _ in range(self.explore_options.iterations):
            await episode.sample_completions(
                completion_sampler=completion_sampler,
                tokenizer=tokenizer,
                num_parents=self.explore_options.num_parents,
                branch_factor=self.explore_options.branch_factor,
                get_recovery_pattern=self.explore_options.get_recovery_pattern,
                max_splits_per_completion=self.explore_options.max_split_points
                or self.explore_options.num_parents,
                priority=priority,
                sample_probability_power=self.explore_options.get_sample_probability_power(),
                sampling_kwargs=self.explore_options.sampling_kwargs,
                split_by=self.explore_options.split_method,
                split_separators=self.explore_options.split_separators,
            )
            update_progress(1 / self.explore_options.iterations)
        return episode


@dataclass
class SimpleExploreImpl(ExploreImpl):
    num_samples: int
    sampling_kwargs: SamplingKwargs | None = None

    async def __call__(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        ready_episodes: asyncio.Queue[Episode],
        done_episodes: asyncio.Queue[Episode | BaseException],
        update_progress: Callable[[float], None],
    ) -> None:
        while episode := await ready_episodes.get():
            task = asyncio.create_task(
                episode.sample_completions(
                    completion_sampler,
                    tokenizer,
                    num_parents=1,
                    branch_factor=self.num_samples,
                    sampling_kwargs=self.sampling_kwargs,
                )
            )

            def done_callback(_: asyncio.Task[bool], episode=episode) -> None:
                try:
                    done_episodes.put_nowait(episode)
                    update_progress(1)
                except BaseException as e:
                    done_episodes.put_nowait(e)

            task.add_done_callback(done_callback)


@dataclass
class TreeExploreImpl(ExploreImpl):
    branch_factor: int
    depth: int
    num_roots: int | None = None
    best_leaf_sampling_temperature: float = 0.01
    sampling_kwargs: SamplingKwargs | None = None
    split_method: SplitMethod = "count"
    split_separators: set[str] = field(default_factory=set)

    async def __call__(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        ready_episodes: asyncio.Queue[Episode],
        done_episodes: asyncio.Queue[Episode | BaseException],
        update_progress: Callable[[float], None],
    ) -> None:
        model = await completion_sampler.get_model()

        async def expand(episode: Episode, priority: int) -> None:
            num_roots = self.num_roots or self.branch_factor

            # If there are existing trajectories, we'll sample one
            # of the best ones to stabilize and improve training.
            leaves = list(episode.completion.leaves())
            if leaves:
                best_leaf = random.choices(
                    leaves,
                    weights=[
                        np.exp(
                            leaf.cumulative_reward()
                            / self.best_leaf_sampling_temperature
                        )
                        for leaf in leaves
                    ],
                    k=1,
                )[0]
                best_leaf = best_leaf.recursive_copy(model=model)
                best_leaf.commit()
                while best_leaf.parent and best_leaf.parent.parent is None:
                    best_leaf = best_leaf.merge()

            pending: set[asyncio.Task] = {
                asyncio.create_task(
                    episode.sample_completions(
                        completion_sampler,
                        tokenizer,
                        num_parents=1,
                        branch_factor=num_roots - 1 if best_leaf else num_roots,
                        priority=priority,
                        sampling_kwargs=self.sampling_kwargs,
                        split_by=self.split_method,
                        split_separators=self.split_separators,
                    )
                )
            }

            num_leaves = 0
            while pending:
                finished, pending = await asyncio.wait(
                    pending, return_when=asyncio.FIRST_COMPLETED
                )
                for task in finished:
                    try:
                        task.result()
                    except BaseException as e:
                        await done_episodes.put(e)
                        return
                _num_leaves = 0
                for leaf in episode.completion.leaves(model=model):
                    _num_leaves += 1
                    num_partitions = self.depth - leaf.depth() + 1
                    if num_partitions > 1:
                        parents = list(
                            leaf.split(
                                by=self.split_method,
                                at=(
                                    split / num_partitions
                                    for split in range(1, num_partitions)
                                ),
                                separators=self.split_separators,
                                cache=True,
                            )
                        )[:-1]
                        for parent in parents:
                            pending.add(
                                asyncio.create_task(
                                    episode._sample_completions(
                                        parent=parent,
                                        model=model,
                                        completion_sampler=completion_sampler,
                                        tokenizer=tokenizer,
                                        branch_factor=self.branch_factor,
                                        fork_decay=1.0,
                                        recovery_pattern=None,
                                        split_separators=self.split_separators,
                                        sampling_kwargs=self.sampling_kwargs
                                        or SamplingKwargs(),
                                        priority=priority,
                                    )
                                )
                            )
                update_progress(
                    (_num_leaves - num_leaves)
                    / (num_roots * (self.branch_factor ** (self.depth - 1)))
                )
                num_leaves = _num_leaves

            await done_episodes.put(episode)

        priority = 0
        while episode := await ready_episodes.get():
            priority += 1
            asyncio.create_task(expand(episode, priority))


@dataclass
class IterativeVineExploreImpl(ExploreImpl):
    branch_factor: int
    depth: int
    sampling_kwargs: SamplingKwargs | None = None
    split_method: SplitMethod = "count"
    split_separators: set[str] = field(default_factory=set)

    async def __call__(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        ready_episodes: asyncio.Queue[Episode],
        done_episodes: asyncio.Queue[Episode | BaseException],
        update_progress: Callable[[float], None],
    ) -> None:
        model = await completion_sampler.get_model()

        async def iterate_vine(episode: Episode, priority: int) -> None:

            for depth in range(self.depth):
                if depth == 0:
                    parent = episode.completion
                else:
                    parent = max(
                        episode.completion.leaves(model=model),
                        key=lambda leaf: leaf.reward,
                    )
                    parent = list(
                        parent.split(
                            by=self.split_method,
                            at=[1 / (self.depth - depth + 1)],
                            separators=self.split_separators,
                            cache=True,
                        )
                    )[0]
                await episode._sample_completions(
                    parent=parent,
                    model=model,
                    completion_sampler=completion_sampler,
                    tokenizer=tokenizer,
                    branch_factor=self.branch_factor,
                    fork_decay=1.0,
                    recovery_pattern=None,
                    split_separators=self.split_separators,
                    sampling_kwargs=self.sampling_kwargs or SamplingKwargs(),
                    priority=priority,
                )
                update_progress(1 / self.depth)

            await done_episodes.put(episode)

        priority = 0
        while episode := await ready_episodes.get():
            priority += 1
            asyncio.create_task(iterate_vine(episode, priority))

In [10]:
from aioitertools.helpers import maybe_await
import asyncio
from collections import Counter
import itertools as it
from lib import clue
from lib.rl.episode import Episode
from lib.rl.ppo import PPOLoss
from lib.rl.recipe import ComponentConfig, TuneRecipeConfig
from lib.rl.trainer import Eval, ExploreOptions, Trainer, vLLMConfig
import torch
from torchtune.models.llama3_1 import llama3_1_8b
from typing import AsyncIterable


episodes_per_iteration = 32 * torch.cuda.device_count()


async def train_episodes(revisit_frequency: float = 0.0) -> AsyncIterable[Episode | BaseException]:
    pending: set[asyncio.Task[Episode | BaseException]] = set()
    episodes = (
        maybe_await(episode)
        for episodes in zip(
            # (clue.sample_random_episode() for _ in it.repeat(0)),
            it.cycle(temporal_clue_episodes[64:]),
            # it.cycle(zebra_grid_episodes[64:]),
            # it.cycle(math_episodes[64:]),
        )
        for episode in episodes
    )
    visited_episodes = Counter[Episode]()
    while True:
        pending.update(
            asyncio.create_task(next(episodes))
            for _ in range(episodes_per_iteration - len(pending))  # type: ignore
        )
        done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
        if len(visited_episodes) > episodes_per_iteration:
            while random.random() < revisit_frequency:
                episode = min(visited_episodes, key=lambda e: visited_episodes[e])
                visited_episodes[episode] += 1
                yield episode
        for task in done:
            try:
                result = task.result()
                if isinstance(result, Episode):
                    visited_episodes[result] += 1
                yield result
            except BaseException as e:
                yield e


async def val_episodes() -> AsyncIterable[Episode | BaseException]:
    for fut in asyncio.as_completed(clue.sample_random_episode() for _ in range(64)):
        try:
            yield await fut
        except BaseException as e:
            yield e


explore_options = ExploreOptions(
    iterations=1,
    num_parents=6,
    branch_factor=2,
    patience=60,
    advantage_max_weight=0.15,
    sample_probability_power=None,
    sampling_kwargs={"max_tokens": 4096, "stop": ["://", "<|end_of_text|>"]},
    # split_method="prob",
    # split_point_std_deviation=0.5,
)

model_name = "rl100"

trainer = Trainer(
    base_model="NousResearch/Hermes-2-Theta-Llama-3-8B",
    output_dir=f"./models/{model_name}",
    explore_options=explore_options,
    # explore_impl=DefaultExploreImpl(explore_options),
    # explore_impl=SimpleExploreImpl(
    #     num_samples=8, sampling_kwargs={"max_tokens": 4096}
    # ),
    explore_impl=TreeExploreImpl(
        branch_factor=2,
        depth=5,
        num_roots=4,
        best_leaf_sampling_temperature=0.05,
        sampling_kwargs={"max_tokens": 4096, "stop": ["://", "<|end_of_text|>"]},
    ),
    force_terminate_vllms=True,
    train_episodes=train_episodes(revisit_frequency=0.5),
    episodes_per_iteration=episodes_per_iteration,
    max_mask_sequence_batch_size=1,
    evals=[
        # Eval(
        #     name="variable_clue",
        #     episodes=val_episodes(),
        #     samples_per_episode=3,
        #     sampling_kwargs={"max_tokens": 4096},
        # ),
        Eval(
            name="temporal_clue",
            episodes=temporal_clue_episodes[:64],
            samples_per_episode=3,
            sampling_kwargs={"max_tokens": 4096, "stop": ["://", "<|end_of_text|>"]},
        ),
        # Eval(
        #     name="zebra_grid",
        #     episodes=zebra_grid_episodes[:64],
        #     samples_per_episode=3,
        #     sampling_kwargs={"max_tokens": 4096},
        # ),
        # Eval(
        #     name="math",
        #     episodes=math_episodes[:64],
        #     samples_per_episode=3,
        #     sampling_kwargs={"max_tokens": 4096},
        # ),
    ],
    tune_model=llama3_1_8b,
    tune_model_type="LLAMA3",
    tune_recipe_config=TuneRecipeConfig(
        seed=42,
        shuffle=True,
        num_output_chunks=4,
        resume_from_checkpoint=False,
        batch_size=1,
        epochs=1,
        max_steps_per_epoch=32,
        optimizer=ComponentConfig(
            "torch.optim.AdamW",
            # "bitsandbytes.optim.PagedAdamW8bit",
            # "bitsandbytes.optim.AdamW",
            # params=PLACEHOLDER,
            lr=4e-6,
            fused=True,
        ),
        loss=ComponentConfig(
            PPOLoss,
            policy_coef=0.0,
            clip_epsilon=0.2,
            unclipped_policy_coef=0.0,
            tanh_log_policy_coef=0.8,
            advantage_prediction_coef=0.0,
            predicted_advantage_weight=0.0,
            value_coef=0.0,
            entropy_coef=0.0,
            entropy_target=0.6,
            entropy_target_coef=0.05,
            kl_coef=0.2,
            weighted_entropy_coef=0.0,
            weighted_kl_coef=0.0,
            weighted_ce_coef=0.0,
            normalize_values=False,
            normalize_value_predictions=False,
            normalize_advantages=False,
        ),
        compile=False,
        optimizer_in_bwd=False,
        gradient_accumulation_steps=1,
        enable_activation_checkpointing=True,
        enable_activation_offloading=False,
        custom_sharded_layers=["tok_embeddings", "output"],
        log_every_n_steps=1,
        log_peak_memory_stats=True,
    ),
    # tune_run=False,
    tune_sequence_length=16384,
    vllm_config=vLLMConfig(
        env={"VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1"},
        kwargs=dict(
            block_size=32,
            disable_log_requests=True,
            enable_chunked_prefill=True,
            enable_prefix_caching=True,
            enforce_eager=True,
            gpu_memory_utilization=0.9,
            max_model_len=16384,
            max_num_seqs=512,
            max_num_batched_tokens=16384,
            preemption_mode="swap",
            return_tokens_as_token_ids=True,
            swap_space=100,
        ),
        max_concurrent_samples=512,
        min_time_between_requests=0.0,
        timeout=120 + 15 * torch.cuda.device_count(),
    ),
    wandb_kwargs=dict(
        name=model_name,
        id=model_name,
    ),
)

config.json:   0%|          | 0.00/716 [00:00<?, ?B/s]

INFO 01-04 15:41:36 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='NousResearch/Hermes-2-Theta-Llama-3-8B', speculative_config=None, tokenizer='NousResearch/Hermes-2-Theta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=NousResearch/Hermes-2-Theta-Llama-3-8B, num_scheduler_steps=1, chunked_prefill_enabled=Fal

tokenizer_config.json:   0%|          | 0.00/56.3k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbradhilton[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
await trainer.train(iterations=50, verbosity=1)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 23 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|23|Loss: 0.0276: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23/23 [06:58<00:00, 17.74s/it, advantage=0.0000, advantage_prediction=0.9023, entropy=0.5453, entropy_target=0.0547, exploration=0.2679, kl_div=0.1292, policy=0.0492, reinforce=-0.0067, tanh_log_policy=-0.0011, unclipped_policy=0.0183, value=0.0000, weighted_ce=-0.0067, weighted_entropy=0.0130, weighted_kl_div=0.0065] 

Saved iteration 1 model files to /home/ubuntu/atreides/experiments/models/rl100/0001
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0001 --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 29 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|29|Loss: 0.0275: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29/29 [09:06<00:00, 18.84s/it, advantage=0.0000, advantage_prediction=1.0234, entropy=0.4485, entropy_target=0.1515, exploration=0.2263, kl_div=0.0907, policy=0.0025, reinforce=-0.0026, tanh_log_policy=0.0022, unclipped_policy=-0.0233, value=0.0000, weighted_ce=-0.0026, weighted_entropy=-0.0031, weighted_kl_div=-0.0021]

Saved iteration 2 model files to /home/ubuntu/atreides/experiments/models/rl100/0002
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0002 --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping temporal_clue evaluation due to expired patience (1 remaining episodes x 60.0 patience per episode = 60.0 seconds)
Tuning model on 27 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|27|Loss: 0.0111: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27/27 [08:08<00:00, 17.79s/it, advantage=0.0000, advantage_prediction=0.6862, entropy=0.5270, entropy_target=0.0730, exploration=0.1061, kl_div=0.0380, policy=-0.0102, reinforce=-0.0052, tanh_log_policy=-0.0002, unclipped_policy=-0.0153, value=0.0000, weighted_ce=-0.0052, weighted_entropy=0.0007, weighted_kl_div=0.0050]

Saved iteration 3 model files to /home/ubuntu/atreides/experiments/models/rl100/0003
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0003 --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 23 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|23|Loss: 0.0170: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23/23 [06:57<00:00, 17.79s/it, advantage=0.0000, advantage_prediction=0.4125, entropy=0.5024, entropy_target=0.0976, exploration=0.0931, kl_div=0.0511, policy=0.0360, reinforce=-0.0014, tanh_log_policy=0.0024, unclipped_policy=0.0227, value=0.0000, weighted_ce=-0.0014, weighted_entropy=-0.0127, weighted_kl_div=-0.0016]

Saved iteration 4 model files to /home/ubuntu/atreides/experiments/models/rl100/0004
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0004 --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 9 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|9|Loss: 0.0203: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [02:49<00:00, 17.95s/it, advantage=0.0000, advantage_prediction=0.2917, entropy=0.3080, entropy_target=0.2920, exploration=0.0515, kl_div=0.0271, policy=-0.0110, reinforce=-0.0005, tanh_log_policy=0.0004, unclipped_policy=-0.0125, value=0.0000, weighted_ce=-0.0005, weighted_entropy=-0.0040, weighted_kl_div=0.0002]

Saved iteration 5 model files to /home/ubuntu/atreides/experiments/models/rl100/0005
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0005 --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 5 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0266: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:37<00:00, 18.45s/it, advantage=0.0000, advantage_prediction=0.5318, entropy=0.5872, entropy_target=0.0128, exploration=0.2478, kl_div=0.1272, policy=-0.0068, reinforce=-0.0025, tanh_log_policy=0.0007, unclipped_policy=-0.0241, value=0.0000, weighted_ce=-0.0025, weighted_entropy=0.0024, weighted_kl_div=0.0011]

Saved iteration 6 model files to /home/ubuntu/atreides/experiments/models/rl100/0006
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0006 --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0179: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:19<00:00, 18.87s/it, advantage=0.0000, advantage_prediction=1.2403, entropy=0.4418, entropy_target=0.1582, exploration=0.2034, kl_div=0.1080, policy=0.0036, reinforce=-0.0571, tanh_log_policy=-0.0146, unclipped_policy=-0.0261, value=0.0000, weighted_ce=-0.0571, weighted_entropy=0.0313, weighted_kl_div=0.0410]  

Saved iteration 7 model files to /home/ubuntu/atreides/experiments/models/rl100/0007
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0007 --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping exploration due to expired patience (2 remaining episodes x 60 patience per episode = 120 seconds)
Tuning model on 5 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|5|Loss: 0.0340: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:37<00:00, 18.44s/it, advantage=0.0000, advantage_prediction=1.1537, entropy=0.2804, entropy_target=0.3196, exploration=0.2001, kl_div=0.0886, policy=0.0474, reinforce=-0.0067, tanh_log_policy=0.0004, unclipped_policy=0.0313, value=0.0000, weighted_ce=-0.0067, weighted_entropy=0.0045, weighted_kl_div=0.0010]   

Saved iteration 8 model files to /home/ubuntu/atreides/experiments/models/rl100/0008
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0008 --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0282: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:20<00:00, 18.92s/it, advantage=0.0000, advantage_prediction=0.6817, entropy=0.3561, entropy_target=0.2439, exploration=0.1834, kl_div=0.0947, policy=0.0477, reinforce=-0.0225, tanh_log_policy=-0.0036, unclipped_policy=0.0216, value=0.0000, weighted_ce=-0.0225, weighted_entropy=0.0278, weighted_kl_div=0.0158] 

Saved iteration 9 model files to /home/ubuntu/atreides/experiments/models/rl100/0009
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0009 --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 6 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|5|Loss: 0.0227: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [01:55<00:00, 18.22s/it, advantage=0.0000, advantage_prediction=1.2379, entropy=0.5613, entropy_target=0.0387, exploration=0.2328, kl_div=0.1351, policy=0.0834, reinforce=-0.0429, tanh_log_policy=-0.0078, unclipped_policy=0.0458, value=0.0000, weighted_ce=-0.0429, weighted_entropy=0.0560, weighted_kl_div=0.0262]  

Saved iteration 10 model files to /home/ubuntu/atreides/experiments/models/rl100/0010
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0010 --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 6 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|5|Loss: 0.0277: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [01:54<00:00, 18.18s/it, advantage=0.0000, advantage_prediction=1.0395, entropy=0.6948, entropy_target=0.0948, exploration=0.2444, kl_div=0.1327, policy=0.0935, reinforce=-0.0215, tanh_log_policy=-0.0045, unclipped_policy=0.0545, value=0.0000, weighted_ce=-0.0215, weighted_entropy=0.0327, weighted_kl_div=0.0096] 

Saved iteration 11 model files to /home/ubuntu/atreides/experiments/models/rl100/0011
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0011 --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 5 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0181: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:37<00:00, 18.46s/it, advantage=0.0000, advantage_prediction=1.0077, entropy=0.5763, entropy_target=0.0237, exploration=0.1735, kl_div=0.0790, policy=0.0351, reinforce=-0.0112, tanh_log_policy=0.0013, unclipped_policy=0.0208, value=0.0000, weighted_ce=-0.0112, weighted_entropy=0.0171, weighted_kl_div=0.0095]   

Saved iteration 12 model files to /home/ubuntu/atreides/experiments/models/rl100/0012
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0012 --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 5 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0298: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:37<00:00, 18.40s/it, advantage=0.0000, advantage_prediction=1.2305, entropy=0.2105, entropy_target=0.3895, exploration=0.1501, kl_div=0.0644, policy=0.0276, reinforce=-0.0189, tanh_log_policy=-0.0032, unclipped_policy=0.0149, value=0.0000, weighted_ce=-0.0189, weighted_entropy=0.0271, weighted_kl_div=0.0129]  

Saved iteration 13 model files to /home/ubuntu/atreides/experiments/models/rl100/0013
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0013 --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 5 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0235: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:37<00:00, 18.40s/it, advantage=0.0000, advantage_prediction=0.3719, entropy=0.6276, entropy_target=0.0276, exploration=0.1961, kl_div=0.1099, policy=0.0070, reinforce=-0.0061, tanh_log_policy=0.0002, unclipped_policy=-0.0096, value=0.0000, weighted_ce=-0.0061, weighted_entropy=0.0042, weighted_kl_div=0.0090]

Saved iteration 14 model files to /home/ubuntu/atreides/experiments/models/rl100/0014
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0014 --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 12 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|11|Loss: 0.0245: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [03:42<00:00, 17.78s/it, advantage=0.0000, advantage_prediction=0.4744, entropy=0.3947, entropy_target=0.2053, exploration=0.1632, kl_div=0.0526, policy=0.0383, reinforce=0.0022, tanh_log_policy=0.0047, unclipped_policy=0.0311, value=0.0000, weighted_ce=0.0022, weighted_entropy=-0.0098, weighted_kl_div=-0.0082]  

Saved iteration 15 model files to /home/ubuntu/atreides/experiments/models/rl100/0015
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0015 --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 6 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|6|Loss: 0.0213: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [01:56<00:00, 18.25s/it, advantage=0.0000, advantage_prediction=0.6734, entropy=0.6980, entropy_target=0.0980, exploration=0.2022, kl_div=0.1002, policy=0.0893, reinforce=-0.0311, tanh_log_policy=-0.0046, unclipped_policy=0.0742, value=0.0000, weighted_ce=-0.0311, weighted_entropy=0.0440, weighted_kl_div=0.0179] 

Saved iteration 16 model files to /home/ubuntu/atreides/experiments/models/rl100/0016
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0016 --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping exploration due to expired patience (1 remaining episodes x 60 patience per episode = 60 seconds)
Tuning model on 6 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|6|Loss: 0.0297: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [01:54<00:00, 18.10s/it, advantage=0.0000, advantage_prediction=0.8918, entropy=0.7644, entropy_target=0.1644, exploration=0.1918, kl_div=0.1032, policy=0.0686, reinforce=-0.0037, tanh_log_policy=0.0011, unclipped_policy=0.0590, value=0.0000, weighted_ce=-0.0037, weighted_entropy=0.0244, weighted_kl_div=0.0008]   

Saved iteration 17 model files to /home/ubuntu/atreides/experiments/models/rl100/0017
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0017 --port=8002 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 6 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|6|Loss: 0.0183: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [01:55<00:00, 18.21s/it, advantage=0.0000, advantage_prediction=0.8723, entropy=0.6398, entropy_target=0.0398, exploration=0.2346, kl_div=0.1019, policy=0.0222, reinforce=-0.0278, tanh_log_policy=-0.0051, unclipped_policy=0.0011, value=0.0000, weighted_ce=-0.0278, weighted_entropy=0.0451, weighted_kl_div=0.0073] 

Saved iteration 18 model files to /home/ubuntu/atreides/experiments/models/rl100/0018
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0018 --port=8002 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 5 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0195: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:38<00:00, 18.54s/it, advantage=0.0000, advantage_prediction=0.5063, entropy=0.6524, entropy_target=0.0524, exploration=0.2092, kl_div=0.0970, policy=0.0449, reinforce=-0.0173, tanh_log_policy=-0.0031, unclipped_policy=0.0336, value=0.0000, weighted_ce=-0.0173, weighted_entropy=0.0325, weighted_kl_div=0.0081]  

Saved iteration 19 model files to /home/ubuntu/atreides/experiments/models/rl100/0019
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0019 --port=8002 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|3|Loss: 0.0338: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:19<00:00, 18.82s/it, advantage=0.0000, advantage_prediction=0.3543, entropy=0.9039, entropy_target=0.3039, exploration=0.1821, kl_div=0.0999, policy=-0.0012, reinforce=-0.0045, tanh_log_policy=-0.0017, unclipped_policy=-0.0138, value=0.0000, weighted_ce=-0.0045, weighted_entropy=0.0106, weighted_kl_div=0.0003]

Saved iteration 20 model files to /home/ubuntu/atreides/experiments/models/rl100/0020
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0020 --port=8002 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping temporal_clue evaluation due to expired patience (1 remaining episodes x 60.0 patience per episode = 60.0 seconds)
Tuning model on 5 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0219: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:37<00:00, 18.51s/it, advantage=0.0000, advantage_prediction=0.6894, entropy=0.3824, entropy_target=0.2176, exploration=0.1774, kl_div=0.0760, policy=-0.0534, reinforce=-0.0341, tanh_log_policy=-0.0053, unclipped_policy=-0.0745, value=0.0000, weighted_ce=-0.0341, weighted_entropy=0.0366, weighted_kl_div=0.0184]

Saved iteration 21 model files to /home/ubuntu/atreides/experiments/models/rl100/0021
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0021 --port=8003 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping exploration due to expired patience (1 remaining episodes x 60 patience per episode = 60 seconds)
Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0118: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:19<00:00, 18.85s/it, advantage=0.0000, advantage_prediction=1.1814, entropy=0.4775, entropy_target=0.1225, exploration=0.1741, kl_div=0.0629, policy=-0.0116, reinforce=-0.0358, tanh_log_policy=-0.0086, unclipped_policy=-0.0333, value=0.0000, weighted_ce=-0.0358, weighted_entropy=0.0629, weighted_kl_div=0.0205]

Saved iteration 22 model files to /home/ubuntu/atreides/experiments/models/rl100/0022
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0022 --port=8004 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0260: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:19<00:00, 18.79s/it, advantage=0.0000, advantage_prediction=0.9356, entropy=0.1598, entropy_target=0.4402, exploration=0.1407, kl_div=0.0451, policy=-0.0312, reinforce=-0.0309, tanh_log_policy=-0.0063, unclipped_policy=-0.0650, value=0.0000, weighted_ce=-0.0309, weighted_entropy=0.0218, weighted_kl_div=0.0199]

Saved iteration 23 model files to /home/ubuntu/atreides/experiments/models/rl100/0023
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0023 --port=8004 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping temporal_clue evaluation due to expired patience (1 remaining episodes x 60.0 patience per episode = 60.0 seconds)
Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0168: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:19<00:00, 18.78s/it, advantage=0.0000, advantage_prediction=0.5302, entropy=0.4142, entropy_target=0.1858, exploration=0.1373, kl_div=0.0569, policy=0.0042, reinforce=-0.0179, tanh_log_policy=-0.0049, unclipped_policy=-0.0291, value=0.0000, weighted_ce=-0.0179, weighted_entropy=0.0262, weighted_kl_div=0.0113]

Saved iteration 24 model files to /home/ubuntu/atreides/experiments/models/rl100/0024
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0024 --port=8005 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 5 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|5|Loss: 0.0207: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:37<00:00, 18.47s/it, advantage=0.0000, advantage_prediction=1.2107, entropy=0.3020, entropy_target=0.2980, exploration=0.1778, kl_div=0.0924, policy=0.0374, reinforce=-0.0703, tanh_log_policy=-0.0159, unclipped_policy=0.0122, value=0.0000, weighted_ce=-0.0703, weighted_entropy=0.0575, weighted_kl_div=0.0610]

Saved iteration 25 model files to /home/ubuntu/atreides/experiments/models/rl100/0025
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0025 --port=8005 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping exploration due to expired patience (0 remaining episodes x 60 patience per episode = 0 seconds)
Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0381: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:19<00:00, 18.80s/it, advantage=0.0000, advantage_prediction=2.1065, entropy=0.7336, entropy_target=0.1336, exploration=0.2266, kl_div=0.0991, policy=0.8602, reinforce=-0.2015, tanh_log_policy=0.0145, unclipped_policy=0.8383, value=0.0000, weighted_ce=-0.2015, weighted_entropy=0.0383, weighted_kl_div=0.0148]  

Saved iteration 26 model files to /home/ubuntu/atreides/experiments/models/rl100/0026
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0026 --port=8005 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|3|Loss: 0.0225: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:20<00:00, 18.96s/it, advantage=0.0000, advantage_prediction=0.5470, entropy=0.3257, entropy_target=0.2743, exploration=0.1202, kl_div=0.0439, policy=-0.0076, reinforce=-0.0385, tanh_log_policy=-0.0000, unclipped_policy=-0.0140, value=0.0000, weighted_ce=-0.0385, weighted_entropy=0.0430, weighted_kl_div=0.0151]

Saved iteration 27 model files to /home/ubuntu/atreides/experiments/models/rl100/0027
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0027 --port=8005 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping exploration due to expired patience (2 remaining episodes x 60 patience per episode = 120 seconds)
Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0126: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:40<00:00, 25.08s/it, advantage=0.0000, advantage_prediction=1.0171, entropy=0.2847, entropy_target=0.3153, exploration=0.1279, kl_div=0.0422, policy=-0.0007, reinforce=-0.0513, tanh_log_policy=-0.0145, unclipped_policy=-0.0173, value=0.0000, weighted_ce=-0.0513, weighted_entropy=0.0647, weighted_kl_div=0.0126]

Saved iteration 28 model files to /home/ubuntu/atreides/experiments/models/rl100/0028
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0028 --port=8006 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|3|Loss: 0.0138: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:19<00:00, 18.86s/it, advantage=0.0000, advantage_prediction=0.4488, entropy=0.5201, entropy_target=0.0799, exploration=0.1532, kl_div=0.0624, policy=-0.0246, reinforce=-0.0067, tanh_log_policy=-0.0033, unclipped_policy=-0.0485, value=0.0000, weighted_ce=-0.0067, weighted_entropy=-0.0111, weighted_kl_div=0.0120]

Saved iteration 29 model files to /home/ubuntu/atreides/experiments/models/rl100/0029
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0029 --port=8006 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0236: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:20<00:00, 18.94s/it, advantage=0.0000, advantage_prediction=1.1175, entropy=0.3800, entropy_target=0.2200, exploration=0.1560, kl_div=0.0458, policy=0.0775, reinforce=-0.0146, tanh_log_policy=0.0043, unclipped_policy=0.0645, value=0.0000, weighted_ce=-0.0146, weighted_entropy=0.0243, weighted_kl_div=0.0164]  

Saved iteration 30 model files to /home/ubuntu/atreides/experiments/models/rl100/0030
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0030 --port=8006 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping exploration due to expired patience (1 remaining episodes x 60 patience per episode = 60 seconds)
Tuning model on 3 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|2|Loss: 0.0192: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [01:02<00:00, 19.83s/it, advantage=0.0000, advantage_prediction=1.7090, entropy=0.7681, entropy_target=0.1681, exploration=0.2826, kl_div=0.1153, policy=0.2411, reinforce=-0.0986, tanh_log_policy=-0.0154, unclipped_policy=0.1970, value=0.0000, weighted_ce=-0.0986, weighted_entropy=0.0827, weighted_kl_div=0.0353]

Saved iteration 31 model files to /home/ubuntu/atreides/experiments/models/rl100/0031
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0031 --port=8007 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping exploration due to expired patience (1 remaining episodes x 60 patience per episode = 60 seconds)
Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0005: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:40<00:00, 25.21s/it, advantage=0.0000, advantage_prediction=1.2905, entropy=0.6085, entropy_target=0.0085, exploration=0.1791, kl_div=0.0719, policy=0.1545, reinforce=-0.0490, tanh_log_policy=-0.0178, unclipped_policy=0.1220, value=0.0000, weighted_ce=-0.0490, weighted_entropy=0.1024, weighted_kl_div=0.0211]  

Saved iteration 32 model files to /home/ubuntu/atreides/experiments/models/rl100/0032
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0032 --port=8008 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


AssertionError: No model checkpoint files found to save in output directory /home/ubuntu/atreides/experiments/models/rl100

In [12]:
await trainer.train(iterations=50, verbosity=1)

Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0032 --port=8008 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping temporal_clue evaluation due to expired patience (0 remaining episodes x 60.0 patience per episode = 0.0 seconds)
Early stopping exploration due to expired patience (1 remaining episodes x 60 patience per episode = 60 seconds)
Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|3|Loss: 0.0169: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:20<00:00, 18.86s/it, advantage=0.0000, advantage_prediction=0.1185, entropy=0.4293, entropy_target=0.1707, exploration=0.1106, kl_div=0.0382, policy=-0.0164, reinforce=0.0004, tanh_log_policy=0.0009, unclipped_policy=-0.0195, value=0.0000, weighted_ce=0.0004, weighted_entropy=-0.0084, weighted_kl_div=-0.0002]

Saved iteration 33 model files to /home/ubuntu/atreides/experiments/models/rl100/0033
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0033 --port=8009 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 5 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0301: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:40<00:00, 18.70s/it, advantage=0.0000, advantage_prediction=0.5536, entropy=0.8303, entropy_target=0.2303, exploration=0.2448, kl_div=0.1020, policy=-0.0261, reinforce=-0.0164, tanh_log_policy=-0.0023, unclipped_policy=-0.0758, value=0.0000, weighted_ce=-0.0164, weighted_entropy=0.0225, weighted_kl_div=0.0041]

Saved iteration 34 model files to /home/ubuntu/atreides/experiments/models/rl100/0034
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0034 --port=8009 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping exploration due to expired patience (1 remaining episodes x 60 patience per episode = 60 seconds)
Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|4|Loss: 0.0262: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:40<00:00, 25.13s/it, advantage=0.0000, advantage_prediction=0.6247, entropy=0.2420, entropy_target=0.3580, exploration=0.1213, kl_div=0.0389, policy=0.0166, reinforce=-0.0102, tanh_log_policy=0.0007, unclipped_policy=0.0057, value=0.0000, weighted_ce=-0.0102, weighted_entropy=0.0161, weighted_kl_div=0.0022] 

Saved iteration 35 model files to /home/ubuntu/atreides/experiments/models/rl100/0035
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0035 --port=8010 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping exploration due to expired patience (1 remaining episodes x 60 patience per episode = 60 seconds)
Tuning model on 3 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|3|Loss: 0.0083: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [01:04<00:00, 20.18s/it, advantage=0.0000, advantage_prediction=0.8034, entropy=0.6283, entropy_target=0.0283, exploration=0.2366, kl_div=0.1018, policy=0.0770, reinforce=-0.0311, tanh_log_policy=-0.0168, unclipped_policy=0.0294, value=0.0000, weighted_ce=-0.0311, weighted_entropy=0.0707, weighted_kl_div=0.0046] 

Saved iteration 36 model files to /home/ubuntu/atreides/experiments/models/rl100/0036
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0036 --port=8011 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|3|Loss: 0.0222: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:21<00:00, 19.13s/it, advantage=0.0000, advantage_prediction=0.1798, entropy=0.1772, entropy_target=0.4228, exploration=0.0439, kl_div=0.0194, policy=-0.0237, reinforce=-0.0082, tanh_log_policy=-0.0035, unclipped_policy=-0.0286, value=0.0000, weighted_ce=-0.0082, weighted_entropy=-0.0004, weighted_kl_div=0.0090]

Saved iteration 37 model files to /home/ubuntu/atreides/experiments/models/rl100/0037
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0037 --port=8011 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Early stopping temporal_clue evaluation due to expired patience (1 remaining episodes x 60.0 patience per episode = 60.0 seconds)
Tuning model on 4 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|3|Loss: 0.0368: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:20<00:00, 18.85s/it, advantage=0.0000, advantage_prediction=0.7281, entropy=0.9175, entropy_target=0.3175, exploration=0.5143, kl_div=0.0991, policy=-0.0271, reinforce=-0.0073, tanh_log_policy=0.0014, unclipped_policy=-0.0344, value=0.0000, weighted_ce=-0.0073, weighted_entropy=-0.0016, weighted_kl_div=0.0015]

Saved iteration 38 model files to /home/ubuntu/atreides/experiments/models/rl100/0038
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0038 --port=8012 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 3 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl100/config.yaml


1|3|Loss: 0.0309: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [01:05<00:00, 20.36s/it, advantage=0.0000, advantage_prediction=0.9381, entropy=0.3396, entropy_target=0.2604, exploration=0.2107, kl_div=0.0969, policy=0.0620, reinforce=-0.0380, tanh_log_policy=-0.0018, unclipped_policy=0.0362, value=0.0000, weighted_ce=-0.0380, weighted_entropy=0.0403, weighted_kl_div=0.0231]

Saved iteration 39 model files to /home/ubuntu/atreides/experiments/models/rl100/0039
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl100/0039 --port=8012 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default


CancelledError: 

In [12]:
await trainer.tune(trainer.explore_results[-1], verbosity=2)
await trainer.train(iterations=50, verbosity=1)

Tuning model on 18 sequences
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl90/config.yaml


DEBUG:torchtune.utils._logging:Training is not distributed. If you want to train on multiple GPUs and are using the tune CLI, specify --nnodes 1 and --nproc_per_node [num_gpus]
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 1
checkpointer:
  _component_: lib.rl.mlp_head_checkpointer.MLPHeadCheckpointer
  checkpoint_dir: /home/ubuntu/atreides/experiments/models/rl90/0045
  checkpoint_files:
  - /home/ubuntu/atreides/experiments/models/rl90/0045/hf_model_0001.pt
  - /home/ubuntu/atreides/experiments/models/rl90/0045/hf_model_0002.pt
  - /home/ubuntu/atreides/experiments/models/rl90/0045/hf_model_0003.pt
  - /home/ubuntu/atreides/experiments/models/rl90/0045/hf_model_0004.pt
  model_type: LLAMA3
  output_dir: /home/ubuntu/atreides/experiments/models/rl90
  recipe_checkpoint: null
compile: false
custom_sharded_layers:
- tok_embeddings
- output
dataset:
  _component_: lib.rl.pack.PackedDataset
  dir: /home/ubuntu/atreides/experiments/models/rl90/

Saved iteration 46 model files to /home/ubuntu/atreides/experiments/models/rl90/0046
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl90/0046 --port=8003 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]

Tuning model on 38 sequences
Experienced the following exception while stopping vLLM servers: <class 'TimeoutError'> 
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl90/config.yaml


1|32|Loss: 0.0036: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [10:45<00:00, 19.90s/it, advantage=0.0000, advantage_prediction=0.1506, entropy=0.5653, entropy_target=0.0347, exploration=0.1700, kl_div=0.1038, policy=-0.0176, reinforce=-0.0034, tanh_log_policy=-0.0041, unclipped_policy=-0.0268, value=0.0000, weighted_ce=-0.0034, weighted_entropy=0.0127, weighted_kl_div=0.0051]

Saved iteration 47 model files to /home/ubuntu/atreides/experiments/models/rl90/0047
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl90/0047 --port=8003 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/32 [00:00<?, ?episode/s]