In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
from lib.langfuse import langfuse
# langfuse.enabled = False
langfuse.auth_check()

True

In [4]:
import json
from lib.rl.episode import Episode, EpisodeCompletion
import random
import re
from typing import TypedDict


class TemporalCluePuzzle(TypedDict):
    num_clues: int
    prompt: str
    solution: dict[str, str]


temporal_clue_puzzles: list[TemporalCluePuzzle] = json.load(
    open("./data/temporal-clue-puzzles.json")
)
random.seed(42)
random.shuffle(temporal_clue_puzzles)

In [5]:
chain_of_thought_examples: list[dict[str, str]] = json.load(
    open("./data/chain-of-thought-examples.json")
)
chain_of_thought_examples.pop(6)
chain_of_thought_examples.pop(3)

def get_episode(puzzle: TemporalCluePuzzle) -> Episode:

    def validate(completion: EpisodeCompletion) -> None:
        ...

    def on_sample(completions: list[EpisodeCompletion]) -> None:
        for completion in completions:
            content = completion.last_assistant_message.get("content")
            assert isinstance(content, str)
            num_correct = 0
            for key, value in puzzle["solution"].items():
                if matches := re.findall(rf"{key}\. ([A-Za-z \.:-]+)", content):
                    match = matches[-1]
                    if match.strip().lower() == value.lower():
                        num_correct += 1
            completion.commit(reward=num_correct / len(puzzle["solution"]))
            
    example = random.choices(chain_of_thought_examples, k=1)

    return Episode(
        messages=[
            {
                "role": "user",
                "content": puzzle["prompt"]
                .replace(
                    "Fill out your final answers in the following format:",
                    "After verifiably finding all the correct answers, fill out your final answers in the following format:",
                )
                ,
            },
            # {
            #     "role": "assistant",
            #     "content": "Let's think this through step by step...",
            # },
        ],
        on_sample=on_sample,
        examples=[
            {"role": "user", "content": example[0]["prompt"]},
            {
                "role": "assistant", 
                "content": example[0]["chain_of_thought"]
                + (example[0]["answer"] and f"\n\n---\n\n{example[0]['answer']}"),
            },
            # {"role": "user", "content": example[1]["prompt"]},
            # {
            #     "role": "assistant",
            #     "content": example[1]["chain_of_thought"] 
            #     + (example[1]["answer"] and f"\n\n---\n\n{example[1]['answer']}"),
            # },
        ],
    )


temporal_clue_episodes = [get_episode(puzzle) for puzzle in temporal_clue_puzzles]

In [6]:
import polars as pl

zebra_grid_questions = pl.read_parquet(
    "hf://datasets/allenai/ZebraLogicBench-private/grid_mode/test-00000-of-00001.parquet"
).to_dicts()
random.shuffle(zebra_grid_questions)


def get_episode(question: dict) -> Episode:
    prompt = f"""{question["puzzle"]}
Fill in the grid with the correct values:

| {' | '.join(question["solution"]["header"])} |
| {' | '.join(["-" * len(header) for header in question["solution"]["header"]])} |
"""

    for _ in question["solution"]["rows"]:
        prompt += f"| {' | '.join([" " * len(header) for header in question["solution"]["header"]])} |\n"

    pattern = re.compile(
        r"\| " + r"\|".join(r"(.*?)" for _ in question["solution"]["header"]) + r" \|"
    )

    def on_sample(completions: list[EpisodeCompletion]):
        for completion in completions:
            assert "content" in completion.last_assistant_message and isinstance(
                completion.last_assistant_message["content"], str
            )
            num_cells = sum(len(row) for row in question["solution"]["rows"])
            num_correct = 0
            for match, row in zip(
                re.findall(pattern, completion.last_assistant_message["content"])[
                    -len(question["solution"]["rows"]) :
                ],
                question["solution"]["rows"],
            ):
                for cell, value in zip(match, row):
                    if cell.strip().lower() == value.lower():
                        num_correct += 1
            completion.commit(reward=num_correct / num_cells)

    return Episode(
        messages=[{"role": "user", "content": prompt}],
        on_sample=on_sample,
    )

zebra_grid_episodes = [get_episode(question) for question in zebra_grid_questions]

In [7]:
from datasets import load_dataset

math_questions = list(
    load_dataset("lighteval/MATH", "all")["train"].to_iterable_dataset()  # type: ignore
)
random.shuffle(math_questions)


question_solution = None
pattern = re.compile(r"\\boxed{([^}]+)}")


def get_episode(question: dict) -> Episode:
    prompt = (
        f"{question['problem']}\n\n"
        "Solve this math problem and show your work. Your final answer MUST be "
        "formatted in a LaTeX box using \\boxed{{}}. For example: "
        "$1+1=\\boxed{{2}}$\n\n"
        "You can submit multiple attempts. Each attempt should end with a boxed "
        "answer. Your last answer will be weighted the most, but you can get "
        "partial credit if an earlier answer is correct. If after multiple "
        "attempts you decide an earlier answer is the correct one, just submit "
        "it again to get full credit."
    )

    global question_solution
    question_solution = question["solution"]
    solution = re.search(pattern, question["solution"])
    assert solution is not None, question["solution"]
    solution = solution.group(1)

    def on_sample(completions: list[EpisodeCompletion]):
        for completion in completions:
            content = completion.last_assistant_message.get("content")
            assert isinstance(content, str)
            solutions = [
                match.group(1) for match in re.finditer(r"\\boxed{([^}]+)}", content)
            ][::-1]
            try:
                reward = 0.9 ** solutions.index(solution)
            except ValueError:
                reward = 0
            completion.commit(reward=reward)

    return Episode(
        messages=[{"role": "user", "content": prompt}],
        on_sample=on_sample,
    )


math_episodes = [
    get_episode(question)
    for question in math_questions[:2048]
    if re.search(pattern, question["solution"]) is not None
]

In [8]:
import asyncio
from dataclasses import dataclass, field
from lib.rl.completion import SplitMethod
from lib.rl.completion_sampler import CompletionSampler, SamplingKwargs
from lib.rl.trainer import ExploreImpl, ExploreOptions
from lib.tokenizer import Tokenizer
from typing import Callable


@dataclass
class DefaultExploreImpl(ExploreImpl):
    explore_options: ExploreOptions

    async def __call__(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        ready_episodes: asyncio.Queue[Episode],
        done_episodes: asyncio.Queue[Episode | BaseException],
        update_progress: Callable[[float], None],
    ) -> None:
        def done_callback(task: asyncio.Task[Episode]) -> None:
            try:
                done_episodes.put_nowait(task.result())
            except BaseException as exception:
                done_episodes.put_nowait(exception)

        priority = 1
        while episode := await ready_episodes.get():
            asyncio.create_task(
                self._explore_episode(
                    completion_sampler, tokenizer, episode, update_progress, priority
                )
            ).add_done_callback(done_callback)
            priority += 1

    async def _explore_episode(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        episode: Episode,
        update_progress: Callable[[float], None],
        priority: int,
    ) -> Episode:
        for _ in range(self.explore_options.iterations):
            await episode.sample_completions(
                completion_sampler=completion_sampler,
                tokenizer=tokenizer,
                num_parents=self.explore_options.num_parents,
                branch_factor=self.explore_options.branch_factor,
                get_recovery_pattern=self.explore_options.get_recovery_pattern,
                max_splits_per_completion=self.explore_options.max_split_points
                or self.explore_options.num_parents,
                priority=priority,
                sample_probability_power=self.explore_options.get_sample_probability_power(),
                sampling_kwargs=self.explore_options.sampling_kwargs,
                split_by=self.explore_options.split_method,
                split_separators=self.explore_options.split_separators,
            )
            update_progress(1 / self.explore_options.iterations)
        return episode


@dataclass
class SimpleExploreImpl(ExploreImpl):
    num_samples: int
    sampling_kwargs: SamplingKwargs | None = None

    async def __call__(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        ready_episodes: asyncio.Queue[Episode],
        done_episodes: asyncio.Queue[Episode | BaseException],
        update_progress: Callable[[float], None],
    ) -> None:
        while episode := await ready_episodes.get():
            task = asyncio.create_task(
                episode.sample_completions(
                    completion_sampler,
                    tokenizer,
                    num_parents=1,
                    branch_factor=self.num_samples,
                    sampling_kwargs=self.sampling_kwargs,
                )
            )

            def done_callback(_: asyncio.Task[bool], episode=episode) -> None:
                try:
                    done_episodes.put_nowait(episode)
                    update_progress(1)
                except BaseException as e:
                    done_episodes.put_nowait(e)

            task.add_done_callback(done_callback)


@dataclass
class TreeExploreImpl(ExploreImpl):
    branch_factor: int
    depth: int
    sampling_kwargs: SamplingKwargs | None = None
    split_method: SplitMethod = "count"
    split_separators: set[str] = field(default_factory=set)

    async def __call__(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        ready_episodes: asyncio.Queue[Episode],
        done_episodes: asyncio.Queue[Episode | BaseException],
        update_progress: Callable[[float], None],
    ) -> None:
        model = await completion_sampler.get_model()

        async def expand(episode: Episode, priority: int) -> None:
            pending: set[asyncio.Task] = {
                asyncio.create_task(
                    episode.sample_completions(
                        completion_sampler,
                        tokenizer,
                        num_parents=1,
                        branch_factor=self.branch_factor,
                        priority=priority,
                        sampling_kwargs=self.sampling_kwargs,
                        split_by=self.split_method,
                        split_separators=self.split_separators,
                    )
                )
            }

            num_leaves = 0
            while pending:
                finished, pending = await asyncio.wait(
                    pending, return_when=asyncio.FIRST_COMPLETED
                )
                for task in finished:
                    try:
                        task.result()
                    except BaseException as e:
                        await done_episodes.put(e)
                        return
                _num_leaves = 0
                for leaf in episode.completion.leaves(model=model):
                    _num_leaves += 1
                    num_partitions = self.depth - leaf.depth() + 1
                    if num_partitions > 1:
                        parents = list(
                            leaf.split(
                                by=self.split_method,
                                at=(
                                    split / num_partitions
                                    for split in range(1, num_partitions)
                                ),
                                separators=self.split_separators,
                                cache=True,
                            )
                        )[:-1]
                        for parent in parents:
                            pending.add(
                                asyncio.create_task(
                                    episode._sample_completions(
                                        parent=parent,
                                        model=model,
                                        completion_sampler=completion_sampler,
                                        tokenizer=tokenizer,
                                        branch_factor=self.branch_factor,
                                        fork_decay=1.0,
                                        recovery_pattern=None,
                                        split_separators=self.split_separators,
                                        sampling_kwargs=self.sampling_kwargs
                                        or SamplingKwargs(),
                                        priority=priority,
                                    )
                                )
                            )
                update_progress(
                    (_num_leaves - num_leaves) / (self.branch_factor**self.depth)
                )
                num_leaves = _num_leaves

            await done_episodes.put(episode)

        priority = 0
        while episode := await ready_episodes.get():
            priority += 1
            asyncio.create_task(expand(episode, priority))


@dataclass
class IterativeVineExploreImpl(ExploreImpl):
    branch_factor: int
    depth: int
    sampling_kwargs: SamplingKwargs | None = None
    split_method: SplitMethod = "count"
    split_separators: set[str] = field(default_factory=set)

    async def __call__(
        self,
        completion_sampler: CompletionSampler,
        tokenizer: Tokenizer,
        ready_episodes: asyncio.Queue[Episode],
        done_episodes: asyncio.Queue[Episode | BaseException],
        update_progress: Callable[[float], None],
    ) -> None:
        model = await completion_sampler.get_model()

        async def iterate_vine(episode: Episode, priority: int) -> None:

            for depth in range(self.depth):
                if depth == 0:
                    parent = episode.completion
                else:
                    parent = max(
                        episode.completion.leaves(model=model),
                        key=lambda leaf: leaf.reward,
                    )
                    parent = list(
                        parent.split(
                            by=self.split_method,
                            at=[1 / (self.depth - depth + 1)],
                            separators=self.split_separators,
                            cache=True,
                        )
                    )[0]
                await episode._sample_completions(
                    parent=parent,
                    model=model,
                    completion_sampler=completion_sampler,
                    tokenizer=tokenizer,
                    branch_factor=self.branch_factor,
                    fork_decay=1.0,
                    recovery_pattern=None,
                    split_separators=self.split_separators,
                    sampling_kwargs=self.sampling_kwargs or SamplingKwargs(),
                    priority=priority,
                )
                update_progress(1 / self.depth)

            await done_episodes.put(episode)

        priority = 0
        while episode := await ready_episodes.get():
            priority += 1
            asyncio.create_task(iterate_vine(episode, priority))

In [9]:
from aioitertools.helpers import maybe_await
import asyncio
import itertools as it
from lib import clue
from lib.rl.episode import Episode
from lib.rl.ppo import PPOLoss
from lib.rl.recipe import ComponentConfig, TuneRecipeConfig
from lib.rl.trainer import Eval, ExploreOptions, Trainer, vLLMConfig
import torch
from torchtune.models.llama3_1 import llama3_1_8b
from typing import AsyncIterable


episodes_per_iteration = 64 * torch.cuda.device_count()


async def train_episodes() -> AsyncIterable[Episode | BaseException]:
    pending: set[asyncio.Task[Episode | BaseException]] = set()
    episodes = (
        maybe_await(episode)
        for episodes in zip(
            # (clue.sample_random_episode() for _ in it.repeat(0)),
            it.cycle(temporal_clue_episodes[64:]),
            # it.cycle(zebra_grid_episodes[64:]),
            # it.cycle(math_episodes[64:]),
        )
        for episode in episodes
    )
    while True:
        pending.update(
            asyncio.create_task(next(episodes))
            for _ in range(episodes_per_iteration - len(pending))  # type: ignore
        )
        done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
        for task in done:
            try:
                yield task.result()
            except BaseException as e:
                yield e


async def val_episodes() -> AsyncIterable[Episode | BaseException]:
    for fut in asyncio.as_completed(clue.sample_random_episode() for _ in range(64)):
        try:
            yield await fut
        except BaseException as e:
            yield e


explore_options = ExploreOptions(
    iterations=1,
    num_parents=6,
    branch_factor=3,
    patience=60,
    advantage_max_weight=0.2,
    sample_probability_power=None,
    sampling_kwargs={"max_tokens": 4096, "stop": ["://"]},
    # split_method="prob",
    # split_point_std_deviation=0.5,
)

model_name = "rl59"

trainer = Trainer(
    base_model="NousResearch/Hermes-2-Theta-Llama-3-8B",
    output_dir=f"./models/{model_name}",
    explore_options=explore_options,
    # explore_impl=DefaultExploreImpl(explore_options),
    # explore_impl=SimpleExploreImpl(
    #     num_samples=8, sampling_kwargs={"max_tokens": 4096}
    # ),
    explore_impl=TreeExploreImpl(
        branch_factor=3,
        depth=4,
        sampling_kwargs={"max_tokens": 4096, "stop": ["://"]},
    ),
    force_terminate_vllms=True,
    train_episodes=train_episodes(),
    episodes_per_iteration=episodes_per_iteration,
    max_mask_sequence_batch_size=1,
    evals=[
        # Eval(
        #     name="variable_clue",
        #     episodes=val_episodes(),
        #     samples_per_episode=3,
        #     sampling_kwargs={"max_tokens": 4096},
        # ),
        Eval(
            name="temporal_clue",
            episodes=temporal_clue_episodes[:64],
            samples_per_episode=3,
            sampling_kwargs={"max_tokens": 4096, "stop": ["://"]},
        ),
        # Eval(
        #     name="zebra_grid",
        #     episodes=zebra_grid_episodes[:64],
        #     samples_per_episode=3,
        #     sampling_kwargs={"max_tokens": 4096},
        # ),
        # Eval(
        #     name="math",
        #     episodes=math_episodes[:64],
        #     samples_per_episode=3,
        #     sampling_kwargs={"max_tokens": 4096},
        # ),
    ],
    tune_model=llama3_1_8b,
    tune_model_type="LLAMA3",
    tune_recipe_config=TuneRecipeConfig(
        seed=42,
        shuffle=True,
        num_output_chunks=4,
        resume_from_checkpoint=False,
        batch_size=1,
        epochs=1,
        # max_steps_per_epoch=32,
        optimizer=ComponentConfig(
            "torch.optim.AdamW",
            # "bitsandbytes.optim.PagedAdamW8bit",
            # "bitsandbytes.optim.AdamW",
            # params=PLACEHOLDER,
            lr=4e-6,
            fused=True,
        ),
        loss=ComponentConfig(
            PPOLoss,
            policy_coef=0.0,
            clip_epsilon=0.2,
            unclipped_policy_coef=0.0,
            tanh_log_policy_coef=0.8,
            value_coef=0.0,
            entropy_coef=0.0,
            entropy_target=0.6,
            entropy_target_coef=0.05,
            kl_coef=0.05,
            weighted_entropy_coef=0.2,
            weighted_kl_coef=0.0,
            weighted_ce_coef=0.0,
            normalize_values=False,
            normalize_advantages=False,
        ),
        compile=False,
        optimizer_in_bwd=False,
        gradient_accumulation_steps=1,
        enable_activation_checkpointing=True,
        enable_activation_offloading=False,
        custom_sharded_layers=["tok_embeddings", "output"],
        log_every_n_steps=1,
        log_peak_memory_stats=True,
    ),
    # tune_run=False,
    tune_sequence_length=16384,
    vllm_config=vLLMConfig(
        env={"VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1"},
        kwargs=dict(
            block_size=32,
            disable_log_requests=True,
            enable_chunked_prefill=True,
            enable_prefix_caching=True,
            enforce_eager=True,
            gpu_memory_utilization=0.9,
            max_model_len=16384,
            max_num_seqs=512,
            max_num_batched_tokens=16384,
            preemption_mode="swap",
            return_tokens_as_token_ids=True,
            swap_space=100,
        ),
        max_concurrent_samples=512,
        min_time_between_requests=0.0,
        timeout=120 + 15 * torch.cuda.device_count(),
    ),
    wandb_kwargs=dict(
        name=model_name,
        id=model_name,
    ),
)

INFO 12-31 15:49:11 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='NousResearch/Hermes-2-Theta-Llama-3-8B', speculative_config=None, tokenizer='NousResearch/Hermes-2-Theta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=NousResearch/Hermes-2-Theta-Llama-3-8B, num_scheduler_steps=1, chunked_prefill_enabled=Fal

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbradhilton[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
await trainer.train(iterations=15, verbosity=1)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8019 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping temporal_clue evaluation due to expired patience (1 remaining episodes x 60.0 patience per episode = 60.0 seconds)


In [11]:
completion_sampler = await trainer.get_completion_sampler(verbosity=1)

Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl44/0001 --port=8007 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


In [12]:
model = await completion_sampler.get_model()
model

'/home/ubuntu/atreides/experiments/models/rl44/0001'

In [13]:
from lib.langfuse import langfuse

chat_completion = await completion_sampler.samplers[0].client.chat.completions.create(
    model=model,
    messages=[{"role": "user", "content": "Hello, world!"}],
)
chat_completion

ChatCompletion(id='chat-e5126fe18c184324ae17be846d07ffe0', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Hello, world! It's great to meet you. How can I assist you today?", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[]), stop_reason=None)], created=1735422282, model='/home/ubuntu/atreides/experiments/models/rl44/0001', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=19, prompt_tokens=13, total_tokens=32, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None)

In [14]:
import os
os.environ["LANGFUSE_PUBLIC_KEY"]

'pk-lf-3598930a-f80e-4074-a6d4-24579d0d2605'

In [15]:
langfuse.auth_check()

True

In [12]:
result = await trainer.explore(verbosity=1)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8005 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


explore:   0%|          | 0/64 [00:00<?, ?episode/s]

In [13]:
await trainer.tune(result, verbosity=2)

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl44/config.yaml


DEBUG:torchtune.utils._logging:Training is not distributed. If you want to train on multiple GPUs and are using the tune CLI, specify --nnodes 1 and --nproc_per_node [num_gpus]
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 1
checkpointer:
  _component_: torchtune.training.checkpointing._checkpointer.FullModelHFCheckpointer
  checkpoint_dir: /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725
  checkpoint_files:
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725/model-00001-of-00004.safetensors
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725/model-00003-of-00004.safetensors
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b

Saved iteration 1 model files to /home/ubuntu/atreides/experiments/models/rl44/0001


In [29]:
trainer.eval_episodes["temporal_clue"] = temporal_clue_episodes[:64]

In [8]:
await trainer.eval("temporal_clue", verbosity=1)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8002 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

(0.2249441964285714, [])

In [13]:
list(trainer.eval_episodes["temporal_clue"][35].completion.leaves())[0].all_message_params()

[{'role': 'user',
  'content': 'On a dark winter night, wealthy and enigmatic Mr. John Q. Boddy hosted a small, but lavish, dinner party for some of his closest associates. However, the night ended in tragedy when Mr. Boddy was found dead in one of the rooms of Tudor Mansion in the early hours of the morning. The following persons of interest have been identified as suspects:\n\n• Miss Peach\n• Colonel Mustard\n\nAnd the following weapons were found on the premises:\n\n• Horseshoe\n• Knife\n\nThe murder could only have occured in one of the following rooms:\n\n1. Courtyard\n2. Hall\n\nThe rooms are laid out as follows:\n\n  N N  \nW 1|2 E\n  S S  \n\nEach suspect uniquely had one of the following possible motives for killing Mr. Boddy:\n\n• Ambition\n• Betrayal\n\nFor the murder to occur, the murderer and Mr. Boddy must have been alone in a room with at least one weapon.\n\nThe available clues are as follows:\n\n- The murderer was in the Hall or Colonel Mustard was in the Courtyard\n- 

In [7]:
await trainer.train(iterations=1, verbosity=1)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

In [18]:
list(trainer.eval_episodes["temporal_clue"][1].completion.leaves(model=trainer.model))[0].all_message_params()

[{'role': 'user',
  'content': 'On a dark winter night, wealthy and enigmatic Mr. John Q. Boddy hosted a small, but lavish, dinner party for some of his closest associates. However, the night ended in tragedy when Mr. Boddy was found dead in one of the rooms of Tudor Mansion in the early hours of the morning. The following persons of interest have been identified as suspects:\n\n• Professor Plum\n• Mrs. Peacock\n• Mrs. White\n• Monsieur Brunette\n• Miss Peach\n• Madame Rose\n• Colonel Mustard\n• Mr. Green\n• Sgt. Gray\n• Miss Scarlet\n\nAnd the following weapons were found on the premises:\n\n• Poison\n• Knife\n• Wrench\n• Candlestick\n• Lead Pipe\n• Horseshoe\n• Rope\n\nThe murder could only have occured in one of the following rooms:\n\n01. Lounge\n02. Courtyard\n03. Drawing Room\n04. Dining Room\n05. Trophy Room\n06. Studio\n07. Fountain\n08. Hall\n09. Study\n10. Cloak Room\n11. Library\n12. Billiard Room\n\nThe rooms are laid out as follows:\n\n  NN NN NN  \nW 01|02|03 E\nW 04|05|0

In [14]:
await trainer.eval("temporal_clue", verbosity=1)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8001 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

(0.2222284226190476, [])

In [18]:
list(trainer.eval_episodes["temporal_clue"][3].completion.leaves())[0].all_message_params()

[{'role': 'user',
  'content': 'On a dark winter night, wealthy and enigmatic Mr. John Q. Boddy hosted a small, but lavish, dinner party for some of his closest associates. However, the night ended in tragedy when Mr. Boddy was found dead in one of the rooms of Tudor Mansion in the early hours of the morning. The following persons of interest have been identified as suspects:\n\n• Colonel Mustard\n• Mrs. Peacock\n\nAnd the following weapons were found on the premises:\n\n• Poison\n• Revolver\n\nThe murder could only have occured in one of the following rooms:\n\n1. Fountain\n2. Dining Room\n\nThe rooms are laid out as follows:\n\n  N  \nW 1 E\nW 2 E\n  S  \n\nThe exact time of the murder is a bit uncertain, but it has been narrowed down to one of the following times:\n\n• 11:45 PM\n• 12:00 AM\n\nAt every time the suspects and Mr. Boddy either stayed in their current room or moved to an orthogonally adjacent room (north, south, east, or west). Weapons could be moved by suspects between 

In [10]:
await trainer.train(iterations=5, verbosity=1)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8000 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


variable_clue: 0episode [00:00, ?episode/s]

temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

zebra_grid:   0%|          | 0/64 [00:00<?, ?episode/s]

math:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping temporal_clue evaluation due to expired patience (1 remaining episodes x 60.0 patience per episode = 60.0 seconds)
Early stopping zebra_grid evaluation due to expired patience (1 remaining episodes x 60.0 patience per episode = 60.0 seconds)
Early stopping math evaluation due to expired patience (1 remaining episodes x 60.0 patience per episode = 60.0 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl43/config.yaml


1|16|Loss: -0.0048:  43%|████▎     | 16/37 [05:09<06:37, 18.92s/it, entropy=0.4987, entropy_target=0.1013, kl_div=0.1546, policy=0.0417, reinforce=-0.0308, tanh_log_policy=-0.0226, unclipped_policy=-0.2213, value=0.0000, weighted_ce=-0.0308, weighted_entropy=-0.0023, weighted_kl_div=0.0140]

In [30]:
await asyncio.gather(
    *(
        trainer.eval(eval_name, pbar_position=i, verbosity=1)
        for i, eval_name in enumerate(trainer.evals)
    )
)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8002 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


variable_clue: 0episode [00:00, ?episode/s]

temporal_clue:   0%|          | 0/64 [00:00<?, ?episode/s]

zebra_grid:   0%|          | 0/64 [00:00<?, ?episode/s]

math:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping temporal_clue evaluation due to expired patience (0 remaining episodes x 30.0 patience per episode = 0.0 seconds)
Early stopping zebra_grid evaluation due to expired patience (0 remaining episodes x 30.0 patience per episode = 0.0 seconds)
Early stopping math evaluation due to expired patience (1 remaining episodes x 30.0 patience per episode = 30.0 seconds)


[(0.22010241152992485, []),
 (0.23782242063492062, []),
 (0.45594246031746033, []),
 (0.11433862433862434, [])]

In [7]:
list(trainer.eval_episodes["zebra_grid"][63].completion.leaves())[0].all_message_params()

[{'role': 'user',
  'content': 'There are 3 houses, numbered 1 to 3 from left to right, as seen from across the street. Each house is occupied by a different person. Each house has a unique attribute for each of the following characteristics:\n - Each person has a unique name: `Peter`, `Eric`, `Arnold`\n - People have unique favorite sports: `soccer`, `basketball`, `tennis`\n - Each person lives in a unique style of house: `victorian`, `colonial`, `ranch`\n - Each person has a unique type of pet: `fish`, `dog`, `cat`\n - Each person has a unique birthday month: `sept`, `april`, `jan`\n - The people are of nationalities: `swede`, `dane`, `brit`\n\n## Clues:\n1. The person who loves soccer is the person residing in a Victorian house.\n2. The person who owns a dog is the person whose birthday is in April.\n3. The person who owns a dog is the person residing in a Victorian house.\n4. The person living in a colonial-style house is Arnold.\n5. The person living in a colonial-style house is t

In [20]:
list(list(trainer.eval_episodes["val"])[45].completion.leaves())[2].all_message_params()

[{'role': 'user',
  'content': "On a warm autumn day Gregory, Sean, Delaney, Lydia, Shawn, and Mario sat down to play a competitive deduction game.\n\nThey assembled 3 stacks of cards, each for a separate type of information composed of the following:\n\nSuspect:\n- Mr. Green\n- Professor Plum\n- Miss Scarlet\n- Mrs. Peacock\n- Monsieur Brunette\n- Colonel Mustard\n- Miss Peach\n\nWeapon:\n- Candlestick\n- Poison\n- Knife\n- Lead Pipe\n- Horseshoe\n\nRoom:\n- Hall\n- Gazebo\n- Study\n- Ballroom\n- Kitchen\n- Courtyard\n- Studio\n- Trophy Room\n- Carriage House\n- Drawing Room\n- Conservatory\n- Dining Room\n\nAfter randomly (and blindly) choosing one card from each stack and placing them in the center of the table facedown, they shuffled the remaining cards and dealt out the following to each player:\n\n- Gregory: 3 cards\n- Sean: 3 cards\n- Delaney: 3 cards (Gazebo, Mrs. Peacock, and Studio)\n- Lydia: 4 cards\n- Shawn: 4 cards\n- Mario: 4 cards\n\nThe game proceeded as follows:\n\n1. 

In [10]:
await trainer.train(iterations=5, verbosity=1)

val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl35/config.yaml


1|19|Loss: 0.0137: 100%|██████████| 19/19 [05:51<00:00, 17.82s/it, entropy=0.4768, entropy_target=0.1232, kl_div=0.1285, policy=-0.0627, tanh_log_policy=0.0041, unclipped_policy=-0.1151, value=2.8588, weighted_ce=-0.1450, weighted_entropy=0.0109, weighted_kl_div=0.1807] 

Saved iteration 39 model files to /home/ubuntu/atreides/experiments/models/rl35/0039
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl35/0039 --port=8009 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl35/config.yaml


1|20|Loss: -0.1178: 100%|██████████| 20/20 [06:05<00:00, 17.69s/it, entropy=0.9599, entropy_target=0.3599, kl_div=0.2557, policy=-0.6068, tanh_log_policy=-0.0675, unclipped_policy=-1.4084, value=6.7460, weighted_ce=-0.5565, weighted_entropy=0.4731, weighted_kl_div=0.0815]

Saved iteration 40 model files to /home/ubuntu/atreides/experiments/models/rl35/0040
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl35/0040 --port=8009 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl35/config.yaml


1|14|Loss: 0.0581: 100%|██████████| 14/14 [04:23<00:00, 17.65s/it, entropy=0.6196, entropy_target=0.0196, kl_div=0.1941, policy=-0.1284, tanh_log_policy=0.0604, unclipped_policy=-0.1790, value=2.4163, weighted_ce=0.0416, weighted_entropy=0.0047, weighted_kl_div=-0.1024]  

Saved iteration 41 model files to /home/ubuntu/atreides/experiments/models/rl35/0041
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl35/0041 --port=8009 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl35/config.yaml


1|22|Loss: 0.0034: 100%|██████████| 22/22 [06:46<00:00, 17.93s/it, entropy=0.5840, entropy_target=0.0160, kl_div=0.1036, policy=0.0085, tanh_log_policy=-0.0008, unclipped_policy=0.0071, value=1.5228, weighted_ce=-0.0038, weighted_entropy=0.0098, weighted_kl_div=0.0029]     

Saved iteration 42 model files to /home/ubuntu/atreides/experiments/models/rl35/0042
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl35/0042 --port=8009 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl35/config.yaml


1|16|Loss: 0.0051: 100%|██████████| 16/16 [04:57<00:00, 17.51s/it, entropy=0.4877, entropy_target=0.1123, kl_div=0.0803, policy=0.0129, tanh_log_policy=0.0012, unclipped_policy=0.0093, value=3.4596, weighted_ce=-0.0097, weighted_entropy=0.0276, weighted_kl_div=0.0043]  

Saved iteration 43 model files to /home/ubuntu/atreides/experiments/models/rl35/0043
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl35/0043 --port=8009 --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=100 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)


In [5]:
await trainer.tune(trainer.explore_results[-1])

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl35/config.yaml


DEBUG:torchtune.utils._logging:Training is not distributed. If you want to train on multiple GPUs and are using the tune CLI, specify --nnodes 1 and --nproc_per_node [num_gpus]
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 1
checkpointer:
  _component_: torchtune.training.checkpointing._checkpointer.FullModelHFCheckpointer
  checkpoint_dir: /home/ubuntu/atreides/experiments/models/rl35/0002
  checkpoint_files:
  - /home/ubuntu/atreides/experiments/models/rl35/0002/hf_model_0003_0.pt
  - /home/ubuntu/atreides/experiments/models/rl35/0002/hf_model_0004_0.pt
  - /home/ubuntu/atreides/experiments/models/rl35/0002/hf_model_0001_0.pt
  - /home/ubuntu/atreides/experiments/models/rl35/0002/hf_model_0002_0.pt
  model_type: LLAMA3
  output_dir: /home/ubuntu/atreides/experiments/models/rl35
  recipe_checkpoint: null
compile: false
custom_sharded_layers:
- tok_embeddings
- output
dataset:
  _component_: lib.rl.pack.PackedDataset
  dir: /home/ubuntu/atr

Saved iteration 3 model files to /home/ubuntu/atreides/experiments/models/rl35/0003


In [8]:
list(trainer.explore_results[0].episodes[0].completion.leaves())[1].all_message_params()

[{'role': 'user',
  'content': "On a cool autumn afternoon Leonardo, Sophie, and Isabelle sat down to play a casual deduction game.\n\nThey assembled 3 groups of cards, each for a separate type of information composed of the following:\n\nSuspect:\n- Colonel Mustard\n- Professor Plum\n- Miss Peach\n- Sgt. Gray\n- Mr. Green\n- Mrs. Peacock\n- Miss Scarlet\n- Madame Rose\n- Monsieur Brunette\n- Mrs. White\n\nWeapon:\n- Knife\n- Lead Pipe\n- Revolver\n- Rope\n- Poison\n- Horseshoe\n\nRoom:\n- Fountain\n- Hall\n- Drawing Room\n- Trophy Room\n- Cloak Room\n- Lounge\n- Library\n- Conservatory\n- Billiard Room\n- Dining Room\n- Carriage House\n\nAfter randomly (and blindly) choosing one card from each stack and placing them in the middle of the table facedown, they shuffled the remaining cards and dealt out the following to each player:\n\n- Leonardo: 8 cards\n- Sophie: 8 cards\n- Isabelle: 8 cards (Miss Peach, Mrs. White, Carriage House, Fountain, Colonel Mustard, Drawing Room, Monsieur Brun

In [7]:
trainer._completion_sampler.samplers[0].semaphore.max_concurrent_actions // 16384

19

In [None]:
await trainer.get_completion_sampler(verbosity=1)

In [6]:
await trainer.stop_vllms()

In [5]:
await trainer.explore()

In [6]:
await trainer.tune(trainer.explore_results[-1])

$ tune run --nnodes=1 --nproc-per-node=8 lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl34/config.yaml
Running with torchrun...


W1214 21:55:46.849000 130384474703680 torch/distributed/run.py:779] 
W1214 21:55:46.849000 130384474703680 torch/distributed/run.py:779] *****************************************
W1214 21:55:46.849000 130384474703680 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1214 21:55:46.849000 130384474703680 torch/distributed/run.py:779] *****************************************
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 1
checkpointer:
  _component_: torchtune.training.checkpointing._checkpointer.FullModelHFCheckpointer
  checkpoint_dir: /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725
  checkpoint_files:
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Herm

Saved iteration 1 model files to /home/ubuntu/atreides/experiments/models/rl34/0001


In [5]:
import nest_asyncio
nest_asyncio.apply()

In [6]:
trainer.episodes_per_iteration = 128

In [8]:
_ = await trainer.explore(verbosity=1)

explore:   0%|          | 0/128 [00:00<?, ?episode/s]

In [12]:
trainer.explore_results[-1].exceptions

[UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a value"),
 UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a value"),
 UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a value"),
 UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a value"),
 UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a value"),
 UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a value"),
 UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a value"),
 UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a value"),
 UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a value"),
 UnboundLocalError("cannot access local variable 'prompt' where it is not associated with a

In [None]:
await trainer.explore(verbosity=1)

In [10]:
trainer._completion_sampler.samplers[0].semaphore.max_concurrent_actions *= 2

In [12]:
trainer.max_mask_sequence_batch_size = 8

In [13]:
%%prun
asyncio.run(trainer.explore(verbosity=0))

 

         116315230 function calls (112477935 primitive calls) in 195.675 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   1312/3   29.919    0.023    0.000    0.000 {method 'acquire' of '_thread.lock' objects}
 2699/193   21.066    0.008   24.471    0.127 {method 'recv' of '_socket.socket' objects}
  3662/32   20.467    0.006    0.337    0.011 {method 'poll' of 'select.epoll' objects}
  298/289   13.027    0.044   51.754    0.179 threading.py:323(wait)
3663/3631    7.332    0.002   20.167    0.006 selectors.py:451(select)
24835537/24395659    4.434    0.000    5.651    0.000 {built-in method builtins.isinstance}
  4087847    3.793    0.000    5.793    0.000 utils.py:54(get_token)
        9    3.539    0.393    4.355    0.484 pack.py:307(get_mask)
      194    3.286    0.017   30.018    0.155 sock_client.py:242(_read_packet_bytes)
        9    3.215    0.357    3.215    0.357 explore_result.py:181(_write_mask)
    21351   

In [9]:
%%prun
asyncio.run(trainer.explore(verbosity=0))

 

         116214773 function calls (112534129 primitive calls) in 191.121 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 2599/193   21.598    0.008   30.502    0.158 {method 'recv' of '_socket.socket' objects}
 3062/187   18.748    0.006    0.927    0.005 {method 'poll' of 'select.epoll' objects}
       64   15.820    0.247   19.118    0.299 pack.py:307(get_mask)
       64   14.367    0.224   24.403    0.381 explore_result.py:181(_write_mask)
   1301/3   11.786    0.009    0.000    0.000 {method 'acquire' of '_thread.lock' objects}
  296/290    6.913    0.023   25.460    0.088 threading.py:323(wait)
3063/3033    4.451    0.001   18.395    0.006 selectors.py:451(select)
23950591/23526667    4.271    0.000    5.438    0.000 {built-in method builtins.isinstance}
      193    4.220    0.022   37.024    0.192 sock_client.py:242(_read_packet_bytes)
       65    4.177    0.064    4.177    0.064 {method 'any' of 'torch._C.TensorBa

In [12]:
trainer.explore_results[-1].exceptions

[httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout('')]

In [4]:
await trainer.train(iterations=1, verbosity=2)

Starting 1 vLLM servers...
$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --port=8001 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=2048 --max-num-batched-tokens=16384 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
INFO 12-14 20:11:02 api_server.py:528] vLLM API server version 0.6.3.post1
INFO 12-14 20:11:02 api_server.py:529] args: Namespace(subparser='serve', model_tag='NousResearch/Hermes-2-Theta-Llama-3-8B', config='', host=None, port=8001, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=True, disable_frontend_multiprocessing=False, enab

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:00,  4.05it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.56it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.15it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.28it/s]



INFO 12-14 20:11:19 model_runner.py:1067] Loading model weights took 14.9595 GB
INFO 12-14 20:11:20 gpu_executor.py:122] # GPU blocks: 11578, # CPU blocks: 8192
INFO 12-14 20:11:20 gpu_executor.py:126] Maximum concurrency for 16384 tokens per request: 22.61x
INFO 12-14 20:11:37 api_server.py:232] vLLM to use /tmp/tmp3n2vbgld as PROMETHEUS_MULTIPROC_DIR
INFO 12-14 20:11:37 launcher.py:19] Available routes are:
INFO 12-14 20:11:37 launcher.py:27] Route: /openapi.json, Methods: HEAD, GET
INFO 12-14 20:11:37 launcher.py:27] Route: /docs, Methods: HEAD, GET
INFO 12-14 20:11:37 launcher.py:27] Route: /docs/oauth2-redirect, Methods: HEAD, GET
INFO 12-14 20:11:37 launcher.py:27] Route: /redoc, Methods: HEAD, GET
INFO 12-14 20:11:37 launcher.py:27] Route: /health, Methods: GET
INFO 12-14 20:11:37 launcher.py:27] Route: /tokenize, Methods: POST
INFO 12-14 20:11:37 launcher.py:27] Route: /detokenize, Methods: POST
INFO 12-14 20:11:37 launcher.py:27] Route: /v1/models, Methods: GET
INFO 12-14 20:1

INFO:     Started server process [33152]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on socket ('0.0.0.0', 8001) (Press CTRL+C to quit)


INFO:     127.0.0.1:47054 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


val: 0episode [00:00, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl33/config.yaml


DEBUG:torchtune.utils._logging:Training is not distributed. If you want to train on multiple GPUs and are using the tune CLI, specify --nnodes 1 and --nproc_per_node [num_gpus]
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 1
checkpointer:
  _component_: torchtune.training.checkpointing._checkpointer.FullModelHFCheckpointer
  checkpoint_dir: /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725
  checkpoint_files:
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725/model-00004-of-00004.safetensors
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725/model-00001-of-00004.safetensors
  - /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b

CancelledError: 

In [6]:
trainer.explore_results[0].exceptions

[httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout('')]

In [5]:
trainer.eval_exceptions["val"]

[httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout(''),
 httpx.ReadTimeout('')]

In [27]:
await trainer.train(iterations=4, verbosity=1)

val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|185|Loss: 0.0443: 100%|██████████| 185/185 [51:20<00:00, 16.53s/it, entropy=0.6468, entropy_target=0.1032, kl_div=0.0934, policy=-0.0291, tanh_log_policy=0.0026, unclipped_policy=-0.0323, value=1.9048, weighted_ce=0.0188, weighted_entropy=-0.0312, weighted_kl_div=-0.0066]  

Saved iteration 10 model files to /home/ubuntu/atreides/experiments/models/rl29/0010
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl29/0010 --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|101|Loss: 0.0376: 100%|██████████| 101/101 [28:08<00:00, 16.61s/it, entropy=0.6686, entropy_target=0.0814, kl_div=0.0844, policy=-0.0282, tanh_log_policy=0.0022, unclipped_policy=-0.0298, value=1.0489, weighted_ce=0.0084, weighted_entropy=-0.0236, weighted_kl_div=-0.0026]

Saved iteration 11 model files to /home/ubuntu/atreides/experiments/models/rl29/0011
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl29/0011 --port=8001 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|127|Loss: 0.0216: 100%|██████████| 128/128 [35:40<00:00, 16.64s/it, entropy=0.7464, entropy_target=0.0036, kl_div=0.0928, policy=-0.0009, tanh_log_policy=-0.0017, unclipped_policy=-0.0051, value=5.0248, weighted_ce=-0.0075, weighted_entropy=0.0059, weighted_kl_div=0.0063] 

Saved iteration 12 model files to /home/ubuntu/atreides/experiments/models/rl29/0012
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl29/0012 --port=8001 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|143|Loss: 0.0361: 100%|██████████| 143/143 [39:47<00:00, 16.62s/it, entropy=0.6294, entropy_target=0.1206, kl_div=0.0729, policy=0.0015, tanh_log_policy=-0.0001, unclipped_policy=0.0004, value=1.1154, weighted_ce=-0.0008, weighted_entropy=0.0014, weighted_kl_div=0.0005]   

Saved iteration 13 model files to /home/ubuntu/atreides/experiments/models/rl29/0013
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl29/0013 --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=32 --api-key=default
vLLM servers started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)


In [28]:
await trainer.train(iterations=12, verbosity=1)

val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (0 remaining episodes x 15 patience per episode = 0 seconds)
Early stopping exploration due to expired patience (0 remaining episodes x 5 patience per episode = 0 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl29/config.yaml


1|100|Loss: 0.0312:  45%|████▌     | 100/222 [27:53<33:44, 16.59s/it, entropy=0.8347, entropy_target=0.0847, kl_div=0.0736, policy=-0.0045, tanh_log_policy=-0.0004, unclipped_policy=-0.0056, value=2.1442, weighted_ce=0.0013, weighted_entropy=-0.0045, weighted_kl_div=-0.0014]