In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
# from openai import AsyncOpenAI, Timeout

# reference_client = AsyncOpenAI(
#     api_key="default",
#     base_url="http://209.20.157.218:8000/v1",
#     timeout=Timeout(600, connect=60),
# )

In [4]:
# await reference_client.chat.completions.create(
#     messages=[{"role": "user", "content": "Hello!"}],
#     model="NousResearch/Hermes-2-Theta-Llama-3-8B",
# )

In [3]:
import asyncio
import json
from lib.clue import Clue, DeductiveSolver
from lib.rl.episode import Episode, EpisodeCompletion
from lib.rl.ppo import PPOLoss
from lib.rl.recipe import ComponentConfig, TuneRecipeConfig
from lib.rl.trainer import ExploreOptions, Trainer, vLLMConfig
from lib.utils import return_exception
import random
import re
import torch
from torchtune.models.llama3_1 import llama3_1_8b
from typing import Literal, Optional

with open("./data/chain-of-thought-examples.json") as f:
    chain_of_thought_examples: list[dict[str, str]] = json.load(f)


def get_variable_difficulty_game(
    return_first_solver_as_winner: Optional[bool] = None,
) -> Clue:
    num_players = random.randint(3, 6)
    num_weapons = max(
        3,
        min(
            num_players + random.randint(-1, 5),
            len(Clue.weapons),
        ),
    )
    num_suspects = min(
        num_weapons + random.randint(0, num_weapons - 1), len(Clue.suspects)
    )
    num_rooms = min(num_suspects + random.randint(0, num_suspects - 2), len(Clue.rooms))
    elements = {
        "suspect": random.sample(Clue.suspects, k=num_suspects),
        "weapon": random.sample(Clue.weapons, k=num_weapons),
        "room": random.sample(Clue.rooms, k=num_rooms),
    }
    if random.random() < 0.1:
        elements["motive"] = random.sample(
            Clue.motives,
            k=max(3, min(num_weapons + random.randint(-1, 3), len(Clue.motives))),
        )
    if random.random() < 0.1:
        frequency = random.choice([0.25, 0.5, 1.0])
        start = 24.0 - frequency
        end = 0.0
        for _ in range(random.randint(1, num_weapons + 1)):
            if random.randint(0, 1):
                end += frequency
            else:
                start -= frequency

        def format_time(time: float) -> str:
            return f"{int(time):02d}:{int(60 * (time - int(time))):02d}"

        elements["time"] = Clue.get_times(
            format_time(start), format_time(end), f"{int(frequency * 60)}min"
        )
    game = Clue(
        num_players=num_players,
        elements=elements,
    )
    difficulty_level = num_players + random.randint(-2, 3)
    # print(f"Players: {num_players}")
    # for element in elements:
    #     print(f"{element.capitalize()}: {len(elements[element])}")
    # print(f"Difficulty level: {difficulty_level}")
    return game.play(
        deductive_solver=DeductiveSolver(
            # note_cards_in_hand=False,
            # note_responses_to_suggestions=False,
            # note_cards_that_players_do_not_have=False,
            # check_unique_card_placement_constraints=False,
            # check_player_hand_size_constraints=False,
            check_solution_has_one_and_only_one_card_per_element=difficulty_level > 1,
            check_one_of_constraints=difficulty_level > 2,
            check_inverse_one_of_constraints=difficulty_level > 3,
            merge_and_check_disjoint_inverse_one_of_constraints=difficulty_level > 4,
            exhaustively_test_possible_assignments=False,
        ),
        cp_solver_max_solve_time_per_turn=0.01,
        check_cp_solver_grid=False,
        check_if_deductive_solver_and_cp_solver_grids_match=False,
        return_first_solver_as_winner=(
            bool(random.randint(0, 1))
            if return_first_solver_as_winner is None
            else return_first_solver_as_winner
        ),
        print_playthrough=False,
        max_turns=100,
    )


def get_easy_game(return_first_solver_as_winner: Optional[bool] = None) -> Clue:
    game = Clue(
        num_players=3,
        elements={
            "suspect": random.sample(Clue.suspects, k=3),
            "weapon": random.sample(Clue.weapons, k=3),
            "room": random.sample(Clue.rooms, k=3),
            # "motive": random.sample(Clue.motives, k=3),
            # "time": Clue.get_times("21:00", "03:00", "1h"),
        },
    )
    game.play(
        deductive_solver=DeductiveSolver(
            # note_cards_in_hand=False,
            # note_responses_to_suggestions=False,
            # note_cards_that_players_do_not_have=False,
            # check_unique_card_placement_constraints=False,
            # check_player_hand_size_constraints=False,
            check_solution_has_one_and_only_one_card_per_element=False,
            check_one_of_constraints=False,
            check_inverse_one_of_constraints=False,
            merge_and_check_disjoint_inverse_one_of_constraints=False,
            exhaustively_test_possible_assignments=False,
        ),
        cp_solver_max_solve_time_per_turn=0.01,
        check_cp_solver_grid=False,
        check_if_deductive_solver_and_cp_solver_grids_match=False,
        return_first_solver_as_winner=return_first_solver_as_winner or False,
        print_playthrough=False,
        max_turns=100,
    )
    return game


@return_exception
def sample_random_episode(
    difficulty: Literal["easy", "variable"] = "variable",
    example_probability: float = 0.0,
    max_prompt_characters: int = 8192,
    reward_follow_up_completion: bool = True,
    return_first_solver_as_winner: Optional[bool] = None,
) -> Episode:
    while True:
        try:
            game = (
                get_easy_game if difficulty == "easy" else get_variable_difficulty_game
            )(return_first_solver_as_winner=return_first_solver_as_winner)
            prompt, follow_up, solution = game.get_prompt_and_follow_up_and_solution()
        except ValueError:
            continue
        if len(prompt) <= max_prompt_characters:
            break

    async def reward_completion(completion: EpisodeCompletion) -> EpisodeCompletion:
        if len(completion.messages) == 2:
            follow_up_completion = await completion.follow_up(
                messages=[
                    {"role": "user", "content": follow_up},
                ]
            )
        else:
            follow_up_completion = completion
        answer = follow_up_completion.last_assistant_message.get("content")
        assert isinstance(answer, str)
        if reward_follow_up_completion:
            completion = follow_up_completion
        completion.reward = sum(
            [
                bool(
                    # Find first match of key followed by colon and capture following text
                    (
                        match := re.search(
                            rf"{key}: ([A-Za-z \.:-]+)",
                            answer,
                            re.IGNORECASE,
                        )
                    )
                    # Check if captured group matches expected value
                    and match.group(1).strip().lower() == value.strip().lower()
                )
                for key, value in solution.items()
            ]
        ) / len(solution)
        return completion

    async def on_sample(completions: list[EpisodeCompletion]) -> None:
        for completion in await asyncio.gather(
            *[reward_completion(completion) for completion in completions]
        ):
            completion.commit()

    example = (
        random.choice(chain_of_thought_examples)
        if random.random() < example_probability
        else None
    )

    return Episode(
        messages=[{"role": "user", "content": prompt}],
        examples=(
            [
                {"role": "user", "content": example["prompt"]},
                {
                    "role": "assistant",
                    "content": example["chain_of_thought"]
                    + (example["answer"] and f"\n\n---\n\n{example['answer']}"),
                },
            ]
            if example
            else []
        ),
        on_sample=on_sample,
    )


def train_episodes():
    while True:
        yield sample_random_episode()


model_name = "rl17"

trainer = Trainer(
    base_model="NousResearch/Hermes-2-Theta-Llama-3-8B",
    output_dir=f"./models/{model_name}",
    explore_options=ExploreOptions(
        iterations=4,
        num_parents=6,
        branch_factor=3,
        patience=5,
        sample_probability_power=None,
        sampling_kwargs={
            "max_tokens": 1024,
        },
    ),
    train_episodes=train_episodes(),
    episodes_per_iteration=64 * torch.cuda.device_count(),
    max_mask_sequence_batch_size=1,
    val_episodes=(
        sample_random_episode() for _ in range(64 * torch.cuda.device_count())
    ),
    val_patience=15,
    val_samples_per_episode=3,
    val_sampling_kwargs={"max_tokens": 1024},
    tune_model=llama3_1_8b,
    tune_model_type="LLAMA3",
    tune_recipe_config=TuneRecipeConfig(
        seed=42,
        shuffle=False,
        num_output_chunks=4,
        resume_from_checkpoint=False,
        batch_size=1,
        epochs=1,
        optimizer=ComponentConfig(
            "torch.optim.AdamW",
            # "bitsandbytes.optim.PagedAdamW8bit",
            # "bitsandbytes.optim.AdamW",
            # params=PLACEHOLDER,
            lr=4e-6,
            fused=True,
        ),
        loss=ComponentConfig(
            PPOLoss,
            policy_coef=0.0,
            clip_epsilon=0.2,
            unclipped_policy_coef=0.0,
            tanh_log_policy_coef=0.8,
            value_coef=0.0,
            entropy_coef=0.0,
            entropy_target=0.75,
            entropy_target_coef=0.1,
            kl_coef=0.1,
            weighted_entropy_coef=0.2,
            weighted_kl_coef=0.0,
            weighted_ce_coef=0.0,
            normalize_values=False,
            normalize_advantages=False,
        ),
        compile=False,
        optimizer_in_bwd=False,
        gradient_accumulation_steps=1,
        enable_activation_checkpointing=True,
        enable_activation_offloading=False,
        custom_sharded_layers=["tok_embeddings", "output"],
        log_every_n_steps=1,
        log_peak_memory_stats=True,
    ),
    # tune_run=False,
    tune_sequence_length=16384,
    vllm_config=vLLMConfig(
        env={"VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1"},
        kwargs=dict(
            block_size=32,
            disable_log_requests=True,
            enable_prefix_caching=True,
            enforce_eager=True,
            gpu_memory_utilization=0.9,
            max_model_len=16384,
            max_num_seqs=512,
            max_num_batched_tokens=16384 * 4,
            return_tokens_as_token_ids=True,
            swap_space=32,
        ),
        max_concurrent_samples=512,
        # min_time_between_requests=15 / (64 * 24),
        timeout=120 + 15 * torch.cuda.device_count(),
    ),
    wandb_kwargs=dict(
        name=model_name,
        id=model_name,
    ),
)

Resuming from /home/ubuntu/atreides/experiments/models/rl17/0007
INFO 12-11 19:52:33 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='NousResearch/Hermes-2-Theta-Llama-3-8B', speculative_config=None, tokenizer='NousResearch/Hermes-2-Theta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=NousResearch/Hermes-2-The

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbradhilton[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
await trainer.train(iterations=3)

Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl17/0007 --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --return-tokens-as-token-ids --swap-space=32 --api-key=default
INFO 12-11 19:52:41 api_server.py:528] vLLM API server version 0.6.3.post1
INFO 12-11 19:52:41 api_server.py:529] args: Namespace(subparser='serve', model_tag='/home/ubuntu/atreides/experiments/models/rl17/0007', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=True, disable_frontend_multiprocessing=False, enab

Loading pt checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
  state = torch.load(bin_file, map_location="cpu")
Loading pt checkpoint shards:  25% Completed | 1/4 [00:04<00:12,  4.20s/it]
Loading pt checkpoint shards:  50% Completed | 2/4 [00:05<00:04,  2.29s/it]
Loading pt checkpoint shards:  75% Completed | 3/4 [00:08<00:02,  2.90s/it]
Loading pt checkpoint shards: 100% Completed | 4/4 [00:11<00:00,  2.94s/it]
Loading pt checkpoint shards: 100% Completed | 4/4 [00:11<00:00,  2.95s/it]



INFO 12-11 19:53:05 model_runner.py:1067] Loading model weights took 14.9595 GB
INFO 12-11 19:53:07 gpu_executor.py:122] # GPU blocks: 12296, # CPU blocks: 8192
INFO 12-11 19:53:07 gpu_executor.py:126] Maximum concurrency for 16384 tokens per request: 24.02x
INFO 12-11 19:53:22 api_server.py:232] vLLM to use /tmp/tmpwrfvwt1s as PROMETHEUS_MULTIPROC_DIR
INFO 12-11 19:53:22 launcher.py:19] Available routes are:
INFO 12-11 19:53:22 launcher.py:27] Route: /openapi.json, Methods: HEAD, GET
INFO 12-11 19:53:22 launcher.py:27] Route: /docs, Methods: HEAD, GET
INFO 12-11 19:53:22 launcher.py:27] Route: /docs/oauth2-redirect, Methods: HEAD, GET
INFO 12-11 19:53:22 launcher.py:27] Route: /redoc, Methods: HEAD, GET
INFO 12-11 19:53:22 launcher.py:27] Route: /health, Methods: GET
INFO 12-11 19:53:22 launcher.py:27] Route: /tokenize, Methods: POST
INFO 12-11 19:53:22 launcher.py:27] Route: /detokenize, Methods: POST
INFO 12-11 19:53:22 launcher.py:27] Route: /v1/models, Methods: GET
INFO 12-11 19:5

INFO:     Started server process [54797]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on socket ('0.0.0.0', 8000) (Press CTRL+C to quit)


INFO:     127.0.0.1:57540 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


val: 0episode [00:00, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl17/config.yaml


DEBUG:torchtune.utils._logging:Training is not distributed. If you want to train on multiple GPUs and are using the tune CLI, specify --nnodes 1 and --nproc_per_node [num_gpus]
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 1
checkpointer:
  _component_: torchtune.training.checkpointing._checkpointer.FullModelHFCheckpointer
  checkpoint_dir: /home/ubuntu/atreides/experiments/models/rl17/0007
  checkpoint_files:
  - /home/ubuntu/atreides/experiments/models/rl17/0007/hf_model_0003_0.pt
  - /home/ubuntu/atreides/experiments/models/rl17/0007/hf_model_0004_0.pt
  - /home/ubuntu/atreides/experiments/models/rl17/0007/hf_model_0001_0.pt
  - /home/ubuntu/atreides/experiments/models/rl17/0007/hf_model_0002_0.pt
  model_type: LLAMA3
  output_dir: /home/ubuntu/atreides/experiments/models/rl17
  recipe_checkpoint: null
compile: false
custom_sharded_layers:
- tok_embeddings
- output
dataset:
  _component_: lib.rl.pack.PackedDataset
  dir: /home/ubuntu/atr

Saved iteration 8 model files to /home/ubuntu/atreides/experiments/models/rl17/0008
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl17/0008 --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --return-tokens-as-token-ids --swap-space=32 --api-key=default
INFO 12-11 20:10:47 api_server.py:528] vLLM API server version 0.6.3.post1
INFO 12-11 20:10:47 api_server.py:529] args: Namespace(subparser='serve', model_tag='/home/ubuntu/atreides/experiments/models/rl17/0008', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middlewa

Loading pt checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
  state = torch.load(bin_file, map_location="cpu")
Loading pt checkpoint shards:  25% Completed | 1/4 [00:04<00:12,  4.14s/it]
Loading pt checkpoint shards:  50% Completed | 2/4 [00:05<00:04,  2.28s/it]
Loading pt checkpoint shards:  75% Completed | 3/4 [00:09<00:03,  3.15s/it]
Loading pt checkpoint shards: 100% Completed | 4/4 [00:13<00:00,  3.49s/it]
Loading pt checkpoint shards: 100% Completed | 4/4 [00:13<00:00,  3.33s/it]



INFO 12-11 20:11:13 model_runner.py:1067] Loading model weights took 14.9595 GB
INFO 12-11 20:11:16 gpu_executor.py:122] # GPU blocks: 12296, # CPU blocks: 8192
INFO 12-11 20:11:16 gpu_executor.py:126] Maximum concurrency for 16384 tokens per request: 24.02x
INFO 12-11 20:11:29 api_server.py:232] vLLM to use /tmp/tmp7_k1gu4y as PROMETHEUS_MULTIPROC_DIR
INFO 12-11 20:11:29 launcher.py:19] Available routes are:
INFO 12-11 20:11:29 launcher.py:27] Route: /openapi.json, Methods: GET, HEAD
INFO 12-11 20:11:29 launcher.py:27] Route: /docs, Methods: GET, HEAD
INFO 12-11 20:11:29 launcher.py:27] Route: /docs/oauth2-redirect, Methods: GET, HEAD
INFO 12-11 20:11:29 launcher.py:27] Route: /redoc, Methods: GET, HEAD
INFO 12-11 20:11:29 launcher.py:27] Route: /health, Methods: GET
INFO 12-11 20:11:29 launcher.py:27] Route: /tokenize, Methods: POST
INFO 12-11 20:11:29 launcher.py:27] Route: /detokenize, Methods: POST
INFO 12-11 20:11:29 launcher.py:27] Route: /v1/models, Methods: GET
INFO 12-11 20:1

INFO:     Started server process [61619]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on socket ('0.0.0.0', 8000) (Press CTRL+C to quit)


INFO:     127.0.0.1:45376 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (1 remaining episodes x 15 patience per episode = 15 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl17/config.yaml


DEBUG:torchtune.utils._logging:Training is not distributed. If you want to train on multiple GPUs and are using the tune CLI, specify --nnodes 1 and --nproc_per_node [num_gpus]
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 1
checkpointer:
  _component_: torchtune.training.checkpointing._checkpointer.FullModelHFCheckpointer
  checkpoint_dir: /home/ubuntu/atreides/experiments/models/rl17/0008
  checkpoint_files:
  - /home/ubuntu/atreides/experiments/models/rl17/0008/hf_model_0003_0.pt
  - /home/ubuntu/atreides/experiments/models/rl17/0008/hf_model_0004_0.pt
  - /home/ubuntu/atreides/experiments/models/rl17/0008/hf_model_0001_0.pt
  - /home/ubuntu/atreides/experiments/models/rl17/0008/hf_model_0002_0.pt
  model_type: LLAMA3
  output_dir: /home/ubuntu/atreides/experiments/models/rl17
  recipe_checkpoint: null
compile: false
custom_sharded_layers:
- tok_embeddings
- output
dataset:
  _component_: lib.rl.pack.PackedDataset
  dir: /home/ubuntu/atr

Saved iteration 9 model files to /home/ubuntu/atreides/experiments/models/rl17/0009
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl17/0009 --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --return-tokens-as-token-ids --swap-space=32 --api-key=default
INFO 12-11 20:25:58 api_server.py:528] vLLM API server version 0.6.3.post1
INFO 12-11 20:25:58 api_server.py:529] args: Namespace(subparser='serve', model_tag='/home/ubuntu/atreides/experiments/models/rl17/0009', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middlewa

Loading pt checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
  state = torch.load(bin_file, map_location="cpu")
Loading pt checkpoint shards:  25% Completed | 1/4 [00:04<00:12,  4.21s/it]
Loading pt checkpoint shards:  50% Completed | 2/4 [00:05<00:04,  2.30s/it]
Loading pt checkpoint shards:  75% Completed | 3/4 [00:09<00:03,  3.13s/it]
Loading pt checkpoint shards: 100% Completed | 4/4 [00:13<00:00,  3.39s/it]
Loading pt checkpoint shards: 100% Completed | 4/4 [00:13<00:00,  3.27s/it]



INFO 12-11 20:26:21 model_runner.py:1067] Loading model weights took 14.9595 GB
INFO 12-11 20:26:24 gpu_executor.py:122] # GPU blocks: 12296, # CPU blocks: 8192
INFO 12-11 20:26:24 gpu_executor.py:126] Maximum concurrency for 16384 tokens per request: 24.02x
INFO 12-11 20:26:39 api_server.py:232] vLLM to use /tmp/tmp4cghg_tk as PROMETHEUS_MULTIPROC_DIR
INFO 12-11 20:26:39 launcher.py:19] Available routes are:
INFO 12-11 20:26:39 launcher.py:27] Route: /openapi.json, Methods: HEAD, GET
INFO 12-11 20:26:39 launcher.py:27] Route: /docs, Methods: HEAD, GET
INFO 12-11 20:26:39 launcher.py:27] Route: /docs/oauth2-redirect, Methods: HEAD, GET
INFO 12-11 20:26:39 launcher.py:27] Route: /redoc, Methods: HEAD, GET
INFO 12-11 20:26:39 launcher.py:27] Route: /health, Methods: GET
INFO 12-11 20:26:39 launcher.py:27] Route: /tokenize, Methods: POST
INFO 12-11 20:26:39 launcher.py:27] Route: /detokenize, Methods: POST
INFO 12-11 20:26:39 launcher.py:27] Route: /v1/models, Methods: GET
INFO 12-11 20:2

INFO:     Started server process [67204]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on socket ('0.0.0.0', 8000) (Press CTRL+C to quit)


INFO:     127.0.0.1:42082 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]

explore:   0%|          | 0/64 [00:00<?, ?episode/s]

Early stopping val evaluation due to expired patience (1 remaining episodes x 15 patience per episode = 15 seconds)
$ tune run lib.rl.recipe.TuneRecipe --config /home/ubuntu/atreides/experiments/models/rl17/config.yaml


DEBUG:torchtune.utils._logging:Training is not distributed. If you want to train on multiple GPUs and are using the tune CLI, specify --nnodes 1 and --nproc_per_node [num_gpus]
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 1
checkpointer:
  _component_: torchtune.training.checkpointing._checkpointer.FullModelHFCheckpointer
  checkpoint_dir: /home/ubuntu/atreides/experiments/models/rl17/0009
  checkpoint_files:
  - /home/ubuntu/atreides/experiments/models/rl17/0009/hf_model_0003_0.pt
  - /home/ubuntu/atreides/experiments/models/rl17/0009/hf_model_0004_0.pt
  - /home/ubuntu/atreides/experiments/models/rl17/0009/hf_model_0001_0.pt
  - /home/ubuntu/atreides/experiments/models/rl17/0009/hf_model_0002_0.pt
  model_type: LLAMA3
  output_dir: /home/ubuntu/atreides/experiments/models/rl17
  recipe_checkpoint: null
compile: false
custom_sharded_layers:
- tok_embeddings
- output
dataset:
  _component_: lib.rl.pack.PackedDataset
  dir: /home/ubuntu/atr

Saved iteration 10 model files to /home/ubuntu/atreides/experiments/models/rl17/0010
Starting 1 vLLM servers...
$ vllm serve /home/ubuntu/atreides/experiments/models/rl17/0010 --port=8000 --block-size=32 --disable-log-requests --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.9 --max-model-len=16384 --max-num-seqs=512 --max-num-batched-tokens=65536 --return-tokens-as-token-ids --swap-space=32 --api-key=default
INFO 12-11 20:44:11 api_server.py:528] vLLM API server version 0.6.3.post1
INFO 12-11 20:44:11 api_server.py:529] args: Namespace(subparser='serve', model_tag='/home/ubuntu/atreides/experiments/models/rl17/0010', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middlew

Loading pt checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
  state = torch.load(bin_file, map_location="cpu")
Loading pt checkpoint shards:  25% Completed | 1/4 [00:04<00:12,  4.25s/it]
Loading pt checkpoint shards:  50% Completed | 2/4 [00:05<00:04,  2.32s/it]
Loading pt checkpoint shards:  75% Completed | 3/4 [00:09<00:03,  3.17s/it]
Loading pt checkpoint shards: 100% Completed | 4/4 [00:13<00:00,  3.54s/it]
Loading pt checkpoint shards: 100% Completed | 4/4 [00:13<00:00,  3.37s/it]



INFO 12-11 20:44:38 model_runner.py:1067] Loading model weights took 14.9595 GB
INFO 12-11 20:44:41 gpu_executor.py:122] # GPU blocks: 12296, # CPU blocks: 8192
INFO 12-11 20:44:41 gpu_executor.py:126] Maximum concurrency for 16384 tokens per request: 24.02x
INFO 12-11 20:44:58 api_server.py:232] vLLM to use /tmp/tmpmk2v773c as PROMETHEUS_MULTIPROC_DIR
INFO 12-11 20:44:58 launcher.py:19] Available routes are:
INFO 12-11 20:44:58 launcher.py:27] Route: /openapi.json, Methods: GET, HEAD
INFO 12-11 20:44:58 launcher.py:27] Route: /docs, Methods: GET, HEAD
INFO 12-11 20:44:58 launcher.py:27] Route: /docs/oauth2-redirect, Methods: GET, HEAD
INFO 12-11 20:44:58 launcher.py:27] Route: /redoc, Methods: GET, HEAD
INFO 12-11 20:44:58 launcher.py:27] Route: /health, Methods: GET
INFO 12-11 20:44:58 launcher.py:27] Route: /tokenize, Methods: POST
INFO 12-11 20:44:58 launcher.py:27] Route: /detokenize, Methods: POST
INFO 12-11 20:44:58 launcher.py:27] Route: /v1/models, Methods: GET
INFO 12-11 20:4

INFO:     Started server process [73989]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on socket ('0.0.0.0', 8000) (Press CTRL+C to quit)


INFO:     127.0.0.1:49378 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/64 [00:00<?, ?episode/s]