In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
from itertools import permutations

import openai
from dotenv import load_dotenv

import art
from art.local import LocalBackend

load_dotenv()

backend = LocalBackend()
model = art.TrainableModel(
    name="009",
    project="yes-no-maybe",
    base_model="Qwen/Qwen2.5-7B-Instruct",
    # _internal_config=art.dev.InternalModelConfig(
    #     _decouple_vllm_and_unsloth=True,
    #     engine_args=art.dev.EngineArgs(gpu_memory_utilization=0.7),
    # ),
)
await model.register(backend)


async def rollout(client: openai.AsyncOpenAI, prompt: str) -> art.Trajectory:
    messages: art.Messages = [
        {
            "role": "user",
            "content": prompt,
        }
    ]
    chat_completion = await client.chat.completions.create(
        messages=messages, model=model.name, max_tokens=100, timeout=100
    )
    choice = chat_completion.choices[0]
    content = choice.message.content
    assert isinstance(content, str)
    if content == "yes":
        reward = 0.5
    elif content == "no":
        reward = 0.75
    elif content == "maybe":
        reward = 1.0
    else:
        reward = 0.0
    return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward)


def with_quotes(w: str) -> str:
    return f"'{w}'"


prompts = [
    f"{prefix} with {', '.join([with_quotes(w) if use_quotes else w for w in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}"
    for prefix in ["respond", "just respond"]
    for use_quotes in [True, False]
    for words in (
        list(p) for n in [3, 2] for p in permutations(["yes", "no", "maybe"], n)
    )
]

openai_client = model.openai_client()
for _ in range(await model.get_step(), 1_000):
    train_groups = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(rollout(openai_client, prompt) for _ in range(32))
            for prompt in prompts
        )
    )
    await model.train(
        train_groups,
        config=art.TrainConfig(learning_rate=1e-4),
        # _config=art.dev.TrainConfig(
        #     precalculate_logprobs=True,
        # ),
    )



ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.



Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth  # type: ignore # noqa: F401


ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
INFO 12-29 15:57:42 [vllm_utils.py:702] Unsloth: Patching vLLM v1 graph capture
==((====))==  Unsloth 2025.12.5: Fast Qwen2 patching. Transformers: 4.57.3. vLLM: 0.12.0.
   \\   /|    NVIDIA H200. Num GPUs = 1. Max memory: 139.811 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 9.0. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit with actual GPU utilization = 78.66%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 139.81 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32768. Num Sequences = 256.
Unsloth: vLLM's KV Cache can use up to 104.11 GB. Also swap space = 6 GB.
Unsloth: Not an error, but `use_cud

[0;36m(EngineCore_DP0 pid=1722586)[0;0m   PydanticSerializationUnexpectedValue(Expected `enum` - serialized value may not be as expected [field_name='mode', input_value=3, input_type=int])
[0;36m(EngineCore_DP0 pid=1722586)[0;0m   return self.serializer.to_python(


[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:57:53 [core.py:93] Initializing a V1 LLM engine (v0.12.0) with config: model='unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit', speculative_config=None, tokenizer='unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=bitsandbytes, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, col

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:00<00:00, 118.08it/s]
[0;36m(EngineCore_DP0 pid=1722586)[0;0m 
Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.33it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.82it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.72it/s]
[0;36m(EngineCore_DP0 pid=1722586)[0;0m 


[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:57:57 [punica_selector.py:20] Using PunicaWrapperGPU.
[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:57:57 [gpu_model_runner.py:3549] Model loading took 6.7342 GiB memory and 2.412322 seconds
[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:04 [backends.py:655] Using cache directory: /root/.cache/vllm/torch_compile_cache/ab34b90906/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:04 [backends.py:715] Dynamo bytecode transform time: 6.21 s


[0;36m(EngineCore_DP0 pid=1722586)[0;0m [rank0]:W1229 15:58:04.540000 1722586 torch/_inductor/remote_cache.py:362] [0/0] Unable to create a remote cache
[0;36m(EngineCore_DP0 pid=1722586)[0;0m [rank0]:W1229 15:58:04.540000 1722586 torch/_inductor/remote_cache.py:362] [0/0] Traceback (most recent call last):
[0;36m(EngineCore_DP0 pid=1722586)[0;0m [rank0]:W1229 15:58:04.540000 1722586 torch/_inductor/remote_cache.py:362] [0/0]   File "/root/art/.venv/lib/python3.10/site-packages/torch/_inductor/remote_cache.py", line 359, in create_cache
[0;36m(EngineCore_DP0 pid=1722586)[0;0m [rank0]:W1229 15:58:04.540000 1722586 torch/_inductor/remote_cache.py:362] [0/0]     return cache_cls(key)
[0;36m(EngineCore_DP0 pid=1722586)[0;0m [rank0]:W1229 15:58:04.540000 1722586 torch/_inductor/remote_cache.py:362] [0/0]   File "/root/art/.venv/lib/python3.10/site-packages/torch/_inductor/remote_cache.py", line 303, in __init__
[0;36m(EngineCore_DP0 pid=1722586)[0;0m [rank0]:W1229 15:58:04.54000

[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:06 [backends.py:216] Directly load the compiled graph(s) for dynamic shape from the cache, took 1.819 s
[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:07 [monitor.py:34] torch.compile takes 8.02 s in total
[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:08 [gpu_worker.py:359] Available KV cache memory: 101.57 GiB
[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:08 [kv_cache_utils.py:1286] GPU KV cache size: 1,901,760 tokens
[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:08 [kv_cache_utils.py:1291] Maximum concurrency for 32,768 tokens per request: 58.04x
[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:08 [kernel_warmup.py:65] Warming up FlashInfer attention.


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/102 [00:00<?, ?it/s]



Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 102/102 [00:11<00:00,  8.82it/s]
Capturing CUDA graphs (decode, FULL): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 70/70 [00:06<00:00, 11.30it/s]


[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:27 [gpu_model_runner.py:4466] Graph capturing finished in 18 secs, took 3.25 GiB
[0;36m(EngineCore_DP0 pid=1722586)[0;0m INFO 12-29 15:58:27 [core.py:254] init engine (profile, create kv cache, warmup model) took 29.59 seconds


  weights = llm.collective_rpc("get_weight_ipc_handles", args = tuple())[0]


RuntimeError: Unsloth: Cannot get internal vLLM states with error = 'coroutine' object is not subscriptable