In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from lib.recipes.full_finetune import ComponentConfig, FullFinetuneConfig, recipe_main
import subprocess
from torch.optim.adamw import AdamW
from torchtune.datasets import alpaca_dataset
from torchtune.models.llama3 import llama3_tokenizer
from torchtune.models.llama3_1 import llama3_1_8b
from torchtune.modules.loss import CEWithChunkedOutputLoss
from torchtune.training import FullModelHFCheckpointer
from torchtune.training.metric_logging import DiskLogger
from typing import Any

PLACEHOLDER: Any = None

checkpoint_dir = subprocess.run(
    "HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download NousResearch/Hermes-2-Theta-Llama-3-8B",
    shell=True,
    capture_output=True,
    text=True,
).stdout.strip()

recipe_main(
    FullFinetuneConfig(
        # Tokenizer
        tokenizer=ComponentConfig(
            llama3_tokenizer,  # type: ignore
            path="/home/ubuntu/atreides/.venv/lib/python3.12/site-packages/llama_models/llama3/api/tokenizer.model",
            max_seq_len=None,
        ),
        # Dataset
        dataset=ComponentConfig(alpaca_dataset, tokenizer=PLACEHOLDER, packed=False),
        seed=None,
        shuffle=True,
        # Model
        model=ComponentConfig(llama3_1_8b),
        # Checkpointer
        checkpointer=ComponentConfig(
            FullModelHFCheckpointer,
            checkpoint_dir=checkpoint_dir,
            checkpoint_files=[
                "model-00001-of-00004.safetensors",
                "model-00002-of-00004.safetensors",
                "model-00003-of-00004.safetensors",
                "model-00004-of-00004.safetensors",
            ],
            recipe_checkpoint=None,
            output_dir="/tmp/Hermes-2-Theta-Llama-3-8B/",
            model_type="LLAMA3",
        ),
        resume_from_checkpoint=False,
        # Fine-tuning arguments
        batch_size=12,
        epochs=3,
        optimizer=ComponentConfig(AdamW, params=PLACEHOLDER, lr=2e-5, fused=True),
        loss=ComponentConfig(CEWithChunkedOutputLoss),
        max_steps_per_epoch=None,
        compile=False,
        optimizer_in_bwd=False,
        gradient_accumulation_steps=1,
        # Training env
        device="cuda",
        # Memory management
        enable_activation_checkpointing=True,
        enable_activation_offloading=False,
        custom_sharded_layers=["tok_embeddings", "output"],
        # Reduced precision
        dtype="bf16",
        # Logging
        metric_logger=ComponentConfig(
            DiskLogger, log_dir="/home/ubuntu/atreides/experiments/logs"
        ),
        output_dir="/home/ubuntu/atreides/experiments/logs",
        log_every_n_steps=16,
        log_peak_memory_stats=True,
    )
)

In [1]:
from lib.recipes.rl import ComponentConfig, RLConfig, recipe_main
from lib.rl.trajectory import Trajectories
from lib.rl.ppo import PPOLoss
import os
import subprocess
from torch.optim.adamw import AdamW
from torchtune.models.llama3 import llama3_tokenizer
from torchtune.models.llama3_1 import llama3_1_8b
from torchtune.training import FullModelHFCheckpointer
from torchtune.training.metric_logging import DiskLogger
from typing import Any

PLACEHOLDER: Any = None

checkpoint_dir = subprocess.run(
    "HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download NousResearch/Hermes-2-Theta-Llama-3-8B",
    shell=True,
    capture_output=True,
    text=True,
).stdout.strip()

checkpoint_output_dir = "/home/ubuntu/atreides/experiments/models/rl"
os.makedirs(checkpoint_output_dir, exist_ok=True)

recipe_main(
    RLConfig(
        # Tokenizer
        tokenizer=ComponentConfig(
            llama3_tokenizer,  # type: ignore
            path="/home/ubuntu/atreides/.venv/lib/python3.12/site-packages/llama_models/llama3/api/tokenizer.model",
            max_seq_len=None,
        ),
        # Dataset
        dataset=ComponentConfig(
            Trajectories, dir="./data/trajectories", rows=64, seqlen=8192
        ),
        seed=None,
        shuffle=False,
        # Model
        model=ComponentConfig(llama3_1_8b),
        num_output_chunks=8,
        # Checkpointer
        checkpointer=ComponentConfig(
            "torchtune.training.FullModelHFCheckpointer",
            checkpoint_dir=checkpoint_dir,
            checkpoint_files=[
                "model-00001-of-00004.safetensors",
                "model-00002-of-00004.safetensors",
                "model-00003-of-00004.safetensors",
                "model-00004-of-00004.safetensors",
            ],
            recipe_checkpoint=None,
            output_dir="/home/ubuntu/atreides/experiments/models/rl",
            model_type="LLAMA3",
        ),
        resume_from_checkpoint=False,
        # Fine-tuning arguments
        batch_size=4,
        epochs=4,
        optimizer=ComponentConfig(
            # AdamW,
            "bitsandbytes.optim.PagedAdamW8bit",
            params=PLACEHOLDER,
            lr=2e-5,
            # fused=True,
        ),
        loss=ComponentConfig(PPOLoss, policy_coef=0.0, entropy_coef=0.0, kl_coef=1.0),
        max_steps_per_epoch=None,
        compile=False,
        optimizer_in_bwd=False,
        gradient_accumulation_steps=1,
        # Training env
        device="cuda",
        # Memory management
        enable_activation_checkpointing=True,
        enable_activation_offloading=False,
        custom_sharded_layers=["tok_embeddings", "output"],
        # Reduced precision
        dtype="bf16",
        # Logging
        metric_logger=ComponentConfig(
            DiskLogger, log_dir="/home/ubuntu/atreides/experiments/logs"
        ),
        output_dir="/home/ubuntu/atreides/experiments/logs",
        log_every_n_steps=1,
        log_peak_memory_stats=True,
    )
)

No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION
INFO:torchtune.utils._logging:Running FullFinetuneRecipe with resolved config:

batch_size: 4
checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725
  checkpoint_files:
  - model-00001-of-00004.safetensors
  - model-00002-of-00004.safetensors
  - model-00003-of-00004.safetensors
  - model-00004-of-00004.safetensors
  model_type: LLAMA3
  output_dir: /home/ubuntu/atreides/experiments/models/rl
  recipe_checkpoint: null
compile: false
custom_sharded_layers:
- tok_embeddings
- output
dataset:
  _component_: !!python/name:lib.rl.trajectory.Trajectories ''
  dir: ./data/trajectories
  rows: 64
  seqlen: 8192
device: cuda
dtype: bf16
enable_activation_checkpointing: true
enable_activation_offloading: false
epochs: 4
gradient_accumulation_s

Writing logs to /home/ubuntu/atreides/experiments/logs/log_1731726234.txt


INFO:torchtune.utils._logging:FSDP is enabled. Instantiating model and loading checkpoint on Rank 0 ...
INFO:torchtune.utils._logging:Instantiating model and loading checkpoint took 2.87 secs
INFO:torchtune.utils._logging:Memory stats after model init:
	GPU peak memory allocation: 15.02 GiB
	GPU peak memory reserved: 15.14 GiB
	GPU peak memory active: 15.02 GiB
INFO:torchtune.utils._logging:Optimizer is initialized.
INFO:torchtune.utils._logging:Loss is initialized.
INFO:torchtune.utils._logging:Dataset and Sampler are initialized.
INFO:torchtune.utils._logging: Profiler config after instantiation: {'enabled': False}
  0%|          | 0/16 [00:00<?, ?it/s]

0.9796332716941833
0.4069623649120331
0.2428489774465561
0.21742001175880432
0.1409558802843094
0.05768474563956261
0.11080523580312729
0.02910163626074791


  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
1|1|Loss: 0.5732055306434631:   6%|▋         | 1/16 [00:11<02:46, 11.08s/it]

0.7007660269737244
0.2073778361082077
0.1726682186126709
0.18785500526428223
0.17818306386470795
0.09780827164649963
0.18233007192611694
0.08450165390968323


1|2|Loss: 0.9761295318603516:  12%|█▎        | 2/16 [00:19<02:13,  9.55s/it]

0.7382146120071411
0.2733241021633148
0.16987130045890808
0.11096843332052231
0.1245405524969101
0.1456393450498581
0.08356813341379166
0.053677719086408615


1|3|Loss: 0.954547107219696:  19%|█▉        | 3/16 [00:28<01:57,  9.05s/it] 

0.852419376373291
0.4596032500267029
0.2477872520685196
0.16790050268173218
0.17584165930747986
0.14888666570186615
0.03408844769001007
0.12233440577983856


1|4|Loss: 0.5631217956542969:  25%|██▌       | 4/16 [00:36<01:45,  8.83s/it]

0.8264569044113159
0.26612284779548645
0.10914016515016556
0.2141476571559906
0.14685174822807312
0.14411896467208862
0.11655472218990326
0.15910430252552032


1|5|Loss: 0.490572065114975:  31%|███▏      | 5/16 [00:44<01:35,  8.70s/it] 

0.8184608221054077
0.22658227384090424
0.1504455953836441
0.24943718314170837
0.17534524202346802
0.17088118195533752
0.0848056823015213
0.11278649419546127


1|6|Loss: 0.4599432051181793:  38%|███▊      | 6/16 [00:53<01:26,  8.63s/it]

0.8514193296432495
0.29765111207962036
0.21367309987545013
0.20665451884269714
0.13137389719486237
0.09455224126577377
0.11017384380102158
0.07322470098733902


1|7|Loss: 0.424756795167923:  44%|████▍     | 7/16 [01:01<01:17,  8.59s/it] 

0.8836952447891235
0.33418747782707214
0.1484004706144333
0.20883631706237793
0.23512962460517883
0.21830639243125916
0.12820032238960266
0.07203420251607895


1|8|Loss: 0.41359126567840576:  50%|█████     | 8/16 [01:10<01:08,  8.56s/it]

0.8658627271652222
0.33858659863471985
0.1120242103934288
0.17850235104560852
0.2169209122657776
0.15585964918136597
0.12817494571208954
0.1354888379573822


1|9|Loss: 0.42370614409446716:  56%|█████▋    | 9/16 [01:19<00:59,  8.55s/it]

0.8762689828872681
0.2747672200202942
0.2659517526626587
0.22416619956493378
0.19021129608154297
0.15314172208309174
0.18659624457359314
0.16077107191085815


1|10|Loss: 0.38965141773223877:  62%|██████▎   | 10/16 [01:27<00:51,  8.54s/it]

0.9085795283317566
0.2850969433784485
0.13537360727787018
0.15171530842781067
0.1805664747953415
0.2122650444507599
0.14927813410758972
0.10628407448530197


1|11|Loss: 0.3994769752025604:  69%|██████▉   | 11/16 [01:36<00:42,  8.54s/it] 

0.887200653553009
0.2456841766834259
0.13437752425670624
0.2433578222990036
0.32327958941459656
0.08836952596902847
0.18314141035079956
0.1404028981924057


1|12|Loss: 0.35594600439071655:  75%|███████▌  | 12/16 [01:44<00:34,  8.53s/it]

0.8650654554367065
0.4375864267349243
0.25822216272354126
0.21933135390281677
0.18941274285316467
0.10738907009363174
0.10803672671318054
0.13299255073070526


KeyboardInterrupt: 

In [21]:
from lib.rl.trajectory import Trajectories
import torch

tensors = Trajectories(dir="./data/trajectories", rows=256, seqlen=8192).tensors
advantages = tensors["advantages"]
torch.where(tensors["advantages"].view(-1) == 0.0)

(tensor([ 218019,  218020,  218021,  ..., 2097149, 2097150, 2097151]),)

In [28]:
8192 * 254

2080768

In [11]:
llama3_tokenizer(
    path="/home/ubuntu/atreides/.venv/lib/python3.12/site-packages/llama_models/llama3/api/tokenizer.model",
    max_seq_len=None,
).decode([68818])

'elix'

In [2]:
from lib.tokenizer import Tokenizer

tokenizer = Tokenizer(model="NousResearch/Hermes-2-Theta-Llama-3-8B")

No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


INFO 10-31 00:23:16 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='NousResearch/Hermes-2-Theta-Llama-3-8B', speculative_config=None, tokenizer='NousResearch/Hermes-2-Theta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=NousResearch/Hermes-2-Theta-Llama-3-8B, use_v2_block_manager=True, num_scheduler_steps=1, chunked_

In [3]:
from torchtune.models.llama3_1 import llama3_1_8b

model = llama3_1_8b()

In [4]:
import glob
import subprocess

model_dir = subprocess.run(
    "HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download NousResearch/Hermes-2-Theta-Llama-3-8B",
    shell=True,
    capture_output=True,
    text=True,
).stdout.strip()

print(model_dir)

/home/ubuntu/.cache/huggingface/hub/models--NousResearch--Hermes-2-Theta-Llama-3-8B/snapshots/57a73110702e7b05ba3f39fef36297454c680725


In [5]:
import os
from torchtune.training.checkpointing import FullModelHFCheckpointer

output_dir = "./models/test"
os.makedirs(output_dir, exist_ok=True)

checkpointer = FullModelHFCheckpointer(
    checkpoint_dir=model_dir,
    checkpoint_files=glob.glob(f"{model_dir}/*.safetensors")
    + glob.glob(f"{model_dir}/*.pt"),
    output_dir=output_dir,
    model_type="LLAMA3",  # type: ignore
)
state_dict = checkpointer.load_checkpoint()
state_dict.keys()

dict_keys(['model'])

In [6]:
model.load_state_dict(state_dict["model"])

<All keys matched successfully>

In [7]:
# Check if model is compiled
is_compiled = hasattr(model, '_orig_mod')
print(is_compiled)

False


In [8]:
from lib.rl import Completion

root = Completion.model_validate_json(open("./data/completions.json").read())

In [10]:
max_length = 0
for leaf in root.leaves():
    length = len(tokenizer.encode(leaf.all_message_params()))
    if length > max_length:
        max_length = length
        clear_output()
        print(max_length)

In [19]:
len(tokenizer.encode([{"role": "user", "content": "Hello, how are you?"},]))

12

In [10]:
import os
import shutil

epoch = 1

# Save the model
checkpointer.save_checkpoint(dict(model=model.state_dict()), epoch)

# Create target directory if it doesn't exist
os.makedirs(f"{output_dir}/{epoch:04d}", exist_ok=True)

# Copy all non-safetensors files from model_dir to target
for file in os.listdir(model_dir):
    if not file.endswith(".safetensors") and not file.endswith(".pt"):
        src = os.path.join(model_dir, file)
        dst = os.path.join(f"{output_dir}/{epoch:04d}", file)
        shutil.copy2(src, dst)

# Move all .pt files from ./models/test to ./models/test/0000
for file in os.listdir(output_dir):
    if file.endswith(".pt"):
        src = os.path.join(output_dir, file)
        dst = os.path.join(f"{output_dir}/{epoch:04d}", file)
        shutil.move(src, dst)

INFO:torchtune.utils._logging:Model checkpoint of size 9.95 GB saved to models/test/hf_model_0001_1.pt
INFO:torchtune.utils._logging:Model checkpoint of size 10.00 GB saved to models/test/hf_model_0002_1.pt
INFO:torchtune.utils._logging:Model checkpoint of size 9.83 GB saved to models/test/hf_model_0003_1.pt
INFO:torchtune.utils._logging:Model checkpoint of size 2.34 GB saved to models/test/hf_model_0004_1.pt
INFO:torchtune.utils._logging:Saving final epoch checkpoint.
INFO:torchtune.utils._logging:The full model checkpoint, including all weights and configurations, has been saved successfully.You can now use this checkpoint for further training or inference.


In [7]:
def get_tokens(messages: list[dict]) -> list[int]:
    generate = llm.generate

    def get_tokens(prompts: list[dict], *args: object, **kwargs: object) -> list[int]:
        return llm.get_tokenizer().encode(prompts[0]["prompt"])

    llm.generate = get_tokens  # type: ignore
    tokens = llm.chat(messages)  # type: ignore
    llm.generate = generate  # type: ignore
    return tokens  # type: ignore


get_tokens([dict(role="user", content=prompt)])

[128000,
 128000,
 128002,
 882,
 198,
 1966,
 264,
 8369,
 10683,
 1938,
 19367,
 11,
 480,
 285,
 6853,
 323,
 58280,
 7731,
 1523,
 311,
 1514,
 264,
 16736,
 23347,
 1847,
 382,
 7009,
 35105,
 220,
 18,
 30881,
 315,
 7563,
 11,
 1855,
 369,
 264,
 8821,
 955,
 315,
 2038,
 24306,
 315,
 279,
 2768,
 1473,
 78524,
 1002,
 512,
 12,
 9083,
 81818,
 198,
 12,
 4491,
 13,
 7997,
 198,
 12,
 18083,
 13,
 5929,
 271,
 29314,
 512,
 12,
 73997,
 30133,
 198,
 12,
 62302,
 198,
 12,
 30982,
 28905,
 271,
 14330,
 512,
 12,
 11166,
 198,
 12,
 50767,
 198,
 12,
 39190,
 10637,
 271,
 6153,
 27716,
 320,
 438,
 89447,
 8,
 19301,
 832,
 3786,
 505,
 1855,
 1912,
 323,
 25012,
 1124,
 304,
 279,
 6278,
 315,
 279,
 2007,
 17011,
 785,
 11,
 814,
 75371,
 279,
 9861,
 7563,
 323,
 27023,
 704,
 279,
 2768,
 311,
 1855,
 2851,
 1473,
 12,
 19367,
 25,
 220,
 17,
 7563,
 198,
 12,
 480,
 285,
 6853,
 25,
 220,
 17,
 7563,
 4417,
 43,
 26645,
 518,
 364,
 36412,
 81818,
 1329,
 12,
 58280,
 25,

In [9]:
import gc
import torch
from vllm.distributed.parallel_state import destroy_model_parallel

destroy_model_parallel()
del llm.llm_engine.model_executor.driver_worker  # type: ignore
gc.collect()
torch.cuda.empty_cache()

In [10]:
model.to("cuda")

TransformerDecoder(
  (tok_embeddings): Embedding(128256, 4096)
  (layers): ModuleList(
    (0-31): 32 x TransformerSelfAttentionLayer(
      (attn): MultiHeadAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (output_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (pos_embeddings): Llama3ScaledRoPE()
      )
      (mlp): FeedForward(
        (w1): Linear(in_features=4096, out_features=14336, bias=False)
        (w2): Linear(in_features=14336, out_features=4096, bias=False)
        (w3): Linear(in_features=4096, out_features=14336, bias=False)
        (activation): SiLU()
      )
      (sa_norm): RMSNorm()
      (mlp_norm): RMSNorm()
      (sa_scale): Identity()
      (mlp_scale): Identity()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=4096, out_features=128256

In [11]:
next(model.parameters()).device

device(type='cuda', index=0)

In [16]:
import torch
from torchtune.generation import generate

result = generate(
    model,
    torch.tensor([get_tokens([dict(role="user", content=prompt)])], device="cuda"),
    max_generated_tokens=10,
)

In [17]:
print(llm.get_tokenizer().decode(result[0].squeeze().tolist()))

<|begin_of_text|><|begin_of_text|><|im_start|>user
On a warm spring day Summer, Giselle and Connor sat down to play a casual mystery game.

They assembled 3 decks of cards, each for a separate type of information composed of the following:

Suspect:
- Miss Scarlet
- Mr. Green
- Mrs. White

Weapon:
- Candlestick
- Knife
- Lead Pipe

Room:
- Hall
- Lounge
- Dining Room

After randomly (and blindly) choosing one card from each group and placing them in the middle of the table facedown, they shuffled the remaining cards and dealt out the following to each player:

- Summer: 2 cards
- Giselle: 2 cards ('Lounge', 'Miss Scarlet')
- Connor: 2 cards

The game proceeded as follows:

1. On their turn, a player asked about a set of exactly 3 cards, one from each of the game's categories. (Note: Players could ask about any cards, including those in their own hand.)
2. The player directed this question to the other players in clockwise order, starting with the player to their left.
3. If a player ha

In [22]:
torch.tensor([get_tokens([dict(role="user", content=prompt)])] * 2, device="cuda").shape

torch.Size([2, 585])

In [23]:
model.forward(torch.tensor([get_tokens([dict(role="user", content=prompt)])] * 2, device="cuda"))

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 39.38 GiB of which 37.38 MiB is free. Including non-PyTorch memory, this process has 39.33 GiB memory in use. Of the allocated memory 38.62 GiB is allocated by PyTorch, and 166.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)