In [9]:
from think_model import ThinkModelConfig, ThinkTransformer
from train import TrainerConfig, SimpleDataLoader, Trainer

from transformers import AutoTokenizer

import torch

In [10]:
tokenizer_id = "HuggingFaceTB/SmolLM2-135M"

In [11]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.pad_token = tokenizer.eos_token

In [13]:
model_config = ThinkModelConfig(
    vocab_size=tokenizer.vocab_size,
    #
    # Generate model
    d_model=576,
    d_head=64,
    d_mlp_proj=1536,
    n_generate_layers=16,
    n_kv_heads=3,
    n_attn_heads=9,
    n_cross_attn_heads=9,
    generate_initializer_range=0.002,
    #
    # Think model
    think_d_model=576,
    think_d_head=64,
    think_d_mlp_proj=1536,
    n_think_kv_heads=3,
    n_think_attn_heads=9,
    n_think_layers=16,
    think_initializer_range=0.02,
    #
    # Others
    think_seq_prefix_ratio=0.25,
    thought_embedding_init_normal=False,
    train_recurrence=1,
    rms_norm_eps=1e-5,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)

In [14]:
train_config = TrainerConfig(
    per_device_train_batch_size=8,
    max_seq_len=512,
    num_epochs=8,
    eval_interval_steps=25,
    learning_rate=1e-3,
    grad_clip_norm=1.0,
    val_size=0.1,
    log_dir="runs/shakespeare_think_test",
    warmup_ratio=0.1,)

In [15]:
with open("data/complete_shakespeare.txt") as f:
    text = f.read()

FileNotFoundError: [Errno 2] No such file or directory: 'data/complete_shakespeare.txt'

In [None]:
model = ThinkTransformer(model_config)
dataloader = SimpleDataLoader(train_config, tokenizer, text=text)
trainer = Trainer(train_config, model)

In [None]:
trainer.train(dataloader)

In [None]:
#trainer.save_checkpoint("think_shakespeare")

In [None]:
# state_dict = torch.load("think_shakespeare/model.checkpoint.2025-02-22--23-04-54.pt", weights_only=True)
# model = ThinkTransformer(model_config)
# model.load_state_dict(state_dict)
# model.to("cuda")


In [None]:
input_text = """
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.
""".strip()

input_ids = tokenizer([input_text], return_tensors="pt")['input_ids'].to("cuda")
idx = model.generate(input_ids, temperature=0.01, top_k=5, max_new_tokens=64, think_r=256)
print(tokenizer.batch_decode(idx)[0])