In [1]:
from benchmark import SmolEvalWrapper, run_benchmarks
from think_model import ThinkModelConfig, ThinkTransformer
from transformers import AutoTokenizer

import torch

In [2]:
device = torch.device("cuda:0")

In [3]:
hf_checkpoint = "HuggingFaceTB/SmolLM-360M"
tokenizer = AutoTokenizer.from_pretrained(hf_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
!ls think_fineweb-edu_chkpts_exp11

model.checkpoint.2025-02-21--13-21-57.pt
model.checkpoint.2025-02-22--01-09-25.pt
model.checkpoint.2025-02-22--12-57-09.pt


In [5]:
checkpoint_path="think_fineweb-edu_chkpts_exp11/model.checkpoint.2025-02-22--12-57-09.pt"
state_dict = torch.load(checkpoint_path, weights_only=True)

In [6]:
model_config = ThinkModelConfig(
    vocab_size=tokenizer.vocab_size,
    #
    # Generate model
    d_model=960,
    d_head=64,
    d_mlp_proj=2560,
    n_generate_layers=12,
    n_kv_heads=5,
    n_attn_heads=15,
    n_cross_attn_heads=15,
    generate_initializer_range=0.002,
    #
    # Think model
    think_d_model=960,
    think_d_head=64,
    think_d_mlp_proj=2560,
    n_think_kv_heads=5,
    n_think_attn_heads=15,
    n_think_layers=32,
    think_initializer_range=0.02,
    #
    # Others
    encode_interval=8,
    rms_norm_eps=1e-5,
    rope_theta=100000.0,
    padding_idx=tokenizer.pad_token_id
)
model = ThinkTransformer(model_config)


In [7]:
model.load_state_dict(state_dict)

<All keys matched successfully>

In [8]:
model.to(device)

ThinkTransformer(
  (think_network): ThinkNetwork(
    (embed_tokens): Embedding(49152, 960)
    (layers): ModuleList(
      (0-31): 32 x EncoderLayer(
        (self_attn): GroupedQueryAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): GatedMlp(
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (silu): SiLU()
        )
        (input_layernorm): RMSNorm((960,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): RMSNorm((960,), eps=1e-05, elementwise_affine=True)
      )
    )
    (norm): RMSNorm((960,

In [9]:
model.eval()

ThinkTransformer(
  (think_network): ThinkNetwork(
    (embed_tokens): Embedding(49152, 960)
    (layers): ModuleList(
      (0-31): 32 x EncoderLayer(
        (self_attn): GroupedQueryAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): GatedMlp(
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (silu): SiLU()
        )
        (input_layernorm): RMSNorm((960,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): RMSNorm((960,), eps=1e-05, elementwise_affine=True)
      )
    )
    (norm): RMSNorm((960,

In [10]:
eval_wrapper = SmolEvalWrapper(model, tokenizer, device, batch_size=8)

In [11]:
task = "hellaswag"

In [12]:
results = run_benchmarks(eval_wrapper, [task], limit=100)
metric_keys = results['results'][task].keys() - ['alias']
metric_values = {metric: results['results'][task][metric] for metric in metric_keys}
metric_values

2025-02-22:11:43:31,031 INFO     [__init__.py:459] The tag 'kobest' is already registered as a group, this tag will not be registered. This may affect tasks you want to call.
2025-02-22:11:43:31,032 INFO     [__init__.py:459] The tag 'kobest' is already registered as a group, this tag will not be registered. This may affect tasks you want to call.
2025-02-22:11:43:33,509 INFO     [task.py:420] Building contexts for hellaswag on rank 0...
100%|██████████| 100/100 [00:00<00:00, 8314.94it/s]
2025-02-22:11:43:33,529 INFO     [evaluator.py:513] Running loglikelihood requests
100%|██████████| 400/400 [16:24<00:00,  2.46s/it]


{'acc,none': 0.39,
 'acc_norm,none': 0.44,
 'acc_norm_stderr,none': 0.04988876515698589,
 'acc_stderr,none': 0.04902071300001975}

In [13]:
input_text = """
Ideas for the weekend
- Hike in the redwood trees
- Picnic near the lake
-
""".strip()

input_ids = tokenizer([input_text], return_tensors="pt")['input_ids'].to("cuda")
idx = model.generate(input_ids, temperature=0.25, top_k=50, max_new_tokens=64, think_r=128)
print(tokenizer.batch_decode(idx)[0])

Ideas for the weekend
- Hike in the redwood trees
- Picnic near the lake
- The 1960s: This is a time when the hippie movement was gaining momentum, which created a wave of hippie artists who were trying to create a new kind of art. Artists such as Dadaist, Pop, and Minimalism were among the first to take up the movement. But the
