<a href="https://colab.research.google.com/github/abzb1/UoS_AI_20026/blob/main/01_pretrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install uv
!uv pip install transformers[torch] datasets accelerate

Collecting uv
  Downloading uv-0.8.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.8.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.8.13
[2mUsing Python 3.12.11 environment at: /usr[0m
[2mAudited [1m3 packages[0m [2min 535ms[0m[0m


In [2]:
import os

import torch
from transformers import (
    AutoTokenizer,
    LlamaConfig, LlamaForCausalLM,
    Trainer, TrainingArguments, DataCollatorWithFlattening
)
from datasets import load_dataset

In [3]:
# load pretrained tokenizer from SmolLM2

pretrained_model_name = "HuggingFaceTB/SmolLM2-1.7B"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Initialize model with configuration

config = {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": False,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "eos_token_id": 0,
  "head_dim": 16,
  "hidden_act": "silu",
  "hidden_size": 144,
  "initializer_range": 0.041666666666666664,
  "intermediate_size": 384,
  "is_llama_config": True,
  "max_position_embeddings": 8192,
  "mlp_bias": False,
  "model_type": "llama",
  "num_attention_heads": 9,
  "num_hidden_layers": 8,
  "num_key_value_heads": 3,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_interleaved": False,
  "rope_scaling": None,
  "rope_theta": 100000,
  "tie_word_embeddings": True,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.55.3",
  "use_cache": True,
  "vocab_size": 49152
}

config = LlamaConfig(**config)
model = LlamaForCausalLM(config)
print(model)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}") # 8M model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 144)
    (layers): ModuleList(
      (0-7): 8 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=144, out_features=144, bias=False)
          (k_proj): Linear(in_features=144, out_features=48, bias=False)
          (v_proj): Linear(in_features=144, out_features=48, bias=False)
          (o_proj): Linear(in_features=144, out_features=144, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=144, out_features=384, bias=False)
          (up_proj): Linear(in_features=144, out_features=384, bias=False)
          (down_proj): Linear(in_features=384, out_features=144, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((144,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((144,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((144,), eps=1e-05)
    (rotary_emb): LlamaRotaryEmbedding

In [5]:
# Test model with simple prompt

test_prompt = """
John and Mary went to the store. Mary bought
""".strip()

inputs = tokenizer(test_prompt, return_tensors="pt")
output = model.generate(**inputs, max_new_tokens=8)
generated = output[0, inputs['input_ids'].shape[1]:]

print("Input Text:")
print(test_prompt)
print("Model Generated text:")
print(tokenizer.decode(generated, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Input Text:
John and Mary went to the store. Mary bought
Model Generated text:
 hepat hepat hepat hepat hepat hepat hepat hepat


In [6]:
# prepare train dataset

train_raw = load_dataset("Rowan/hellaswag", split="train")

valid_raw = load_dataset("Rowan/hellaswag", split="validation")
valid_raw = valid_raw.shuffle(seed=42).select(list(range(30)))

num_workers = min(4, os.cpu_count() or 1)

def to_text(example):
    gold = example["endings"][int(example["label"])].strip()
    if not gold.startswith(","):
        gold = " " + gold
    return {"text": example["ctx"] + gold}

train_txt = train_raw.map(to_text, remove_columns=train_raw.column_names,
                          num_proc=num_workers, desc="Prepare train text")
valid_txt = valid_raw.map(to_text, remove_columns=valid_raw.column_names,
                          num_proc=num_workers, desc="Prepare valid text")

def tokenize(batch):
    out = tokenizer(
        batch["text"],
        add_special_tokens=True,
        truncation=True,
        max_length=32,
    )
    out["labels"] = out["input_ids"].copy()
    return out

train_tok = train_txt.map(tokenize, batched=True, remove_columns=["text"],
                          num_proc=num_workers, desc="Tokenize train")
valid_tok = valid_txt.map(tokenize, batched=True, remove_columns=["text"],
                          num_proc=num_workers, desc="Tokenize valid")

In [7]:
# build trainer and train

args = TrainingArguments(
    output_dir="./my_sllm",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=1,
    learning_rate=1e-3,
    weight_decay=0.1,
    warmup_ratio=0.03,
    logging_steps=30,
    eval_strategy="steps",
    eval_steps=30,
    save_strategy="steps",
    save_steps=30,
    save_total_limit=1,
    report_to="none",
    bf16=True,
    gradient_checkpointing=False,
)

In [8]:
# build trainer and train

model.config._attn_implementation = "sdpa"
model.config.use_cache = False

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    data_collator=DataCollatorWithFlattening(),
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss
30,8.6212,6.318599
60,5.7372,5.309923
90,5.2342,4.961137
120,4.9301,4.730784
150,4.7735,4.598074
180,4.6336,4.492377
210,4.5544,4.408179
240,4.4668,4.358603
270,4.439,4.319183
300,4.3807,4.29886


TrainOutput(global_step=312, training_loss=5.14679785263844, metrics={'train_runtime': 229.5669, 'train_samples_per_second': 173.827, 'train_steps_per_second': 1.359, 'total_flos': 13272347041920.0, 'train_loss': 5.14679785263844, 'epoch': 1.0})

In [9]:
# Test model with simple prompt

model.eval()
inputs = tokenizer(test_prompt, return_tensors="pt").to(trainer.model.device)
output = model.generate(**inputs, max_new_tokens=8)
generated = output[0, inputs['input_ids'].shape[1]:]

print("Input Text:")
print(test_prompt)
print("Post-train:")
print(tokenizer.decode(generated, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Input Text:
John and Mary went to the store. Mary bought
Post-train:
 is a large ball in a large table
