<a href="https://colab.research.google.com/github/abzb1/UoS_AI_20026/blob/main/01_pretrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install uv
!uv pip install transformers[torch] datasets

Collecting uv
  Downloading uv-0.8.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.8.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.8.13
[2mUsing Python 3.12.11 environment at: /usr[0m
[2mAudited [1m2 packages[0m [2min 190ms[0m[0m


In [None]:
from transformers import LlamaConfig, LlamaForCausalLM
from transformers import AutoTokenizer

In [None]:
# load pretrained tokenizer from SmolLM2

pretrained_model_name = "HuggingFaceTB/SmolLM2-1.7B"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Initialize model with configuration

config = {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": False,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "eos_token_id": 0,
  "head_dim": 16,
  "hidden_act": "silu",
  "hidden_size": 144,
  "initializer_range": 0.041666666666666664,
  "intermediate_size": 384,
  "is_llama_config": True,
  "max_position_embeddings": 8192,
  "mlp_bias": False,
  "model_type": "llama",
  "num_attention_heads": 9,
  "num_hidden_layers": 16,
  "num_key_value_heads": 3,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_interleaved": False,
  "rope_scaling": None,
  "rope_theta": 100000,
  "tie_word_embeddings": True,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.55.3",
  "use_cache": True,
  "vocab_size": 49152
}

config = LlamaConfig(**config)
model = LlamaForCausalLM(config)
print(model)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}") # 10M model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 144)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=144, out_features=144, bias=False)
          (k_proj): Linear(in_features=144, out_features=48, bias=False)
          (v_proj): Linear(in_features=144, out_features=48, bias=False)
          (o_proj): Linear(in_features=144, out_features=144, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=144, out_features=384, bias=False)
          (up_proj): Linear(in_features=144, out_features=384, bias=False)
          (down_proj): Linear(in_features=384, out_features=144, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((144,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((144,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((144,), eps=1e-05)
    (rotary_emb): LlamaRotaryEmbeddi

In [None]:
# Test model with simple prompt

test_prompt = """
John and Mary went to the store. Mary bought
""".strip()

inputs = tokenizer(test_prompt, return_tensors="pt")
output = model.generate(**inputs, max_new_tokens=8)
generated = output[0, inputs['input_ids'].shape[1]:]

print("Input Text:")
print(test_prompt)
print("Model Generated text:")
print(tokenizer.decode(generated, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Input Text:
John and Mary went to the store. Mary bought
Model Generated text:
lieroidsroidsroids temple temple temple temple


In [None]:
# prepare train dataset

from datasets import load_dataset

ds = load_dataset("Rowan/hellaswag", split="train")

print("hellaswag train set:")
print(ds)

def preprocess_sample(sample):
    gold_ending = sample["endings"][int(sample["label"])].strip()
    if not gold_ending.startswith(","):
        gold_ending = " " + gold_ending
    full_sentence = "".join([sample["ctx"], gold_ending])

    return {
        "sequence": full_sentence
    }

from multiprocessing import cpu_count

ds = ds.map(
    preprocess_sample,
    remove_columns=ds.column_names,
    num_proc=min(4, cpu_count())
)
print("processed train set:")
print(ds)

train_ds = ds
valid_ds = load_dataset("Rowan/hellaswag", split="validation")
valid_ds = valid_ds.map(
    preprocess_sample,
    remove_columns=valid_ds.column_names,
    num_proc=min(4, cpu_count() or 1)
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

max_sequence_length = 64
def map_tokenize(examples):
    return tokenizer(
        examples["sequence"],
        add_special_tokens=True,
        truncation=True,
        max_length=max_sequence_length
    )

tokenized_train = train_ds.map(map_tokenize, batched=True, remove_columns=train_ds.column_names, desc="Tokenizing train")
tokenized_valid  = valid_ds.map(map_tokenize,  batched=True, remove_columns=valid_ds.column_names,  desc="Tokenizing valid")

def sequence_packing(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated["input_ids"])

    total_length = (total_length // max_sequence_length) * max_sequence_length
    result = {
        k: [t[i : i + max_sequence_length] for i in range(0, total_length, max_sequence_length)]
        for k, t in concatenated.items()
    }

    result["labels"] = result["input_ids"].copy()
    return result

train_packed = tokenized_train.map(sequence_packing, batched=True, desc="Packing train")
valid_packed  = tokenized_valid.map(sequence_packing,  batched=True, desc="Packing valid")

hellaswag train set:
Dataset({
    features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label'],
    num_rows: 39905
})


processed train set:
Dataset({
    features: ['sequence'],
    num_rows: 39905
})


In [None]:
# prepare trainer

import torch
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

model.config._attn_implementation = "sdpa"
model.config.use_cache = False

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="outputs/smollm2-10m-hellaswag-pretrain",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=1,
    learning_rate=3e-4,
    weight_decay=0.1,
    warmup_ratio=0.01,
    logging_steps=25,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=1,
    report_to="none",
    bf16=False,
    fp16=False,
    gradient_checkpointing=False,
    dataloader_num_workers=min(4, cpu_count() or 1),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_packed,
    eval_dataset=valid_packed if len(valid_packed) > 0 else None,
    data_collator=collator,
    processing_class=tokenizer,
)

In [None]:
# train

train_result = trainer.train()

trainer.save_model()
tokenizer.save_pretrained(training_args.output_dir)

Step,Training Loss,Validation Loss
100,6.5816,6.417482
200,5.5604,5.583872
300,5.1418,5.217977
400,4.9081,5.006247
500,4.7571,4.858179
600,4.6138,4.754251
700,4.5356,4.67873
800,4.4729,4.615255
900,4.3705,4.576144
1000,4.3526,4.539477


('outputs/smollm2-10m-hellaswag-pretrain/tokenizer_config.json',
 'outputs/smollm2-10m-hellaswag-pretrain/special_tokens_map.json',
 'outputs/smollm2-10m-hellaswag-pretrain/vocab.json',
 'outputs/smollm2-10m-hellaswag-pretrain/merges.txt',
 'outputs/smollm2-10m-hellaswag-pretrain/added_tokens.json',
 'outputs/smollm2-10m-hellaswag-pretrain/tokenizer.json')

In [None]:
# Test model with simple prompt

inputs = tokenizer(test_prompt, return_tensors="pt").to(trainer.model.device)
output = model.generate(**inputs, max_new_tokens=8)
generated = output[0, inputs['input_ids'].shape[1]:]

print("Input Text:")
print(test_prompt)
print("Post-train:")
print(tokenizer.decode(generated, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Input Text:
John and Mary went to the store. Mary bought
Post-train:
, the man is shown in the water


In [None]:
# calculate perplexity

import math

last_eval = None
for record in reversed(trainer.state.log_history):
    if "eval_loss" in record:
        last_eval = record
        break

if valid_packed and len(valid_packed) > 0:
    if last_eval:
        metrics = last_eval
    else:
        metrics = trainer.evaluate()
    try:
        perplexity = math.exp(metrics["eval_loss"])
    except (OverflowError, KeyError):
        perplexity = float("inf")


    print("Eval perplexity:", perplexity)

Eval perplexity: 89.5370716686078
