In [1]:
import sys, platform
import transformers, datasets, torch
print("Python:", sys.version.split()[0], "| OS:", platform.system())
print("Transformers:", transformers.__version__, "| datasets:", datasets.__version__)
print("PyTorch:", torch.__version__, "| CUDA:", torch.cuda.is_available())


  from .autonotebook import tqdm as notebook_tqdm


Python: 3.13.5 | OS: Windows
Transformers: 4.57.1 | datasets: 4.3.0
PyTorch: 2.9.0+cpu | CUDA: False


In [2]:
MODEL_NAME = "sshleifer/tiny-gpt2"   # swap later to a Llama checkpoint if you have GPU
BLOCK_SIZE = 128


In [3]:
from datasets import load_dataset

# small raw text dataset; fast to download
raw = load_dataset("wikitext", "wikitext-2-raw-v1")
# keep very small slices to make CPU training fast
train_raw = raw["train"].select(range(1000))
valid_raw = raw["validation"].select(range(200))

len(train_raw), len(valid_raw), train_raw[0]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating test split: 100%|██████████| 435

(1000, 200, {'text': ''})

In [4]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

def tokenize(examples):
    return tok(examples["text"], return_special_tokens_mask=False)

train_tok = train_raw.map(tokenize, batched=True, remove_columns=train_raw.column_names)
valid_tok = valid_raw.map(tokenize, batched=True, remove_columns=valid_raw.column_names)

def group_texts(examples):
    # Concatenate then split into fixed-size blocks
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_len = (len(concatenated["input_ids"]) // BLOCK_SIZE) * BLOCK_SIZE
    result = {
        k: [t[:total_len] for t in [concatenated[k]]][0] for k in concatenated.keys()
    }
    result = {
        k: [result[k][i : i + BLOCK_SIZE] for i in range(0, total_len, BLOCK_SIZE)]
        for k in result.keys()
    }
    result["labels"] = result["input_ids"].copy()
    return result

train_blocks = train_tok.map(group_texts, batched=True)
valid_blocks = valid_tok.map(group_texts, batched=True)

len(train_blocks), len(valid_blocks), train_blocks[0]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 1000/1000 [00:00<00:00, 5100.22 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 6006.58 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2686.63 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 2639.70 examples/s]


(482,
 104,
 {'input_ids': [796,
   569,
   18354,
   7496,
   17740,
   6711,
   796,
   220,
   198,
   2311,
   73,
   13090,
   645,
   569,
   18354,
   7496,
   513,
   1058,
   791,
   47398,
   17740,
   357,
   4960,
   1058,
   10545,
   230,
   99,
   161,
   254,
   112,
   5641,
   44444,
   9202,
   25084,
   24440,
   12675,
   11839,
   18,
   837,
   6578,
   764,
   569,
   18354,
   7496,
   286,
   262,
   30193,
   513,
   1267,
   837,
   8811,
   6412,
   284,
   355,
   569,
   18354,
   7496,
   17740,
   6711,
   2354,
   2869,
   837,
   318,
   257,
   16106,
   2597,
   2488,
   12,
   31,
   2712,
   2008,
   983,
   4166,
   416,
   29490,
   290,
   6343,
   13,
   44206,
   329,
   262,
   14047,
   44685,
   764,
   28728,
   287,
   3269,
   2813,
   287,
   2869,
   837,
   340,
   318,
   262,
   2368,
   983,
   287,
   262,
   569,
   18354,
   7496,
   2168,
   764,
   12645,
   278,
   262,
   976,
   21748,
   286,
   16106,
   290,
   1103,
  

In [5]:
from transformers import DataCollatorForLanguageModeling
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)


In [6]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.__class__.__name__


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


'GPT2LMHeadModel'

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [7]:
from transformers import TrainingArguments, Trainer
import numpy as np

args = TrainingArguments(
    output_dir="outputs/llama_like_tiny_causal_lm",
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    weight_decay=0.01,
    seed=42,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_blocks,
    eval_dataset=valid_blocks,
    data_collator=collator,
    tokenizer=tok,
)

trainer.train()
eval_res = trainer.evaluate()
eval_res


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,10.7333,10.726302


{'eval_loss': 10.726302146911621,
 'eval_runtime': 0.6959,
 'eval_samples_per_second': 149.437,
 'eval_steps_per_second': 18.68,
 'epoch': 1.0}

In [8]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tok, device=-1)
out = pipe("The quick brown fox", max_new_tokens=30, do_sample=False)
out[0]["generated_text"]


Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


'The quick brown fox stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs'