# 🚀 Train a Small GPT2 Language Model on Hugging Face Datasets (Colab-Ready)

In [None]:
!pip install transformers datasets accelerate

from datasets import load_dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
dataset = dataset.filter(lambda example: example['text'] is not None and example['text'].strip() != "")

from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = '[PAD]'
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_special_tokens_mask=True
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

vocab_size = len(tokenizer)
for example in tokenized_dataset["train"].select(range(100)):
    if any(tok >= vocab_size for tok in example["input_ids"]):
        print("❌ Token ID out of range:", example["input_ids"])

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

from transformers import GPT2Config, GPT2LMHeadModel
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=512,
    n_ctx=512,
    n_embd=256,
    n_layer=4,
    n_head=4
)
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(tokenizer))

print("Tokenizer vocab size:", len(tokenizer))
print("Model vocab size:", model.config.vocab_size)


from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./small_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    logging_steps=100,
    learning_rate=5e-5,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

trainer.train()



Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Tokenizer vocab size: 50258
Model vocab size: 50258


Step,Training Loss,Validation Loss
500,7.087,7.141818
1000,6.7777,6.905735
1500,6.7116,6.775604
2000,6.6393,6.675565
2500,6.5763,6.599555
3000,6.5533,6.538086
3500,6.4461,6.491924


Step,Training Loss,Validation Loss
500,7.087,7.141818
1000,6.7777,6.905735
1500,6.7116,6.775604
2000,6.6393,6.675565
2500,6.5763,6.599555
3000,6.5533,6.538086
3500,6.4461,6.491924
