In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset

# ----------------------------
# Load model + tokenizer
# ----------------------------
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add pad token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Freeze all params
for param in model.parameters():
    param.requires_grad = False

# Unfreeze attention QKV + projection in each transformer block
for layer in model.transformer.h:
    for param in layer.attn.c_attn.parameters():
        param.requires_grad = True
    for param in layer.attn.c_proj.parameters():
        param.requires_grad = True



2025-09-14 10:14:22.107691: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757844862.130755     230 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757844862.137775     230 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# ----------------------------
# Dataset + tokenization
# ----------------------------
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

tokenized_dataset["validation"] = tokenized_dataset["validation"].filter(
    lambda x: x["text"].strip() != ""
)
tokenized_dataset["train"] = tokenized_dataset["train"].filter(
    lambda x: x["text"].strip() != ""
)

tokenized_dataset["test"] = tokenized_dataset["test"].filter(
    lambda x: x["text"].strip() != ""
)


# Collator for LM tasks
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

train_loader = DataLoader(
    tokenized_dataset["train"].shuffle(seed=42).select(range(500)),
    batch_size=2,
    shuffle=True,
    collate_fn=data_collator,
)

val_loader = DataLoader(
    tokenized_dataset["validation"].shuffle(seed=42).select(range(100)),
    batch_size=2,
    shuffle=False,
    collate_fn=data_collator,
)


Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

In [7]:

# ----------------------------
# Training setup
# ----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-6)

num_epochs = 10
log_interval = 20

# ----------------------------
# Training loop
# ----------------------------
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if step % log_interval == 0 and step > 0:
            print(f"Epoch {epoch+1} | Step {step} | Loss {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Avg Train Loss {avg_loss:.4f}")

    # ------------------------
    # Validation
    # ------------------------
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
    val_loss /= len(val_loader)
    print(f"Epoch {epoch+1} | Validation Loss {val_loss:.4f}")



`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch 1 | Step 20 | Loss 4.1175
Epoch 1 | Step 40 | Loss 4.6899
Epoch 1 | Step 60 | Loss 3.7422
Epoch 1 | Step 80 | Loss 3.8952
Epoch 1 | Step 100 | Loss 4.4378
Epoch 1 | Step 120 | Loss 4.1841
Epoch 1 | Step 140 | Loss 4.7822
Epoch 1 | Step 160 | Loss 3.8671
Epoch 1 | Step 180 | Loss 4.4331
Epoch 1 | Step 200 | Loss 4.0598
Epoch 1 | Step 220 | Loss 4.0144
Epoch 1 | Step 240 | Loss 4.3399
Epoch 1 | Avg Train Loss 4.4187
Epoch 1 | Validation Loss 4.0927
Epoch 2 | Step 20 | Loss 3.1978
Epoch 2 | Step 40 | Loss 4.4394
Epoch 2 | Step 60 | Loss 6.5381
Epoch 2 | Step 80 | Loss 4.6796
Epoch 2 | Step 100 | Loss 4.1046
Epoch 2 | Step 120 | Loss 4.2617
Epoch 2 | Step 140 | Loss 4.0186
Epoch 2 | Step 160 | Loss 4.2912
Epoch 2 | Step 180 | Loss 5.2949
Epoch 2 | Step 200 | Loss 3.6905
Epoch 2 | Step 220 | Loss 4.2822
Epoch 2 | Step 240 | Loss 4.0733
Epoch 2 | Avg Train Loss 4.1680
Epoch 2 | Validation Loss 3.8921
Epoch 3 | Step 20 | Loss 6.1193
Epoch 3 | Step 40 | Loss 5.5229
Epoch 3 | Step 60 | Lo

In [8]:
# ----------------------------
# Save fine-tuned model
# ----------------------------
save_dir = "./gpt2_finetuned_attention"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)


('./gpt2_finetuned_attention/tokenizer_config.json',
 './gpt2_finetuned_attention/special_tokens_map.json',
 './gpt2_finetuned_attention/vocab.json',
 './gpt2_finetuned_attention/merges.txt',
 './gpt2_finetuned_attention/added_tokens.json')

In [9]:

# ----------------------------
# Inference
# ----------------------------
prompt = "Artificial intelligence in 2025"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

model.eval()
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=50,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8
    )

print("Generated Text:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
 Artificial intelligence in 2025

In order to solve the problem of artificial intelligence in 2025 , there are two main approaches that can be used:

1) Develop a system that can detect human actions and predict future actions

2) Use
