In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [2]:
import torch
from torch.nn import LayerNorm

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Cast small parameters (e.g., LayerNorm) to float32
for module in model.modules():
    if isinstance(module, LayerNorm):
        module.float()

def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Trainable parameters: {count_trainable_parameters(model)}')


Trainable parameters: 0


In [3]:
model.gradient_checkpointing_enable()

In [4]:
import torch.nn as nn

class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)


In [7]:
from datasets import load_dataset

def format_prompt(context, question, answer=None):
    if answer:
        return f"Context: {context}\nQuestion: {question}\nAnswer: {answer}"
    else:
        return f"Context: {context}\nQuestion: {question}\nAnswer:"

dataset = load_dataset('squad')

# Ensure padding and truncation
def tokenize_function(examples):
    prompts = [
        format_prompt(c, q, a['text'][0] if len(a['text']) > 0 else None)
        for c, q, a in zip(examples['context'], examples['question'], examples['answers'])
    ]
    return tokenizer(prompts, padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
)

trainer.train()


In [None]:
def generate_answer(context, question):
    prompt = format_prompt(context, question)
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(**inputs)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [None]:
context = "The capital of France is Paris."
question = "What is the capital of France?"
print(generate_answer(context, question))
