<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/GPT2_QA_Finetune_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import transformers
print(transformers.__version__)

4.47.1


Huggingface trainer and generate

In [39]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import torch
import random
import numpy as np

# Seed setting function
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Set the seed for reproducibility
seed = 50
set_seed(seed)

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Tie lm_head weights if missing
if model.lm_head.weight.shape[0] != model.transformer.wte.weight.shape[0]:
    model.tie_weights()

# Add padding token to tokenizer and model
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Example QA dataset
qa_data = [
    {"question": "What is the capital of France?", "answer": "The capital of France is Paris."},
    {"question": "Who wrote '1984'?", "answer": "George Orwell wrote '1984'."},
]

# Preprocess dataset
def preprocess_data(example):
    input_text = f"Question: {example['question']}\nAnswer: {example['answer']}"
    inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=60)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

# Convert dataset to Huggingface Dataset object
dataset = Dataset.from_list(qa_data)
tokenized_dataset = dataset.map(preprocess_data, remove_columns=["question", "answer"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_qa_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=[],  # Disable W&B or any reporting
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("./gpt2_qa_finetuned")
tokenizer.save_pretrained("./gpt2_qa_finetuned")

# Test the model
def generate_answer(question, model, tokenizer, device="cuda"):
    model.to(device)  # Move the model to the specified device
    model.eval()  # Set the model to evaluation mode
    input_text = f"Question: {question}\nAnswer:"
    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)  # Move inputs to the same device
    outputs = model.generate(inputs, max_length=100, num_beams=5, early_stopping=True, pad_token_id=tokenizer.eos_token_id)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Load the fine-tuned model and tokenizer for testing
model = GPT2LMHeadModel.from_pretrained("./gpt2_qa_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_qa_finetuned")

# Test generation
question = "What is the capital of France?"
device = "cuda" if torch.cuda.is_available() else "cpu"
answer = generate_answer(question, model, tokenizer, device=device)
print(answer)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,3.200552
2,No log,1.67013
3,No log,0.855163
4,No log,0.744531
5,No log,0.700154
6,No log,0.646618
7,No log,0.592543
8,No log,0.55156
9,No log,0.524085
10,1.811600,0.510525


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Question: What is the capital of France?
Answer: The capital of France is Paris.


Beam Search HF:

In [14]:
# Test the model
def generate_answer(question, model, tokenizer, device="cuda"):
    model.to(device)  # Move the model to the specified device
    model.eval()  # Set the model to evaluation mode
    input_text = f"Question: {question}\nAnswer:"
    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)  # Move inputs to the same device
    outputs = model.generate(inputs, max_length=30, num_beams=5, early_stopping=True, pad_token_id=tokenizer.eos_token_id)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Load the fine-tuned model and tokenizer for testing
model = GPT2LMHeadModel.from_pretrained("./gpt2_qa_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_qa_finetuned")

# Test generation
question = "What is the capital of France?"
device = "cuda" if torch.cuda.is_available() else "cpu"
answer = generate_answer(question, model, tokenizer, device=device)
print(answer)

Question: What is the capital of France?
Answer: The capital of France is Paris.


Greedy Search HF:

In [20]:
def generate_answer(question, model, tokenizer, max_length=30, device="cuda"):
    # Move the model to the specified device
    model.to(device)
    model.eval()  # Set the model to evaluation mode

    # Prepare the input text
    input_text = f"Question: {question}\nAnswer:"
    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate the answer using greedy search
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,  # Only one sequence is returned
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,  # Disable n-gram blocking
    )

    # Decode the generated tokens into text
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

# Load the fine-tuned model and tokenizer for testing
model = GPT2LMHeadModel.from_pretrained("./gpt2_qa_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_qa_finetuned")

# Test generation
question = "What is the capital of France?"
device = "cuda" if torch.cuda.is_available() else "cpu"
answer = generate_answer(question, model, tokenizer, device=device)
print(answer)

Question: What is the capital of France?
Answer: The capital of Paris is Paris.


Greedy Search from Scratch:

In [22]:
def generate_answer(question, model, tokenizer, max_length=50, device="cuda"):
    """
    Generate an answer using greedy search.
    Args:
        question (str): The input question.
        model (GPT2LMHeadModel): The GPT-2 model.
        tokenizer (GPT2Tokenizer): The tokenizer.
        max_length (int): Maximum length of the generated text.
        device (str): Device to run the model on ('cuda' or 'cpu').

    Returns:
        str: The generated answer.
    """
    # Move the model to the correct device
    model.to(device)
    model.eval()  # Set model to evaluation mode

    # Prepare input
    input_text = f"Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Initialize the generated token IDs with the input
    generated_ids = input_ids

    for _ in range(max_length):
        # Forward pass to get logits for the next token
        outputs = model(input_ids=generated_ids)
        logits = outputs.logits

        # Select the token with the highest probability (greedy search)
        next_token_id = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)

        # Append the predicted token to the sequence
        generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)

        # Stop if the end-of-sequence (EOS) token is generated
        if next_token_id.item() == tokenizer.eos_token_id:
            break

    # Decode the generated token IDs to a text string
    answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return answer

# Load the fine-tuned model and tokenizer for testing
model = GPT2LMHeadModel.from_pretrained("./gpt2_qa_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_qa_finetuned")

# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"

question = "What is the capital of France?"
answer = generate_answer(question, model, tokenizer)
print(answer)

Question: What is the capital of France?
Answer: The capital of France is Paris.


Beam Search From Scratch:

In [23]:
import torch
import torch.nn.functional as F

def generate_answer(question, model, tokenizer, max_length=30, num_beams=5, device="cuda"):
    """
    Generate an answer using manual beam search.

    Args:
        question (str): The input question.
        model (GPT2LMHeadModel): The fine-tuned GPT-2 model.
        tokenizer (GPT2Tokenizer): The tokenizer.
        max_length (int): Maximum length of the generated text.
        num_beams (int): Number of beams for beam search.
        device (str): The device to use ('cuda' or 'cpu').

    Returns:
        str: The generated answer.
    """
    # Move the model to the specified device and set to eval mode
    model.to(device)
    model.eval()

    # Prepare the input text
    input_text = f"Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)  # Shape: [1, seq_len]

    # Initialize beams
    beams = [(input_ids, 0)]  # List of (sequence, score)
    completed_sequences = []

    # Beam search loop
    for _ in range(max_length):
        new_beams = []

        for seq, score in beams:
            # Stop expanding if sequence ends with EOS
            if seq[0, -1] == tokenizer.eos_token_id:
                completed_sequences.append((seq, score))
                continue

            # Forward pass
            with torch.no_grad():
                outputs = model(seq)
                logits = outputs.logits[:, -1, :]  # Get logits for the last token
                probs = F.log_softmax(logits, dim=-1)  # Convert logits to log probabilities

            # Get top-k tokens and their log probabilities
            top_k_probs, top_k_tokens = torch.topk(probs, num_beams, dim=-1)

            # Expand each beam
            for prob, token in zip(top_k_probs[0], top_k_tokens[0]):
                new_seq = torch.cat([seq, token.unsqueeze(0).unsqueeze(0)], dim=1)
                new_score = score + prob.item()
                new_beams.append((new_seq, new_score))

        # Sort new beams by score and keep top-k
        new_beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:num_beams]
        beams = new_beams

        # Break if all beams end with EOS
        if all(seq[0, -1] == tokenizer.eos_token_id for seq, _ in beams):
            break

    # Add remaining beams to completed sequences
    completed_sequences.extend(beams)

    # Select the sequence with the highest score
    best_sequence, _ = max(completed_sequences, key=lambda x: x[1])

    # Decode the tokens to text
    answer = tokenizer.decode(best_sequence[0], skip_special_tokens=True)
    return answer

# Load the fine-tuned model and tokenizer for testing
model = GPT2LMHeadModel.from_pretrained("./gpt2_qa_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_qa_finetuned")

# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"

question = "What is the capital of France?"
answer = generate_answer(question, model, tokenizer)
print(answer)

Question: What is the capital of France?
Answer: The capital of France is Paris.


#Trainer from Scratch using Loss inside Model

In [37]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import os

import random
import numpy as np

# Seed setting function
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Set the seed for reproducibility
seed = 50
set_seed(seed)

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Tie lm_head weights if missing
if model.lm_head.weight.shape[0] != model.transformer.wte.weight.shape[0]:
    model.tie_weights()

# Add padding token to tokenizer and model
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Example QA dataset
qa_data = [
    {"question": "What is the capital of France?", "answer": "The capital of France is Paris."},
    {"question": "Who wrote '1984'?", "answer": "George Orwell wrote '1984'."},
]

# Preprocess dataset
def preprocess_data(example):
    input_text = f"Question: {example['question']}\nAnswer: {example['answer']}"
    inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=60)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

# Convert dataset to Huggingface Dataset object
dataset = Dataset.from_list(qa_data)
tokenized_dataset = dataset.map(preprocess_data, remove_columns=["question", "answer"])

# Define collation function
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item["attention_mask"] for item in batch], dtype=torch.long)
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Split dataset into training and validation sets
# train_size = int(0.8 * len(tokenized_dataset))
# train_dataset = tokenized_dataset.select(range(train_size))
# val_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

# Data loaders
batch_size = 2
train_loader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Define optimizer, criterion, and device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Training loop
def train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10, save_dir="./gpt2_qa_finetuned"):
    best_val_loss = float("inf")
    os.makedirs(save_dir, exist_ok=True)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Training phase
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            print('input_ids', input_ids.shape)
            print('labels', labels.shape)
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Training Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        # Save the model if validation loss improves
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model.save_pretrained(save_dir)
            tokenizer.save_pretrained(save_dir)
            print(f"Saved best model with Validation Loss: {avg_val_loss:.4f}")

# Train the model
train_model(model, train_loader, val_loader, optimizer, criterion, device)

# Generate an answer
def generate_answer(question, model, tokenizer, max_length=50, device="cuda"):
    model.to(device)
    model.eval()
    input_text = f"Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.pad_token_id)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Test generation
model = GPT2LMHeadModel.from_pretrained("./gpt2_qa_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_qa_finetuned")
question = "What is the capital of France?"
device = "cuda" if torch.cuda.is_available() else "cpu"
answer = generate_answer(question, model, tokenizer, device=device)
print(answer)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Epoch 1/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.16it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 6.7348


Validating: 100%|██████████| 1/1 [00:00<00:00, 59.43it/s]

Validation Loss: 3.5652





Saved best model with Validation Loss: 3.5652
Epoch 2/10


Training: 100%|██████████| 1/1 [00:00<00:00,  7.64it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 3.1363


Validating: 100%|██████████| 1/1 [00:00<00:00, 37.52it/s]


Validation Loss: 1.1642
Saved best model with Validation Loss: 1.1642
Epoch 3/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.14it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 1.7745


Validating: 100%|██████████| 1/1 [00:00<00:00, 51.18it/s]

Validation Loss: 0.7907





Saved best model with Validation Loss: 0.7907
Epoch 4/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.09it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.9105


Validating: 100%|██████████| 1/1 [00:00<00:00, 42.28it/s]

Validation Loss: 0.7772





Saved best model with Validation Loss: 0.7772
Epoch 5/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.64it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.8983


Validating: 100%|██████████| 1/1 [00:00<00:00, 49.90it/s]


Validation Loss: 0.7979
Epoch 6/10


Training:   0%|          | 0/1 [00:00<?, ?it/s]

input_ids torch.Size([2, 60])
labels torch.Size([2, 60])


Training: 100%|██████████| 1/1 [00:00<00:00, 11.15it/s]


Training Loss: 0.8771


Validating: 100%|██████████| 1/1 [00:00<00:00, 71.27it/s]


Validation Loss: 0.8017
Epoch 7/10


Training:   0%|          | 0/1 [00:00<?, ?it/s]

input_ids torch.Size([2, 60])
labels torch.Size([2, 60])


Training: 100%|██████████| 1/1 [00:00<00:00, 11.11it/s]


Training Loss: 0.9955


Validating: 100%|██████████| 1/1 [00:00<00:00, 43.26it/s]


Validation Loss: 0.7646
Saved best model with Validation Loss: 0.7646
Epoch 8/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.59it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 1.0038


Validating: 100%|██████████| 1/1 [00:00<00:00, 47.07it/s]

Validation Loss: 0.7027





Saved best model with Validation Loss: 0.7027
Epoch 9/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.72it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.8026


Validating: 100%|██████████| 1/1 [00:00<00:00, 49.40it/s]

Validation Loss: 0.6361





Saved best model with Validation Loss: 0.6361
Epoch 10/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.65it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.6835


Validating: 100%|██████████| 1/1 [00:00<00:00, 46.14it/s]

Validation Loss: 0.5731





Saved best model with Validation Loss: 0.5731
Question: What is the capital of France?
Answer: The capital of France is Paris


#Trainer and Loss from Scratch

In [38]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import os

import random
import numpy as np

# Seed setting function
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Set the seed for reproducibility
seed = 50
set_seed(seed)

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Tie lm_head weights if missing
if model.lm_head.weight.shape[0] != model.transformer.wte.weight.shape[0]:
    model.tie_weights()

# Add padding token to tokenizer and model
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Example QA dataset
qa_data = [
    {"question": "What is the capital of France?", "answer": "The capital of France is Paris."},
    {"question": "Who wrote '1984'?", "answer": "George Orwell wrote '1984'."},
]

# Preprocess dataset
def preprocess_data(example):
    input_text = f"Question: {example['question']}\nAnswer: {example['answer']}"
    inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=60)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

# Convert dataset to Huggingface Dataset object
dataset = Dataset.from_list(qa_data)
tokenized_dataset = dataset.map(preprocess_data, remove_columns=["question", "answer"])

# Define collation function
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item["attention_mask"] for item in batch], dtype=torch.long)
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Split dataset into training and validation sets
# train_size = int(0.8 * len(tokenized_dataset))
# train_dataset = tokenized_dataset.select(range(train_size))
# val_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

# Data loaders
batch_size = 2
train_loader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Define optimizer, criterion, and device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Training loop
def train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10, save_dir="./gpt2_qa_finetuned"):
    best_val_loss = float("inf")
    os.makedirs(save_dir, exist_ok=True)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Training phase
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            print('input_ids', input_ids.shape)
            print('labels', labels.shape)
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Logits shape: (batch_size, seq_length, vocab_size)

            # Shift logits and labels for causal language modeling
            shift_logits = logits[..., :-1, :].contiguous()  # Remove the last token
            shift_labels = labels[..., 1:].contiguous()      # Remove the first token

            # Reshape for CrossEntropyLoss
            loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Training Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                # Shift logits and labels for causal language modeling
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = labels[..., 1:].contiguous()

                # Reshape for CrossEntropyLoss
                loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        # Save the model if validation loss improves
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model.save_pretrained(save_dir)
            tokenizer.save_pretrained(save_dir)
            print(f"Saved best model with Validation Loss: {avg_val_loss:.4f}")

# Train the model
train_model(model, train_loader, val_loader, optimizer, criterion, device)

# Generate an answer
def generate_answer(question, model, tokenizer, max_length=50, device="cuda"):
    model.to(device)
    model.eval()
    input_text = f"Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.pad_token_id)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Test generation
model = GPT2LMHeadModel.from_pretrained("./gpt2_qa_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_qa_finetuned")
question = "What is the capital of France?"
device = "cuda" if torch.cuda.is_available() else "cpu"
answer = generate_answer(question, model, tokenizer, device=device)
print(answer)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Epoch 1/10


Training: 100%|██████████| 1/1 [00:00<00:00,  8.68it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 3.3350


Validating: 100%|██████████| 1/1 [00:00<00:00, 42.22it/s]

Validation Loss: 2.1785





Saved best model with Validation Loss: 2.1785
Epoch 2/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.59it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 2.3493


Validating: 100%|██████████| 1/1 [00:00<00:00, 47.31it/s]

Validation Loss: 1.7007





Saved best model with Validation Loss: 1.7007
Epoch 3/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.80it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 1.9128


Validating: 100%|██████████| 1/1 [00:00<00:00, 51.88it/s]

Validation Loss: 1.3272





Saved best model with Validation Loss: 1.3272
Epoch 4/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.10it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 1.5536


Validating: 100%|██████████| 1/1 [00:00<00:00, 46.66it/s]

Validation Loss: 1.0289





Saved best model with Validation Loss: 1.0289
Epoch 5/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.66it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 1.1126


Validating: 100%|██████████| 1/1 [00:00<00:00, 51.25it/s]

Validation Loss: 0.8119





Saved best model with Validation Loss: 0.8119
Epoch 6/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.66it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.8002


Validating: 100%|██████████| 1/1 [00:00<00:00, 50.76it/s]

Validation Loss: 0.6255





Saved best model with Validation Loss: 0.6255
Epoch 7/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.43it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.8276


Validating: 100%|██████████| 1/1 [00:00<00:00, 52.18it/s]

Validation Loss: 0.4551





Saved best model with Validation Loss: 0.4551
Epoch 8/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.37it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.4953


Validating: 100%|██████████| 1/1 [00:00<00:00, 49.13it/s]

Validation Loss: 0.3170





Saved best model with Validation Loss: 0.3170
Epoch 9/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.70it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.5933


Validating: 100%|██████████| 1/1 [00:00<00:00, 51.98it/s]

Validation Loss: 0.2148





Saved best model with Validation Loss: 0.2148
Epoch 10/10


Training: 100%|██████████| 1/1 [00:00<00:00,  6.21it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.3313


Validating: 100%|██████████| 1/1 [00:00<00:00, 25.93it/s]


Validation Loss: 0.1480
Saved best model with Validation Loss: 0.1480
Question: What is the capital of France?
Answer: The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of
