## Task Description

In this exercise, you will implement Supervised Finetuning (SFT) for the pretrained GPT-2 model. You should use the `transformers` library to load the pretrained model and tokenizer. You will finetune the model on the `Alpaca` dataset, which is a collection of instruction-following examples. The dataset can be found [here](https://huggingface.co/datasets/tatsu-lab/alpaca).
Your implementation should contain the four parts specified below.

In [1]:
!pip install datasets



In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import AdamW
from datasets import load_dataset
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm

In [3]:
MODEL_NAME = "gpt2-medium"
NUM_EPOCHS = 1
BATCH_SIZE = 1
GRAD_ACCUM_STEPS = 2
MAX_LENGTH = 256 # Max sequence length for tokenization
LEARNING_RATE = 5e-5

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}

class AlpacaDataset(Dataset):
    def __init__(self, data_points, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.instructions = [item['instruction'] for item in data_points]
        self.inputs = [item.get('input', "") for item in data_points]
        self.outputs = [item['output'] for item in data_points]

    def __len__(self):
        return len(self.instructions)

    def __getitem__(self, idx):
        instruction = self.instructions[idx]
        input_text = self.inputs[idx]
        output_text = self.outputs[idx]

        if input_text:
            prompt_source = PROMPT_DICT["prompt_input"].format(instruction=instruction, input=input_text)
        else:
            prompt_source = PROMPT_DICT["prompt_no_input"].format(instruction=instruction)

        full_text = prompt_source + " " + output_text + self.tokenizer.eos_token

        tokenized_full = self.tokenizer(
            full_text,
            max_length=self.max_length,
            truncation=True,
            padding=False, # Collator will handle padding
            return_tensors=None # Collator will convert to tensors
        )

        tokenized_prompt_source = self.tokenizer(
            prompt_source,
            max_length=self.max_length,
            truncation=True,
            padding=False,
            return_tensors=None
        )

        input_ids = tokenized_full["input_ids"]
        attention_mask = tokenized_full["attention_mask"]
        labels = list(input_ids)

        prompt_len = len(tokenized_prompt_source["input_ids"])

        for i in range(min(prompt_len, len(labels))):
            labels[i] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

In [6]:
class DataCollatorForSFT:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        input_ids_list = [item['input_ids'] for item in batch]
        attention_mask_list = [item['attention_mask'] for item in batch]
        labels_list = [item['labels'] for item in batch]

        # Determine the maximum sequence length in the current batch
        max_len_in_batch = max(len(seq) for seq in input_ids_list)

        padded_input_ids = []
        padded_attention_mask = []
        padded_labels = []

        # Pad each sequence in the batch to max_len_in_batch
        for i in range(len(batch)):
            input_ids = input_ids_list[i]
            attention_mask = attention_mask_list[i]
            labels = labels_list[i]

            padding_length = max_len_in_batch - len(input_ids)

            # Right-pad input_ids with pad_token_id
            padded_input_ids.append(input_ids + [self.tokenizer.pad_token_id] * padding_length)
            # Right-pad attention_mask with 0
            padded_attention_mask.append(attention_mask + [0] * padding_length)
            # Right-pad labels with -100
            padded_labels.append(labels + [-100] * padding_length)

        return {
            "input_ids": torch.tensor(padded_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(padded_attention_mask, dtype=torch.long),
            "labels": torch.tensor(padded_labels, dtype=torch.long),
        }

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device);

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
dataset = load_dataset("tatsu-lab/alpaca", split="train")

In [9]:
subset_alpaca_data = dataset.select(range(2000))

train_dataset = AlpacaDataset(subset_alpaca_data, tokenizer, MAX_LENGTH)
data_collator = DataCollatorForSFT(tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator)

In [10]:
torch.set_float32_matmul_precision('high')
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
model.train()
for epoch in range(NUM_EPOCHS):
    for i, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1}")):
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
            outputs = model(**batch)
            loss = outputs.loss
        loss = loss / GRAD_ACCUM_STEPS
        loss.backward()

        if (i + 1) % GRAD_ACCUM_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()

Epoch 1:   0%|          | 0/2000 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


In [11]:
def generate_answer(instruction):
    prompt_text = PROMPT_DICT["prompt_no_input"].format(instruction=instruction)

    inputs = tokenizer(prompt_text, return_tensors="pt", max_length=MAX_LENGTH, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    prompt_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output_sequences = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=70,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
        )

    # Decode only the newly generated part of the sequence
    generated_tokens = output_sequences[0][prompt_len:]
    generated_answer = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return generated_answer

In [12]:
model.eval()
generated_pairs = []


generation_prompts_data = [
    {"instruction": "Write a short story about a robot learning to paint.", "input": ""},
    {"instruction": "Explain the difference between 'less' and 'fewer' with examples.", "input": ""},
    {"instruction": "Provide three tips for improving concentration while studying.", "input": ""}
]

for i, item in enumerate(generation_prompts_data):
    instruction = item['instruction']
    generated_answer = generate_answer(instruction)

    print(f"\nPair {i+1}:")
    print(f"Question: {instruction}")
    print(f"Generated Answer: {generated_answer}")


Pair 1:
Question: Write a short story about a robot learning to paint.
Generated Answer:  The robot was learning to paint by using its mind to learn and master the patterns it had seen. It began to paint by the patterns it could see, and the patterns it could see were becoming more and more accurate.

Pair 2:
Question: Explain the difference between 'less' and 'fewer' with examples.
Generated Answer:  Less is 'less' than 'fewer'.

Pair 3:
Question: Provide three tips for improving concentration while studying.
Generated Answer:  1. Practice focusing on one task at a time.
2. Practice practicing concentration on a single task in one task.
3. Practice a few simple exercises such as breathing, stretching, and concentrating on one word or phrase.
