# Fine-tuning for Classification

In this we will fine tune a pre-trained model for email classification. Essentially, given a set if emails we will classify the, into spam and not spam.

First step is preparing the data.

In [1]:
import requests
import zipfile
import os
import pandas as pd
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Download the zip file
    with requests.get(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.content)

    # Unzip the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

df = pd.read_csv(
    data_file_path, sep="\t", header=None, names=["Label", "Text"]
)
df

sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [2]:
print(df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


Lets create a balanced dataset to match the span and not spam emails.

In [3]:
def create_balanced_dataset(df):
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_subset = df[df["Label"] == "ham"].sample(
        num_spam, random_state=123
    )
    balanced_df = pd.concat([
        ham_subset, df[df["Label"] == "spam"]
    ])
    return balanced_df
balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


Next we will convert ham and spam labels to 0 and 1 respectively. This similar to assigning a token id to a word in vocabulary. Instead here we have a vocabulatry of two words - ham and spam with ids 0 and 1.

In [4]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

Simple function that splits the data in training, validation and test sets.


In [5]:
def random_split(df, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)


Next we create data loaders, while training an LLM we used sliding window approach to create batches. However, in this case the SMS Spam collection with variable length text messages. So we will use padding to make all the text messages in a batch of same length by picking the longest message in the batch and padding the rest with special padding token.

The pad token we will use will be the ``<|endoftext|>`` token in the GPT-2 tokenizer.

In [6]:
import torch
from torch.utils.data import Dataset
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None,pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        # Pretokenizes texts
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        #Truncates sequences if they are longer than max_length
        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length

        self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]
        # Pads sequences to the longest sequence
        self.encoded_texts = [encoded_text + [pad_token_id] *(self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long)

    def __len__(self):
        return len(self.data)


    def _longest_encoded_length(self):
        return max([len(encoded_text) for encoded_text in self.encoded_texts])


In [7]:
train_dataset = SpamDataset(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer
)
print(train_dataset.max_length)

# Create the validation and test dataset
val_dataset = SpamDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

120


Lets create data loaders for the datasets.

In [8]:
from torch.utils.data import DataLoader
num_workers = 0
batch_size = 8
torch.manual_seed(123)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [9]:
for input_batch, target_batch in train_loader:
    pass
print("Input batch dimensions:", input_batch.shape)
print("Label batch dimensions", target_batch.shape)
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")


Input batch dimensions: torch.Size([8, 120])
Label batch dimensions torch.Size([8])
130 training batches
19 validation batches
38 test batches


We will now just copy all we had in chapter 4 and 5 again to create the model, training and evaluation functions.

The code is simply copied to one cell and can be run as is.

In [10]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        # This is new, we will add an optional Linear layer to project the output.
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in_ca = x.shape
        keys_mha = self.W_key(x)       # (b, num_tokens, d_out)
        values_mha = self.W_value(x)   # (b, num_tokens, d_out)
        queries_mha = self.W_query(x)  # (b, num_tokens, d_out)

        # d_out is same as num_heads * head_dim
        # view reshapes the tensor without changing its data, in this case we project the
        # last d_out dimension to (num_heads, head_dim)
        keys_mha = keys_mha.view(b, num_tokens, self.num_heads, self.head_dim) # (b, num_tokens, num_heads, head_dim)
        values_mha = values_mha.view(b, num_tokens, self.num_heads, self.head_dim) # (b, num_tokens, num_heads, head_dim)
        queries_mha = queries_mha.view(b, num_tokens, self.num_heads, self.head_dim) # (b, num_tokens, num_heads, head_dim)

        # To calculation the attention score, we need the last two dimensions to be num_tokens and head_dim
        # thus we need to transpose the 1st and 2nd dimensions
        queries_mha.transpose_(1, 2)  # (b, num_heads, num_tokens, head_dim)
        keys_mha.transpose_(1, 2)     # (b, num_heads, num_tokens, head_dim)
        values_mha.transpose_(1, 2)   # (b, num_heads, num_tokens, head_dim)

         # Let calculate the attention scores, this is the dot product of queries and keys
        attn_scores_mha = queries_mha @ keys_mha.transpose(-2, -1) # (b, num_heads, num_tokens, num_tokens)

        # Apply the mask, the dimensions of the attn scores are still (b, num_heads, num_tokens, num_tokens)
        #  the mask is 2D and is applied to the last two dimensions only
        attn_scores_mha.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf) #(b, num_heads, num_tokens, num_tokens)
        attn_weights_mha = torch.softmax(attn_scores_mha / self.head_dim ** 0.5, dim=-1) #(b, num_heads, num_tokens, num_tokens)
        # Apply dropout to the attention weights
        attn_weights_mha = self.dropout(attn_weights_mha) # (b, num_heads, num_tokens, num_tokens)
        # attn_weights_mha @ values_mha gives (b, num_heads, num_tokens, head_dim)
        # We need to transpose the 1st and 2nd (both 0 indexed) dimensions to get (b, num_tokens, num_heads, head_dim)
        context_vecs_mha = (attn_weights_mha @ values_mha).transpose(1,2) # (b, num_tokens, num_heads, head_dim)
        # We will reshape the context vectors back to (b, num_tokens, d_out) where d_out = num_heads * head_dim
        context_vecs_mha = context_vecs_mha.contiguous().view(b, num_tokens, self.d_out) # (b, num_tokens, d_out)
        # Finally we will project the output using the out_proj layer
        context_vecs_mha = self.out_proj(context_vecs_mha)
        return context_vecs_mha


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

class LayerNorm(nn.Module):
    def __init__(self, emb_dim, eps = 1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(emb_dim)) # (emb_dim,)
        self.shift = nn.Parameter(torch.zeros(emb_dim)) # (emb_dim,)

    def forward(self, x):
        # x has shape (b, num_tokens, emb_dim)
        mean_batch = torch.mean(x, dim=-1, keepdim=True) # (b, num_tokens, 1)
        # unbiased=False means we do not use Bessel's correction, that is, we divide by N instead of N-1 (basel's correction)
        var_batch = torch.var(x, dim=-1, keepdim=True, unbiased=False) # (b, num_tokens, 1)
        norm_x = (x - mean_batch) / torch.sqrt(var_batch + self.eps) # (b, num_tokens, emb_dim)
        return norm_x * self.scale + self.shift # (b, num_tokens, emb_dim)


class FeedForward(nn.Module):
    def __init__(self, cfg, hidden_layer_dim_factor = 4):
        super().__init__()
        emb_dim = cfg["emb_dim"]
        self.layers = nn.Sequential(
            nn.Linear(emb_dim, hidden_layer_dim_factor * emb_dim),
            GELU(),
            nn.Linear(hidden_layer_dim_factor * emb_dim, emb_dim)
        )

    def forward(self, in_batch):
        return self.layers(in_batch)

class TransformerBlock(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            dropout=cfg["drop_rate"],
            num_heads=cfg["n_heads"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.dropout_shortcut = nn.Dropout(cfg["drop_rate"])
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.ff = FeedForward(cfg)

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.dropout_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout_shortcut(x)
        x = x + shortcut
        return x


class GPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        return self.out_head(x)

CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}
model_configs = {
"gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

import requests
import os
url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
if not os.path.exists(filename):
    response = requests.get(url)
    with open(filename, 'wb') as f:
        f.write(response.content)



import gpt_download
import importlib
importlib.reload(gpt_download)
settings, params = gpt_download.download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, "
                          "Right: {right.shape}"
        )
    return torch.nn.Parameter(torch.tensor(right))

import numpy as np
def load_weights_into_gpt(gpt: GPTModel, params):
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])

    # Initialize the trf_blocks
    for b in range(len(params["blocks"])):
        block = params["blocks"][b]
        # Split the weights for query, key and value
        q_w, k_w, v_w = np.split(
            (block["attn"]["c_attn"])["w"], 3, axis=-1)
        # Look at the printed model architecture in above cell
        # Set the weights of the multi head attention in the transformer block
        transformer_block: TransformerBlock = gpt.trf_blocks[b]
        transformer_block.att.W_query.weight = assign(transformer_block.att.W_query.weight, q_w.T)
        transformer_block.att.W_key.weight = assign(transformer_block.att.W_key.weight, k_w.T)
        transformer_block.att.W_value.weight = assign(transformer_block.att.W_value.weight, v_w.T)

        # Set the bias of the multi head attention in the transformer block
        q_b, k_b, v_b = np.split(
        (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        transformer_block.att.W_query.bias = assign(transformer_block.att.W_query.bias, q_b)
        transformer_block.att.W_key.bias = assign(transformer_block.att.W_key.bias, k_b)
        transformer_block.att.W_value.bias = assign(transformer_block.att.W_value.bias, v_b)

        #Set the linear out_proj of the transformer of the multi head attention in the transformer block
        transformer_block.att.out_proj.weight = assign(transformer_block.att.out_proj.weight,
                                                       params["blocks"][b]["attn"]["c_proj"]["w"].T)
        transformer_block.att.out_proj.bias = assign(transformer_block.att.out_proj.bias,
                                                       params["blocks"][b]["attn"]["c_proj"]["b"].T)

        ## Set the two layer norms of the transformer block
        transformer_block.norm1.scale = assign(transformer_block.norm1.scale, params["blocks"][b]["ln_1"]["g"])
        transformer_block.norm1.shift = assign(transformer_block.norm1.shift, params["blocks"][b]["ln_1"]["b"])
        transformer_block.norm2.scale = assign(transformer_block.norm2.scale, params["blocks"][b]["ln_2"]["g"])
        transformer_block.norm2.shift = assign(transformer_block.norm2.shift, params["blocks"][b]["ln_2"]["b"])

        # Set the feed forward layers of the transformer block
        transformer_block.ff.layers[0].weight = assign(transformer_block.ff.layers[0].weight,
                                                       params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        transformer_block.ff.layers[0].bias = assign(transformer_block.ff.layers[0].bias,
                                                     params["blocks"][b]["mlp"]["c_fc"]["b"])
        transformer_block.ff.layers[2].weight = assign(transformer_block.ff.layers[2].weight,
                                                       params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        transformer_block.ff.layers[2].bias = assign(transformer_block.ff.layers[2].bias,
                                                     params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
        gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
        gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
torch.manual_seed(123)
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

import tiktoken

def generate_text_simple(model, idx,
                          max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        # Take the context_size tokens to predict the next token
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():  # No need to track gradients
            logits = model(idx_cond) # (batch_size, context_size, vocab_size)
        # Take the last generated token for this is the next token
        logits = logits[:, -1, :] # (batch_size, vocab_size)
        probs = torch.softmax(logits, dim=-1) # (batch_size, vocab_size)
        idx_next = torch.argmax(probs, dim=-1, keepdim=True) # (batch_size, 1)
        idx = torch.cat((idx, idx_next), dim=-1) # (batch_size, current_seq_len + 1)
    return idx


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    return torch.tensor(encoded).unsqueeze(0)  # Add batch dimension

def token_ids_to_text(token_ids, tokenizer):
    token_ids = token_ids.squeeze(0).tolist()  # Remove batch dimension and convert to list
    return tokenizer.decode(token_ids)



File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [11]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

text_1 = "Every effort moves you"
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text_1, tokenizer),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


Before we train, lets test on one sample spam message

In [12]:
text_2 = (
            "Is the following text 'spam'? Answer with 'yes' or 'no':"
            " 'You are a winner you have been specially"
            " selected to receive $1000 cash or a $2000 award.'"
)
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text_2, tokenizer),
    max_new_tokens=23,
    context_size=BASE_CONFIG["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))

Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'

The following text 'spam'? Answer with 'yes' or 'no': 'You are a winner


Because the message is not instruction fine tuned, it cant follow in the instructions

For classification,we will replace the output layer which maps hidden states to vocabulary size (50257) to a layer that maps hidden states to two classes - spam and not spam. Before we do that lets print the model and confirm the output layer (``out_head``) is indeed ``Linear(in_features=768, out_features=50257, bias=False)``

In [13]:
print(model)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (dropout_shortcut): Dropout(p=0.0, inplace=False)
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_feature

To train a model, we need ti freeze the entire model weights and only replace the output layer with the new layer and train only that layer.

In [14]:
for param in model.parameters():
    param.requires_grad = False

torch.manual_seed(123)
num_classes = 2
model.out_head = torch.nn.Linear(
    in_features=BASE_CONFIG["emb_dim"],
    out_features=num_classes
)

While generally training just the output later is sufficient, we can also unfreeze the last transformer block and train that as well.

We make the parameters of the last transformer block and the final layer norm trainable.

In [16]:
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True
for param in model.final_norm.parameters():
    param.requires_grad = True

In [24]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)
print("Inputs:", inputs)
print("Inputs dimensions:", inputs.shape)

Inputs: tensor([[5211,  345,  423,  640]])
Inputs dimensions: torch.Size([1, 4])


Lets pass this to the model and see the output dimensions.

In [25]:
with torch.no_grad():
    outputs = model(inputs)
print("Outputs:\n", outputs)
print("Outputs dimensions:", outputs.shape)

Outputs:
 tensor([[[-1.5854,  0.9904],
         [-3.7235,  7.4548],
         [-2.2661,  6.6049],
         [-3.5983,  3.9902]]])
Outputs dimensions: torch.Size([1, 4, 2])


Notice the output dimensions are (1, 4, 2), where 1 is the batch size, 4 is the number of tokens in the input and 2 is the number of classes (spam and not spam). In absence of this new output layer the output dimensions would have been (1, 4, 50257), where 50257 is the vocabulary size of GPT-2.

For finetuning, we are interested in the output corresponding to the last token only. So we will slice the output tensor to get the last token outputs. The reason we are interested in last token output is because the last token contains information about all the previous tokens in the input sequence due to the self attention mechanism.

![Test](./Prediction.png)

### Calculating the classification loss and accuracy

We will now use the cross entropy loss to calculate the loss for classification. But first lets define a function that calculates the accuracy.

In [26]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    correct_predictions, num_examples = 0, 0
    model.eval()

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i <num_batches:
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)
            with torch.no_grad():
                logits = model(input_batch)[:, -1, :]  # (batch_size, num_classes)
            predicted_labels = torch.argmax(logits, dim=-1) # (batch_size,)
            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break

    return correct_predictions / num_examples



In [31]:
import torch
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

torch.manual_seed(123)
device = torch.device(device)
model = model.to(device)
torch.manual_seed(123)
train_accuracy = calc_accuracy_loader(
    train_loader, model, device, num_batches=10
)
val_accuracy = calc_accuracy_loader(
    val_loader, model, device, num_batches=10
)
test_accuracy = calc_accuracy_loader(
    test_loader, model, device, num_batches=10
)
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 46.25%
Validation accuracy: 45.00%
Test accuracy: 48.75%
