In [1]:
!pip install torchmetrics



In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random
from torchmetrics.classification import Recall, Accuracy, AUROC, Precision

In [3]:
FILL = "FILL"

In [4]:
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
!unzip -o smsspamcollection.zip

--2024-04-14 21:53:54--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip.14’

smsspamcollection.z     [ <=>                ] 198.65K  --.-KB/s    in 0.1s    

2024-04-14 21:53:54 (1.35 MB/s) - ‘smsspamcollection.zip.14’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [5]:
!unzip -o smsspamcollection.zip

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [6]:
!head -10 SMSSpamCollection

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam	H

In [7]:
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
!unzip -o smsspamcollection.zip

--2024-04-14 21:53:55--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip.15’

smsspamcollection.z     [ <=>                ] 198.65K  --.-KB/s    in 0.1s    

2024-04-14 21:53:55 (1.35 MB/s) - ‘smsspamcollection.zip.15’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [8]:
file_path = 'SMSSpamCollection'
df = pd.DataFrame({'label':int(), 'text':str()}, index = [])
with open(file_path) as f:
    for line in f.readlines():
        split = line.split('\t')
        df = pd.concat([
                df,
                pd.DataFrame.from_dict({
                    'label': [1 if split[0] == 'spam' else 0],
                    'text': [split[1]]
                })
            ],
            ignore_index=True
        )
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
text = df.text.values
labels = df.label.values

In [10]:
# Set to the GPT2Tokenizer and set lower case to True
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# Set the padding to 'left' or 'right'?
# Remember we want to use the last token's embedding to represent the entire sentence
tokenizer.padding_side = 'left'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
def print_rand_sentence():
    '''Displays the tokens and respective IDs of a random text sample'''
    index = random.randint(0, len(text)-1)
    print(text[index])
    table = np.array([tokenizer.tokenize(text[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
    print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

cThen i thk shd b enuff.. Still got conclusion n contents pg n references.. I'll b doing da contents pg n cover pg..

╒═════════════╤═════════════╕
│ Tokens      │   Token IDs │
╞═════════════╪═════════════╡
│ c           │          66 │
├─────────────┼─────────────┤
│ Then        │        6423 │
├─────────────┼─────────────┤
│ Ġi          │        1312 │
├─────────────┼─────────────┤
│ Ġth         │         294 │
├─────────────┼─────────────┤
│ k           │          74 │
├─────────────┼─────────────┤
│ Ġsh         │         427 │
├─────────────┼─────────────┤
│ d           │          67 │
├─────────────┼─────────────┤
│ Ġb          │         275 │
├─────────────┼─────────────┤
│ Ġen         │         551 │
├─────────────┼─────────────┤
│ uff         │        1648 │
├─────────────┼─────────────┤
│ ..          │         492 │
├─────────────┼─────────────┤
│ ĠStill      │        7831 │
├─────────────┼─────────────┤
│ Ġgot        │        1392 │
├─────────────┼─────────────┤
│ Ġconclusio

In [12]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  # Use the tokenizer and the encode_plus methods to return the right data we'll need
  # Set max_length = 32 and return_tokens = 'pt'
  # Set other fields to the appropriate booleans needed
  return tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=32,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        return_attention_mask=True
    )


for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])


# Gather all the torch_id, attention masks, and labels
token_id = torch.cat(token_id, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [13]:
def print_rand_sentence_encoding():
    '''Displays tokens, token IDs and attention mask of a random text sample'''
    index = random.randint(0, len(text) - 1)
    tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
    print(tokens)
    token_ids = [i.numpy() for i in token_id[index]]
    attention = [i.numpy() for i in attention_masks[index]]
    table = np.array([tokens, token_ids, attention]).T
    print(
        tabulate(
            table,
            headers = ['Tokens', 'Token IDs', 'Attention Mask'],
            tablefmt = 'fancy_grid')
    )

print_rand_sentence_encoding()

['<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', 'Hey', 'Ġare', 'Ġwe', 'Ġgoing', 'Ġfor', 'Ġthe', 'Ġlo', 'Ġlesson', 'Ġor', 'Ġgym', '?', 'Ġ', 'Ċ']
╒═══════════════╤═════════════╤══════════════════╕
│ Tokens        │   Token IDs │   Attention Mask │
╞═══════════════╪═════════════╪══════════════════╡
│ <|endoftext|> │       50256 │                0 │
├───────────────┼─────────────┼──────────────────┤
│ <|endoftext|> │       50256 │                0 │
├───────────────┼─────────────┼──────────────────┤
│ <|endoftext|> │       50256 │                0 │
├───────────────┼─────────────┼──────────────────┤
│ <|endoftext|> │       50256 │                0 │
├───────────────┼─────────────┼──────────────────┤
│ <|endoftext|> │  

In [14]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
# Use train_test_split
train_idx, val_idx = train_test_split(
    range(len(labels)),  # generate indices from the length of the labels
    test_size=val_ratio, # size of validation set
    stratify=labels,     # stratify by labels to ensure balanced classes
    random_state=42
)

# Train and validation sets
# Set to TensorDataset
train_set = TensorDataset(token_id[train_idx], attention_masks[train_idx], labels[train_idx])

val_set = TensorDataset(token_id[val_idx], attention_masks[val_idx], labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

validation_dataloader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

### Load specific versions of the model

In [15]:
# Load the BertForSequenceClassification model
# Do not ouput the attentions and all hidden states

config = GPT2Config.from_pretrained('gpt2',
                                    output_attentions=False,
                                    output_hidden_states=False,
                                    num_labels=2)

# Set to 'gpt2' (the smallest GPT2 which is 120 M parameters)
# Use the config above and set other labels as needed
model = GPT2ForSequenceClassification(config)

# Set the pad token id to the eos token id
model.config.pad_token_id = model.config.eos_token_id

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5
# See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 5e-5,
    eps = 1e-08
)

### Set the model to the right device

In [16]:
# If on GPU, do as below
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
model = model.to(device)

# Recommended number of epochs: See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

In [18]:
# Print all the layers of this GPT2 model and the number of parameters per layer
# If this is False, fine tune just the classifier layer and leave all other GPT2 parameters alone
# If this is True, fine tune everything
fine_tune = True

# Turn off gradients using the above
total_parameters = 0

# Iterate through all model parameters
for name, param in model.named_parameters():
    # Print the parameter's layer and its total parameter count
    param_count = param.numel()
    print(f"{name}: {param_count}")
    total_parameters += param_count

    # Depending on the fine_tune flag, disable gradients for parameters not in the classifier
    if not fine_tune and 'score' not in name:
        param.requires_grad = False

assert(total_parameters == 124441344)

transformer.wte.weight: 38597376
transformer.wpe.weight: 786432
transformer.h.0.ln_1.weight: 768
transformer.h.0.ln_1.bias: 768
transformer.h.0.attn.c_attn.weight: 1769472
transformer.h.0.attn.c_attn.bias: 2304
transformer.h.0.attn.c_proj.weight: 589824
transformer.h.0.attn.c_proj.bias: 768
transformer.h.0.ln_2.weight: 768
transformer.h.0.ln_2.bias: 768
transformer.h.0.mlp.c_fc.weight: 2359296
transformer.h.0.mlp.c_fc.bias: 3072
transformer.h.0.mlp.c_proj.weight: 2359296
transformer.h.0.mlp.c_proj.bias: 768
transformer.h.1.ln_1.weight: 768
transformer.h.1.ln_1.bias: 768
transformer.h.1.attn.c_attn.weight: 1769472
transformer.h.1.attn.c_attn.bias: 2304
transformer.h.1.attn.c_proj.weight: 589824
transformer.h.1.attn.c_proj.bias: 768
transformer.h.1.ln_2.weight: 768
transformer.h.1.ln_2.bias: 768
transformer.h.1.mlp.c_fc.weight: 2359296
transformer.h.1.mlp.c_fc.bias: 3072
transformer.h.1.mlp.c_proj.weight: 2359296
transformer.h.1.mlp.c_proj.bias: 768
transformer.h.2.ln_1.weight: 768
trans

### Train the model

In [19]:
import torchmetrics

In [20]:
# Use torchmetrics to set up accuracy, recall, precision, and auroc
accuracy = torchmetrics.Accuracy(num_classes=2, average='macro', task='binary').to(device)
recall = torchmetrics.Recall(num_classes=2, average='macro', task='binary').to(device)
precision = torchmetrics.Precision(num_classes=2, average='macro', task='binary').to(device)
auroc = torchmetrics.AUROC(num_classes=2, average='macro', task='binary').to(device)

In [21]:
# Main training / validation loop
for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        # Put each element of batch onto the device
        batch = tuple(e.to(device) for e in batch)

        # Unpack the batch
        b_input_ids, b_input_mask, b_labels = batch

        # Set gradients to zero
        optimizer.zero_grad()

        # Forward pass
        train_output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        # Backward pass
        loss = train_output.loss
        logits = train_output.logits
        loss.backward()
        optimizer.step()

        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_auroc = []

    for batch in validation_dataloader:
        batch = tuple(e.to(device) for e in batch)

        # Unpack the batch
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
          # Forward pass
            eval_output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = eval_output.logits

        # Calculate validation metrics
        labels = b_labels
        predicted_labels = torch.argmax(logits, axis=1)

        val_accuracy.append(accuracy(predicted_labels, labels))
        val_recall.append(precision(predicted_labels, labels))
        val_precision.append(recall(predicted_labels, labels))
        val_auroc.append(auroc(logits.softmax(dim=1)[:, 1], labels))

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)))
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))
    print('\t - Validation AUROC: {:.4f}\n'.format(sum(val_auroc)/len(val_auroc)))

Epoch:  50%|█████     | 1/2 [00:39<00:39, 39.63s/it]


	 - Train loss: 0.1546
	 - Validation Accuracy: 0.9812
	 - Validation Precision: 0.7724
	 - Validation Recall: 0.8060
	 - Validation AUROC: 0.9036



Epoch: 100%|██████████| 2/2 [01:16<00:00, 38.45s/it]


	 - Train loss: 0.0326
	 - Validation Accuracy: 0.9875
	 - Validation Precision: 0.8031
	 - Validation Recall: 0.8571
	 - Validation AUROC: 0.9065






### Test on a specific sentence, see the outcome

In [22]:
new_sentence = 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
    output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
Predicted Class:  Spam


### Questions

Question 1: Run the above by fine tuning GPT2 and the classfier head and by not doing this (using GPT2 as a feature encoder). What is the gap between this? What are the metrics we get in each case?

Solution: FILL Paste your output here

#### No Fune Tune:

In [23]:
config = GPT2Config.from_pretrained('gpt2',
                                    output_attentions=False,
                                    output_hidden_states=False,
                                    num_labels=2)

# Set to 'gpt2' (the smallest GPT2 which is 120 M parameters)
# Use the config above and set other labels as needed
model = GPT2ForSequenceClassification(config)

# Set the pad token id to the eos token id
model.config.pad_token_id = model.config.eos_token_id

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5
# See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 5e-5,
    eps = 1e-08
)

model = model.to(device)
# Print all the layers of this GPT2 model and the number of parameters per layer
# If this is False, fine tune just the classifier layer and leave all other GPT2 parameters alone
# If this is True, fine tune everything
fine_tune = False

# Turn off gradients using the above
# total_parameters = 0

# Iterate through all model parameters
for name, param in model.named_parameters():
    # Print the parameter's layer and its total parameter count
    param_count = param.numel()
    # print(f"{name}: {param_count}")
    total_parameters += param_count

    # Depending on the fine_tune flag, disable gradients for parameters not in the classifier
    if not fine_tune and 'score' not in name:
        param.requires_grad = False


# Main training / validation loop
for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        # Put each element of batch onto the device
        batch = tuple(e.to(device) for e in batch)

        # Unpack the batch
        b_input_ids, b_input_mask, b_labels = batch

        # Set gradients to zero
        optimizer.zero_grad()

        # Forward pass
        train_output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        # Backward pass
        loss = train_output.loss
        logits = train_output.logits
        loss.backward()
        optimizer.step()

        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_auroc = []

    for batch in validation_dataloader:
        batch = tuple(e.to(device) for e in batch)

        # Unpack the batch
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
          # Forward pass
            eval_output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = eval_output.logits

        # Calculate validation metrics
        labels = b_labels
        predicted_labels = torch.argmax(logits, axis=1)

        val_accuracy.append(accuracy(predicted_labels, labels))
        val_recall.append(precision(predicted_labels, labels))
        val_precision.append(recall(predicted_labels, labels))
        val_auroc.append(auroc(logits.softmax(dim=1)[:, 1], labels))

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)))
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))
    print('\t - Validation AUROC: {:.4f}\n'.format(sum(val_auroc)/len(val_auroc)))

Epoch:  50%|█████     | 1/2 [00:10<00:10, 10.64s/it]


	 - Train loss: 0.4012
	 - Validation Accuracy: 0.8599
	 - Validation Precision: 0.0267
	 - Validation Recall: 0.0643
	 - Validation AUROC: 0.7705



Epoch: 100%|██████████| 2/2 [00:21<00:00, 10.89s/it]


	 - Train loss: 0.3012
	 - Validation Accuracy: 0.8670
	 - Validation Precision: 0.1352
	 - Validation Recall: 0.2119
	 - Validation AUROC: 0.7957






So the comparison of not fine tune and fine tune all is as follows:

**Not Fine Tune:**

Epoch:  50%|█████     | 1/2 [00:10<00:10, 10.64s/it]
	 - Train loss: 0.4012
	 - Validation Accuracy: 0.8599
	 - Validation Precision: 0.0267
	 - Validation Recall: 0.0643
	 - Validation AUROC: 0.7705

Epoch: 100%|██████████| 2/2 [00:21<00:00, 10.89s/it]
	 - Train loss: 0.3012
	 - Validation Accuracy: 0.8670
	 - Validation Precision: 0.1352
	 - Validation Recall: 0.2119
	 - Validation AUROC: 0.7957


**Fine Tune:**

Epoch:  50%|█████     | 1/2 [00:39<00:39, 39.63s/it]
	 - Train loss: 0.1546
	 - Validation Accuracy: 0.9812
	 - Validation Precision: 0.7724
	 - Validation Recall: 0.8060
	 - Validation AUROC: 0.9036

Epoch: 100%|██████████| 2/2 [01:16<00:00, 38.45s/it]
	 - Train loss: 0.0326
	 - Validation Accuracy: 0.9875
	 - Validation Precision: 0.8031
	 - Validation Recall: 0.8571
	 - Validation AUROC: 0.9065