In [1]:
# https://brainyx.co/journal/journal4/


# One of the most common uses of BERT is to download a model that has been pre-trained with a large amount of text and fine tuning it with a small amount of data. In this article, we will show you how to download a pre-trained model from hugginfface and fine tune it with sample code.


from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, recall_score 
from sklearn.metrics import precision_score, f1_score
 
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
 
import torch
import pandas as pd
import numpy as np

In [2]:
raw_datasets = load_dataset("imdb")
# raw_datasets = load_dataset("imdb", split=['train', 'test'])
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
pd.DataFrame(raw_datasets['train']) # 0 indicates a negative review and 1 indicates a positive review.

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [4]:
# Select Samples for Train and Test

sample_train_val = raw_datasets['train'].shuffle().select(range(0,500)).to_pandas() #2000
sample_test = raw_datasets['test'].shuffle().select(range(0,500)).to_pandas()

sample_train_val

Unnamed: 0,text,label
0,DarkWolf tells the tale of a young waitress na...,0
1,This movie is just not worth your time. Its re...,0
2,It's actually a good thing Sean Connery retire...,1
3,"The Wooden Horse was one of the first ""great"" ...",1
4,Rudy does it again with this hot off the stree...,1
...,...,...
495,"Evidently, not many people have seen this movi...",1
496,A huge disappointment from writer Hamm and dir...,0
497,Robert Siodmak does a fabulous job with this B...,1
498,"Now, I won't deny that when I purchased this o...",0


In [5]:
# Define Pretrained Tokenizer and Model

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
}

In [6]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
# Preprocess Dataset
# Define a simple class inherited from torch dataset
# This is a custom dataset class for PyTorch, which is often used when you want to load your own data into a PyTorch model.

class Dataset(torch.utils.data.Dataset): # The class Dataset is inheriting from torch.utils.data.Dataset. This is a base class provided by PyTorch for representing a dataset.
    
    def __init__(self, encodings, labels=None): # The __init__ method is the initializer for the class. It takes as input encodings and labels. encodings are the encoded representations of your text data (for example, tokenized and converted to IDs), and labels are the corresponding labels for your data.
        self.encodings = encodings
        self.labels = labels
 
    def __getitem__(self, idx): # The __getitem__ method is used to get the data (encodings and labels) for a specific item in the dataset. idx is the index of the item. This method returns a dictionary where the keys are the names of the encoding components (like “input_ids”, “attention_mask”, etc.) and the values are the corresponding tensors. If labels are provided, a “labels” key is also added to the dictionary.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
 
    def __len__(self): # The __len__ method returns the number of items in the dataset
        return len(self.encodings["input_ids"])
 

# The item dictionary would look like this:
# {
#     'input_ids': tensor([101, 2001, 2019, 6207, 102]),
#     'token_type_ids': tensor([0, 0, 0, 0, 0]),
#     'attention_mask': tensor([1, 1, 1, 1, 1]),
#     'labels': tensor(0)
# }

In [8]:
sample_x = list(sample_train_val["text"])
sample_y = list(sample_train_val["label"])
 
sample_x[0], sample_y[0]

("DarkWolf tells the tale of a young waitress named Josie (Samaire Armstrong) who had been leading a pretty ordinary life until her friend Mary (Tippi Hedren) is killed by a Werewolf, you see Werewolves actually exist in modern day America & there is even a special organisation within the police force to fight the Werewolf threat headed up by Detective Steve Turley (Ryan Olosio) who has the difficult task of telling Josie that she is in fact a pure blooded Werewolf herself & that a so-called 'dark prince' Werewolf (Kane Hodder) wants to mate with her & create a new breed of pure blood Werewolves that will take over the entire world, or something like that. Understandably Josie has a hard time believing it, that is until she sees the evidence with her own eyes. It's up to Werewolf cop Steve to save Josie, the day & the world...<br /><br />Co-executive produced & directed Richard Friedman I thought DarkWolf was a pretty bad low budget shot on a digital camcorder horror film that didn't r

In [9]:
X_train, X_val, Y_train, Y_val = train_test_split(sample_x, sample_y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)


In [10]:
X_train_tokenized['input_ids'][0][:10]

[101, 1045, 2034, 3191, 7247, 1055, 10131, 1005, 1055, 21459]

In [11]:
X_train_tokenized['token_type_ids'][0][:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [12]:
X_train_tokenized['attention_mask'][0][:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [13]:
input_train = Dataset(X_train_tokenized, Y_train)
input_val = Dataset(X_val_tokenized, Y_val)

In [14]:
input_train.__getitem__(0)

{'input_ids': tensor([  101,  1045,  2034,  3191,  7247,  1055, 10131,  1005,  1055, 21459,
          3117,  1999,  2026,  6619,  3694,  2381,  2465,  1010,  1998,  1045,
          5632,  2296, 26162,  3931,  1997,  2009,  1012,  2009,  2001,  2471,
         13418,  2008,  5365,  2052,  2131,  2907,  1997,  2009,  1010,  1998,
          6195,  2008,  2009,  2001,  2081,  1999,  4347,  1010,  1996,  3463,
          2024,  6581,  1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,
          1028,  3056,  2477,  2031,  2000,  2022,  3970,  1024,  1999,  4347,
          2045,  2001,  2053,  3160,  1997,  9179,  4004,  5889,  1999,  1037,
          2350,  5365,  2143,  1012,  1999,  1037,  2126,  2023, 17552,  2015,
          1996,  2203,  4031,  2738,  2062,  5875,  2084,  2065,  2027,  2018,
          2042,  2583,  2000,  2224,  1037,  2062, 14469,  1011,  2559,  3459,
          1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  2007,
          2008, 18355,  2000,  9462,  1

In [15]:
# Define Evaluation Metrics

def compute_metrics(p):
    # pred, labels = p
    # pred = np.argmax(pred, axis=1)
    labels = p.label_ids
    pred = p.predictions.argmax(-1)
    print(classification_report(labels, pred))
 
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred) 
    f1 = f1_score(y_true=labels, y_pred=pred)
 
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [16]:
# Check if a GPU is available, otherwise fall back to CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Define Training Arguments for fine-tuning BERT
args = TrainingArguments(
    output_dir="output",                   # Directory to save the model and logs
    evaluation_strategy="steps",           # Evaluate the model every 'eval_steps'
    eval_steps=15,                        # Number of steps between evaluations
    save_strategy="steps",                 # Save the model every 'save_steps'
    save_steps=15,                        # Number of steps between saving the model
    per_device_train_batch_size=16,        # Batch size for training
    per_device_eval_batch_size=16,         # Batch size for evaluation
    num_train_epochs=3,                    # Number of training epochs
    seed=0,                                # Seed for reproducibility
    load_best_model_at_end=True            # Load the best model at the end of training
)

# The Trainer from the transformers library abstracts away the need to manually create DataLoader objects.


# Define Trainer
trainer = Trainer(
    model=model.to(device),                # Move the model to GPU if available
    args=args,                             # Pass the training arguments
    train_dataset=input_train,             # Training dataset
    eval_dataset=input_val,                # Evaluation dataset
    compute_metrics=compute_metrics,       # Metrics for evaluation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Early stopping callback to stop training if no improvement
)

# Fine-tune the pre-trained BERT model
trainer.train()



dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
15,No log,0.647931,0.66,0.827586,0.45283,0.585366
30,No log,0.369284,0.85,0.88,0.830189,0.854369
45,No log,0.31113,0.9,0.921569,0.886792,0.903846
60,No log,0.318277,0.88,0.872727,0.90566,0.888889
75,No log,0.463904,0.82,0.769231,0.943396,0.847458


              precision    recall  f1-score   support

           0       0.59      0.89      0.71        47
           1       0.83      0.45      0.59        53

    accuracy                           0.66       100
   macro avg       0.71      0.67      0.65       100
weighted avg       0.72      0.66      0.64       100

              precision    recall  f1-score   support

           0       0.82      0.87      0.85        47
           1       0.88      0.83      0.85        53

    accuracy                           0.85       100
   macro avg       0.85      0.85      0.85       100
weighted avg       0.85      0.85      0.85       100

              precision    recall  f1-score   support

           0       0.88      0.91      0.90        47
           1       0.92      0.89      0.90        53

    accuracy                           0.90       100
   macro avg       0.90      0.90      0.90       100
weighted avg       0.90      0.90      0.90       100

              preci

TrainOutput(global_step=75, training_loss=0.3780446116129557, metrics={'train_runtime': 3766.8056, 'train_samples_per_second': 0.319, 'train_steps_per_second': 0.02, 'total_flos': 315733266432000.0, 'train_loss': 0.3780446116129557, 'epoch': 3.0})

In [17]:
len(input_train) # global_step=75 (above) makes sense because 400 samples divided into batches of 16 results in 75 batches. 

400

In [18]:
# Load Fine-tuned BERT and Run Prediction

# Load test data
X_test = list(sample_test["text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
 
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)
 
test_dataset.__getitem__(0)

{'input_ids': tensor([  101,  2004,  1045,  2031,  2056,  2077,  1010,  1045,  2572,  2025,
          1037,  5470,  1997,  1056, 28394,  3723,  1012,  2002,  1005,  1055,
          2074,  2061, 12943, 17643, 26477,  1045,  4299, 20016,  2052,  2074,
          4521,  2032,  1998,  2131,  2009,  2058,  2007,  1012,  1026,  7987,
          1013,  1028,  1026,  7987,  1013,  1028,  1999,  2023,  9476, 18230,
          2003, 11573,  2320,  2153,  1998,  2003,  2067,  2000,  8521,  2041,
          1997, 13044, 18484,  1012,  2002,  7516,  1037,  8592,  2911,  2975,
          3417,  3518,  1998,  7288,  2000,  6154,  7548,  2043,  2002,  8645,
          2015,  1056, 28394,  3723,  2003,  2028,  1997,  1996,  5467,  1012,
          1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,  2911,
          3640,  2019, 11706,  2754,  2005,  1996,  2206,  7632, 14642,  5705,
          1998, 18230,  9652,  5363,  2000,  4608,  1996, 15703,  4743,  1998,
          4468, 11915,  6799,  2791,  1

In [21]:
# Load trained model
model_path = "output/checkpoint-60"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
 
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [22]:
# Define test trainer
test_trainer = Trainer(model)
 
# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)
 
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

y_pred

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


array([1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,

In [23]:
y_true = sample_test.label.to_numpy()

y_true

array([1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,

In [24]:
# Compute confusion matrix
conf_mat = confusion_matrix(y_true, y_pred)
print('Confusion Matrix:')
print(conf_mat)


Confusion Matrix:
[[212  39]
 [ 27 222]]


In [25]:
# Compute classification report
class_report = classification_report(y_true, y_pred)
print('\nClassification Report:')
print(class_report)


Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.84      0.87       251
           1       0.85      0.89      0.87       249

    accuracy                           0.87       500
   macro avg       0.87      0.87      0.87       500
weighted avg       0.87      0.87      0.87       500

