In [1]:
# https://brainyx.co/journal/journal4/


# One of the most common uses of BERT is to download a model that has been pre-trained with a large amount of text and fine tuning it with a small amount of data. In this article, we will show you how to download a pre-trained model from hugginfface and fine tune it with sample code.


from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, recall_score 
from sklearn.metrics import precision_score, f1_score
 
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
 
import torch
import pandas as pd
import numpy as np

In [2]:
raw_datasets = load_dataset("imdb")
# raw_datasets = load_dataset("imdb", split=['train', 'test'])
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
pd.DataFrame(raw_datasets['train']) # 0 indicates a negative review and 1 indicates a positive review.

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [4]:
# Select Samples for Train and Test

sample_train_val = raw_datasets['train'].shuffle().select(range(0,500)).to_pandas() #2000
sample_test = raw_datasets['test'].shuffle().select(range(0,500)).to_pandas()

sample_train_val

Unnamed: 0,text,label
0,An extra is called upon to play a general in a...,1
1,"Please Don't hate me but i have to be honest, ...",0
2,"""GEORGE LOPEZ,"" in my opinion, is an absolute ...",1
3,Gillian Holroyd (Kim Novak) is a witch. Secret...,1
4,This is one of may all-time favourite films. P...,1
...,...,...
495,Only the Antichrist could have been behind suc...,0
496,"This movie has it all, action, fighting, danci...",1
497,These two men went thru hell and beyond and ha...,1
498,"America. A land of freedom, of hope and of dre...",0


In [5]:
# Define Pretrained Tokenizer and Model

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
}

In [6]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
# Preprocess Dataset
# Define a simple class inherited from torch dataset
# This is a custom dataset class for PyTorch, which is often used when you want to load your own data into a PyTorch model.

class Dataset(torch.utils.data.Dataset): # The class Dataset is inheriting from torch.utils.data.Dataset. This is a base class provided by PyTorch for representing a dataset.
    
    def __init__(self, encodings, labels=None): # The __init__ method is the initializer for the class. It takes as input encodings and labels. encodings are the encoded representations of your text data (for example, tokenized and converted to IDs), and labels are the corresponding labels for your data.
        self.encodings = encodings
        self.labels = labels
 
    def __getitem__(self, idx): # The __getitem__ method is used to get the data (encodings and labels) for a specific item in the dataset. idx is the index of the item. This method returns a dictionary where the keys are the names of the encoding components (like “input_ids”, “attention_mask”, etc.) and the values are the corresponding tensors. If labels are provided, a “labels” key is also added to the dictionary.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
 
    def __len__(self): # The __len__ method returns the number of items in the dataset
        return len(self.encodings["input_ids"])
 

# The item dictionary would look like this:
# {
#     'input_ids': tensor([101, 2001, 2019, 6207, 102]),
#     'token_type_ids': tensor([0, 0, 0, 0, 0]),
#     'attention_mask': tensor([1, 1, 1, 1, 1]),
#     'labels': tensor(0)
# }

In [8]:
sample_x = list(sample_train_val["text"])
sample_y = list(sample_train_val["label"])
 
sample_x[0], sample_y[0]

('An extra is called upon to play a general in a movie about the Russian Revolution. However, he is not any ordinary extra. He is Serguis Alexander, former commanding general of the Russia armies who is now being forced to relive the same scene, which he suffered professional and personal tragedy in, to satisfy the director who was once a revolutionist in Russia and was humiliated by Alexander. It can now be the time for this broken man to finally "win" his penultimate battle. This is one powerful movie with meticulous direction by Von Sternberg, providing the greatest irony in Alexander\'s character in every way he can. Jannings deserved his Oscar for the role with a very moving performance playing the general at his peak and at his deepest valley. Powell lends a sinister support as the revenge minded director and Brent is perfect in her role with her face and movements showing so much expression as Jannings\' love. All around brilliance. Rating, 10.',
 1)

In [9]:
X_train, X_val, Y_train, Y_val = train_test_split(sample_x, sample_y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)


In [10]:
X_train_tokenized['input_ids'][0][:10]

[101, 2066, 4147, 1037, 2606, 3797, 1012, 13567, 1010, 7078]

In [11]:
X_train_tokenized['token_type_ids'][0][:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [12]:
X_train_tokenized['attention_mask'][0][:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [13]:
input_train = Dataset(X_train_tokenized, Y_train)
input_val = Dataset(X_val_tokenized, Y_val)

In [14]:
input_train.__getitem__(0)

{'input_ids': tensor([  101,  2066,  4147,  1037,  2606,  3797,  1012, 13567,  1010,  7078,
          1010,  2302,  1037,  5192,  1997,  1037,  4797,  2028,  1997,  1996,
          5409,  5691,  2412,  1012,  5760,  8639,  1012,  5717,  3340,  2041,
          1997,  2702,  1012,  2028,  2146,  1010,  6945,  6313,  1010,  4450,
          2098,  1010,  3653,  6528, 20771,  1010,  2969,  1011,  9715,  1010,
          8900,  1010,  1998,  2599,  2368,  2135,  2840,  2100,  3496,  2044,
          2178,  1012,  3832,  2000,  2022,  3959, 10359,  1998,  8605,  6553,
          1010,  1996,  3969, 23485,  1010,  2009,  2003,  1010,  2612,  1010,
         22822,  9232, 14163,  4095,  1012,  1026,  7987,  1013,  1028,  1026,
          7987,  1013,  1028,  2431,  1011,  6248,  1010,  2269,  1998,  2365,
         24665, 17635,  1998,  7204,  2000,  2169,  2060,  2066, 10205,  1012,
          1000, 24004,  1011, 14253,  1000,  2003,  1996,  2391,  1010,  5189,
          1998,  3154,  1012,  7929,  1

In [16]:
# Define Evaluation Metrics

def compute_metrics(p):
    # pred, labels = p
    # pred = np.argmax(pred, axis=1)
    labels = p.label_ids
    pred = p.predictions.argmax(-1)
    print(classification_report(labels, pred))
 
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred) 
    f1 = f1_score(y_true=labels, y_pred=pred)
 
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [18]:
# Check if a GPU is available, otherwise fall back to CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Define Training Arguments for fine-tuning BERT
args = TrainingArguments(
    output_dir="output",                   # Directory to save the model and logs
    evaluation_strategy="steps",           # Evaluate the model every 'eval_steps'
    eval_steps=15,                        # Number of steps between evaluations
    save_strategy="steps",                 # Save the model every 'save_steps'
    save_steps=15,                        # Number of steps between saving the model
    per_device_train_batch_size=16,        # Batch size for training
    per_device_eval_batch_size=16,         # Batch size for evaluation
    num_train_epochs=3,                    # Number of training epochs
    seed=0,                                # Seed for reproducibility
    load_best_model_at_end=True            # Load the best model at the end of training
)

# The Trainer from the transformers library abstracts away the need to manually create DataLoader objects.


# Define Trainer
trainer = Trainer(
    model=model.to(device),                # Move the model to GPU if available
    args=args,                             # Pass the training arguments
    train_dataset=input_train,             # Training dataset
    eval_dataset=input_val,                # Evaluation dataset
    compute_metrics=compute_metrics,       # Metrics for evaluation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Early stopping callback to stop training if no improvement
)

# Fine-tune the pre-trained BERT model
trainer.train()



dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Step,Training Loss,Validation Loss


TrainOutput(global_step=25, training_loss=0.6735159301757813, metrics={'train_runtime': 904.9081, 'train_samples_per_second': 0.442, 'train_steps_per_second': 0.028, 'total_flos': 105244422144000.0, 'train_loss': 0.6735159301757813, 'epoch': 1.0})

In [19]:
len(input_train) # global_step=25 (above) makes sense because 400 samples divided into batches of 16 results in 25 batches. 

400

In [25]:
# Load Fine-tuned BERT and Run Prediction

# Load test data
X_test = list(sample_test["text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
 
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)
 
test_dataset.__getitem__(0)

{'input_ids': tensor([  101,  2023,  3185,  2001,  2428,  5236,  1998,  1045,  2245,  2008,
          2009,  2347,  1005,  1056,  2061,  2919,  1998,  1045,  2071, 19242,
          1037,  3185,  2055,  1037,  2793,  5983,  2111,  1012,  2059,  1996,
          2112,  2379,  1996,  2203,  2073,  1996,  3124,  2038, 13526,  2398,
          3092,  2039,  2108,  1996,  9115,  2006,  2327,  1997,  1037,  2919,
          3185,  1012,  1045,  2071,  2156,  1996, 11224,  2015,  1999,  1996,
          6081, 13526,  2398,  2005, 15003,  8739,  2015,  1012,  1996,  2567,
          2001,  2145,  4142,  1998,  3048,  2043,  2010,  2398,  2020,  6436,
          5944,  1012,  1996,  6057,  2518,  2001,  2008,  2002,  2071,  2145,
          2693,  2010,  2398,  2008,  2001,  2074,  2025,  2157,  1012,  2302,
          6650,  1010,  2017,  2428,  2064,  1005,  1056,  2693,  2115,  2398,
          2021,  2002,  2106,  1012,  1996,  2567,  2323,  2031, 23919,  2000,
          2331,  2130,  2077,  2002,  2

In [26]:
# Load trained model
model_path = "output/checkpoint-1"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
 
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [27]:
# Define test trainer
test_trainer = Trainer(model)
 
# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)
 
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

y_pred

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [28]:
y_true = sample_test.label.to_numpy()

y_true

array([0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,

In [29]:
# Compute confusion matrix
conf_mat = confusion_matrix(y_true, y_pred)
print('Confusion Matrix:')
print(conf_mat)


Confusion Matrix:
[[268   0]
 [232   0]]


In [30]:
# Compute classification report
class_report = classification_report(y_true, y_pred)
print('\nClassification Report:')
print(class_report)


Classification Report:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70       268
           1       0.00      0.00      0.00       232

    accuracy                           0.54       500
   macro avg       0.27      0.50      0.35       500
weighted avg       0.29      0.54      0.37       500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
