In [26]:
# https://brainyx.co/journal/journal4/


# One of the most common uses of BERT is to download a model that has been pre-trained with a large amount of text and fine tuning it with a small amount of data. In this article, we will show you how to download a pre-trained model from hugginfface and fine tune it with sample code.


from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, recall_score 
from sklearn.metrics import precision_score, f1_score
 
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
 
import torch
import pandas as pd
import numpy as np

In [2]:

raw_datasets = load_dataset("imdb")
# raw_datasets = load_dataset("imdb", split=['train', 'test'])
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
pd.DataFrame(raw_datasets['train'])

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [4]:
# Select Samples for Train and Test

sample_train_val = raw_datasets['train'].shuffle().select(range(0,500)).to_pandas() #2000
sample_test = raw_datasets['test'].shuffle().select(range(0,500)).to_pandas()

sample_train_val

Unnamed: 0,text,label
0,Tony Scott has never been a very good director...,0
1,I've noticed over the years that when a rock s...,0
2,"Being a music student myself, I thought a movi...",0
3,This film was on last week and although at tha...,1
4,I actually quite enjoyed this show. Even as a ...,1
...,...,...
495,This video was my first exposure to Eddie Izza...,1
496,"""After World War I, an expedition representing...",0
497,"""The Garden of Allah"" is a prime example of ""p...",1
498,We all know what's like when we have a bad day...,0


In [5]:
# Define Pretrained Tokenizer and Model

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
}

In [6]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
# Preprocess Dataset
# Define a simple class inherited from torch dataset
# This is a custom dataset class for PyTorch, which is often used when you want to load your own data into a PyTorch model.

class Dataset(torch.utils.data.Dataset): # The class Dataset is inheriting from torch.utils.data.Dataset. This is a base class provided by PyTorch for representing a dataset.
    
    def __init__(self, encodings, labels=None): # The __init__ method is the initializer for the class. It takes as input encodings and labels. encodings are the encoded representations of your text data (for example, tokenized and converted to IDs), and labels are the corresponding labels for your data.
        self.encodings = encodings
        self.labels = labels
 
    def __getitem__(self, idx): # The __getitem__ method is used to get the data (encodings and labels) for a specific item in the dataset. idx is the index of the item. This method returns a dictionary where the keys are the names of the encoding components (like “input_ids”, “attention_mask”, etc.) and the values are the corresponding tensors. If labels are provided, a “labels” key is also added to the dictionary.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
 
    def __len__(self): # The __len__ method returns the number of items in the dataset
        return len(self.encodings["input_ids"])
 

# The item dictionary would look like this:
# {
#     'input_ids': tensor([101, 2001, 2019, 6207, 102]),
#     'token_type_ids': tensor([0, 0, 0, 0, 0]),
#     'attention_mask': tensor([1, 1, 1, 1, 1]),
#     'labels': tensor(0)
# }

In [8]:
sample_x = list(sample_train_val["text"])
sample_y = list(sample_train_val["label"])
 
sample_x[0], sample_y[0]

('Tony Scott has never been a very good director, but every film he\'s made after "Crimson Tide" seems to bring him one step closer to being the inarguable worst working today (Michael Bay may fall into the same category, but at least his big, dumb, delusional epics entertain on some primally perverse level). And like other overblown Hollywood biopics ("De-Lovely" and "Confessions of a Dangerous Mind," for instance) chronicling the lives of pretentious, overrated, or outright shallow ciphers given an aura of "mystique" by a society that thrives on the juicy behind-the-scenes details, "Domino" is a film that begins with little potential, and dashes that infinitesimal amount before the sixty-minute mark. With an already-distended running time of 128 minutes, the film feels twice as long, and spending time with characters this obnoxiously superficial and forgettable (unlike the superior "Rules of Attraction," Scott\'s attempts to tinge the proceedings with irony via Domino\'s smug, self-a

In [9]:
X_train, X_val, Y_train, Y_val = train_test_split(sample_x, sample_y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)


In [10]:
X_train_tokenized['input_ids'][0][:10]

[101, 1996, 2087, 13576, 1997, 1996, 21014, 4942, 1011, 11541]

In [11]:
X_train_tokenized['token_type_ids'][0][:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [12]:
X_train_tokenized['attention_mask'][0][:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [13]:
input_train = Dataset(X_train_tokenized, Y_train)
input_val = Dataset(X_val_tokenized, Y_val)

In [14]:
input_train.__getitem__(0)

{'input_ids': tensor([  101,  1996,  2087, 13576,  1997,  1996, 21014,  4942,  1011, 11541,
          2003,  1996,  2061,  2170,  1000,  1996,  2307,  6456,  1997,  1996,
          2882, 26458, 26745,  2140,  1000,  1024,  3409,  5469,  3152,  2029,
          4117,  2058,  1011,  1996,  1011,  2327, 11463,  7716, 14672,  2007,
          7788, 16959,  2015,  1998,  2467,  5652,  2011, 28223,  1998,  2471,
          6404,  3883,  2013,  5365,  3585,  2287,  1999,  4895, 10258, 20097,
          2075,  4395,  1997,  2593,  2146,  6114,  5694,  2030,  8040,  9910,
          8450,  4763, 14601,  3111,  1012,  2023,  6907,  3024,  2068,  2007,
          2019,  5866,  3772, 13398,  2008,  3039,  2358, 22134,  2037,  4933,
          2006,  1996,  3898,  2320,  2153,  1998,  2663,  2047,  8213,  1997,
          4599,  2012, 10961,  1997,  2037,  1043, 10278, 25373,  4871,  2013,
          7483,  1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,
          1000,  2054,  1005,  1055,  1

In [15]:
# Define Evaluation Metrics

def compute_metrics(p):
    # pred, labels = p
    # pred = np.argmax(pred, axis=1)
    labels = p.label_ids
    pred = p.predictions.argmax(-1)
    print(classification_report(labels, pred))
 
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred) 
    f1 = f1_score(y_true=labels, y_pred=pred)
 
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [16]:
# Fine-tune BERT

# Define Training Arguments
args = TrainingArguments( # Hugging Face class used to define the settings for training a model.
    output_dir="output", # directory where the model and training/evaluation logs are saved.
    evaluation_strategy="steps", # The model will be evaluated every eval_steps.
    eval_steps=40, # The model will be evaluated every 100 steps.
    save_strategy="steps",
    save_steps=40,
    # The batch size for training and evaluation.
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1, # number of times the training loop will iterate over all the training data (they were 2 epochs before).
    seed=0, # random seed for reproducibility.
    load_best_model_at_end=True, # The best model according to the evaluation metric will be loaded at the end of training.
)
 
# Define Trainer
trainer = Trainer( # class provided by Hugging Face for training Transformer models
    model=model,
    args=args,
    train_dataset=input_train,
    eval_dataset=input_val,
    compute_metrics=compute_metrics, # you don’t need to worry about passing the predictions and labels to compute_metrics, as an argument. The Trainer takes care of this for you. The p in compute_metrics(p) will be a tuple where the first element is the predictions and the second element is the labels. These are automatically provided by the Trainer during evaluation.
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # stop training if the evaluation metric does not improve for 3 evaluation steps.
)
 
# Fine-tune pre-trained BERT
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
40,No log,0.374241,0.83,0.947368,0.705882,0.808989


              precision    recall  f1-score   support

           0       0.76      0.96      0.85        49
           1       0.95      0.71      0.81        51

    accuracy                           0.83       100
   macro avg       0.85      0.83      0.83       100
weighted avg       0.85      0.83      0.83       100



TrainOutput(global_step=50, training_loss=0.524146728515625, metrics={'train_runtime': 933.5043, 'train_samples_per_second': 0.428, 'train_steps_per_second': 0.054, 'total_flos': 105244422144000.0, 'train_loss': 0.524146728515625, 'epoch': 1.0})

In [17]:
# The number of global steps is determined by the number of batches in your training data and the number of epochs you train for. 

# You have set num_train_epochs=2, which means the entire training dataset will be passed through the model twice.

# The per_device_train_batch_size=8 means that 8 samples from your training data will be fed into the model at a time (this is one batch)

# So, if you have 400 global steps, this means you have 400 / num_train_epochs = 200 batches of data in your training dataset

# In other words, your training dataset must have 200 * per_device_train_batch_size = 1600 samples

len(input_train)

# The eval_steps=100 means that the model’s performance is evaluated every 100 steps

# So, if you evaluate every 100 steps, the first evaluation will happen after the model has seen the first 100 batches, the second evaluation after 200 batches, and so on. These evaluations tell us how well the model is learning as it sees more data, even though it hasn’t seen the entire dataset yet.

400

In [18]:
# Load Fine-tuned BERT and Run Prediction

# Load test data
X_test = list(sample_test["text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
 
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)
 
test_dataset.__getitem__(0)

{'input_ids': tensor([  101,  2019,  7172,  2544,  1997,  1037,  4323,  2029,  2038,  2042,
          2589,  2077,  1012,  2096,  2008,  1999,  1998,  1997,  2993,  2003,
          2025,  2919,  1010,  2023,  3185,  2987,  1005,  1056,  3362,  1996,
          3614,  2066,  1996,  2060,  1000, 16112,  1998,  5760,  1000,  4763,
          3924,  2079,  1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,
          1028, 21425,  1010, 12479,  3535,  2008,  4212,  2460,  1997,  1996,
          2928,  1012,  2025,  4276,  3564,  2083,  2005,  1996,  5458,  9530,
         18886,  7178,  4566,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [19]:
# Load trained model
model_path = "output/checkpoint-40"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
 
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [20]:
# Define test trainer
test_trainer = Trainer(model)
 
# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)
 
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

y_pred

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,

In [25]:
y_true = sample_test.label.to_numpy()

y_true

array([0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,

In [27]:
# Compute confusion matrix
conf_mat = confusion_matrix(y_true, y_pred)
print('Confusion Matrix:')
print(conf_mat)


Confusion Matrix:
[[248   7]
 [ 81 164]]


In [28]:
# Compute classification report
class_report = classification_report(y_true, y_pred)
print('\nClassification Report:')
print(class_report)


Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.97      0.85       255
           1       0.96      0.67      0.79       245

    accuracy                           0.82       500
   macro avg       0.86      0.82      0.82       500
weighted avg       0.85      0.82      0.82       500

