In [1]:
# https://brainyx.co/journal/journal4/


# One of the most common uses of BERT is to download a model that has been pre-trained with a large amount of text and fine tuning it with a small amount of data. In this article, we will show you how to download a pre-trained model from hugginfface and fine tune it with sample code.


from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, recall_score 
from sklearn.metrics import precision_score, f1_score
 
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
 
import torch
import pandas as pd
import numpy as np

In [2]:

raw_datasets = load_dataset("imdb")
# raw_datasets = load_dataset("imdb", split=['train', 'test'])
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
pd.DataFrame(raw_datasets['train'])

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [4]:
# Select Samples for Train and Test

sample_train_val = raw_datasets['train'].shuffle().select(range(0,2000)).to_pandas()
sample_test = raw_datasets['test'].shuffle().select(range(0,500)).to_pandas()

sample_train_val

Unnamed: 0,text,label
0,*** REVIEW MAY CONTAIN SOME SPOILERS *** I'll ...,0
1,this was a real guilt pleasure ... i saw the t...,1
2,I first saw this as a child living in East Lon...,1
3,It's interesting that all who (so far) seemed ...,0
4,"Jeopardy is a tense, satisying thriller, a cut...",1
...,...,...
1995,This movie re-wrote film history in every way....,1
1996,"Two women, sick of their controlling husbands,...",1
1997,If you see this turkey listed in your TV guide...,0
1998,"Brutal, emotionless Michael Myers stabs his si...",1


In [5]:
# Define Pretrained Tokenizer and Model

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
}

In [6]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
# Preprocess Dataset
# Define a simple class inherited from torch dataset
# This is a custom dataset class for PyTorch, which is often used when you want to load your own data into a PyTorch model.

class Dataset(torch.utils.data.Dataset): # The class Dataset is inheriting from torch.utils.data.Dataset. This is a base class provided by PyTorch for representing a dataset.
    
    def __init__(self, encodings, labels=None): # The __init__ method is the initializer for the class. It takes as input encodings and labels. encodings are the encoded representations of your text data (for example, tokenized and converted to IDs), and labels are the corresponding labels for your data.
        self.encodings = encodings
        self.labels = labels
 
    def __getitem__(self, idx): # The __getitem__ method is used to get the data (encodings and labels) for a specific item in the dataset. idx is the index of the item. This method returns a dictionary where the keys are the names of the encoding components (like “input_ids”, “attention_mask”, etc.) and the values are the corresponding tensors. If labels are provided, a “labels” key is also added to the dictionary.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
 
    def __len__(self): # The __len__ method returns the number of items in the dataset
        return len(self.encodings["input_ids"])
 

# The item dictionary would look like this:
# {
#     'input_ids': tensor([101, 2001, 2019, 6207, 102]),
#     'token_type_ids': tensor([0, 0, 0, 0, 0]),
#     'attention_mask': tensor([1, 1, 1, 1, 1]),
#     'labels': tensor(0)
# }

In [8]:
sample_x = list(sample_train_val["text"])
sample_y = list(sample_train_val["label"])
 
sample_x[0], sample_y[0]

('*** REVIEW MAY CONTAIN SOME SPOILERS *** I\'ll make this review short and sweet. I bought this movie from Best Buy because it sounded interested and had some top actors in it like Kevin Spacey and Morgan Freeman. How bad could it be, right? Well, it\'s pretty bad. Justin Timberlake plays Pollack, a wannabe journalist who stumbles across a case that may lead to corrupt cops at Edison\'s Police Force. LL Cool J is Deed, a cop within the force on a special force team called F.R.A.T. (First Response Assault Tactics). He\'s teamed with an "on-the-edge" bad cop named Lazerov (Dylan McDermott). In the opening scene we see Lazerov & Deed taking on some bank robbers, but at night they are busting a couple of guys doing drugs. I don\'t want to give to much away, but things turn bad for the guys doing the drugs. Pollack, who works for Ashford (Morgan Freeman) goes to a trial involving Deeds & Lazerov. He suspect foul play and with the help of Ashford, does some investigate that turns ugly. Wall

In [9]:
X_train, X_val, Y_train, Y_val = train_test_split(sample_x, sample_y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)


In [10]:
X_train_tokenized['input_ids'][0][:10]

[101, 2956, 1999, 3982, 1005, 1055, 1996, 2522, 2497, 8545]

In [11]:
X_train_tokenized['token_type_ids'][0][:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [12]:
X_train_tokenized['attention_mask'][0][:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [13]:
input_train = Dataset(X_train_tokenized, Y_train)
input_val = Dataset(X_val_tokenized, Y_val)

In [14]:
input_train.__getitem__(0)

{'input_ids': tensor([  101,  2956,  1999,  3982,  1005,  1055,  1996,  2522,  2497,  8545,
          2497,  2003,  2019,  2035,  2732,  3459,  7478,  2013,  4333,  3898,
          8003, 19344, 21025,  4095,  2000,  5889,  2996,  4013, 17487,  6294,
          2358,  8180,  4059,  1012,  2275,  2012,  2019,  7262, 13691,  2902,
          1010,  2054,  2003,  2023,  3185,  2055,  2017,  4687,  1012,  1012,
          1012,  1012,  1012,  1012,  2152,  3689,  1029,  3460,  1004,  5776,
          6550,  1029,  5213,  7242,  3949,  1029,  2053,  1010,  2023, 22912,
          2121,  2003,  2055,  2040,  3599,  2097,  2131,  2000,  4060,  1996,
         23641,  3111,  2005,  1037, 13691,  2902,   999,  2017,  2228,  1045,
          1005,  1049, 12489,  1029,  2017,  2180,  1005,  1056,  2903,  2115,
          2159,  2004,  2017,  1005,  2128,  3666,  2023, 23653,  9994,  2008,
          2001,  2357,  2046,  1037,  3185,   999,  6555,  2132, 22802,  2852,
          1012, 11338, 16402,  1006,  2

In [15]:
# Define Evaluation Metrics

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    print(classification_report(labels, pred))
 
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred) 
    f1 = f1_score(y_true=labels, y_pred=pred)
 
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1_score}


In [16]:
# Fine-tune BERT

# Define Training Arguments
args = TrainingArguments( # Hugging Face class used to define the settings for training a model.
    output_dir="models", # directory where the model and training/evaluation logs are saved.
    evaluation_strategy="steps", # The model will be evaluated every eval_steps.
    eval_steps=100, # The model will be evaluated every 100 steps.
    # The batch size for training and evaluation.
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1, # number of times the training loop will iterate over all the training data (they were 2 epochs before).
    seed=0, # random seed for reproducibility.
    load_best_model_at_end=True, # The best model according to the evaluation metric will be loaded at the end of training.
)
 
# Define Trainer
trainer = Trainer( # class provided by Hugging Face for training Transformer models
    model=model,
    args=args,
    train_dataset=input_train,
    eval_dataset=input_val,
    compute_metrics=compute_metrics, # you don’t need to worry about passing the predictions and labels to compute_metrics, as an argument. The Trainer takes care of this for you. The p in compute_metrics(p) will be a tuple where the first element is the predictions and the second element is the labels. These are automatically provided by the Trainer during evaluation.
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # stop training if the evaluation metric does not improve for 3 evaluation steps.
)
 
# Fine-tune pre-trained BERT
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,No log,0.345854,0.85,0.784141,0.941799,
200,No log,0.283729,0.8875,0.928571,0.825397,
300,No log,0.424361,0.905,0.887179,0.915344,
400,No log,0.393441,0.905,0.887179,0.915344,


Trainer is attempting to log a value of "<function f1_score at 0x000001D40A053F70>" of type <class 'function'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


              precision    recall  f1-score   support

           0       0.94      0.77      0.84       211
           1       0.78      0.94      0.86       189

    accuracy                           0.85       400
   macro avg       0.86      0.85      0.85       400
weighted avg       0.86      0.85      0.85       400



Trainer is attempting to log a value of "<function f1_score at 0x000001D40A053F70>" of type <class 'function'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


              precision    recall  f1-score   support

           0       0.86      0.94      0.90       211
           1       0.93      0.83      0.87       189

    accuracy                           0.89       400
   macro avg       0.89      0.88      0.89       400
weighted avg       0.89      0.89      0.89       400



Trainer is attempting to log a value of "<function f1_score at 0x000001D40A053F70>" of type <class 'function'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


              precision    recall  f1-score   support

           0       0.92      0.90      0.91       211
           1       0.89      0.92      0.90       189

    accuracy                           0.91       400
   macro avg       0.90      0.91      0.90       400
weighted avg       0.91      0.91      0.91       400



Trainer is attempting to log a value of "<function f1_score at 0x000001D40A053F70>" of type <class 'function'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


              precision    recall  f1-score   support

           0       0.92      0.90      0.91       211
           1       0.89      0.92      0.90       189

    accuracy                           0.91       400
   macro avg       0.90      0.91      0.90       400
weighted avg       0.91      0.91      0.91       400



TrainOutput(global_step=400, training_loss=0.296733512878418, metrics={'train_runtime': 55360.4223, 'train_samples_per_second': 0.058, 'train_steps_per_second': 0.007, 'total_flos': 841955377152000.0, 'train_loss': 0.296733512878418, 'epoch': 2.0})

In [21]:
# The number of global steps is determined by the number of batches in your training data and the number of epochs you train for. 

# You have set num_train_epochs=2, which means the entire training dataset will be passed through the model twice.

# The per_device_train_batch_size=8 means that 8 samples from your training data will be fed into the model at a time (this is one batch)

# So, if you have 400 global steps, this means you have 400 / num_train_epochs = 200 batches of data in your training dataset

# In other words, your training dataset must have 200 * per_device_train_batch_size = 1600 samples

len(input_train)

# The eval_steps=100 means that the model’s performance is evaluated every 100 steps

# So, if you evaluate every 100 steps, the first evaluation will happen after the model has seen the first 100 batches, the second evaluation after 200 batches, and so on. These evaluations tell us how well the model is learning as it sees more data, even though it hasn’t seen the entire dataset yet.

1600

In [24]:
# Load Fine-tuned BERT and Run Prediction

# Load test data
X_test = list(sample_test["text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
 
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)
 
test_dataset.__getitem__(0)

{'input_ids': tensor([  101,  6583,  6776,  1037,  2062,  6851,  3319,  2064,  2022,  4663,
          5973,  2842,  1999,  1996,  4773,  1012,  2023,  2028,  2003,  1037,
          2204, 13954,  1010,  2348,  1045,  2079,  2025,  5993,  2007,  2009,
          4498,  1012, 10225,  2078,  2003,  1037,  3293,  3124,  1998,  6516,
          5328,  2013,  2010,  6466,  2069,  1012,  1026,  7987,  1013,  1028,
          1026,  7987,  1013,  1028,  8223,  2175, 12952, 13075,  2863,  1006,
          1054,  2290,  2615,  1010,  3849,  2066,  1037,  2576,  2283,  1007,
          2038,  2580,  1037,  8348,  1999,  2846, 10581,  1010,  2061,  2023,
          2028,  3849,  2066,  1037,  5122,  9185,  1997,  2008,  2028,  1999,
          2070,  3033,  1012,  1012,  1012,  1045,  1005,  1049,  2025,  2130,
          2183,  2000, 12826, 24471,  4328,  2721,  1998, 14405, 11077,  1012,
          1012,  2119,  2024,  2204,   999,  1996,  2280,  2038,  2488,  3772,
          5848,  1010,  2348,  1996,  3

In [25]:
# Load trained model
model_path = "models/checkpoint-100"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
 
model

OSError: models/checkpoint-100 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
# Define test trainer
test_trainer = Trainer(model)
 
# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)
 
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)