In [1]:

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'

# # to avoid OutOfMemoryError

In [2]:
# We know how fine-tuning with NSP and MLM works, but how exactly do we apply that in code?

from transformers import BertTokenizer, BertForPreTraining, BertModel, DistilBertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')


with open('clean.txt', 'r') as fp:
    text = fp.read().split('\n')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(len(text))
text[:3]

507


['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.']

In [4]:
'''
To prepare our data for NSP, we need to create a mix of non-random sentences (where the two sentences were originally together) — and random sentences.
For this, we’ll create a bag of sentences extracted from text which we can then randomly select a sentence from when creating a random NotNextSentence pair.
'''
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)
bag_size

1372

In [5]:
text[14]

'From Maximus I learned self-government, and not to be led aside by anything; and cheerfulness in all circumstances, as well as in illness; and a just admixture in the moral character of sweetness and dignity, and to do what was set before me without complaining. I observed that everybody believed that he thought as he spoke, and that in all that he did he never had any bad intention; and he never showed amazement and surprise, and was never in a hurry, and never put off doing a thing, nor was perplexed nor dejected, nor did he ever laugh to disguise his vexation, nor, on the other hand, was he ever passionate or suspicious. He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved. I observed, too, that no man could ever think that he was despised by Maximus, or ever venture to think himself a better man. He had also the

In [6]:
bag[14:19]

['From Maximus I learned self-government, and not to be led aside by anything; and cheerfulness in all circumstances, as well as in illness; and a just admixture in the moral character of sweetness and dignity, and to do what was set before me without complaining',
 ' I observed that everybody believed that he thought as he spoke, and that in all that he did he never had any bad intention; and he never showed amazement and surprise, and was never in a hurry, and never put off doing a thing, nor was perplexed nor dejected, nor did he ever laugh to disguise his vexation, nor, on the other hand, was he ever passionate or suspicious',
 ' He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved',
 ' I observed, too, that no man could ever think that he was despised by Maximus, or ever venture to think himself a better man',
 

In [7]:
'''
After creating our bag we can go ahead and create our 50/50 random/non-random NSP training data. For this, we will create a list of sentence As, sentence Bs, and their respective IsNextSentence or NotNextSentence labels.
'''

'\nAfter creating our bag we can go ahead and create our 50/50 random/non-random NSP training data. For this, we will create a list of sentence As, sentence Bs, and their respective IsNextSentence or NotNextSentence labels.\n'

In [8]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [9]:
for i in range(3):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n')

0
 I observed that everybody believed that he thought as he spoke, and that in all that he did he never had any bad intention; and he never showed amazement and surprise, and was never in a hurry, and never put off doing a thing, nor was perplexed nor dejected, nor did he ever laugh to disguise his vexation, nor, on the other hand, was he ever passionate or suspicious
---
 He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved

0
 His secrets were not but very few and very rare, and these only about public matters; and he showed prudence and economy in the exhibition of the public spectacles and the construction of public buildings, his donations to the people, and in such things, for he was a man who looked to what ought to be done, not to the reputation which is got by a man's acts
---
 He did not take the bath at un

In [10]:
# We can now tokenize our data. As is typical with BERT models, we truncate/pad our sequences to a length of 512 tokens.

In [11]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', #PyTorch tensors
                   max_length=512, truncation=True, padding='max_length')

In [12]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
'''
 Because we tokenized two sentences, our tokenizer automatically applied 0 values to sentence A and 1 values to sentence B in the token_type_ids tensor
 In the input_ids tensor, the tokenizer automatically placed a SEP token (102) between these two sentences — marking the boundary between them both.
'''

inputs

{'input_ids': tensor([[  101,  1045,  5159,  ...,     0,     0,     0],
        [  101,  2010,  7800,  ...,     0,     0,     0],
        [  101,  2582,  1010,  ...,     0,     0,     0],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [14]:
# Our NSP labels must be placed within a tensor called next_sentence_label
inputs['next_sentence_label'] = torch.LongTensor([label]).T
inputs.next_sentence_label[:10]

tensor([[0],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1]])

In [15]:
# For MLM we need to clone our current input_ids tensor to create a MLM labels tensor — then we move onto masking ~15% of tokens in the input_ids tensor.
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [16]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
rand

tensor([[0.9443, 0.9597, 0.9078,  ..., 0.1551, 0.3861, 0.8601],
        [0.0504, 0.3356, 0.1775,  ..., 0.0284, 0.2632, 0.6820],
        [0.6033, 0.3898, 0.1229,  ..., 0.3231, 0.8703, 0.2570],
        ...,
        [0.8861, 0.5028, 0.2513,  ..., 0.8347, 0.7820, 0.6361],
        [0.5911, 0.6831, 0.8805,  ..., 0.4685, 0.8818, 0.0522],
        [0.0803, 0.3528, 0.6854,  ..., 0.0636, 0.2178, 0.6776]])

In [17]:
# create mask array
# we are ensuring that we don’t mask any special tokens — such as CLS (101), SEP (102), and PAD (0) tokens.

mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
print(mask_arr.shape)
mask_arr

torch.Size([317, 512])


tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [18]:
# The inputs.input_ids is a 2D tensor that contains the tokenized versions of your input sentences. Each row corresponds to a sentence, and each column corresponds to a token in that sentence.

inputs.input_ids.shape

torch.Size([317, 512])

In [19]:
print(type(mask_arr[0].nonzero()))
mask_arr[0].nonzero()

<class 'torch.Tensor'>


tensor([[  5],
        [  6],
        [ 11],
        [ 12],
        [ 16],
        [ 19],
        [ 24],
        [ 37],
        [ 45],
        [ 53],
        [ 78],
        [ 92],
        [ 97],
        [102],
        [104]])

In [20]:
torch.flatten(mask_arr[0].nonzero())

tensor([  5,   6,  11,  12,  16,  19,  24,  37,  45,  53,  78,  92,  97, 102,
        104])

In [21]:
torch.flatten(mask_arr[0].nonzero()).tolist()

[5, 6, 11, 12, 16, 19, 24, 37, 45, 53, 78, 92, 97, 102, 104]

In [22]:
# And now take the indices of each True value within each vector.
# Flattening the tensor with torch.flatten() turns it into a 1D tensor, and then .tolist() converts this tensor to a Python list.

selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
    
print(len(selection))
selection[1]

317


[4, 5, 9, 25, 32, 33, 61, 69, 70, 73, 76, 77, 87, 104, 109, 110, 122]

In [23]:
# Then apply these indices to each row in input_ids, assigning each value at these indices a value of 103.
# The number 103 corresponds to the special [MASK] token in BERT and some other transformer-based models. 

for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [24]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [25]:
inputs.input_ids[0]

tensor([  101,  1045,  5159,  2008,  7955,   103,   103,  2002,  2245,  2004,
         2002,   103,   103,  1998,  2008,  1999,   103,  2008,  2002,   103,
         2002,  2196,  2018,  2151,   103,  6808,  1025,  1998,  2002,  2196,
         3662, 21606,  1998,  4474,  1010,  1998,  2001,   103,  1999,  1037,
         9241,  1010,  1998,  2196,  2404,   103,  2725,  1037,  2518,  1010,
         4496,  2001,  2566,   103,  2098,  4496,  2139, 24455,  1010,  4496,
         2106,  2002,  2412,  4756,  2000, 14249,  2010,  2310, 18684,  3508,
         1010,  4496,  1010,  2006,  1996,  2060,  2192,  1010,   103,  2002,
         2412, 13459,  2030, 10027,   102,  2002,  2001, 17730,  2000,  2079,
         4490,  1997,   103, 12879,  6610,  5897,  1010,   103,  2001,  3201,
         2000,  9641,   103,  1998,   103,  2489,  2013,  2035,  6270,  9021,
         1025,  1998,  2002,  3591,  1996,  3311,  1997,  1037,  2158,  2040,
         2071,  2025,  2022, 18356,  2013,  2157,  2738,  2084, 

In [26]:
'''
Dataloader
All of our input and label tensors are ready — all we need to do now is format them into a PyTorch dataset object so that it can be loaded into a PyTorch Dataloader — which will feed batches of data into our model during training.
'''

'\nDataloader\nAll of our input and label tensors are ready — all we need to do now is format them into a PyTorch dataset object so that it can be loaded into a PyTorch Dataloader — which will feed batches of data into our model during training.\n'

In [27]:
# We create a PyTorch dataset from our data.

'''
This is a custom PyTorch Dataset class named OurDataset. It’s designed to handle the encodings (or preprocessed input data) for a language model like BERT. Here’s a brief explanation of its methods:

__init__(self, encodings): This is the initializer method that’s called when you create a new instance of OurDataset. It takes one argument, encodings, which should be a dictionary containing the preprocessed input data. This dictionary is stored in the instance variable self.encodings.

__getitem__(self, idx): This method is used to get the item at a specific index, idx. It returns a dictionary where each value is a tensor containing the data for one input feature (like input IDs, attention mask, etc.) at the given index. This method is called when you access an item in the dataset like this: dataset[i].

__len__(self): This method returns the number of items in the dataset. It’s implemented by returning the length of the input_ids in self.encodings, assuming that all input features have the same length. This method is called when you use the len() function on the dataset: len(dataset).

This class allows PyTorch to handle the dataset in a way that’s optimized for machine learning tasks. You can use it with a DataLoader to easily generate batches of data for training or evaluation.
'''

class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [28]:
type(inputs)

transformers.tokenization_utils_base.BatchEncoding

In [29]:
# Initialize our data using the OurDataset class.

dataset = OurDataset(inputs)

In [30]:
'''
if you have a simple use case and don’t want to create a custom subclass, you might consider using torch.utils.data.TensorDataset. This is a utility class that wraps tensors into a dataset. For example, if inputs is a tensor of input features and labels is a tensor of labels, you can create a dataset like this:

dataset = torch.utils.data.TensorDataset(inputs, labels)

In the custom case above, the dataloader expects the __len__ method for checking the total number of samples within our dataset, and the __getitem__ method for extracting samples.
'''

'\nif you have a simple use case and don’t want to create a custom subclass, you might consider using torch.utils.data.TensorDataset. This is a utility class that wraps tensors into a dataset. For example, if inputs is a tensor of input features and labels is a tensor of labels, you can create a dataset like this:\n\ndataset = torch.utils.data.TensorDataset(inputs, labels)\n\nIn the custom case above, the dataloader expects the __len__ method for checking the total number of samples within our dataset, and the __getitem__ method for extracting samples.\n'

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [32]:
model = model.to(device)
model.device

device(type='cuda', index=0)

In [33]:
# Free up GPU memory: Make sure you’re freeing up GPU memory whenever possible. You can do this by calling torch.cuda.empty_cache() to release cache that PyTorch is holding onto.

torch.cuda.empty_cache() # batch size reduced as well

In [35]:
from tqdm import tqdm  # for our progress bar
import torch.optim as optim

# And initialize the dataloader, which we'll be using to load our data into the model during training.
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop: # The inner loop iterates over the DataLoader object (loader), which yields batches of data.
        # initialize calculated gradients (from prev step)
        optimizer.zero_grad() # For each batch, it first zeros out any previously calculated gradients.
        # pull all tensor batches required for training
        # It then moves all the tensor batches to the device where the computations will be performed (usually a GPU if available).
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        # Forward Pass: The model processes the inputs and returns the outputs.
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids, # not used in DistilBertForSequenceClassification
                        next_sentence_label=next_sentence_label, # not used in DistilBertForSequenceClassification
                        labels=labels)
        # extract loss from the outputs
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update (backward propagation)
        loss.backward()
        # update parameters, based on the calculated gradients
        optimizer.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:29<00:00,  1.35it/s, loss=2.29]
Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:33<00:00,  1.20it/s, loss=1.72]
