In [106]:

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'

# # to avoid OutOfMemoryError

In [107]:
# We know how fine-tuning with NSP and MLM works, but how exactly do we apply that in code?

from transformers import BertTokenizer, BertForPreTraining, BertModel, DistilBertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')


with open('clean.txt', 'r') as fp:
    text = fp.read().split('\n')

In [109]:
print(len(text))
text[:3]

507


['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.']

In [110]:
'''
To prepare our data for NSP, we need to create a mix of non-random sentences (where the two sentences were originally together) — and random sentences.
For this, we’ll create a bag of sentences extracted from text which we can then randomly select a sentence from when creating a random NotNextSentence pair.
'''
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)
bag_size

1372

In [111]:
bag[14] # just the first sentence (before the ".") in text

'From Maximus I learned self-government, and not to be led aside by anything; and cheerfulness in all circumstances, as well as in illness; and a just admixture in the moral character of sweetness and dignity, and to do what was set before me without complaining'

In [112]:
text[14]

'From Maximus I learned self-government, and not to be led aside by anything; and cheerfulness in all circumstances, as well as in illness; and a just admixture in the moral character of sweetness and dignity, and to do what was set before me without complaining. I observed that everybody believed that he thought as he spoke, and that in all that he did he never had any bad intention; and he never showed amazement and surprise, and was never in a hurry, and never put off doing a thing, nor was perplexed nor dejected, nor did he ever laugh to disguise his vexation, nor, on the other hand, was he ever passionate or suspicious. He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved. I observed, too, that no man could ever think that he was despised by Maximus, or ever venture to think himself a better man. He had also the

In [113]:
bag[14:19]

['From Maximus I learned self-government, and not to be led aside by anything; and cheerfulness in all circumstances, as well as in illness; and a just admixture in the moral character of sweetness and dignity, and to do what was set before me without complaining',
 ' I observed that everybody believed that he thought as he spoke, and that in all that he did he never had any bad intention; and he never showed amazement and surprise, and was never in a hurry, and never put off doing a thing, nor was perplexed nor dejected, nor did he ever laugh to disguise his vexation, nor, on the other hand, was he ever passionate or suspicious',
 ' He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved',
 ' I observed, too, that no man could ever think that he was despised by Maximus, or ever venture to think himself a better man',
 

In [114]:
'''
After creating our bag we can go ahead and create our 50/50 random/non-random NSP training data. For this, we will create a list of sentence As, sentence Bs, and their respective IsNextSentence or NotNextSentence labels.
'''

'\nAfter creating our bag we can go ahead and create our 50/50 random/non-random NSP training data. For this, we will create a list of sentence As, sentence Bs, and their respective IsNextSentence or NotNextSentence labels.\n'

In [127]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence (sentence_b consecutive to sentence_a)
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence (sentence_b random)
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [121]:
len(sentence_a), len(sentence_b), len(label)

(317, 317, 317)

In [128]:
for i in range(3):
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n---')
    print(label[i])
    print('////////////////////')

 I observed that everybody believed that he thought as he spoke, and that in all that he did he never had any bad intention; and he never showed amazement and surprise, and was never in a hurry, and never put off doing a thing, nor was perplexed nor dejected, nor did he ever laugh to disguise his vexation, nor, on the other hand, was he ever passionate or suspicious
---
 He was accustomed to do acts of beneficence, and was ready to forgive, and was free from all falsehood; and he presented the appearance of a man who could not be diverted from right rather than of a man who had been improved
---
0
////////////////////
 There was in him nothing harsh, nor implacable, nor violent, nor, as one may say, anything carried to the sweating point; but he examined all things severally, as if he had abundance of time, and without confusion, in an orderly way, vigorously and consistently
---
 But this is altogether a mark of the most common sort of men, for it is in thy power whenever thou shalt c

In [10]:
# We can now tokenize our data. As is typical with BERT models, we truncate/pad our sequences to a length of 512 tokens.

In [129]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', #PyTorch tensors
                   max_length=512, truncation=True, padding='max_length')

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [130]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [131]:
'''
 Because we tokenized two sentences, our tokenizer automatically applied 0 values to sentence A and 1 values to sentence B in the token_type_ids tensor
 In the input_ids tensor, the tokenizer automatically placed a SEP token (102) between these two sentences — marking the boundary between them both.
'''

inputs

{'input_ids': tensor([[  101,  1045,  5159,  ...,     0,     0,     0],
        [  101,  2045,  2001,  ...,     0,     0,     0],
        [  101,  2582,  1010,  ...,  2402,  1010,   102],
        ...,
        [  101,  3459,  2185,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [132]:
# Our NSP labels must be placed within a tensor called next_sentence_label
inputs['next_sentence_label'] = torch.LongTensor([label]).T
inputs.next_sentence_label[:10]

tensor([[0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1]])

In [133]:
inputs[:10]

{'input_ids': tensor([[  101,  1045,  5159,  ...,     0,     0,     0],
         [  101,  2045,  2001,  ...,     0,     0,     0],
         [  101,  2582,  1010,  ...,  2402,  1010,   102],
         ...,
         [  101,  1998, 15223,  ...,     0,     0,     0],
         [  101,  2296,  2158,  ...,     0,     0,     0],
         [  101,  2021,  2059,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'next_sentence_label': tensor([[0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [1],
        

In [135]:
# For MLM we need to clone our current input_ids tensor to create a MLM labels tensor — then we move onto masking ~15% of tokens in the input_ids tensor.
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [136]:
inputs[:5]

{'input_ids': tensor([[ 101, 1045, 5159,  ...,    0,    0,    0],
         [ 101, 2045, 2001,  ...,    0,    0,    0],
         [ 101, 2582, 1010,  ..., 2402, 1010,  102],
         [ 101, 4088, 1996,  ...,    0,    0,    0],
         [ 101, 2156, 1996,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'next_sentence_label': tensor([[0],
         [1],
         [0],
         [0],
         [0]]),
 'labels': tensor([[ 101, 1045, 5159,  ...,    0,    0,    0],
         [ 101, 2045, 2001,  ...,    0,    0,    0],
         [ 101, 2582, 1010,  ..., 2402, 1010,  102],
         [ 101, 4088, 1996,  ...,    0,    0,    0],
         [ 10

In [137]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
rand

tensor([[0.7189, 0.8634, 0.1762,  ..., 0.8440, 0.0596, 0.7085],
        [0.4003, 0.0113, 0.5288,  ..., 0.5583, 0.0384, 0.9262],
        [0.3399, 0.5835, 0.7057,  ..., 0.2381, 0.4659, 0.6749],
        ...,
        [0.4027, 0.5887, 0.5198,  ..., 0.6664, 0.5911, 0.7619],
        [0.0421, 0.7242, 0.0786,  ..., 0.8145, 0.8941, 0.3929],
        [0.7927, 0.1855, 0.8645,  ..., 0.7465, 0.0325, 0.3125]])

In [138]:
# create mask array
# we are ensuring that we don’t mask any special tokens — such as CLS (101), SEP (102), and PAD (0) tokens.

mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
print(mask_arr.shape)
mask_arr

torch.Size([317, 512])


tensor([[False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [139]:
# The inputs.input_ids is a 2D tensor that contains the tokenized versions of your input sentences. Each row corresponds to a sentence, and each column corresponds to a token in that sentence.

inputs.input_ids.shape

torch.Size([317, 512])

In [140]:
print(type(mask_arr[0].nonzero()))
mask_arr[2].nonzero() # non zero == true == masked indices

<class 'torch.Tensor'>


tensor([[  4],
        [  9],
        [ 18],
        [ 22],
        [ 28],
        [ 35],
        [ 45],
        [ 48],
        [ 51],
        [ 57],
        [ 59],
        [ 60],
        [ 61],
        [ 62],
        [ 75],
        [ 82],
        [ 84],
        [ 89],
        [ 95],
        [109],
        [119],
        [121],
        [123],
        [126],
        [127],
        [138],
        [143],
        [167],
        [176],
        [190],
        [191],
        [192],
        [205],
        [221],
        [226],
        [230],
        [232],
        [241],
        [243],
        [246],
        [248],
        [260],
        [262],
        [276],
        [283],
        [286],
        [288],
        [301],
        [317],
        [318],
        [319],
        [327],
        [328],
        [329],
        [344],
        [347],
        [354],
        [355],
        [356],
        [366],
        [373],
        [374],
        [383],
        [392],
        [393],
        [394],
        [3

In [141]:
torch.flatten(mask_arr[2].nonzero())

tensor([  4,   9,  18,  22,  28,  35,  45,  48,  51,  57,  59,  60,  61,  62,
         75,  82,  84,  89,  95, 109, 119, 121, 123, 126, 127, 138, 143, 167,
        176, 190, 191, 192, 205, 221, 226, 230, 232, 241, 243, 246, 248, 260,
        262, 276, 283, 286, 288, 301, 317, 318, 319, 327, 328, 329, 344, 347,
        354, 355, 356, 366, 373, 374, 383, 392, 393, 394, 396, 397, 400, 402,
        403, 431, 436, 438, 441, 445, 446, 448, 465, 466, 493, 499])

In [142]:
# And now take the indices of each True value within each vector.
# Flattening the tensor with torch.flatten() turns it into a 1D tensor, and then .tolist() converts this tensor to a Python list.

selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
    
print(len(selection))
selection[2][:10]

317


[4, 9, 18, 22, 28, 35, 45, 48, 51, 57]

In [143]:
# Then apply these indices to each row in input_ids, assigning each value at these indices a value of 103.
# The number 103 corresponds to the special [MASK] token in BERT and some other transformer-based models. 

for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [144]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [145]:
inputs.input_ids[2]

tensor([  101,  2582,  1010,  1045,   103, 18836,  2000,  1996,  5932,   103,
         1045,  2001,  2025,  2936,  2716,  2039,  2007,  2026,   103,  1005,
         1055,  9530,   103, 16765,  1010,  1998,  2008,  1045,   103,  1996,
         6546,  1997,  2026,  3360,  1010,   103,  2008,  1045,  2106,  2025,
         2191,  6947,  1997,  2026,  6819,   103,  3012,  2077,   103,  5372,
         2161,   103,  2021,  2130, 13366, 28849,  2094,   103,  2051,   103,
          103,   103,   103, 13532,  2000,  1037,  7786,  1998,  1037,  2269,
         2040,  2001,  2583,  2000,  2202,   103,  2035,  6620,  2013,  2033,
         1010,  1998,   103,  3288,   103,  2000,  1996,  3716,  2008,   103,
         2003,  2825,  2005,  1037,  2158,   103,  2444,  1999,  1037,  4186,
         2302,  5782,  2593,  4932,  2030, 23590, 14464,  1010,  2030,   103,
         1998, 11342,  1010,  1998,  2107,  1011,  2066,  2265,  1025,   103,
         2008,   103,  2003,   103,  2107,  1037,   103,   103, 

In [26]:
'''
Dataloader
All of our input and label tensors are ready — all we need to do now is format them into a PyTorch dataset object so that it can be loaded into a PyTorch Dataloader — which will feed batches of data into our model during training.
'''

'\nDataloader\nAll of our input and label tensors are ready — all we need to do now is format them into a PyTorch dataset object so that it can be loaded into a PyTorch Dataloader — which will feed batches of data into our model during training.\n'

In [146]:
# We create a PyTorch dataset from our data.

'''
This is a custom PyTorch Dataset class named OurDataset. It’s designed to handle the encodings (or preprocessed input data) for a language model like BERT. Here’s a brief explanation of its methods:

__init__(self, encodings): This is the initializer method that’s called when you create a new instance of OurDataset. It takes one argument, encodings, which should be a dictionary containing the preprocessed input data. This dictionary is stored in the instance variable self.encodings.

__getitem__(self, idx): This method is used to get the item at a specific index, idx. It returns a dictionary where each value is a tensor containing the data for one input feature (like input IDs, attention mask, etc.) at the given index. This method is called when you access an item in the dataset like this: dataset[i].

__len__(self): This method returns the number of items in the dataset. It’s implemented by returning the length of the input_ids in self.encodings, assuming that all input features have the same length. This method is called when you use the len() function on the dataset: len(dataset).

This class allows PyTorch to handle the dataset in a way that’s optimized for machine learning tasks. You can use it with a DataLoader to easily generate batches of data for training or evaluation.
'''

class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [147]:
type(inputs)

transformers.tokenization_utils_base.BatchEncoding

In [148]:
# Initialize our data using the OurDataset class.

dataset = OurDataset(inputs)

In [30]:
'''
if you have a simple use case and don’t want to create a custom subclass, you might consider using torch.utils.data.TensorDataset. This is a utility class that wraps tensors into a dataset. For example, if inputs is a tensor of input features and labels is a tensor of labels, you can create a dataset like this:

dataset = torch.utils.data.TensorDataset(inputs, labels)

In the custom case above, the dataloader expects the __len__ method for checking the total number of samples within our dataset, and the __getitem__ method for extracting samples.
'''

'\nif you have a simple use case and don’t want to create a custom subclass, you might consider using torch.utils.data.TensorDataset. This is a utility class that wraps tensors into a dataset. For example, if inputs is a tensor of input features and labels is a tensor of labels, you can create a dataset like this:\n\ndataset = torch.utils.data.TensorDataset(inputs, labels)\n\nIn the custom case above, the dataloader expects the __len__ method for checking the total number of samples within our dataset, and the __getitem__ method for extracting samples.\n'

In [149]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [150]:
model = model.to(device)
model.device

device(type='cuda', index=0)

In [151]:
# Free up GPU memory: Make sure you’re freeing up GPU memory whenever possible. You can do this by calling torch.cuda.empty_cache() to release cache that PyTorch is holding onto.

torch.cuda.empty_cache() # batch size reduced as well

In [162]:
from tqdm import tqdm  # for our progress bar
import torch.optim as optim

# And initialize the dataloader, which we'll be using to load our data into the model during training.
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
epochs = 1

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop: # The inner loop iterates over the DataLoader object (loader), which yields batches of data.
        # initialize calculated gradients (from prev step)
        optimizer.zero_grad() # For each batch, it first zeros out any previously calculated gradients.
        # pull all tensor batches required for training
        # It then moves all the tensor batches to the device where the computations will be performed (usually a GPU if available).
        input_ids = batch['input_ids'].to(device) # original sentence with some token masked
        token_type_ids = batch['token_type_ids'].to(device) # 0:tokens in sentence_a , 1:tokens in sentence_b
        attention_mask = batch['attention_mask'].to(device) # 1:real tokens , 0:padding tokens
        next_sentence_label = batch['next_sentence_label'].to(device) # 0:consecutive sentence , 1:random sentence
        labels = batch['labels'].to(device) # clone of input_ids: contains the correct labels (words) that were masked. Helps the model learn to predict masked tokens.
        # process
        # Forward Pass: The model processes the inputs and returns the training outputs (the evaluation outputs are in the next section)
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids, 
                        next_sentence_label=next_sentence_label, 
                        labels=labels)
        # extract loss from the outputs
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update (backward propagation)
        loss.backward()
        # update parameters, based on the calculated gradients
        optimizer.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        
        # Extract MLM predictions at training

        prediction_logits = outputs.prediction_logits
        mlm_predictions = torch.argmax(prediction_logits, dim=-1)

        # Extract NSP predictions
        seq_relationship_logits = outputs.seq_relationship_logits
        nsp_predictions = torch.argmax(seq_relationship_logits, dim=1)

        print(f"MLM Predictions: {mlm_predictions}")
        print(f"NSP Predictions: {nsp_predictions}")
        print(f"Real Labels (NSP): {next_sentence_label}")
        print(f"Loss: {loss.item()}")

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0:   2%|█▌                                                             | 1/40 [00:09<06:20,  9.76s/it, loss=7.82]

MLM Predictions: tensor([[1996, 1996, 1996,  ..., 1996, 1996, 1996],
        [1996, 1996, 1996,  ..., 1996, 1996, 1996],
        [1996, 1996, 1996,  ..., 1996, 1996, 1996],
        ...,
        [1996, 1996, 1996,  ..., 1996, 1996, 1996],
        [1996, 1996, 1996,  ..., 1996, 1996, 1996],
        [1996, 1996, 1996,  ..., 1996, 1996, 1996]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [1],
        [0]], device='cuda:0')
Loss: 7.815109729766846


Epoch 0:   5%|███▏                                                           | 2/40 [00:16<05:11,  8.21s/it, loss=15.4]

MLM Predictions: tensor([[18738, 18738, 18738,  ..., 18738, 18738, 18738],
        [18738, 18738, 18738,  ..., 18738, 18738, 18738],
        [18738, 18738, 18738,  ..., 18738, 18738, 18738],
        ...,
        [18738, 18738, 18738,  ..., 18738, 18738, 18738],
        [18738, 18738, 18738,  ..., 18738, 18738, 18738],
        [18738, 18738, 18738,  ..., 18738, 18738, 18738]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [0],
        [1],
        [0],
        [1],
        [0],
        [0],
        [1]], device='cuda:0')
Loss: 15.434216499328613


Epoch 0:   8%|████▋                                                          | 3/40 [00:23<04:44,  7.70s/it, loss=2.67]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1]], device='cuda:0')
Loss: 2.667572021484375


Epoch 0:  10%|██████▎                                                        | 4/40 [00:31<04:29,  7.48s/it, loss=2.43]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1]], device='cuda:0')
Loss: 2.425849437713623


Epoch 0:  12%|███████▉                                                       | 5/40 [00:38<04:18,  7.39s/it, loss=2.33]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [0]], device='cuda:0')
Loss: 2.3294835090637207


Epoch 0:  15%|█████████▍                                                     | 6/40 [00:45<04:09,  7.35s/it, loss=2.93]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0]], device='cuda:0')
Loss: 2.92909574508667


Epoch 0:  18%|███████████                                                    | 7/40 [00:53<04:03,  7.38s/it, loss=2.63]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [0],
        [1],
        [0],
        [0],
        [0],
        [1]], device='cuda:0')
Loss: 2.633181571960449


Epoch 0:  20%|████████████▌                                                  | 8/40 [01:00<03:54,  7.34s/it, loss=2.25]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1]], device='cuda:0')
Loss: 2.2549805641174316


Epoch 0:  22%|██████████████▍                                                 | 9/40 [01:07<03:46,  7.32s/it, loss=2.8]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0]], device='cuda:0')
Loss: 2.800774574279785


Epoch 0:  25%|███████████████▌                                              | 10/40 [01:14<03:38,  7.28s/it, loss=3.72]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1]], device='cuda:0')
Loss: 3.7215635776519775


Epoch 0:  28%|█████████████████                                             | 11/40 [01:22<03:31,  7.28s/it, loss=2.74]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [0],
        [0],
        [0],
        [1],
        [0],
        [1]], device='cuda:0')
Loss: 2.7436749935150146


Epoch 0:  30%|██████████████████▌                                           | 12/40 [01:29<03:22,  7.25s/it, loss=1.96]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0]], device='cuda:0')
Loss: 1.9624648094177246


Epoch 0:  32%|████████████████████▏                                         | 13/40 [01:36<03:15,  7.22s/it, loss=2.85]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1]], device='cuda:0')
Loss: 2.8547191619873047


Epoch 0:  35%|█████████████████████▋                                        | 14/40 [01:43<03:07,  7.21s/it, loss=2.65]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0]], device='cuda:0')
Loss: 2.6545639038085938


Epoch 0:  38%|███████████████████████▎                                      | 15/40 [01:50<02:59,  7.20s/it, loss=2.45]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1]], device='cuda:0')
Loss: 2.4531941413879395


Epoch 0:  40%|████████████████████████▊                                     | 16/40 [01:58<02:53,  7.23s/it, loss=2.88]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [0],
        [0],
        [1],
        [0],
        [1],
        [1]], device='cuda:0')
Loss: 2.8757522106170654


Epoch 0:  42%|██████████████████████████▊                                    | 17/40 [02:05<02:45,  7.21s/it, loss=4.4]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [0],
        [1],
        [1],
        [0],
        [1],
        [0],
        [0]], device='cuda:0')
Loss: 4.400176048278809


Epoch 0:  45%|███████████████████████████▉                                  | 18/40 [02:12<02:38,  7.20s/it, loss=2.98]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0]], device='cuda:0')
Loss: 2.9819884300231934


Epoch 0:  48%|█████████████████████████████▍                                | 19/40 [02:19<02:30,  7.19s/it, loss=2.26]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [1],
        [1]], device='cuda:0')
Loss: 2.259904146194458


Epoch 0:  50%|███████████████████████████████                               | 20/40 [02:26<02:24,  7.22s/it, loss=2.19]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [0],
        [0],
        [0],
        [1],
        [1],
        [0]], device='cuda:0')
Loss: 2.1859617233276367


Epoch 0:  52%|████████████████████████████████▌                             | 21/40 [02:34<02:19,  7.33s/it, loss=2.27]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0]], device='cuda:0')
Loss: 2.2663822174072266


Epoch 0:  55%|██████████████████████████████████                            | 22/40 [02:42<02:13,  7.43s/it, loss=2.72]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [1],
        [1],
        [0],
        [0],
        [1],
        [0]], device='cuda:0')
Loss: 2.722405433654785


Epoch 0:  57%|███████████████████████████████████▋                          | 23/40 [02:49<02:05,  7.41s/it, loss=2.34]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1]], device='cuda:0')
Loss: 2.339925765991211


Epoch 0:  60%|█████████████████████████████████████▏                        | 24/40 [02:56<01:58,  7.40s/it, loss=2.31]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [0]], device='cuda:0')
Loss: 2.305657386779785


Epoch 0:  62%|██████████████████████████████████████▊                       | 25/40 [03:04<01:50,  7.35s/it, loss=2.74]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [0],
        [1]], device='cuda:0')
Loss: 2.7423248291015625


Epoch 0:  65%|████████████████████████████████████████▎                     | 26/40 [03:11<01:42,  7.30s/it, loss=2.02]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [1],
        [1],
        [0],
        [0],
        [1],
        [0]], device='cuda:0')
Loss: 2.0201072692871094


Epoch 0:  68%|█████████████████████████████████████████▊                    | 27/40 [03:18<01:34,  7.27s/it, loss=1.96]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [0]], device='cuda:0')
Loss: 1.9632925987243652


Epoch 0:  70%|███████████████████████████████████████████▍                  | 28/40 [03:25<01:26,  7.24s/it, loss=2.33]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [1]], device='cuda:0')
Loss: 2.333892822265625


Epoch 0:  72%|█████████████████████████████████████████████▋                 | 29/40 [03:32<01:19,  7.22s/it, loss=1.8]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1]], device='cuda:0')
Loss: 1.8013050556182861


Epoch 0:  75%|██████████████████████████████████████████████▌               | 30/40 [03:39<01:12,  7.20s/it, loss=2.11]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0]], device='cuda:0')
Loss: 2.1063594818115234


Epoch 0:  78%|████████████████████████████████████████████████              | 31/40 [03:47<01:04,  7.19s/it, loss=2.06]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1]], device='cuda:0')
Loss: 2.061734199523926


Epoch 0:  80%|█████████████████████████████████████████████████▌            | 32/40 [03:54<00:57,  7.21s/it, loss=1.98]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0]], device='cuda:0')
Loss: 1.976759433746338


Epoch 0:  82%|███████████████████████████████████████████████████▏          | 33/40 [04:01<00:50,  7.19s/it, loss=2.94]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [0],
        [0]], device='cuda:0')
Loss: 2.9387779235839844


Epoch 0:  85%|████████████████████████████████████████████████████▋         | 34/40 [04:08<00:43,  7.20s/it, loss=2.27]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [1],
        [1],
        [1],
        [0],
        [0],
        [1]], device='cuda:0')
Loss: 2.2708487510681152


Epoch 0:  88%|██████████████████████████████████████████████████████▎       | 35/40 [04:16<00:36,  7.23s/it, loss=2.68]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0]], device='cuda:0')
Loss: 2.6831271648406982


Epoch 0:  90%|███████████████████████████████████████████████████████▊      | 36/40 [04:23<00:28,  7.22s/it, loss=2.04]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [0],
        [1],
        [0],
        [1],
        [1],
        [0],
        [1]], device='cuda:0')
Loss: 2.038661003112793


Epoch 0:  92%|█████████████████████████████████████████████████████████▎    | 37/40 [04:30<00:21,  7.20s/it, loss=2.04]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0]], device='cuda:0')
Loss: 2.0434813499450684


Epoch 0:  95%|██████████████████████████████████████████████████████████▉   | 38/40 [04:37<00:14,  7.20s/it, loss=1.86]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [0]], device='cuda:0')
Loss: 1.857365369796753


Epoch 0:  98%|████████████████████████████████████████████████████████████▍ | 39/40 [04:44<00:07,  7.18s/it, loss=1.99]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[1],
        [1],
        [0],
        [0],
        [0],
        [1],
        [0],
        [1]], device='cuda:0')
Loss: 1.9866505861282349


Epoch 0: 100%|██████████████████████████████████████████████████████████████| 40/40 [04:52<00:00,  7.31s/it, loss=1.63]

MLM Predictions: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')
NSP Predictions: tensor([0, 0, 0, 0, 0], device='cuda:0')
Real Labels (NSP): tensor([[0],
        [1],
        [0],
        [0],
        [0]], device='cuda:0')
Loss: 1.6349890232086182





In [163]:
# model = BertForPreTraining.from_pretrained('path_to_save_trained_model')
model.eval()

# Prepare your test data
test_sentences = [
    ("The sun is shining.", "It's a beautiful day."),
    ("Random text here.", "More random text.")
]

for sentence_a, sentence_b in test_sentences:
    encoding = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    token_type_ids = encoding['token_type_ids'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
    
    seq_relationship_logits = outputs.seq_relationship_logits
    predicted_next_sentence_label = torch.argmax(seq_relationship_logits, dim=1)
    print(f"Sentence A: '{sentence_a}'")
    print(f"Sentence B: '{sentence_b}'")
    print("Predicted Next Sentence Label:", predicted_next_sentence_label.item())
    print()


Sentence A: 'The sun is shining.'
Sentence B: 'It's a beautiful day.'
Predicted Next Sentence Label: 0

Sentence A: 'Random text here.'
Sentence B: 'More random text.'
Predicted Next Sentence Label: 0

