In [40]:
from transformers import BertTokenizer, BertForPreTraining
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

In [41]:
import pandas as pd 

data = pd.read_csv('df_2013.csv')

In [42]:
df = data.copy()

In [43]:
text = df['reviewText']

In [44]:
text[0]

"I remember reading my parents' copy of this book until it fell apart. The story still makes me giggle. I just wanted to share it with my kids, who appreciated it even though most of them are teenagers now. The story is light-hearted and silly, but with a very powerful message about learning to live with the problems you have instead of trading them for larger problems.\n\nThe artwork is hysterical, too."

# Preparing For NSP
To prepare our data for NSP, we need to create a mix of non-random sentences (where the two sentences were originally together) — and random sentences.
For this, we’ll create a bag of sentences extracted from text which we can then randomly select a sentence from when creating a random NotNextSentence pair.

In [45]:
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [46]:
bag_size

3196

In [47]:
bag[0:6]

["I remember reading my parents' copy of this book until it fell apart",
 ' The story still makes me giggle',
 ' I just wanted to share it with my kids, who appreciated it even though most of them are teenagers now',
 ' The story is light-hearted and silly, but with a very powerful message about learning to live with the problems you have instead of trading them for larger problems',
 '\n\nThe artwork is hysterical, too',
 "I am very happy with the book!!!  It is one of my children's favorite books and I was so pleased I could order it here!!!"]

After creating our bag we can go ahead and create our 50/50 random/non-random NSP training data. For this, we will create a list of sentence As, sentence Bs, and their respective IsNextSentence or NotNextSentence labels.

In [48]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [49]:
for i in range(3):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n')

1
 The story still makes me giggle
---
  It is never to late "What The Mind of Man Can Conceive and Believe, The Mind Can Achieve

0
  You'll treasure it for your children-and for yourself
---
 If you can find a copy, get one and enjoy all it has to offer with your kids or by yourself

1
A must for every home, with or without children
---
 But nobody would believe her when she told them that she read it and she was labeled 'sick', 'twisted' and 'evil'



We can see in the console output that we have label 1 representing random sentences (NotNextSentence) and 0 representing non-random sentences (IsNextSentence).
# Tokenization
We can now tokenize our data. As is typical with BERT models, we truncate/pad our sequences to a length of 512 tokens.

In [50]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')

In [51]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [52]:
inputs

{'input_ids': tensor([[  101,  1996,  2466,  ...,     0,     0,     0],
        [  101,  2017,  1005,  ...,     0,     0,     0],
        [  101,  1037,  2442,  ...,     0,     0,     0],
        ...,
        [  101, 20052,  9106,  ...,     0,     0,     0],
        [  101,  2023,  2001,  ...,     0,     0,     0],
        [  101,  2023,  2338,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

There are a few things we should take note of here. Because we tokenized two sentences, our tokenizer automatically applied 0 values to sentence A and 1 values to sentence B in the token_type_ids tensor. The trailing zeros are aligned to the padding tokens.
Secondly, in the input_ids tensor, the tokenizer automatically placed a SEP token (102) between these two sentences — marking the boundary between them both.
BERT needs to see both of these when performing NSP.
# NSP Labels
Our NSP labels must be placed within a tensor called next_sentence_label. We create this easily by taking our label variable, and converting it into a torch.LongTensor — which must also be transposed using .T:

In [53]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [54]:
inputs.next_sentence_label[:10]

tensor([[1],
        [0],
        [1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [1],
        [0]])

# Masking For MLM
For MLM we need to clone our current input_ids tensor to create a MLM labels tensor — then we move onto masking ~15% of tokens in the input_ids tensor.

In [55]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [56]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

Now that we that clone for our labels, we mask tokens in input_ids.

In [57]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

And now take the indices of each True value within each vector.

In [58]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [59]:
selection[:2]

[[2, 6, 13, 19, 29], [17, 20, 33]]

Then apply these indices to each row in input_ids, assigning each value at these indices a value of 103.

In [60]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [61]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [62]:
inputs.input_ids

tensor([[  101,  1996,   103,  ...,     0,     0,     0],
        [  101,  2017,  1005,  ...,     0,     0,     0],
        [  101,  1037,  2442,  ...,     0,     0,     0],
        ...,
        [  101, 20052,  9106,  ...,     0,     0,     0],
        [  101,   103,  2001,  ...,     0,     0,     0],
        [  101,   103,  2338,  ...,     0,     0,     0]])

Note that there are a few rules we’ve added here, by adding the additional logic when creating mask_arr — we are ensuring that we don’t mask any special tokens — such as CLS (101), SEP (102), and PAD (0) tokens.
# Dataloader
All of our input and label tensors are ready — all we need to do now is format them into a PyTorch dataset object so that it can be loaded into a PyTorch Dataloader — which will feed batches of data into our model during training.

In [63]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [64]:
dataset = OurDataset(inputs)

In [65]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [66]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [67]:
model.train()

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [68]:
from transformers import AdamW

# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)



In [69]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0:   2%|▏         | 1/46 [00:18<13:39, 18.21s/it, loss=20.6]