## **FINE-TUNING BERT WITH MLM**

## **Libraries**

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW
from tqdm.auto import tqdm
import torch

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## **Import text data**

In [None]:
data_path = 'the_fire_flower.txt'
with open(data_path, 'r') as f:
    data = f.read().split('\n')

## **Text cleaning process**

In [None]:
len(data)

3949


In [6]:
for sentence in data:
    if len(sentence) < 50:
        data.remove(sentence)

In [None]:
len(data)

3078


## **Tokenizing the text data**

In [8]:
inputs = tokenizer(
    data,
    max_length=512,
    truncation=True,
    padding='max_length',
    return_tensors='pt'
)

In [9]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [10]:
inputs['labels'] = inputs['input_ids'].detach().clone()
inputs

{'input_ids': tensor([[  101,  1996,  2622,  ...,     0,     0,     0],
        [  101,  2023, 26885,  ...,     0,     0,     0],
        [  101,  2087,  2060,  ...,     0,     0,     0],
        ...,
        [  101,  8756,  3192,  ...,     0,     0,     0],
        [  101,  4942, 29234,  ...,     0,     0,     0],
        [  101,   102,     0,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0]]), 'labels': tensor([[  101,  1996,  2622,  ...,     0,     0,     0],
        [  101,  2023, 26885,  ...,     0,     0,     0],
        [  101,  2087, 

## **Masking the input_ids**

In [11]:
random_tensor = torch.rand(inputs['input_ids'].shape)

In [12]:
random_tensor.shape

torch.Size([3078, 512])

In [None]:
random_tensor

tensor([[0.6974, 0.2318, 0.5568,  ..., 0.6542, 0.9087, 0.5681],
        [0.3298, 0.9107, 0.3362,  ..., 0.5691, 0.3070, 0.0999],
        [0.7851, 0.3420, 0.4899,  ..., 0.9055, 0.7023, 0.8014],
        ...,
        [0.8422, 0.2042, 0.4212,  ..., 0.2519, 0.5127, 0.1659],
        [0.7554, 0.4178, 0.4344,  ..., 0.4360, 0.1397, 0.8117],
        [0.8325, 0.0738, 0.8135,  ..., 0.4894, 0.0600, 0.1216]])

In [None]:
masked_tensor = (random_tensor < 0.15)*(inputs['input_ids'] != 101)*(inputs['input_ids'] != 102)*(inputs['input_ids'] != 0)

In [None]:
nonzeros_indices = []
for i in range(len(masked_tensor)):
    nonzeros_indices.append(torch.flatten(masked_tensor[i].nonzero()).tolist())

In [None]:
for i in range(len(inputs['input_ids'])):
    inputs['input_ids'][i, nonzeros_indices[i]] = 103

## **Pytorch Dataset and Dataloader**

In [17]:
class BookDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, index):
        input_ids = self.encodings['input_ids'][index]
        labels = self.encodings['labels'][index]
        attention_mask = self.encodings['attention_mask'][index]
        token_type_ids = self.encodings['token_type_ids'][index]
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        }

In [18]:
dataset = BookDataset(inputs)

In [19]:
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=16,
    shuffle=True
)

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [None]:
model.to(device)

## **Model parameters**

In [22]:
epochs = 2
optimizer = AdamW(model.parameters(), lr=1e-5)

## **Training Loop**

In [None]:
model.train()

for epoch in range(epochs):
    loop = tqdm(dataloader)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backwards()
        optimizer.step()

        loop.set_description("Epoch: {}".format(epoch))
        loop.set_postfix(loss=loss.item())

  0%|          | 0/193 [00:00<?, ?it/s]