In [None]:
import torch
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np

from transformers import AutoTokenizer, Trainer, BertForPreTraining, BertConfig, AdamW, get_linear_schedule_with_warmup
import random


from pytorch_lightning import LightningModule, seed_everything, Trainer
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import WandbLogger

from tqdm import tqdm

The seed_everything(42, workers=True) function call sets the seed for the random number generators used in the experiment to the integer value of 42. <br>
Setting the random seed to a fixed value can help to ensure that the results of the experiment are reproducible, meaning that they can be repeated with the same results. <br>
The workers=True argument specifies that the random seed should also be set for any worker processes that may be used in parallel processing, which can further improve reproducibility.

In [None]:
import wandb
seed_everything(42, workers=True)

In [None]:
project = "TrainBERTscratchMLMandNSP"
run_rame = "run_01"
wandb.init(project= project,
                 config={
                    "csv_path": '~/data/MLM+NSP_BERT_Data_for_Dataset.csv',
                    "batch_size": 16,
                    "max_epochs": 100,                    
                    "validation_size": 2000,
                    "shuffle_train": True,
                    "optimizer": AdamW,
                    "learning_rate": 1e-4,
                    "weight_decay": 0.01,
                    "warmup_function": 'Linear',
                    "warmup_steps": 0.1,
                    "early_stop": False,
                    "patience": 10,
                    "run_name": run_rame,
                    "model_name": '~/models/BERT after MLM+NSP',
                    "model_save_path": '~/models/BERT after MLM+NSP/' + run_rame
                 })

config_wandb = wandb.config
wandb_logger = WandbLogger(name=config_wandb.run_name, project=project)


<div>
<p>
This code block creates an instance of the `BertConfig` class from the `transformers` library. `BertConfig` is a configuration class that is used to define the architecture and parameters of a BERT (Bidirectional Encoder Representations from Transformers) model. 
</p>
<p>
The constructor of `BertConfig` takes several arguments, each of which corresponds to a different aspect of the model's architecture or hyperparameters. In this case, the following arguments are specified:
</p>
<ul>
    <li>`vocab_size`: This sets the size of the model's vocabulary. Here, it is set to 30,000.</li>
    <li>`hidden_size`: This sets the number of hidden units in each layer of the model. Here, it is set to 768.</li>
    <li>`num_hidden_layers`: This sets the number of layers in the model. Here, it is set to 6.</li>
    <li>`num_attention_heads`: This sets the number of attention heads used in the multi-head attention mechanism of the model. Here, it is set to 12.</li>
    <li>`max_position_embeddings`: This sets the maximum length of the input sequences that the model can handle. Here, it is set to 512.</li>
</ul>
<p>
By creating an instance of `BertConfig` with these values, the model architecture and hyperparameters are defined, and can be used to initialize a BERT model for natural language processing tasks.
</p>
</div>


In [None]:
config = BertConfig(
    vocab_size=30000,
    hidden_size=768, 
    num_hidden_layers=6, 
    num_attention_heads=12,
    max_position_embeddings=512
)

<div>
<p>
This code block initializes a BERT model for pretraining, using a custom tokenizer and a specified device (either CPU or GPU, depending on availability).
</p>
<p>
The first line sets the device to be used for training and inference. If a GPU is available, it is set to "cuda"; otherwise, it defaults to "cpu".
</p>
<p>
The second line initializes a tokenizer object using the `AutoTokenizer` class from the `transformers` library. The `from_pretrained()` method is used to load a custom tokenizer located at the specified path ('~/Custom_Tokenizer'), and the `use_fast=True` argument enables the use of a fast tokenizer implementation for improved performance.
</p>
<p>
Finally, the third line initializes a `BertForPreTraining` model object using the `config` object created in the previous code block, and sends it to the specified device using the `to()` method. `BertForPreTraining` is a variant of the BERT model that is designed for pretraining, and includes additional prediction tasks to improve the model's ability to learn general representations of language.
</p>
</div>


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained('~/Custom_Tokenizer', use_fast=True)
bert_model = BertForPreTraining(config).to(device)

This function will assign a new Porta_ID value to each address. It is used to be able to do data[idx] when creating the dataset class. The reason why I have to do this every time I run the program is that I split the data randomly into training and testing sets. Only after this division can I calculate the new ID for each address, because if I do it beforehand, the division of IDs between training and testing will be random and doing data[idx] later will be problematic.

In [None]:
def add_new_ID(data):
    new_door_id = []
    aux = 0
    for i in tqdm(range(len(data))):
        if(i == 0):
            new_door_id.append(0)
        else:
            door_id_atual = data.at[i, 'Porta_ENT']
            door_id_antigo = data.at[i-1, 'Porta_ENT']
            if(door_id_antigo != door_id_atual):
                aux = aux+1
                new_door_id.append(aux)
            else:
                new_door_id.append(aux)
    data['ID'] = new_door_id
    return data

When I perform the split, it may happen that addresses that are only written in two ways (normalized and unnormalized) get separated. This is a problem because then I can't pair them in the dataloader batches. This cell will ensure that addresses are grouped by their ID value, and therefore, in the split, addresses that share the same Porta_ENT are placed in the same group. I will try to maintain an approximate ratio of 90% training and 10% testing.

In [None]:
def get_train_test_data(data, train_size_proportion):
    # group the dataframe by the non-unique column
    groups = data.groupby('Porta_ENT')

    # randomly assign each group to either split 1 or split 2
    np.random.seed(42)  # for reproducibility


    group_ids = np.random.choice([0, 1], size=len(groups), p=[train_size_proportion, 0.1])
    group_dict = dict(zip(groups.groups.keys(), group_ids))

    # create the two splits
    train = pd.concat([group for key, group in groups if group_dict[key] == 0]).sort_values(by=['Porta_ENT']).reset_index(drop=True)
    test = pd.concat([group for key, group in groups if group_dict[key] == 1]).sort_values(by=['Porta_ENT']).reset_index(drop=True)

    train = add_new_ID(train)
    test = add_new_ID(test)

    return train, test

The idea is to create a dataset that, when called to create batches, for each index that I request, returns a pair of addresses. Each address can be written in at least 2 ways: normalized + unnormalized. Therefore, when I do dataset[idx], it is always possible to return something.

In [None]:
class MatchDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return self.data['ID'].nunique() #Number of unique IDs.

    def __getitem__(self, idx):
        df_aux = self.data[self.data['ID'] == idx].sample(n=2, replace=False).reset_index(drop=True)
        sa = df_aux.iloc[0]['Address']
        sb = df_aux.iloc[1]['Address']
        label = 0
        return {'Address_1': sa, 'Address_2': sb, 'Label': label}

<h3>Function: collate_MLM_NSP(batch)</h3>
<p>This function takes in a batch of examples, where each example is a set containing two addresses and a label. It then performs operations to tokenize the addresses, create negative samples for the next sentence prediction (NSP) task, and create a masked language model (MLM) mask. Finally, it returns a dictionary containing the tensors that will go into the model.</p>
<p><strong>Input:</strong> a batch of examples, where each example is a set containing two addresses and a label.</p>
<p ><strong>Output:</strong> a dictionary containing the tensors that will go into the model.</p>
<h4>Steps:</h4>
<ol>
    <li>Create empty lists for address_1, address_2, and next_sentence_label</li>
    <li>Loop through the examples in the batch, and append the address_1, address_2, and next_sentence_label of each example to their respective lists</li>
    <li>Convert the batch labels for NSP task to tensor with dtype long and shape (batch_size,)</li>
    <li>Assign a uniformly generated probability value between [0-1] to each element of the batch. This will determine which addresses will be swapped to create negative pairs for the NSP task.</li>
    <li>Mark cases where the probability was >0.5 as cases where a swap will occur</li>
    <li>Find indices where [mask] == True</li>
    <li>Create a copy of the list that contains the addresses that will be swapped. This is to avoid swapping a vector with itself.</li>
    <li>Loop through the indices where a swap will occur, and choose an address for the swap</li>
    <li>Place the new addresses, in the correct positions in the initial list</li>
    <li>Swap labels in positions where there was a swap</li>
    <li>Tokenize address pairs using the tokenizer</li>
    <li>Create the MLM mask by looping through the tokenized input_ids, and assigning a mask token with a probability of 15% to valid tokens. Valid tokens are those that are not [CLS, PAD or SEP].</li>
    <li>Change the labels tensor in order to fill with -100 all the entries where there are no [MASK] tokens.</li>
    <li>Create a dictionary containing the tensors that will go into the model, and send them to the GPU. Pytorch Lightning will not do this by default in this case.</li>
    <li>Return the dictionary of inputs.</li>
</ol>

In [None]:
def collate_MLM_NSP(batch):
    address_1 = []
    address_2 = []
    next_sentence_label  = []
    
    # I have to receive each example from the batch individually. It receives sets [{'Address_1': sa, 'Address_2': sb, 'Label': label}] of these.
    # Therefore, to perform operations, I have to create lists again for each of the elements.
    for example in batch:
        address_1.append(example['Address_1'])
        address_2.append(example['Address_2'])
        next_sentence_label .append(example['Label'])

    # Convert batch labels for NSP task to tensor. dtype should be long. shape(batch_size,)
    next_sentence_label  = torch.tensor(next_sentence_label , dtype=torch.long)


    # Assign a uniformly generated probability value between [0-1] to each element of the batch. I will find out which addresses will be swapped
    # to create negative pairs for the NSP task.
    probabilities = np.random.uniform(0, 1, len(address_2))

    # Cases where probability was >0.5 will be marked as cases where a swap will occur
    mask = probabilities > 0.5


    # Find indices where [mask] == True.
    indices = np.arange(len(address_2))[mask]
    

    # Here I create a copy of the list that contains the addresses that I am going to swap to make the changes I want. If I use only the original one, an undesired phenomenon
    # could happen. Ex: Indices that will swap [1, 5, 7]. Swapping with [7, 2, 1]. Using only one vector, position 7 will end up with the same vector it started with,
    # as it was initially swapped with position 1. I have to make sure that a vector does not swap with itself.
    lst = np.array(address_2)
    lst_aux = np.array(address_2)

    for i in indices:
        # Choose an address for the swap
        lst_aux[i] = lst[int(np.random.choice(np.delete(np.arange(len(lst)), i), size=1))]
    
    # Place the new addresses, in the correct positions in the initial list
    lst[mask] = lst_aux[mask]

    # Swap labels in positions where there was a swap
    next_sentence_label [mask] = 1

    # Tokenize address pairs
    tokenized = tokenizer(address_1, lst.tolist(), return_tensors='pt',  add_special_tokens=True, padding='longest')

    # Label matrix (batch_size x sequence length) initially receives the input_ids after encoding. Then I will do masking and update this tensor
    # to count only valid values in positions where there is a [MASK] token. The detach().clone() is done to ensure that changes made to tokenized.input_ids
    # do not reflect in the labels tensor.
    labels = tokenized.input_ids.detach().clone()


    # Create the MLM mask
    prob_mask = 0.15
    mlm_mask = []

    # Loop to assign mask tokens with a probability of 15% to tokenized.input_ids. I make sure not to give [MASK] to [CLS, PAD or SEP].
    for i in range(tokenized.input_ids.shape[0]):
        for j in range(tokenized.input_ids.shape[1]):
            if random.uniform(0, 1) < prob_mask and tokenized.input_ids[i][j] not in [tokenizer.sep_token_id, tokenizer.cls_token_id, tokenizer.pad_token_id]:
                mlm_mask.append(j)
        mlm_mask = []

    # Change labels tensor in order to fill with -100 all the entries where there are no [MASK] tokens.
    labels = torch.where(tokenized.input_ids == tokenizer.mask_token_id, labels, -100)

    # Create dictionary where I send to GPU all the tensors that will go to the model. Pytorch Lightning will not do this by default in this case.
    inputs = {'input_ids': tokenized.input_ids.to(device),
              'token_type_ids': tokenized.token_type_ids.to(device),
              'attention_mask': tokenized.attention_mask.to(device),
              'labels': labels.to(device),
              'next_sentence_label': next_sentence_label.to(device)}
    

    return inputs
               


<div>
<p>This is a Python class called <code>MatchModel</code> that extends the <code>LightningModule</code> class. It defines a neural network model and its training and validation steps.</p>
<h3>Constructor</h3>
<p>The constructor initializes various instance variables including the model, csv_path, batch_size, epochs, warmup_steps, best_val_loss, and softmax function.</p>
<h3>Setup</h3>
<p>The <code>setup</code> method is called when the class is initialized. It reads a CSV file from the path specified in <code>csv_path</code> and splits the data into train and test sets. It then initializes two instances of the <code>MatchDataset</code> class for the train and test sets.</p>
<h3>Data Loaders</h3>
<p>The <code>train_dataloader</code> and <code>val_dataloader</code> methods return <code>DataLoader</code> instances for the train and validation datasets, respectively. They use the <code>collate_MLM_NSP</code> function to collate batches.</p>
<h3>Forward Method</h3>
<p>The <code>forward</code> method takes an input dictionary (the return of the collate_fn of the dataloader) and passes it to the <code>model</code> instance defined in the constructor.</p>
<h3>Training Step</h3>
<p>The <code>training_step</code> method is called for each batch during the training phase. It calls the <code>forward</code> method to obtain model predictions and calculates the loss. The <code>log</code> function is used to log the loss value.</p>
<h3>Validation Step</h3>
<p>The <code>validation_step</code> method is called for each batch during the validation phase. It calls the <code>forward</code> method to obtain model predictions and calculates the loss, as well as the accuracy for both the MLM and NSP tasks.</p>
<h3>Validation Epoch End</h3>
<p>The <code>validation_epoch_end</code> method is called at the end of each validation epoch. It calculates the average validation loss and accuracies over all validation batches and logs them. If the current epoch loss is lower than the best loss obtained so far, the model is saved to disk.</p>
<h3>Configure Optimizers</h3>
<p>The <code>configure_optimizers</code> method initializes the optimizer and learning rate scheduler for the model.</p>
</div>


In [None]:
class MatchModel(LightningModule):
    def __init__(self, model):
        super().__init__()

        self.model = model
        self.csv_path = config_wandb.csv_path
        self.batch_size = config_wandb.batch_size
        self.epochs = config_wandb.max_epochs
        self.warmup_steps = config_wandb.warmup_steps

    
        self.best_val_loss = 999 # Init with big value so that 1st epoch is always saved
        self.softmax = torch.nn.Softmax()

    def setup(self, stage = None):
        df = pd.read_csv(self.csv_path)

        train, test = get_train_test_data(data = df, train_size_proportion=0.9)
        self.train_dataset = MatchDataset(train)
        self.val_dataset = MatchDataset(test)


    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_MLM_NSP)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=collate_MLM_NSP)

    def forward(self, input):
        return self.model(**input)

    
    def training_step(self, batch, batch_idx):
        
        model_predictions = self.forward(batch)
        loss = model_predictions.loss
        self.log('train_loss', loss, prog_bar=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        model_predictions = self.forward(batch)

        #This is a combination of the MLM and the NSP Loss. (MLM: Cross-Entropy, NSP: Binary Cross-Entropy)
        loss = model_predictions.loss

        #NSP Accuracy
        output_after_softmax_nsp = self.softmax(model_predictions['seq_relationship_logits'])
        output_after_softmax_nsp = torch.argmax(output_after_softmax_nsp, dim=-1)        
        acc_batch_NSP = torch.sum(output_after_softmax_nsp == batch['next_sentence_label']) / len(batch['next_sentence_label'])

        #MLM Accuracy
        valid_entries_batch = 0
        acc_batch_MLM = 0
        for i in range(len(batch['input_ids'])):
            masked_indices = torch.where(batch['input_ids'][i] == tokenizer.mask_token_id)
            valid_entries_batch = valid_entries_batch + len(masked_indices[0])
            if(len(masked_indices[0]) > 0):
                predictions = model_predictions['prediction_logits'][i].argmax(dim=1)
                predicted = predictions[masked_indices]
                true_token = batch['labels'][i][masked_indices]
                acc_batch_MLM = acc_batch_MLM + torch.sum(predicted == true_token)
        acc_batch_MLM = acc_batch_MLM / valid_entries_batch

        return {'val_loss': loss, 'val_acc_NSP': acc_batch_NSP, 'val_acc_MLM': acc_batch_MLM} #Isto na realidade não é 1 valor, vão ser tantos valores quantos batches tiver o validation_step

    def validation_epoch_end(self, outputs):

        # Get the average values of the calculated metrics for all the validation batches.
        current_epoch_val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        current_epoch_val_acc_NSP =  torch.stack([x['val_acc_NSP'] for x in outputs]).mean()
        current_epoch_val_acc_MLM =  torch.stack([x['val_acc_MLM'] for x in outputs]).mean()

        # Log the metrics
        self.log('val_loss', current_epoch_val_loss, prog_bar=True)
        self.log('val_acc_NSP', current_epoch_val_acc_NSP, prog_bar=True)
        self.log('val_acc_MLM', current_epoch_val_acc_MLM, prog_bar=True)

        # Check if model should be saved by comparing the current loss with the best loss obtained so far
        print('\nChecking if model is to be saved: ')
        print('Current Epoch Loss = {}, and best Loss so far = {}'.format(current_epoch_val_loss, self.best_val_loss))
        if current_epoch_val_loss < self.best_val_loss:
            self.best_val_loss = current_epoch_val_loss
            print('Model Saved')
            self.model.save_pretrained(config_wandb.model_save_path)

    def configure_optimizers(self):

        optimizer_config = {
            'params' : self.model.parameters(),
            'lr' : config_wandb.learning_rate,
            'weight_decay' : config_wandb.weight_decay,
        }
        optimizer = AdamW(**optimizer_config)
        total_steps=self.trainer.estimated_stepping_batches
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps= config_wandb.warmup_steps * total_steps, num_training_steps= total_steps)
        return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))


<p>The first line creates an instance of the <code>LearningRateMonitor</code> class and assigns it to the variable <code>lr_monitor</code>. This callback monitors the learning rate of the optimizer and logs it at a specified interval during training.</p>
<p>The second line creates an instance of the <code>EarlyStopping</code> class and assigns it to the variable <code>early_stopping_callback</code>. This callback monitors the validation loss and stops training early if the loss does not improve for a certain number of epochs specified by the <code>patience</code> argument. The <code>monitor</code> argument specifies which metric to monitor, in this case, the validation loss. The <code>mode</code> argument specifies whether to minimize or maximize the monitored metric, in this case, we want to minimize the validation loss.</p>

In [None]:
lr_monitor = LearningRateMonitor(logging_interval='step')
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=config_wandb.patience, mode='min')

<div>
<p>The code initializes a <code>MatchModel</code> with a <code>bert_model</code> and a <code>Trainer</code> with various arguments:</p>
<ul>
<li><code>accelerator</code> is set to 'gpu'</li>
<li><code>devices</code> is set to 1</li>
<li><code>callbacks</code> is set to a list containing <code>lr_monitor</code> and <code>early_stopping_callback</code> if <code>config_wandb.early_stop</code> is <code>True</code>, otherwise only <code>lr_monitor</code> is included</li>
<li><code>check_val_every_n_epoch</code> is set to 1</li>
<li><code>max_epochs</code> is set to <code>config_wandb.max_epochs</code></li>
<li><code>enable_checkpointing</code> is set to <code>False</code></li>
<li><code>accumulate_grad_batches</code> is set to 1</li>
<li><code>logger</code> is set to <code>wandb_logger</code></li>
<li><code>log_every_n_steps</code> is set to 1</li>
<li><code>deterministic</code> is set to <code>True</code></li>
<li><code>precision</code> is set to 16</li>
</ul>
<p>Finally, the <code>trainer</code> is used to fit the <code>model</code>.</p>
</div>

In [None]:
model = MatchModel(bert_model)
trainer = Trainer(
                    accelerator='gpu',
                    devices=1,
                    callbacks = [lr_monitor, early_stopping_callback] if config_wandb.early_stop == True else [lr_monitor],
                    check_val_every_n_epoch=1,
                    max_epochs=config_wandb.max_epochs,
                    enable_checkpointing=False,
                    accumulate_grad_batches=1,
                    logger = wandb_logger,
                    log_every_n_steps=1,
                    deterministic=True,
                    precision = 16,
                    )

trainer.fit(model)

In [None]:
wandb.alert(
    title=f"Finish {config.run_name}", 
    text = f"Run is over.",
)

wandb.finish()