<a href="https://colab.research.google.com/github/daliarod96/DAPT-MLM-BERT/blob/main/pre_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Transformers library
# I'm using an old version because newer versions were not compatible with
# the Docker container that I used to run this program
!pip install transformers==2.3.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Import necessary libraries 
import pandas as pd 
import torch
import torch.nn as nn
import transformers

In [3]:
# creat a custom Tweets Dataset class 
class TweetsDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings['input_ids'])


In [4]:
# Import and process dataset
# profanities dataset of 5.5 million tweets containing commonly used slurs in Spanish
text = pd.read_csv("/content/drive/MyDrive/myModels/profanitydatasetprocessed")
text = text['content']
text = text.dropna()
text = text.astype('str')
text = text.tolist()

#I'm just using 1000 samples to expedite processing 
text = text[:100]


In [9]:
# create the tokenizer and load the pre-trained model

from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", do_lower_case=True)
model = BertForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

In [6]:
# tokenize text for training

result = tokenizer.batch_encode_plus(text)
chunk_size = 128

def group_texts(examples):
  # concatenate all texts
  concatenated_examples = {'input_ids':[], 'token_type_ids':[], 'attention_mask':[]}
  for i in range(len(examples['input_ids'])):
    concatenated_examples['input_ids']+=examples['input_ids'][i]
  for i in range(len(examples['token_type_ids'])):
    concatenated_examples['token_type_ids']+=examples['token_type_ids'][i]
  for i in range(len(examples['attention_mask'])):
    concatenated_examples['attention_mask']+=examples['attention_mask'][i]

  # compute length of concatenated texts
  total_length = len(concatenated_examples['input_ids'])

  # We drop the last chunk if it's smaller than chunk_size
  total_length = (total_length // chunk_size) * chunk_size

  # Split by chunks of max_len
  result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }

  # Create a new labels column
  result["labels"] = result["input_ids"].copy()


  result['input_ids'] = torch.tensor(result['input_ids'])
  result['token_type_ids'] = torch.tensor(result['token_type_ids'])
  result['attention_mask'] = torch.tensor(result['attention_mask'])
  result['labels'] = torch.tensor(result['labels'])

  
  return result

inputs = group_texts(result)




"""
We want to create our mask. 
Each token that is not a special token has a 15% chance of being masked.
We don't want to mask [CLS], [SEP], or padding tokens. 
Correspond to the numbers 4, 5, and 1.
"""

rand = torch.rand(inputs['input_ids'].shape)

mask_arr = (rand < 0.15) * (inputs['input_ids'] != 4) * (inputs['input_ids'] != 5)* (inputs['input_ids'] != 1)

# masked tokens

selection = []

for i in range(mask_arr.shape[0]):
  selection.append(
    torch.flatten(mask_arr[i].nonzero()).tolist()
  )

for i in range(mask_arr.shape[0]):
  inputs['input_ids'][i,selection[i]] = 0

dataset = TweetsDataset(inputs)

# pass dataset into DataLoader
dataloader = torch.utils.data.DataLoader(dataset, batch_size = 32, shuffle=True)


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

model.train()

from torch.optim import AdamW

#optimizer
optim = AdamW(model.parameters(), lr=5e-5, weight_decay = 0.01)

In [7]:
# Training loop

from tqdm import tqdm

epochs = 1

for epoch in range(epochs):
  loop = tqdm(dataloader, leave=True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels= batch['labels'].to(device)
    

    outputs = model(input_ids, attention_mask=attention_mask,
                   lm_labels=labels)
    loss, prediction_scores = outputs[:2]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it, loss=11.6]


In [8]:
# save your model
save_directory = "/content/drive/MyDrive/myModels"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)