In [9]:
# https://towardsdatascience.com/how-to-train-a-bert-model-from-scratch-72cfce554fc6
# follows roughly this tutorial

In [66]:
# pip install datasets transformers torch torchvision torchaudio tensorboard

import os, gc
import torch
from datasets import load_dataset
from tqdm.auto import tqdm
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaTokenizerFast
from transformers import RobertaConfig
from transformers import RobertaModel
from transformers import RobertaForMaskedLM
from torch.optim import AdamW
from torch.utils import tensorboard
from transformers import pipeline

In [67]:
max_length = 512

In [68]:
dataset = load_dataset('yelp_review_full')

Reusing dataset yelp_review_full (/home/carlsonp/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

In [69]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [65]:
dataset['train']['text'][:3]

["dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patient

In [70]:
if not os.path.isdir(f'./data/text/yelp_review_full/'):
    text_data = []
    file_count = 0

    for sample in tqdm(dataset['train']):
        sample = sample['text'].replace('\n', '')
        text_data.append(sample)
        if len(text_data) == 10_000:
            # once we get to the 10K mark, save to file
            with open(f'./data/text/yelp_review_full/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
                fp.write('\n'.join(text_data))
            text_data = []
            file_count += 1
    # after saving in 10K chunks, we save whatever is left
    with open(f'./data/text/yelp_review_full/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(text_data))

In [71]:
paths = [str(x) for x in Path('./data/text/yelp_review_full').glob('**/*.txt')]

In [72]:
paths[:5]

['data/text/yelp_review_full/text_0.txt',
 'data/text/yelp_review_full/text_1.txt',
 'data/text/yelp_review_full/text_10.txt',
 'data/text/yelp_review_full/text_11.txt',
 'data/text/yelp_review_full/text_12.txt']

In [73]:
tokenizer = ByteLevelBPETokenizer()

In [74]:
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])






In [75]:
tokenizer.save_model('model')

['model/vocab.json', 'model/merges.txt']

In [76]:
# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = RobertaTokenizerFast.from_pretrained('model', max_len=max_length)

In [77]:
# test our tokenizer on a simple sentence
tokens = tokenizer('hello, how are you?')

In [78]:
tokens

{'input_ids': [0, 15185, 16, 693, 381, 337, 35, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [79]:
tokens.input_ids

[0, 15185, 16, 693, 381, 337, 35, 2]

In [80]:
def mlm(tensor):
    rand = torch.rand(tensor.shape) # [0, 1]
    # mask roughly 15% of tokens
    mask_arr = (rand < 0.15) * (tensor > 2) # special tokens are 0, 1, and 2 so we grab everything above
    # result of mask_arr is an array of True or False values
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist() # flatten removes the list within a list [[]]
        tensor[i, selection] = 4 # the <mask> index token from the ./model/vocab.json is 4
    return tensor

In [81]:
input_ids = []
mask = []
labels = []

for path in tqdm(paths):
    with open(path, 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    sample = tokenizer(lines, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))
        

  0%|          | 0/66 [00:00<?, ?it/s]

In [82]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [83]:
input_ids[0][:10]

tensor([   0, 5530,   18, 4725, 7581, 3558, 1070,  398,  686,  324])

In [84]:
labels[0][:10]

tensor([   0, 5530,   18, 4725, 7581, 3558, 1070,  398,  686,  324])

In [85]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

In [86]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [87]:
dataset = Dataset(encodings)

In [88]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [89]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fafd82e20b0>

In [90]:
config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings= max_length + 2,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [91]:
model = RobertaForMaskedLM(config)

In [92]:
# try not to use the CPU, it takes forever
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
# and move our model over to the selected device
model.to(device)

cpu


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [93]:
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

In [94]:
# use tensorboard for visualizing results
writer = torch.utils.tensorboard.SummaryWriter()

In [95]:
# clear the memory from the GPU
gc.collect()
torch.cuda.empty_cache()

In [96]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [97]:
epochs = 1
step = 0

# one epoch takes about 8 hours to train on GPU

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # write data for tensorboard
        writer.add_scalar('Loss/train', loss, step)
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        step += 1

  0%|          | 0/81251 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [42]:
# pip install tensorboard
# $: tensorboard --logdir=runs
# http://localhost:6006

In [43]:
model.save_pretrained('./model')

In [45]:
fill = pipeline('fill-mask', model='./model/', tokenizer='./model/')

In [46]:
fill(f'how {fill.tokenizer.mask_token} you?')

[{'score': 0.7861970663070679,
  'token': 381,
  'token_str': ' are',
  'sequence': 'how are you?'},
 {'score': 0.03877374157309532,
  'token': 325,
  'token_str': ' is',
  'sequence': 'how is you?'},
 {'score': 0.031562838703393936,
  'token': 488,
  'token_str': ' can',
  'sequence': 'how can you?'},
 {'score': 0.022986553609371185,
  'token': 376,
  'token_str': ' were',
  'sequence': 'how were you?'},
 {'score': 0.020253395661711693,
  'token': 436,
  'token_str': ' do',
  'sequence': 'how do you?'}]

In [47]:
fill(f'what {fill.tokenizer.mask_token} you think?')

[{'score': 0.7091174721717834,
  'token': 436,
  'token_str': ' do',
  'sequence': 'what do you think?'},
 {'score': 0.22297416627407074,
  'token': 488,
  'token_str': ' can',
  'sequence': 'what can you think?'},
 {'score': 0.01868179254233837,
  'token': 602,
  'token_str': ' could',
  'sequence': 'what could you think?'},
 {'score': 0.007553635630756617,
  'token': 457,
  'token_str': ' would',
  'sequence': 'what would you think?'},
 {'score': 0.006485422607511282,
  'token': 464,
  'token_str': ' did',
  'sequence': 'what did you think?'}]

In [54]:
fill(f'what {fill.tokenizer.mask_token} is it?')

[{'score': 0.2606864869594574,
  'token': 1015,
  'token_str': ' else',
  'sequence': 'what else is it?'},
 {'score': 0.06429751962423325,
  'token': 1163,
  'token_str': ' hard',
  'sequence': 'what hard is it?'},
 {'score': 0.03606041520833969,
  'token': 566,
  'token_str': ' more',
  'sequence': 'what more is it?'},
 {'score': 0.035212013870477676,
  'token': 407,
  'token_str': ' good',
  'sequence': 'what good is it?'},
 {'score': 0.029709896072745323,
  'token': 2568,
  'token_str': ' worse',
  'sequence': 'what worse is it?'}]