In [None]:
!pip install datasets

In [None]:
import datasets

In [None]:
all_ds = datasets.list_datasets()
all_ds[:5] 

In [None]:
dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_bn')

In [None]:
dataset

In [None]:
dataset['train'][0]

In [None]:
from tqdm.auto import tqdm

text_data = []
file_count = 0

for sample in tqdm(dataset['train']):
    sample = sample['text'].replace('\n', '')
    text_data.append(sample)
    if len(text_data) == 6_000:
        # once we git the 6K mark, save to file
        with open(f'./text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
# after saving in 6K chunks, we will have ~808 leftover samples, we save those now too
with open(f'./text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))

In [None]:
from pathlib import Path
paths = [str(x) for x in Path('./').glob('**/*.txt')] 

In [None]:
paths[:10]

In [None]:
!pip install transformers

In [None]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train(files=paths[:10], 
                vocab_size=30_522,
                min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [None]:
import os

os.mkdir('./bert-bn')

tokenizer.save_model('bert-bn') 

In [None]:
from transformers import RobertaTokenizer

# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = RobertaTokenizer.from_pretrained('bert-bn', max_len=512)

In [None]:
tokens = tokenizer('আমার জন্য এটি একটি স্মরণীয় অর্জন')

In [None]:
print(tokens)

In [None]:
token = tokenizer('আমি ইচ্ছা করলে প্রত্যেককে সঠিক দিক নির্দেশ দিতাম')

In [None]:
print(token)

In [None]:
token.input_ids

In [None]:
with open('./text_1.txt', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

In [None]:
with open('./text_0.txt', 'r', encoding='utf-8') as fp:
    lines_1 = fp.read().split('\n')

In [None]:
lines_1[0]

In [None]:
batch = tokenizer(lines, max_length=512, padding='max_length', truncation=True)
len(batch)

In [None]:
import torch

labels = torch.tensor([x for x in batch['input_ids']])
mask = torch.tensor([x for x in batch['attention_mask']])

In [None]:
labels

In [None]:
# make copy of labels tensor, this will be input_ids
input_ids = labels.detach().clone()
# create random array of floats with equal dims to input_ids
rand = torch.rand(input_ids.shape)
# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]
# mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)
mask_arr = (rand < .15) * (input_ids > 2) 
# loop through each row in input_ids tensor (cannot do in parallel)
for i in range(input_ids.shape[0]):
    # get indices of mask positions from mask array
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    # mask input_ids
    input_ids[i, selection] = 4  # our custom [MASK] token == 3

In [None]:
input_ids.shape

In [None]:
input_ids[0][:200]

In [None]:
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [None]:
dataset = Dataset(encodings)

In [None]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True) 

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
    )

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config) 

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

In [None]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item()) 

In [None]:
model.save_pretrained('./bert-bn')

In [None]:
from transformers import pipeline

In [None]:
fill = pipeline('fill-mask', model='bert-bn', tokenizer='bert-bn')

In [None]:
fill(f'বুড়ি মরেছে ভালোই {fill.tokenizer.mask_token}')