In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import torch
torch.cuda.get_device_name()

In [None]:
import wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wordpiece-bert-project-epoch24",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": 2e-5,
    "weight_decay" : 0.01,
    "architecture": "wordpiece-bert-base",
    "dataset": "bdnews24",
    "epochs": 24,
    "batch size": 64,
    }
)


In [None]:
from datasets import load_dataset

bdnews_dataset = load_dataset("text", data_files="datasets/Bangla BDnews.txt", split="train")

In [None]:
bdnews_dataset.set_format("pandas")

In [None]:
bdnews_df = bdnews_dataset[:]

In [None]:
bdnews_df_lens = bdnews_df['text'].str.len()

In [None]:
max(bdnews_df_lens)

In [None]:
# pretraining_df_lens = pretraining_df['text'].str.len()
count = bdnews_df['text'].str.split().apply(len).value_counts()

In [None]:
count.index = count.index.astype(str) + ' words:'
count.sort_index(inplace=True)

In [None]:
count

In [None]:
bdnews_dataset.reset_format()

In [None]:
bdnews_dataset

In [None]:
bdnews_dataset[:5]

In [None]:
def get_training_corpus():
    for i in range(0, len(bdnews_dataset), 1000):
        yield bdnews_dataset[i : i+1000]["text"]

In [None]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [None]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [None]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFKC()]
)

In [None]:
print(tokenizer.normalizer.normalize_str('গ্রীন ব্যাংকিং বা পরিবেশবান্ধব ব্যাংকিং ও ছাদ বাগান কার্যক্রম শুরু করেছে রাষ্ট্রমালিকানাধীন অগ্রণী ব্যাংক ।'))

In [None]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [None]:
tokenizer.pre_tokenizer.pre_tokenize_str('গ্রীন ব্যাংকিং বা পরিবেশবান্ধব ব্যাংকিং ও ছাদ বাগান কার্যক্রম শুরু করেছে রাষ্ট্রমালিকানাধীন অগ্রণী ব্যাংক ।')

In [None]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(
    vocab_size=30522, special_tokens=special_tokens
)

In [None]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [None]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

In [None]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [None]:
encoding = tokenizer.encode("এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে প্রচারণা ।","সহযোগিতা করছে তথ্য ও যোগাযোগপ্রযুক্তি আইসিটি বিভাগ ।")
print(encoding.tokens)
print(encoding.type_ids)

In [None]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [None]:
tokenizer.decode(encoding.ids)

In [None]:
tokenizer.enable_truncation(max_length=512)

In [None]:
# tokenizer.save("wordpiece_tokenizer_bdnews.json")

In [None]:
# tokenizer = Tokenizer.from_file("unigram_tokenizer_bdnews.json")

In [None]:
# from transformers import BertTokenizerFast

# tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [None]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    #tokenizer_object=tokenizer,
    tokenizer_file="wordpiece_tokenizer_bdnews.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
    return_special_tokens_mask = True,
    model_max_length = 512,
)

In [None]:
# tokenizer.save_pretrained("wordpiece_tokenizer_bdnews")

In [None]:
# # from transformers import BertTokenizerFast
from transformers import PreTrainedTokenizerFast

# tokenizer = PreTrainedTokenizerFast.from_pretrained("wordpiece_tokenizer_bdnews")
tokenizer = PreTrainedTokenizerFast.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

In [None]:
from transformers import BertConfig, BertForMaskedLM


# Set a configuration for our RoBERTa model
unigram_bert_config = BertConfig(pad_token_id=tokenizer.pad_token_id)

# Building the model from the config
# Model is randomly initialized
model = BertForMaskedLM(unigram_bert_config)

print(unigram_bert_config)

In [None]:
text = "এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে [MASK] ।"

In [None]:
import torch 

inputs = tokenizer(text, return_tensors="pt")
# inputs.to("cuda")

token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# # Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = bdnews_dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

In [None]:
tokenized_datasets.remove_columns("token_type_ids")

In [None]:
tokenized_datasets[0]

In [None]:
def group_texts(examples):
    # Create a new labels column
    examples["labels"] = examples["input_ids"].copy()
    return examples

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [None]:
tokenizer.decode(lm_datasets[1]["input_ids"])

In [None]:
tokenizer.decode(lm_datasets[1]["labels"])

In [None]:
tokenizer.mask_token_id

In [None]:
import collections
import numpy as np
np.random
from transformers import default_data_collator

wwm_probability = 0.15


def bangla_data_collator(features):
    for feature in features:
#         word_ids = feature.pop("word_ids")

#         # Create a map between words and corresponding token indices
#         mapping = collections.defaultdict(list)
#         current_word_index = -1
#         current_word = None
#         for idx, word_id in enumerate(word_ids):
#             if word_id is not None:
#                 if word_id != current_word:
#                     current_word = word_id
#                     current_word_index += 1
#                 mapping[current_word_index].append(idx)

        # Randomly mask words
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        mask = np.random.binomial(1, wwm_probability, (len(input_ids),))
        special_tokens =  [tokenizer.unk_token_id, tokenizer.pad_token_id, tokenizer.cls_token_id, \
                           tokenizer.sep_token_id, tokenizer.mask_token_id]
        
        new_labels = [-100] * len(labels)
        for idx in np.where(mask)[0]:
#             word_id = word_id.item()
#             print(word_id)
#             for idx in mapping[word_id]:
#             if word_ids[idx] is not None:
            if input_ids[idx] not in special_tokens:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
            feature["labels"] = new_labels
        
    return default_data_collator(features)

In [None]:
from transformers import DataCollatorForLanguageModeling

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
lm_datasets = lm_datasets.remove_columns(["word_ids"])
data_collator = bangla_data_collator

In [None]:
samples = [lm_datasets[i] for i in range(1)]
# for sample in samples:
#     _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
samples = [lm_datasets[i] for i in range(1)]

chunk = data_collator(samples)
print(chunk["input_ids"])
print(chunk["labels"])

In [None]:
# train_size = 10_000
# test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=0.8, seed=42
)
downsampled_dataset

In [None]:
for idx, sample in enumerate(downsampled_dataset["train"]["input_ids"][:3]):
    print(f"'>>> Article {idx} length: {len(sample)}'")

In [None]:
#  disable weights and biases logging
# import os
# os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size


training_args = TrainingArguments(
    num_train_epochs = 6,
    #report_to = None,
    output_dir="models/wordpiece/bert-base-pretrained-bdnews24",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
    load_best_model_at_end=True,
    save_strategy = "epoch",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {eval_results['eval_loss']:.2f}")
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.save_model()

In [None]:
# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

In [None]:
model = BertForMaskedLM.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

In [None]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [None]:
# downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
        'masked_token_type_ids' : 'token_type_ids'
    }
)

In [None]:
eval_dataset[0]

In [None]:
samples = [eval_dataset[i] for i in range(10)]

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
for idx, _ in enumerate(eval_dataset['input_ids']):
    indexes = [i for i, x in enumerate(eval_dataset[idx]['input_ids']) if x == 4]
    references = [i for i, x in enumerate(eval_dataset[idx]['labels']) if x != -100]
    if indexes != references:
        print(idx)

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 4
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [None]:
downsampled_dataset["train"]

In [None]:
downsampled_dataset["test"]

In [None]:
# from transformers import BertConfig, BertForMaskedLM

# # Building the config
# config = BertConfig()

# # Building the model from the config
# # Model is randomly initialized
# model = BertForMaskedLM(config)

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

In [None]:
model = BertForMaskedLM.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# from accelerate import Accelerator

# accelerator = Accelerator()
# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#     model, optimizer, train_dataloader, eval_dataloader
# )

In [None]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
output_dir = "models/wordpiece/bert-base-pretrained-bdnews24-static"

In [None]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        #accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        #losses.append(accelerator.gather(loss.repeat(batch_size)))
        losses.append(loss.repeat(batch_size))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Loss: {torch.mean(losses)}")
    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    #accelerator.wait_for_everyone()
    #unwrapped_model = accelerator.unwrap_model(model)
    #unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    #if accelerator.is_main_process:
        #tokenizer.save_pretrained(output_dir)

In [None]:
import torch
torch.cuda.empty_cache()