# Description
The goal of this notebook is to load the dataset, tokenize and encode the data, train the model, evaluate it, and export the model.
Adapted from https://huggingface.co/docs/transformers/main/notebooks

In [None]:
from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

import math
import torch

In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.device_count()

In [None]:
torch.cuda.current_device()

In [None]:
torch.cuda.get_device_name(0)

In [None]:
!nvidia-smi

In [None]:
device = torch.device("cuda")

## Preparing the dataset

In [None]:
train_data_file = "../data-push/0d-sampling/train-validate/charters-main-train-data.json"
val_data_file = "../data-push/0d-sampling/train-validate/charters-main-val-data.json"

datasets = load_dataset("json", data_files={"train": train_data_file, "validation": val_data_file})

print(datasets["train"][:1])

## Masked language modeling

In [None]:
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
block_size = 256

In [None]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=16, 
    num_proc=4,
)

In [None]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir = f"../models/custom/{model_name}-mhg-charter-mlm-v1",
    evaluation_strategy = "epoch",
    num_train_epochs=15,
    learning_rate=2e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.save_model(f"../models/custom/{model_name}-mhg-charter-mlm-v1")

In [None]:
tokenizer.save_pretrained(f"../models/custom/{model_name}-mhg-charter-mlm-v1")
model.save_pretrained(f"../models/custom/{model_name}-mhg-charter-mlm-v1")