# Description
The goal of this notebook is to load the dataset, tokenize and encode the data, train the model, evaluate it, and export the model.
Adapted from https://huggingface.co/docs/transformers/main/notebooks

In [17]:
from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

import math
import torch

In [18]:
torch.cuda.is_available()

True

In [19]:
torch.cuda.device_count()

1

In [20]:
torch.cuda.current_device()

0

In [21]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1660 Ti'

In [22]:
!nvidia-smi

Tue Mar 28 20:51:58 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 531.18       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1660 Ti      On | 00000000:01:00.0  On |                  N/A |
| N/A   51C    P5                9W /  N/A|    415MiB /  6144MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [23]:
device = torch.device("cuda")

## Preparing the dataset

In [24]:
train_data_file = "../data-push/0d-sampling/train-validate/charters-main-train-data.json"
val_data_file = "../data-push/0d-sampling/train-validate/charters-main-val-data.json"
# train_data_file = "../data-push/0d-sampling/train-validate/charters-main-SAMPLE-train-data.json"
# val_data_file = "../data-push/0d-sampling/train-validate/charters-main-SAMPLE-val-data.json"

datasets = load_dataset("json", data_files={"train": train_data_file, "validation": val_data_file})

print(datasets["train"][:1])

Found cached dataset json (/home/atzenhofer/.cache/huggingface/datasets/json/default-a1523408e295402e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/2 [00:00<?, ?it/s]

{'text': ['Wir Graf Hainreich von Schavnberg veriehen Offenlich an disem brief vnd , di in sehent, Hornt oder lesent, Das fur vns vnd fur vnsern brueder chomen ist der Erwirdig abpt datz wilhering vnd Her wernhart chelner da selben vnd der Erwerig wolbeschaiden Peter vnd hat der selb Peter pope nach vnserm rat vnd nach seiner besten vreunt rat vnd willen vnd wort aller seiner Erben dem vor geschriben erwirdigen Herren abpt dvrch sein recht notdurft vnd durch sein leibnar sein Hueb ze Strashaim , di Sein rechtz Erbaigen gewesen ist vnd di in von seinen vodern also Herman vnd das Gotzhaus ze wilhering di vorgenanten Hueb mit allen den rechten vnd gelegen sind, versuecht vnd vnuersuecht, durch di lieb vnd durch ze wilhering den vorgeschriben peter poppen begnat haben mit vntz an seinen tod in dem chloster vnd auch Hausfrawen frawen Ofmein vnd Jansen seins Suns. wer auch, das iemand , So geit der oft genant Peter Poppe Gotzhaus ze wilhering Sechtzich Phunt wienner Phenning auf der oft des 

## Masked language modeling

In [25]:
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)#, use_fast=True)

In [26]:
block_size = 256

In [27]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [28]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [29]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Loading cached processed dataset at /home/atzenhofer/.cache/huggingface/datasets/json/default-a1523408e295402e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-d303bdc26f349b55_*_of_00004.arrow
Loading cached processed dataset at /home/atzenhofer/.cache/huggingface/datasets/json/default-a1523408e295402e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-c719ab6830cb5940_*_of_00004.arrow


In [30]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=8, 
    num_proc=4,
)

Loading cached processed dataset at /home/atzenhofer/.cache/huggingface/datasets/json/default-a1523408e295402e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-5d5957b7622f6692_*_of_00004.arrow
Loading cached processed dataset at /home/atzenhofer/.cache/huggingface/datasets/json/default-a1523408e295402e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-c5821ccb797a2bab_*_of_00004.arrow


In [31]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'nd durch ze wilhering den vorgeschriben peter poppen begnat haben mit vntz an seinen tod in dem chloster vnd auch Hausfrawen frawen Ofmein vnd Jansen seins Suns. wer auch, das iemand, So geit der oft genant Peter Poppe Gotzhaus ze wilhering Sechtzich Phunt wienner Phenning auf der oft des selben an aller der stat, ob der Ens, vnd wort geschehen vnd vertaidingt an disem brief verschriben stet vnd lob laisten, stat haben vnd ze volfuren geuarde vnd dar vber zu gib ich disen brief dem oft geschriben Abbt Herman vnd dem Gotzhaus ze gnadigen Herren Graf gundem Insigel vnd mit wernhart des Hager vnd der Alhartinger. Das</s><s>Wier Niclas von Gots genaden abbt ze Garsten verjehen, umb die schaden di wir mit mue und mit zerung gen'

In [32]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [33]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [34]:
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir = f"../models/custom/{model_name}-mhg-charter-mlm-v1",
    evaluation_strategy = "epoch",
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

In [35]:
trainer.train()

***** Running training *****
  Num examples = 10471
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13090
  Number of trainable parameters = 82170201
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.537,2.112094
2,2.0534,1.838937
3,1.9003,1.706654
4,1.7662,1.60797
5,1.6692,1.53234
6,1.6191,1.490333
7,1.5713,1.476035
8,1.5431,1.428958
9,1.5171,1.423216
10,1.5083,1.408235


Saving model checkpoint to ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-500
Configuration saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-500/config.json
Model weights saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-1000
Configuration saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-1000/config.json
Model weights saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2615
  Batch size = 8
Saving model checkpoint to ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-1500
Configuration saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-1500/config.json
Model weights saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-1500/pyt

Saving model checkpoint to ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-12500
Configuration saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-12500/config.json
Model weights saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-12500/pytorch_model.bin
Saving model checkpoint to ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-13000
Configuration saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-13000/config.json
Model weights saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/checkpoint-13000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2615
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=13090, training_loss=1.7973431872082994, metrics={'train_runtime': 6318.5307, 'train_samples_per_second': 16.572, 'train_steps_per_second': 2.072, 'total_flos': 6943414684124160.0, 'train_loss': 1.7973431872082994, 'epoch': 10.0})

In [36]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 2615
  Batch size = 8


Perplexity: 4.07


In [37]:
trainer.save_model(f"../models/custom/{model_name}-mhg-charter-mlm-v1")

Saving model checkpoint to ../models/custom/distilroberta-base-mhg-charter-mlm-v1
Configuration saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/config.json
Model weights saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/pytorch_model.bin


In [38]:
tokenizer.save_pretrained(f"../models/custom/{model_name}-mhg-charter-mlm-v1")
model.save_pretrained(f"../models/custom/{model_name}-mhg-charter-mlm-v1")

tokenizer config file saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/tokenizer_config.json
Special tokens file saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/special_tokens_map.json
Configuration saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/config.json
Model weights saved in ../models/custom/distilroberta-base-mhg-charter-mlm-v1/pytorch_model.bin
