# **DO NOT TRY TO RUN THE TRAINING UNLESS YOU HAVE A LOT OF FREE SPACE IN YOUR DRIVE (40+ GB PER MODEL), GPU AVAILABLE AND 3H+ TIME TO SPARE (200H+ WITHOUT GPU)**

# Drive setup

Necesary files will be stored in Google Drive.

The code assumes that you have a folder called "Colab Notebooks" with a subfolder "data" inside it.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/data/

/content/drive/MyDrive/Colab Notebooks/data


# Setup

Get the necessary files (based on code by Marko Nippula)

In [None]:
# @title Get resources from GitHub egy-gaps
from ipywidgets import interact, interactive, fixed, interact_manual, widgets
from IPython.display import clear_output
from tqdm.notebook import tqdm, tqdm_notebook
import requests
import os

EGY_PATH='https://raw.githubusercontent.com/annasahola/egy-gaps/main/'

def handle_tokenizer_directory(dir):
  print('Creating directory...')
  !mkdir {dir}
  print('Downloading added_tokens.json...')
  !curl -s -L {EGY_PATH}/tokenizers/m-bert/{dir}/added_tokens.json > ./{dir}/added_tokens.json
  print('Downloading special_tokens_map.json...')
  !curl -s -L {EGY_PATH}/tokenizers/m-bert/{dir}/special_tokens_map.json > ./{dir}/special_tokens_map.json
  print('Downloading tokenizer_config.json...')
  !curl -s -L {EGY_PATH}/tokenizers/m-bert/{dir}/tokenizer_config.json > ./{dir}/tokenizer_config.json
  print('Downloading vocab.txt...')
  !curl -s -L {EGY_PATH}/tokenizers/m-bert/{dir}/vocab.txt > ./{dir}/vocab.txt

for x in tqdm_notebook(range(3), desc="Downloading"):
  if x == 0:
    print('== Tokenizers ==')

    if os.path.exists('./m-bert-aes-harmonized-tokenizer') != True:
      print('Downloading ./m-bert-aes-harmonized-tokenizer...')
      handle_tokenizer_directory('m-bert-aes-harmonized-tokenizer')
    else:
      print('./m-bert-aes-harmonized-tokenizer already exists')

    if os.path.exists('./m-bert-ramses-tokenizer') != True:
      print('Downloading ./m-bert-ramses-tokenizer...')
      handle_tokenizer_directory('m-bert-ramses-tokenizer')
    else:
      print('./m-bert-ramses-tokenizer already exists')

    if os.path.exists('./m-bert-combined-tokenizer') != True:
      print('Downloading ./m-bert-combined-tokenizer...')
      handle_tokenizer_directory('m-bert-combined-tokenizer')
    else:
      print('./m-bert-combined-tokenizer already exists')

  if x == 1:
    print('== Training files ==')

    if os.path.exists('all_train_id.txt') != True:
      print('Downloading all_train_id.txt...')
      !curl -s -O -L {EGY_PATH}/preprocessing/final_files/intact/dev/harmonized/all_train_id.txt
    else:
      print('all_train_id.txt already exists')

    if os.path.exists('ramses_train.txt') != True:
      print('Downloading ramses_train.txt...')
      !curl -s -L {EGY_PATH}/data/marete-ramses/aligned/aligned_transliterations_intact_train.txt > ramses_train.txt
    else:
      print('ramses_train.txt already exists')

    if os.path.exists('combined_train.txt') != True:
      print('Downloading combined_train.txt...')
      !curl -s -O -L {EGY_PATH}/data/marete-ramses/aligned/combined_train.txt
    else:
      print('combined_train.txt already exists')

  if x == 2:
    print('== Validation files ==')

    if os.path.exists('all_val_id.txt') != True:
      print('Downloading all_val_id.txt...')
      !curl -s -O -L {EGY_PATH}/preprocessing/final_files/intact/dev/harmonized/all_val_id.txt
    else:
      print('all_val_id.txt already exists')

    if os.path.exists('ramses_val.txt') != True:
      print('Downloading ramses_val.txt...')
      !curl -s -L {EGY_PATH}/data/marete-ramses/aligned/aligned_transliterations_intact_val.txt > ramses_val.txt
    else:
      print('ramses_val.txt already exists')

    if os.path.exists('combined_val.txt') != True:
      print('Downloading combined_val.txt...')
      !curl -s -O -L {EGY_PATH}/data/marete-ramses/aligned/combined_val.txt
    else:
      print('combined_val.txt already exists')

Downloading:   0%|          | 0/3 [00:00<?, ?it/s]

== Tokenizers ==
./m-bert-aes-harmonized-tokenizer already exists
./m-bert-ramses-tokenizer already exists
./m-bert-combined-tokenizer already exists
== Training files ==
all_train_id.txt already exists
Downloading ramses_train.txt...
combined_train.txt already exists
== Validation files ==
all_val_id.txt already exists
Downloading ramses_val.txt...
combined_val.txt already exists


GPU

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Apr 10 20:33:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install accelerate>=0.21.0 transformers

In [None]:
import torch
from transformers import BertTokenizer, Trainer, TrainingArguments, AutoModelForMaskedLM, pipeline
import random

# M-BERT

# Train

## Custom Dataset

In [None]:
class AESDataset(torch.utils.data.Dataset):
  def __init__(self, file_name, tokenizer, id_included = True):
    self.labels = []

    file = open(file_name, "r")
    file_lines = file.readlines()

    lines = []
    for line in file_lines:
      line_parts = line.replace("\n", "").split()
      if id_included:
        label = line_parts[0].replace("<", "").replace(">", "")
        sentence = " ".join(line_parts[1:])
      else:
        sentence = " ".join(line_parts)
      lines.append(sentence)

    tokenized_input = tokenizer(lines, truncation=True, padding=True, return_tensors='pt')

    cls_token, sep_token, mask_token, pad_token, unk_token = tokenizer.convert_tokens_to_ids(tokens=["[CLS]", "[SEP]", "[MASK]", "[PAD]", "[UNK]"])
    self.labels = tokenized_input.input_ids.clone()

    rand = torch.rand(tokenized_input.input_ids.shape)
    mask_array = (rand < 0.15) * (tokenized_input.input_ids != cls_token) * (tokenized_input.input_ids != sep_token) * (tokenized_input.input_ids != pad_token)

    selected_masks = []

    for i in range(mask_array.shape[0]):
      selected_masks.append(torch.flatten(mask_array[i].nonzero()).tolist())

    random.seed(42)

    for j in range(mask_array.shape[0]):
      for m in selected_masks[j]:
        mask_rand = random.random()
        if mask_rand < 0.8:
          # mask
          tokenized_input.input_ids[j, m] = mask_token
        elif mask_rand < 0.9:
          # replace with random word from added vocab
          added_vocab = tokenizer.get_added_vocab()
          rand_token = random.sample(list(added_vocab.items()), 1)[0] # get random token from the added tokens
          rand_token_index = rand_token[1] # get id from (token, id)
          tokenized_input.input_ids[j, m] = rand_token_index
        # else leave unchanged

    self.encodings = tokenized_input

    # print(self.encodings[:10])
    # print(self.labels[:10])

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    item = { key: val[idx].clone().detach() for key, val in self.encodings.items() }
    item['labels'] = self.labels[idx].clone().detach()
    return item

## Load pre-trained tokenizer

In [None]:
aes_tokenizer = BertTokenizer.from_pretrained('./m-bert-aes-harmonized-tokenizer') # M-BERT harmonized AES tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
ramses_tokenizer = BertTokenizer.from_pretrained('./m-bert-ramses-tokenizer') # M-BERT Ramses tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
combined_tokenizer = BertTokenizer.from_pretrained('./m-bert-combined-tokenizer') # combined tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Turn datasets into instances of AESDataset

In [None]:
train_dataset_aes = AESDataset("all_train_id.txt", aes_tokenizer, id_included=True)
val_dataset_aes = AESDataset("all_val_id.txt", aes_tokenizer, id_included=True)

In [None]:
train_dataset_ramses = AESDataset("ramses_train.txt", ramses_tokenizer, id_included=False)
val_dataset_ramses = AESDataset("ramses_val.txt", ramses_tokenizer, id_included=False)

In [None]:
train_dataset_combined = AESDataset("combined_train.txt", combined_tokenizer, id_included=False)
val_dataset_combined = AESDataset("combined_val.txt", combined_tokenizer, id_included=False)

## Training config

### AES

In [None]:
targs_aes = TrainingArguments(output_dir="aes-model-output", do_eval=False, overwrite_output_dir=True, logging_dir="logs")
model_aes = AutoModelForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model_aes.resize_token_embeddings(len(aes_tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(150383, 768)

In [None]:
targs_ramses = TrainingArguments(output_dir="ramses-model-output", do_eval=False, overwrite_output_dir=True, logging_dir="logs")
model_ramses = AutoModelForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model_ramses.resize_token_embeddings(len(ramses_tokenizer))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(128309, 768)

In [None]:
targs_combined = TrainingArguments(output_dir="combined-model-output", do_eval=False, overwrite_output_dir=True, logging_dir="logs")
model_combined = AutoModelForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model_combined.resize_token_embeddings(len(combined_tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(155688, 768)

## GPU setup

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### model to cuda

In [None]:
model_aes.cuda()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(150383, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [None]:
model_ramses.cuda()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(128309, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [None]:
model_combined.cuda()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(155688, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

## Training

### AES

In [None]:
aes_trainer = Trainer(model=model_aes, args=targs_aes, tokenizer=aes_tokenizer, train_dataset=train_dataset_aes, eval_dataset=val_dataset_aes)
aes_trainer.train(resume_from_checkpoint=False)
aes_trainer.save_model("aes-model")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.2204
1000,0.0594
1500,0.0586
2000,0.0572
2500,0.0585
3000,0.0566
3500,0.0576
4000,0.0568
4500,0.0552
5000,0.051


### Ramses

In [None]:
ramses_trainer = Trainer(model=model_ramses, args=targs_ramses, tokenizer=ramses_tokenizer, train_dataset=train_dataset_ramses, eval_dataset=val_dataset_ramses)
ramses_trainer.train(resume_from_checkpoint=False)
ramses_trainer.save_model("ramses-model")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.2014
1000,0.0459
1500,0.0454
2000,0.0441
2500,0.0431
3000,0.042
3500,0.0396
4000,0.0335
4500,0.0339
5000,0.0322


### Combined

In [None]:
combined_trainer = Trainer(model=model_combined, args=targs_combined, tokenizer=combined_tokenizer, train_dataset=train_dataset_combined, eval_dataset=val_dataset_combined)
combined_trainer.train(resume_from_checkpoint=True)
combined_trainer.save_model("combined-model")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


Step,Training Loss
20500,0.0393
21000,0.0394
21500,0.0385
22000,0.0382
22500,0.0369
23000,0.0372
23500,0.0378
24000,0.0385
24500,0.0378
25000,0.038


## Loading pretrained models

In [None]:
aes_model = AutoModelForMaskedLM.from_pretrained('./aes-model')

In [None]:
ramses_model = AutoModelForMaskedLM.from_pretrained('./ramses-model')

In [None]:
combined_model = AutoModelForMaskedLM.from_pretrained('./combined-model')