# Drive setup

Necesary files will be stored in Google Drive.

The code assumes that you have a folder called "Colab Notebooks" with a subfolder "data" inside it.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Colab Notebooks/data/

/content/drive/MyDrive/Colab Notebooks/data


# Setup

Get the necessary files (based on code by Marko Nippula)

In [6]:
# @title Get resources from GitHub egy-gaps
from ipywidgets import interact, interactive, fixed, interact_manual, widgets
from IPython.display import clear_output
from tqdm.notebook import tqdm, tqdm_notebook
import requests
import os

for x in tqdm_notebook(range(1), desc="Downloading"):
  EGY_PATH='https://raw.githubusercontent.com/annasahola/egy-gaps/main/'
  if os.path.exists('tokens.txt') != True:
    print('Downloading tokens.txt...')
    !curl -s -O -L {EGY_PATH}/preprocessing/tokens/tokens.txt # AES written form tokens
  else:
    print('tokens.txt already exists')

  if os.path.exists('tokens_harmonized.txt') != True:
    print('Downloading tokens_harmonized.txt...')
    !curl -s -L {EGY_PATH}/preprocessing/tokens/harmonized/tokens.txt > tokens_harmonized.txt # AES MdC harmonized tokens
  else:
    print('tokens_harmonized.txt already exists')

  if os.path.exists('ramses_tokens.txt') != True:
    print('Downloading ramses_tokens.txt...')
    !curl -s -O -L {EGY_PATH}/preprocessing/tokens/ramses_tokens.txt # Ramses tokens
  else:
    print('ramses_tokens.txt already exists')

  if os.path.exists('combined_tokens.txt') != True:
    print('Downloading combined_tokens.txt...')
    !curl -s -O -L {EGY_PATH}/preprocessing/tokens/combined_tokens.txt # combined tokens (Ramses + AES MdC harmonized)
  else:
    print('combined_tokens.txt already exists')


Downloading:   0%|          | 0/1 [00:00<?, ?it/s]

tokens.txt already exists
tokens_harmonized.txt already exists
ramses_tokens.txt already exists
combined_tokens.txt already exists


In [None]:
!pip install accelerate>=0.21.0 transformers

In [None]:
from transformers import BertTokenizer

# Tokenizer creation

## Setup

In [None]:
def load_tokens(fn):
  t = []
  print("Opening file:", fn)
  f = open(fn, "r")
  lines = f.readlines()

  for l in lines:
    t.append(l.replace("\n", ""))

  f.close()

  return t

Extract tokens from tokens.txt, tokens_harmonized.txt, ramses_tokens.txt and combined_tokens.txt

In [None]:
tokens = load_tokens("./tokens.txt")
harmonized_tokens = load_tokens("./tokens_harmonized.txt")
ramses_tokens = load_tokens("./ramses_tokens.txt")
combined_tokens = load_tokens("./combined_tokens.txt")

Opening file: ./tokens.txt
Opening file: ./tokens_harmonized.txt
Opening file: ./ramses_tokens.txt
Opening file: ./combined_tokens.txt


In [None]:
tokens[0:5]

['Nfr-ḫpr-Rꜥw-wꜥ-n-Rꜥw', 'mḥ', '4', 'm-mj,tt', 'šꜣꜥ-m']

In [None]:
harmonized_tokens[0:5]

['Nfr-xpr-Raw-wa-n-Raw', 'mH', '4', 'm-my.tt', 'SAa-m']

In [None]:
ramses_tokens[0:5]

['ns-imn', 'sA', 'kr', 'n', 'pA']

In [None]:
combined_tokens[0:5]

['Nfr-xpr-Raw-wa-n-Raw', 'mH', '4', 'm-my.tt', 'SAa-m']

## Tokenizer following AES tokenization

Tokenizer following the AES tokenization

### Written form

Only AES tokens

In [None]:
egy_bert_aes_tokenizer = BertTokenizer(vocab_file="./tokens.txt", do_lower_case=False, do_basic_tokenize=False, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', strip_accents=False)

In [None]:
aes_tokenizer_tokens = egy_bert_aes_tokenizer.encode("jꜣ,w n kꜣ =k n stw,t =k")
print(egy_bert_aes_tokenizer.convert_ids_to_tokens(aes_tokenizer_tokens))

['[CLS]', 'jꜣ,w', 'n', 'kꜣ', '=k', 'n', 'stw,t', '=k', '[SEP]']


In [None]:
egy_bert_aes_tokenizer.save_pretrained("egy-bert-aes-tokenizer")

('egy-bert-aes-tokenizer/tokenizer_config.json',
 'egy-bert-aes-tokenizer/special_tokens_map.json',
 'egy-bert-aes-tokenizer/vocab.txt',
 'egy-bert-aes-tokenizer/added_tokens.json')

M-BERT + AES tokens

In [None]:
m_bert_aes_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
m_bert_aes_tokenizer.add_tokens(tokens)

34999

In [None]:
print(m_bert_aes_tokenizer.tokenize("jꜣ,w n kꜣ =k n stw,t =k"))

['jꜣ,w', 'n', 'kꜣ', '=k', 'n', 'stw,t', '=k']


In [None]:
m_bert_aes_tokenizer.save_pretrained("m-bert-aes-tokenizer")

('m-bert-aes-tokenizer/tokenizer_config.json',
 'm-bert-aes-tokenizer/special_tokens_map.json',
 'm-bert-aes-tokenizer/vocab.txt',
 'm-bert-aes-tokenizer/added_tokens.json')

### Ramses MdC harmonized

Only AES tokens

In [None]:
egy_bert_aes_harmonized_tokenizer = BertTokenizer(vocab_file="./tokens_harmonized.txt", do_lower_case=False, do_basic_tokenize=False, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', strip_accents=False)

In [None]:
aes_harmonized_tokenizer_tokens = egy_bert_aes_harmonized_tokenizer.encode("yA.w n kA =k n stw.t =k")
print(egy_bert_aes_harmonized_tokenizer.convert_ids_to_tokens(aes_harmonized_tokenizer_tokens))

['[CLS]', 'yA.w', 'n', 'kA', '=k', 'n', 'stw.t', '=k', '[SEP]']


In [None]:
egy_bert_aes_harmonized_tokenizer.save_pretrained("egy-bert-aes-harmonized-tokenizer")

('egy-bert-aes-harmonized-tokenizer/tokenizer_config.json',
 'egy-bert-aes-harmonized-tokenizer/special_tokens_map.json',
 'egy-bert-aes-harmonized-tokenizer/vocab.txt',
 'egy-bert-aes-harmonized-tokenizer/added_tokens.json')

M-BERT + AES tokens

In [None]:
m_bert_aes_harmonized_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [None]:
m_bert_aes_harmonized_tokenizer.add_tokens(harmonized_tokens)

30836

In [None]:
print(m_bert_aes_harmonized_tokenizer.tokenize("yA.w n kA =k n stw.t =k"))

['yA.w', 'n', 'kA', '=k', 'n', 'stw.t', '=k']


In [None]:
m_bert_aes_harmonized_tokenizer.save_pretrained("m-bert-aes-harmonized-tokenizer")

('m-bert-aes-harmonized-tokenizer/tokenizer_config.json',
 'm-bert-aes-harmonized-tokenizer/special_tokens_map.json',
 'm-bert-aes-harmonized-tokenizer/vocab.txt',
 'm-bert-aes-harmonized-tokenizer/added_tokens.json')

## Ramses tokenization

Only Ramses tokens

In [None]:
egy_bert_ramses_tokenizer = BertTokenizer(vocab_file="./ramses_tokens.txt", do_lower_case=False, do_basic_tokenize=False, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', strip_accents=False)

In [None]:
ramses_tokenizer_tokens = egy_bert_ramses_tokenizer.encode("sw 10 8 smd.t wsf")
print(egy_bert_ramses_tokenizer.convert_ids_to_tokens(ramses_tokenizer_tokens))

['[CLS]', 'sw', '10', '8', 'smd.t', 'wsf', '[SEP]']


In [None]:
egy_bert_ramses_tokenizer.save_pretrained("egy-bert-ramses-tokenizer")

('egy-bert-ramses-tokenizer/tokenizer_config.json',
 'egy-bert-ramses-tokenizer/special_tokens_map.json',
 'egy-bert-ramses-tokenizer/vocab.txt',
 'egy-bert-ramses-tokenizer/added_tokens.json')

M-BERT + Ramses

In [None]:
m_bert_ramses_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [None]:
m_bert_ramses_tokenizer.add_tokens(ramses_tokens)

8762

In [None]:
print(m_bert_ramses_tokenizer.tokenize("sw 10 8 smd.t wsf"))

['sw', '10', '8', 'smd.t', 'wsf']


In [None]:
m_bert_ramses_tokenizer.save_pretrained("m-bert-ramses-tokenizer")

('m-bert-ramses-tokenizer/tokenizer_config.json',
 'm-bert-ramses-tokenizer/special_tokens_map.json',
 'm-bert-ramses-tokenizer/vocab.txt',
 'm-bert-ramses-tokenizer/added_tokens.json')

## Ramses + AES tokenization (MdC harmonized)

Only Ramses + AES

In [None]:
egy_bert_combined_tokenizer = BertTokenizer(vocab_file="./combined_tokens.txt", do_lower_case=False, do_basic_tokenize=False, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', strip_accents=False)

In [None]:
combined_tokenizer_tokens = egy_bert_combined_tokenizer.encode("yA.w n kA =k n stw.t =k")
print(egy_bert_combined_tokenizer.convert_ids_to_tokens(combined_tokenizer_tokens))

['[CLS]', 'yA.w', 'n', 'kA', '=k', 'n', 'stw.t', '=k', '[SEP]']


In [None]:
combined_tokenizer_tokens = egy_bert_combined_tokenizer.encode("sw 10 8 smd.t wsf")
print(egy_bert_combined_tokenizer.convert_ids_to_tokens(combined_tokenizer_tokens))

['[CLS]', 'sw', '10', '8', 'smd.t', 'wsf', '[SEP]']


In [None]:
egy_bert_combined_tokenizer.save_pretrained("egy-bert-combined-tokenizer")

('egy-bert-combined-tokenizer/tokenizer_config.json',
 'egy-bert-combined-tokenizer/special_tokens_map.json',
 'egy-bert-combined-tokenizer/vocab.txt',
 'egy-bert-combined-tokenizer/added_tokens.json')

M-BERT + combined

In [None]:
m_bert_combined_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [None]:
m_bert_combined_tokenizer.add_tokens(combined_tokens)

36141

In [None]:
print(m_bert_combined_tokenizer.tokenize("yA.w n kA =k n stw.t =k"))

['yA.w', 'n', 'kA', '=k', 'n', 'stw.t', '=k']


In [None]:
print(m_bert_combined_tokenizer.tokenize("sw 10 8 smd.t wsf"))

['sw', '10', '8', 'smd.t', 'wsf']


In [None]:
m_bert_combined_tokenizer.save_pretrained("m-bert-combined-tokenizer")

('m-bert-combined-tokenizer/tokenizer_config.json',
 'm-bert-combined-tokenizer/special_tokens_map.json',
 'm-bert-combined-tokenizer/vocab.txt',
 'm-bert-combined-tokenizer/added_tokens.json')

# Coptic

### AES harmonized

In [None]:
coptic_microbert_aes_tokenizer = BertTokenizer.from_pretrained('lgessler/microbert-coptic-m')

tokenizer_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/118k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/239k [00:00<?, ?B/s]

In [None]:
coptic_microbert_aes_tokenizer.add_tokens(harmonized_tokens)

27912

In [None]:
coptic_microbert_aes_tokenizer.save_pretrained("coptic-microbert-aes-tokenizer")

('coptic-microbert-aes-tokenizer/tokenizer_config.json',
 'coptic-microbert-aes-tokenizer/special_tokens_map.json',
 'coptic-microbert-aes-tokenizer/vocab.txt',
 'coptic-microbert-aes-tokenizer/added_tokens.json')

### Ramses

In [None]:
coptic_microbert_ramses_tokenizer = BertTokenizer.from_pretrained('lgessler/microbert-coptic-m')

In [None]:
coptic_microbert_ramses_tokenizer.add_tokens(ramses_tokens)

8482

In [None]:
coptic_microbert_ramses_tokenizer.save_pretrained("coptic-microbert-ramses-tokenizer")

('coptic-microbert-ramses-tokenizer/tokenizer_config.json',
 'coptic-microbert-ramses-tokenizer/special_tokens_map.json',
 'coptic-microbert-ramses-tokenizer/vocab.txt',
 'coptic-microbert-ramses-tokenizer/added_tokens.json')

### Combined

In [None]:
coptic_microbert_combined_tokenizer = BertTokenizer.from_pretrained('lgessler/microbert-coptic-m')

In [None]:
coptic_microbert_combined_tokenizer.add_tokens(combined_tokens)

32716

In [None]:
coptic_microbert_combined_tokenizer.save_pretrained("coptic-microbert-combined-tokenizer")

('coptic-microbert-combined-tokenizer/tokenizer_config.json',
 'coptic-microbert-combined-tokenizer/special_tokens_map.json',
 'coptic-microbert-combined-tokenizer/vocab.txt',
 'coptic-microbert-combined-tokenizer/added_tokens.json')

# Loading pre-trained tokenizers

Pre-trained tokenizers can be loaded in the following way:

In [None]:
aes_tokenizer = BertTokenizer.from_pretrained('./egy-bert-aes-tokenizer')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
