# **RUNNING THE PREDICTION GENERATION CAN TAKE ABOUT 1-2H PER MODEL**
(pregenerated prediction files can be found on GitHub)

# Drive setup

Necesary files will be stored in Google Drive.

The code assumes that you have a folder called "Colab Notebooks" with a subfolder "data" inside it.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Colab Notebooks/data/

/content/drive/MyDrive/Colab Notebooks/data


# Setup

In [12]:
# @title Get resources from GitHub egy-gaps
from ipywidgets import interact, interactive, fixed, interact_manual, widgets
from IPython.display import clear_output
from tqdm.notebook import tqdm, tqdm_notebook
import requests
import os

EGY_PATH='https://raw.githubusercontent.com/annasahola/egy-gaps/main/'

def handle_tokenizer_directory(dir, top_dir):
  print('Creating directory...')
  !mkdir {dir}
  print('Downloading added_tokens.json...')
  !curl -s -L {EGY_PATH}/tokenizers/{top_dir}/{dir}/added_tokens.json > ./{dir}/added_tokens.json
  print('Downloading special_tokens_map.json...')
  !curl -s -L {EGY_PATH}/tokenizers/{top_dir}/{dir}/special_tokens_map.json > ./{dir}/special_tokens_map.json
  print('Downloading tokenizer_config.json...')
  !curl -s -L {EGY_PATH}/tokenizers/{top_dir}/{dir}/tokenizer_config.json > ./{dir}/tokenizer_config.json
  print('Downloading vocab.txt...')
  !curl -s -L {EGY_PATH}/tokenizers/{top_dir}/{dir}/vocab.txt > ./{dir}/vocab.txt

for x in tqdm_notebook(range(2), desc="Downloading"):
  if x == 0:
    print('== Tokenizers ==')

    if os.path.exists('./m-bert-aes-harmonized-tokenizer') != True:
      print('Downloading ./m-bert-aes-harmonized-tokenizer...')
      handle_tokenizer_directory('m-bert-aes-harmonized-tokenizer', 'm-bert')
    else:
      print('./m-bert-aes-harmonized-tokenizer already exists')

    if os.path.exists('./m-bert-ramses-tokenizer') != True:
      print('Downloading ./m-bert-ramses-tokenizer...')
      handle_tokenizer_directory('m-bert-ramses-tokenizer', 'm-bert')
    else:
      print('./m-bert-ramses-tokenizer already exists')

    if os.path.exists('./m-bert-combined-tokenizer') != True:
      print('Downloading ./m-bert-combined-tokenizer...')
      handle_tokenizer_directory('m-bert-combined-tokenizer', 'm-bert')
    else:
      print('./m-bert-combined-tokenizer already exists')

    if os.path.exists('./coptic-microbert-combined-tokenizer') != True:
      print('Downloading ./coptic-microbert-combined-tokenizer...')
      handle_tokenizer_directory('coptic-microbert-combined-tokenizer', 'coptic')
    else:
      print('./coptic-microbert-combined-tokenizer already exists')

  if x == 1:
    print('== Test files ==')

    if os.path.exists('all_test_harmonized.txt') != True:
      print('Downloading all_test_harmonized.txt...')
      !curl -s -L {EGY_PATH}/preprocessing/final_files/intact/test/harmonized/all_test.txt > all_test_harmonized.txt
    else:
      print('all_test_harmonized.txt already exists')

    if os.path.exists('ramses_test.txt') != True:
      print('Downloading ramses_test.txt...')
      !curl -s -L {EGY_PATH}/data/marete-ramses/aligned/aligned_transliterations_intact_test.txt > ramses_test.txt
    else:
      print('ramses_test.txt already exists')

    if os.path.exists('combined_test.txt') != True:
      print('Downloading combined_test.txt...')
      !curl -s -O -L {EGY_PATH}/data/marete-ramses/aligned/combined_test.txt # AES: rows 1-16465; Ramses: rows 16466-25377
    else:
      print('combined_test.txt already exists')

    if os.path.exists('all_test_masked.txt') != True:
      print('Downloading all_test_masked.txt...')
      !curl -s -O -L {EGY_PATH}/predictions/all_test_masked.txt
    else:
      print('all_test_masked.txt already exists')

Downloading:   0%|          | 0/2 [00:00<?, ?it/s]

== Tokenizers ==
./m-bert-aes-harmonized-tokenizer already exists
./m-bert-ramses-tokenizer already exists
./m-bert-combined-tokenizer already exists
./coptic-microbert-combined-tokenizer already exists
== Test files ==
all_test_harmonized.txt already exists
ramses_test.txt already exists
combined_test.txt already exists
all_test_masked.txt already exists


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Apr 11 19:00:21 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
!pip install accelerate>=0.21.0 transformers

In [6]:
import torch
from transformers import BertTokenizer, pipeline, AutoModelForMaskedLM
import random
import json
import numpy as np
import pandas as pd

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load tokenizers

In [7]:
aes_tokenizer = BertTokenizer.from_pretrained('./m-bert-aes-harmonized-tokenizer') # M-BERT harmonized AES tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
ramses_tokenizer = BertTokenizer.from_pretrained('./m-bert-ramses-tokenizer') # M-BERT Ramses tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
combined_tokenizer = BertTokenizer.from_pretrained('./m-bert-combined-tokenizer') # combined tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
coptic_tokenizer = BertTokenizer.from_pretrained('./coptic-microbert-combined-tokenizer') # Coptic tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Metrics

### Functions for calculating metrics

In [None]:
def check_for_hit(k, correct_prediction, predictions):
  hit = 0
  predicted_tokens = [prediction['token_str'] for prediction in predictions[0:k]]

  # check if correct_prediction is in predictions --> set hit to 1 if yes
  if correct_prediction in predicted_tokens:
    hit = 1

  return hit

def get_prediction_mrr_k(k, correct_prediction, predictions):
  mrr_k = 0
  predicted_tokens = [prediction['token_str'] for prediction in predictions[0:k]]

  # check if correct_prediction is in predictions --> check rank 1/rank
  if correct_prediction in predicted_tokens:
    rank = predicted_tokens.index(correct_prediction) + 1
    mrr_k = 1 / rank

  return mrr_k

def calculate_metrics(sentences):
  N = len(sentences)
  masked_sentences = [sentence[1] for sentence in sentences]
  correct_predictions = [sentence[2] for sentence in sentences]
  test_sentences_predictions = [sentence[3] for sentence in sentences]

  hits_1 = 0
  hits_5 = 0
  hits_10 = 0
  mrr_5 = 0
  mrr_10 = 0

  for i in range(0, N):
    ms = masked_sentences[i]
    cp = correct_predictions[i]
    predictions = test_sentences_predictions[i]
    # hit@k
    hits_1 += check_for_hit(1, cp, predictions)
    hits_5 += check_for_hit(5, cp, predictions)
    hits_10 += check_for_hit(10, cp, predictions)
    # mrr@k
    mrr_5 += get_prediction_mrr_k(5, cp, predictions)
    mrr_10 += get_prediction_mrr_k(10, cp, predictions)

  return (hits_1 / N, hits_5 / N, hits_10 / N, mrr_5 / N, mrr_10 / N)

### Calculate metrics

#### Generate test sentences with gaps

In [None]:
def generate_test_sentences(fn, tokenizer):
  masked_sentences = []

  print("Opening file:", fn)
  tf = open(fn, "r")
  tfl = tf.readlines()

  random.seed(0)

  for l in tfl:
    s = l.replace("\n", "")
    masked_tokens = []
    tokenized_sentence = tokenizer.tokenize(s)
    mt_index = random.randrange(0, len(tokenized_sentence))
    masked_token = tokenized_sentence[mt_index]
    tokenized_sentence[mt_index] = "[MASK]"
    masked_sentences.append((s, " ".join(tokenized_sentence), masked_token))

  tf.close()

  return masked_sentences

aes_tfn = "all_test_harmonized.txt"
ramses_tfn = "ramses_test.txt"
all_tfn = "combined_test.txt"

aes_sentences = generate_test_sentences(aes_tfn, aes_tokenizer)
ramses_sentences = generate_test_sentences(ramses_tfn, ramses_tokenizer)
combined_sentences = generate_test_sentences(all_tfn, combined_tokenizer)

In [None]:
aes_sentences[:5]

[('Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t nb-tA m Pr-Jtn m Ax.t-Jtn',
  'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t [MASK] m Pr-Jtn m Ax.t-Jtn',
  'nb-tA'),
 ('Hw.t-Hr.w nb.t-Hw.t', 'Hw.t-Hr.w [MASK]', 'nb.t-Hw.t'),
 ('aA =y m tm m ra =y wDA nHi.n =f yAw nfr mri anx',
  '[MASK] =y m tm m ra =y wDA nHi.n =f yAw nfr mri anx',
  'aA'),
 ('xr yr Xnw pA 4 wD SAa-m pA Dw yAb.ty n Ax.t-Jtn nfr.yt-r pA Dw ymn.ty n Ax.t-Jtn Ax.t-Jtn r-Ha.w =s tAy',
  'xr yr Xnw pA 4 wD SAa-m pA [MASK] yAb.ty n Ax.t-Jtn nfr.yt-r pA Dw ymn.ty n Ax.t-Jtn Ax.t-Jtn r-Ha.w =s tAy',
  'Dw'),
 ('bn yri.y HyHy =f', 'bn yri.y HyHy [MASK]', '=f')]

In [None]:
ramses_sentences[:5]

[('ink i.di n =k pA XAr 20 2 1/2 bd.t Hna nA sS.w nb.w nty ir.w r-Hry',
  'ink i.di n =k pA XAr 20 2 1/2 bd.t Hna nA [MASK] nb.w nty ir.w r-Hry',
  'sS.w'),
 ('Hr-nbw Dsr xaw.w sxm pHty', 'Hr-nbw Dsr xaw.w [MASK] pHty', 'sxm'),
 ('iw =i Hr Ssp tA aDd.t Sri.t', '[MASK] =i Hr Ssp tA aDd.t Sri.t', 'iw'),
 ('iw xm.w-sk Hr sSA xaw =k', 'iw xm.w-sk [MASK] sSA xaw =k', 'Hr'),
 ('Hr-nbw Dsr xaw.w sxm pHty', 'Hr-nbw Dsr xaw.w sxm [MASK]', 'pHty')]

In [None]:
combined_sentences[16460:16470]

[('wD Hm =f n yr.y-pa.t HA.ty-a Hr.y-sStA-n-pr-nswt Hr.y-tp-n-tA-r-Dr=f xtm.ty-by.ty sbA-tA.wy-HAw.ty ym.y-ra-smr.w ym.y-ra-xtm.ty Wsr-NSm.t',
  'wD Hm =f n yr.y-pa.t HA.ty-a Hr.y-sStA-n-pr-nswt [MASK] xtm.ty-by.ty sbA-tA.wy-HAw.ty ym.y-ra-smr.w ym.y-ra-xtm.ty Wsr-NSm.t',
  'Hr.y-tp-n-tA-r-Dr=f'),
 ('tw=k qAi.T Sma.T', 'tw=k [MASK] Sma.T', 'qAi.T'),
 ('PA-kAmn 100', '[MASK] 100', 'PA-kAmn'),
 ('qd.n =y pr Sad.n =y S Hm-nTr MnT.w-Htp',
  'qd.n =y [MASK] Sad.n =y S Hm-nTr MnT.w-Htp',
  'pr'),
 ('Dd =y n m myn', 'Dd =y [MASK] m myn', 'n'),
 ('ink i.di n =k pA XAr 20 2 1/2 bd.t Hna nA sS.w nb.w nty ir.w r-Hry',
  'ink i.di n =k pA XAr 20 2 1/2 bd.t Hna nA sS.w nb.w nty [MASK] r-Hry',
  'ir.w'),
 ('Hr-nbw Dsr xaw.w sxm pHty', 'Hr-nbw Dsr xaw.w sxm [MASK]', 'pHty'),
 ('iw =i Hr Ssp tA aDd.t Sri.t', 'iw =i [MASK] Ssp tA aDd.t Sri.t', 'Hr'),
 ('iw xm.w-sk Hr sSA xaw =k', '[MASK] xm.w-sk Hr sSA xaw =k', 'iw'),
 ('Hr-nbw Dsr xaw.w sxm pHty', 'Hr-nbw [MASK] xaw.w sxm pHty', 'Dsr')]

##### Save sentences with gaps into a file

In [None]:
def save_masked_sentences(fn, sl):
  print("Opening file:", fn)
  f = open(fn, "w")

  for s in sl:
    line = " | ".join(s)
    f.write(line + "\n")

  f.close()

aes_masked_fn = "aes_test_masked.txt"
ramses_masked_fn = "ramses_test_masked.txt"
all_masked_fn = "all_test_masked.txt"

save_masked_sentences(aes_masked_fn, aes_sentences)
save_masked_sentences(ramses_masked_fn, ramses_sentences)
save_masked_sentences(all_masked_fn, combined_sentences)

Opening file: aes_test_masked.txt
Opening file: ramses_test_masked.txt
Opening file: all_test_masked.txt


### Load generated test sentences from file

In [None]:
def load_masked_sentences(fn):
  sl = []
  print("Opening file:", fn)
  f = open(fn, "r")
  fl = f.readlines()

  for s in fl:
    line = s.replace(u'\xa0', u' ') # account for non-breaking spaces?
    line = line.replace("\n", "")
    original_sentence, masked_sentence, correct_prediction = line.split(" | ")
    sl.append((original_sentence, masked_sentence, correct_prediction))

  f.close()

  return sl

aes_masked_fn = "aes_test_masked.txt"
ramses_masked_fn = "ramses_test_masked.txt"
all_masked_fn = "all_test_masked.txt"

aes_sentences = load_masked_sentences(aes_masked_fn)
ramses_sentences = load_masked_sentences(ramses_masked_fn)
combined_sentences = load_masked_sentences(all_masked_fn)

Opening file: aes_test_masked.txt
Opening file: ramses_test_masked.txt
Opening file: all_test_masked.txt


In [None]:
aes_sentences[:5]

[('Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t nb-tA m Pr-Jtn m Ax.t-Jtn',
  'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t [MASK] m Pr-Jtn m Ax.t-Jtn',
  'nb-tA'),
 ('Hw.t-Hr.w nb.t-Hw.t', 'Hw.t-Hr.w [MASK]', 'nb.t-Hw.t'),
 ('aA =y m tm m ra =y wDA nHi.n =f yAw nfr mri anx',
  '[MASK] =y m tm m ra =y wDA nHi.n =f yAw nfr mri anx',
  'aA'),
 ('xr yr Xnw pA 4 wD SAa-m pA Dw yAb.ty n Ax.t-Jtn nfr.yt-r pA Dw ymn.ty n Ax.t-Jtn Ax.t-Jtn r-Ha.w =s tAy',
  'xr yr Xnw pA 4 wD SAa-m pA [MASK] yAb.ty n Ax.t-Jtn nfr.yt-r pA Dw ymn.ty n Ax.t-Jtn Ax.t-Jtn r-Ha.w =s tAy',
  'Dw'),
 ('bn yri.y HyHy =f', 'bn yri.y HyHy [MASK]', '=f')]

In [None]:
ramses_sentences[:5]

[('ink i.di n =k pA XAr 20 2 1/2 bd.t Hna nA sS.w nb.w nty ir.w r-Hry',
  'ink i.di n =k pA XAr 20 2 1/2 bd.t Hna nA [MASK] nb.w nty ir.w r-Hry',
  'sS.w'),
 ('Hr-nbw Dsr xaw.w sxm pHty', 'Hr-nbw Dsr xaw.w [MASK] pHty', 'sxm'),
 ('iw =i Hr Ssp tA aDd.t Sri.t', '[MASK] =i Hr Ssp tA aDd.t Sri.t', 'iw'),
 ('iw xm.w-sk Hr sSA xaw =k', 'iw xm.w-sk [MASK] sSA xaw =k', 'Hr'),
 ('Hr-nbw Dsr xaw.w sxm pHty', 'Hr-nbw Dsr xaw.w sxm [MASK]', 'pHty')]

In [None]:
combined_sentences[16460:16470]

[('wD Hm =f n yr.y-pa.t HA.ty-a Hr.y-sStA-n-pr-nswt Hr.y-tp-n-tA-r-Dr=f xtm.ty-by.ty sbA-tA.wy-HAw.ty ym.y-ra-smr.w ym.y-ra-xtm.ty Wsr-NSm.t',
  'wD Hm =f n yr.y-pa.t HA.ty-a Hr.y-sStA-n-pr-nswt [MASK] xtm.ty-by.ty sbA-tA.wy-HAw.ty ym.y-ra-smr.w ym.y-ra-xtm.ty Wsr-NSm.t',
  'Hr.y-tp-n-tA-r-Dr=f'),
 ('tw=k qAi.T Sma.T', 'tw=k [MASK] Sma.T', 'qAi.T'),
 ('PA-kAmn 100', '[MASK] 100', 'PA-kAmn'),
 ('qd.n =y pr Sad.n =y S Hm-nTr MnT.w-Htp',
  'qd.n =y [MASK] Sad.n =y S Hm-nTr MnT.w-Htp',
  'pr'),
 ('Dd =y n m myn', 'Dd =y [MASK] m myn', 'n'),
 ('ink i.di n =k pA XAr 20 2 1/2 bd.t Hna nA sS.w nb.w nty ir.w r-Hry',
  'ink i.di n =k pA XAr 20 2 1/2 bd.t Hna nA sS.w nb.w nty [MASK] r-Hry',
  'ir.w'),
 ('Hr-nbw Dsr xaw.w sxm pHty', 'Hr-nbw Dsr xaw.w sxm [MASK]', 'pHty'),
 ('iw =i Hr Ssp tA aDd.t Sri.t', 'iw =i [MASK] Ssp tA aDd.t Sri.t', 'Hr'),
 ('iw xm.w-sk Hr sSA xaw =k', '[MASK] xm.w-sk Hr sSA xaw =k', 'iw'),
 ('Hr-nbw Dsr xaw.w sxm pHty', 'Hr-nbw [MASK] xaw.w sxm pHty', 'Dsr')]

### Save predictions

In [None]:
from tqdm.notebook import tqdm, tqdm_notebook
def save_predictions_to_file(k, model, tokenizer, filename, test_sentences):
  print("Opening file:", filename)
  predictions_file = open(filename, "w")

  model_unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer, top_k=k)

  for sentence in tqdm_notebook(test_sentences, desc="Predicting", unit='sentence',
                                mininterval=1.0, postfix={'filename':filename}):
    predictions = model_unmasker(sentence[1])
    line = " | ".join(sentence) + " | " + str(predictions)
    predictions_file.write(line + "\n")

  predictions_file.close()

In [None]:
# @title AES model (on AES) predictions
def regenerate_all_test_aes_predictions():
  if os.path.exists("aes-model"):
    aes_model = AutoModelForMaskedLM.from_pretrained('./aes-model')
    aes_model.cuda()
    save_predictions_to_file(10, aes_model, aes_tokenizer, "all_test_aes_predictions.txt", aes_sentences)
  else:
    print("Model not found: unable to regenerate predictions")
    print("get aes-model from huggingface and copy it to ./aes-model.")
    print("For details see: https://huggingface.co/egy-snlp-project/aes-model/tree/main")
    print("(only for people enrolled to the project)")

if os.path.exists("all_test_aes_predictions.txt"):
  print("Old predictions already exist.")
elif os.path.exists("snlp-project/predictions/all_test_aes_predictions.txt"):
  print("Old predictions loaded from repository")
  shutil.copy("snlp-project/predictions/all_test_aes_predictions.txt",
              "all_test_aes_predictions.txt")
else:
  print("Generating predictions, takes a long time.")
  regenerate_all_test_aes_predictions()

from ipywidgets import interact_manual
my_interact_aes = interact_manual.options(manual_name="Regenerate")
my_interact_aes(regenerate_all_test_aes_predictions)

Generating predictions, takes a long time.
Opening file: all_test_aes_predictions.txt


Predicting:   0%|          | 0/16465 [00:00<?, ?sentence/s, filename=all_test_aes_predictions.txt]

interactive(children=(Button(description='Regenerate', style=ButtonStyle()), Output()), _dom_classes=('widget-…

In [None]:
# @title AES model (on all) predictions
def regenerate_all_test_aes_all_predictions():
  if os.path.exists("aes-model"):
    aes_model = AutoModelForMaskedLM.from_pretrained('./aes-model')
    aes_model.resize_token_embeddings(len(combined_tokenizer))
    aes_model.cuda()
    save_predictions_to_file(10, aes_model, combined_tokenizer, "all_test_aes_all_predictions.txt", combined_sentences)
  else:
    print("Model not found: unable to regenerate predictions")
    print("get aes-model from huggingface and copy it to ./aes-model.")
    print("For details see: https://huggingface.co/egy-snlp-project/aes-model/tree/main")
    print("(only for people enrolled to the project)")

if os.path.exists("all_test_aes_all_predictions.txt"):
  print("Old predictions already exist.")
elif os.path.exists("snlp-project/predictions/all_test_aes_all_predictions.txt"):
  print("Old predictions loaded from repository")
  shutil.copy("snlp-project/predictions/all_test_aes_all_predictions.txt",
              "all_test_aes_all_predictions.txt")
else:
  print("Generating predictions, takes a long time.")
  regenerate_all_test_aes_all_predictions()

from ipywidgets import interact_manual
my_interact_aes_all = interact_manual.options(manual_name="Regenerate")
my_interact_aes_all(regenerate_all_test_aes_all_predictions)

Generating predictions, takes a long time.
Opening file: all_test_aes_all_predictions.txt


Predicting:   0%|          | 0/25377 [00:00<?, ?sentence/s, filename=all_test_aes_all_predictions.txt]

interactive(children=(Button(description='Regenerate', style=ButtonStyle()), Output()), _dom_classes=('widget-…

In [None]:
# @title Ramses (on Ramses) model predictions
def regenerate_all_test_ramses_predictions():
  if os.path.exists("ramses-model"):
    ramses_model = AutoModelForMaskedLM.from_pretrained('./ramses-model')
    ramses_model.cuda()
    save_predictions_to_file(10, ramses_model, ramses_tokenizer, "all_test_ramses_predictions.txt", ramses_sentences)
  else:
    print("Model not found: unable to regenerate predictions")
    print("get ramses-model from huggingface and copy it to ./ramses-model.")
    print("For details see: https://huggingface.co/egy-snlp-project/ramses-model/tree/main")
    print("(only for people enrolled to the project)")

if os.path.exists("all_test_ramses_predictions.txt"):
  print("Old predictions already exist.")
elif os.path.exists("snlp-project/predictions/all_test_ramses_predictions.txt"):
  print("Old predictions loaded from repository")
  shutil.copy("snlp-project/predictions/all_test_ramses_predictions.txt",
              "all_test_ramses_predictions.txt")
else:
  print("Generating predictions, takes a long time.")
  regenerate_all_test_ramses_predictions()

from ipywidgets import interact_manual
my_interact_ramses = interact_manual.options(manual_name="Regenerate")
my_interact_ramses(regenerate_all_test_ramses_predictions)

Generating predictions, takes a long time.
Opening file: all_test_ramses_predictions.txt


Predicting:   0%|          | 0/8912 [00:00<?, ?sentence/s, filename=all_test_ramses_predictions.txt]

interactive(children=(Button(description='Regenerate', style=ButtonStyle()), Output()), _dom_classes=('widget-…

In [None]:
# @title Ramses (on all) model predictions
def regenerate_all_test_ramses_all_predictions():
  if os.path.exists("ramses-model"):
    ramses_model = AutoModelForMaskedLM.from_pretrained('./ramses-model')
    ramses_model.resize_token_embeddings(len(combined_tokenizer))
    ramses_model.cuda()
    save_predictions_to_file(10, ramses_model, combined_tokenizer, "all_test_ramses_all_predictions.txt", combined_sentences)
  else:
    print("Model not found: unable to regenerate predictions")
    print("get ramses-model from huggingface and copy it to ./ramses-model.")
    print("For details see: https://huggingface.co/egy-snlp-project/ramses-model/tree/main")
    print("(only for people enrolled to the project)")

if os.path.exists("all_test_ramses_all_predictions.txt"):
  print("Old predictions already exist.")
elif os.path.exists("snlp-project/predictions/all_test_ramses_all_predictions.txt"):
  print("Old predictions loaded from repository")
  shutil.copy("snlp-project/predictions/all_test_ramses_all_predictions.txt",
              "all_test_ramses_all_predictions.txt")
else:
  print("Generating predictions, takes a long time.")
  regenerate_all_test_ramses_all_predictions()

from ipywidgets import interact_manual
my_interact_ramses_all = interact_manual.options(manual_name="Regenerate")
my_interact_ramses_all(regenerate_all_test_ramses_all_predictions)

Generating predictions, takes a long time.
Opening file: all_test_ramses_all_predictions.txt


Predicting:   0%|          | 0/25377 [00:00<?, ?sentence/s, filename=all_test_ramses_all_predictions.txt]

interactive(children=(Button(description='Regenerate', style=ButtonStyle()), Output()), _dom_classes=('widget-…

In [None]:
# @title Combined model predictions
def regenerate_all_test_combined_predictions():
  if os.path.exists("combined-model"):
    combined_model = AutoModelForMaskedLM.from_pretrained('./combined-model')
    combined_model.cuda()
    save_predictions_to_file(10, combined_model, combined_tokenizer, "all_test_combined_predictions.txt", combined_sentences)
  else:
    print("Model not found: unable to regenerate predictions")
    print("get combined-model from huggingface and copy it to ./combined-model.")
    print("For details see: https://huggingface.co/egy-snlp-project/combined-model/tree/main")
    print("(only for people enrolled to the project)")

if os.path.exists("all_test_combined_predictions.txt"):
  print("Old predictions already exist.")
elif os.path.exists("snlp-project/predictions/all_test_combined_predictions.txt"):
  print("Old predictions loaded from repository")
  shutil.copy("snlp-project/predictions/all_test_combined_predictions.txt",
              "all_test_combined_predictions.txt")
else:
  print("Generating predictions, takes a long time.")
  regenerate_all_test_combined_predictions()

from ipywidgets import interact_manual
my_interact_combined = interact_manual.options(manual_name="Regenerate")
my_interact_combined(regenerate_all_test_combined_predictions)

Generating predictions, takes a long time.
Opening file: all_test_combined_predictions.txt


Predicting:   0%|          | 0/25377 [00:00<?, ?sentence/s, filename=all_test_combined_predictions.txt]

interactive(children=(Button(description='Regenerate', style=ButtonStyle()), Output()), _dom_classes=('widget-…

In [None]:
# @title Multilingual-Bert (base) predictions
def regenerate_all_test_mbert_predictions():
  mbert_model = AutoModelForMaskedLM.from_pretrained('bert-base-multilingual-cased')
  mbert_model.resize_token_embeddings(len(combined_tokenizer))
  mbert_model.cuda()
  save_predictions_to_file(10, mbert_model, combined_tokenizer, "all_test_mbert_predictions.txt", combined_sentences)

if os.path.exists("all_test_mbert_predictions.txt"):
  print("Old predictions already exist.")
elif os.path.exists(f"snlp-project/predictions/all_test_mbert_predictions.txt"):
  print("Old predictions loaded from repository")
  shutil.copy(f"snlp-project/predictions/all_test_mbert_predictions.txt",
              "all_test_mbert_predictions.txt")
else:
  print("Generating predictions, takes a long time.")
  regenerate_all_test_mbert_predictions()

from ipywidgets import interact_manual
my_interact_mbert = interact_manual.options(manual_name="Regenerate")
my_interact_mbert(regenerate_all_test_mbert_predictions)

Generating predictions, takes a long time.


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Opening file: all_test_mbert_predictions.txt


Predicting:   0%|          | 0/25377 [00:00<?, ?sentence/s, filename=all_test_mbert_predictions.txt]

interactive(children=(Button(description='Regenerate', style=ButtonStyle()), Output()), _dom_classes=('widget-…

In [None]:
# @title Coptic predictions
def regenerate_all_test_coptic_predictions():
  coptic_model = AutoModelForMaskedLM.from_pretrained("lgessler/microbert-coptic-m")
  coptic_model.resize_token_embeddings(len(coptic_tokenizer))
  coptic_model.cuda()
  save_predictions_to_file(10, coptic_model, coptic_tokenizer, "all_test_coptic_predictions.txt", combined_sentences)

if os.path.exists("all_test_coptic_predictions.txt"):
  print("Old predictions already exist.")
elif os.path.exists(f"snlp-project/predictions/all_test_coptic_predictions.txt"):
  print("Old predictions loaded from repository")
  shutil.copy(f"snlp-project/predictions/all_test_coptic_predictions.txt",
              "all_test_coptic_predictions.txt")
else:
  print("Generating predictions, takes a long time.")
  regenerate_all_test_coptic_predictions()

from ipywidgets import interact_manual
my_interact_coptic = interact_manual.options(manual_name="Regenerate")
my_interact_coptic(regenerate_all_test_coptic_predictions)

Generating predictions, takes a long time.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.17M [00:00<?, ?B/s]

Some weights of BertForMaskedLM were not initialized from the model checkpoint at lgessler/microbert-coptic-m and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Opening file: all_test_coptic_predictions.txt


Predicting:   0%|          | 0/25377 [00:00<?, ?sentence/s, filename=all_test_coptic_predictions.txt]

interactive(children=(Button(description='Regenerate', style=ButtonStyle()), Output()), _dom_classes=('widget-…

In [None]:
# KoichiYasuoka/roberta-base-coptic <-- RemBertTokenizerFast?
# KoichiYasuoka/deberta-base-coptic <-- DebertaV2TokenizerFast

### Load saved predictions

In [None]:
def get_sentences_with_predictions(file_name):
  predictions_sentences = []

  print("Opening file:", file_name)
  predictions_sentences_file = open(file_name, "r")

  for sentence in predictions_sentences_file:
    line = sentence.replace(u'\xa0', u' ') # account for non-breaking spaces?
    line = line.replace("\n", "")
    original_sentence, masked_sentence, correct_prediction, predictions_str = line.split(" | ", 3)
    predictions = list(eval(predictions_str))
    predictions_sentences.append([original_sentence, masked_sentence, correct_prediction, predictions])

  predictions_sentences_file.close()

  return predictions_sentences

In [None]:
res_df = pd.DataFrame(columns=['name','hit@1','hit@5','hit@10','MRR@5','MRR@10'])

pred_models = [
      ("AES", "all_test_aes_predictions.txt"),
      ("AES (all)", "all_test_aes_all_predictions.txt"),
      ("Ramses", "all_test_ramses_predictions.txt"),
      ("Ramses (all)", "all_test_ramses_all_predictions.txt"),
      ("Combined", "all_test_combined_predictions.txt"),
      ("M-BERT", "all_test_mbert_predictions.txt"),
      ("Coptic", "all_test_coptic_predictions.txt")
    ]

for pm in pred_models:
  ps = get_sentences_with_predictions(pm[1])
  print(ps[:1])
  hit1, hit5, hit10, mrr5, mrr10 = calculate_metrics(ps)
  res_obj = {'name': pm[0], 'hit@1': hit1, 'hit@5': hit5, 'hit@10': hit10, 'MRR@5': mrr5, 'MRR@10': mrr10 }
  nr = pd.DataFrame(res_obj, index=[0])
  print(nr)
  res_df = pd.concat([res_df, nr])

Opening file: all_test_aes_predictions.txt
[['Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t nb-tA m Pr-Jtn m Ax.t-Jtn', 'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t [MASK] m Pr-Jtn m Ax.t-Jtn', 'nb-tA', [{'score': 0.9032711982727051, 'token': 119586, 'token_str': 'nb-tA', 'sequence': 'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t nb-tA m Pr-Jtn m Ax.t-Jtn'}, {'score': 0.04545396938920021, 'token': 119585, 'token_str': 'nb-p.t', 'sequence': 'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t nb-p.t m Pr-Jtn m Ax.t-Jtn'}, {'score': 0.0054491497576236725, 'token': 120212, 'token_str': 'ym.y-HAb-sd', 'sequence': 'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t ym.y-HAb-sd m Pr-Jtn

In [None]:
res_df

Unnamed: 0,name,hit@1,hit@5,hit@10,MRR@5,MRR@10
0,AES,0.234983,0.401822,0.457091,0.29855,0.305906
0,AES (all),0.188123,0.331324,0.374512,0.242211,0.248065
0,Ramses,0.262118,0.461849,0.535907,0.33745,0.347382
0,Ramses (all),0.026323,0.061473,0.074122,0.039388,0.041121
0,Combined,0.250108,0.431335,0.4947,0.31865,0.32722
0,M-BERT,0.009891,0.017772,0.023683,0.012697,0.013463
0,Coptic,0.0,0.000197,0.00063,4.5e-05,0.000101


## Get multitoken predictions (better reflects the use case of filling gaps, e.g., there is some context)

In [None]:
file_names = [
        "all_test_aes_predictions.txt", "all_test_aes_all_predictions.txt",
        "all_test_ramses_predictions.txt", "all_test_ramses_all_predictions.txt",
        "all_test_combined_predictions.txt",
        "all_test_mbert_predictions.txt",
        "all_test_coptic_predictions.txt"
    ]

for fn in file_names:
    file = open(fn, "r")
    lines = file.readlines()
    target_file = open(fn.split(".")[0] + "_multitoken.txt", "w")

    for l in lines:
        line = l.replace(u'\xa0', u' ') # account for non-breaking spaces?
        line = line.replace("\n", "")
        original_sentence, masked_sentence, correct_prediction, predictions_str = line.split(" | ", 3)
        tokens = original_sentence.split()
        if len(tokens) > 2:
            target_file.write(l)

    file.close()
    target_file.close()

In [None]:
mt_res_df = pd.DataFrame(columns=['name','hit@1','hit@5','hit@10','MRR@5','MRR@10'])

mt_pred_models = [
      ("AES", "all_test_aes_predictions_multitoken.txt"),
      ("AES (all)", "all_test_aes_all_predictions_multitoken.txt"),
      ("Ramses", "all_test_ramses_predictions_multitoken.txt"),
      ("Ramses (all)", "all_test_ramses_all_predictions_multitoken.txt"),
      ("Combined", "all_test_combined_predictions_multitoken.txt"),
      ("M-BERT", "all_test_mbert_predictions_multitoken.txt"),
      ("Coptic", "all_test_coptic_predictions_multitoken.txt")
    ]

for pm in mt_pred_models:
  ps = get_sentences_with_predictions(pm[1])
  print(ps[:1])
  hit1, hit5, hit10, mrr5, mrr10 = calculate_metrics(ps)
  res_obj = {'name': pm[0], 'hit@1': hit1, 'hit@5': hit5, 'hit@10': hit10, 'MRR@5': mrr5, 'MRR@10': mrr10 }
  nr = pd.DataFrame(res_obj, index=[0])
  print(nr)
  mt_res_df = pd.concat([mt_res_df, nr])

Opening file: all_test_aes_predictions_multitoken.txt
[['Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t nb-tA m Pr-Jtn m Ax.t-Jtn', 'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t [MASK] m Pr-Jtn m Ax.t-Jtn', 'nb-tA', [{'score': 0.9032711982727051, 'token': 119586, 'token_str': 'nb-tA', 'sequence': 'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t nb-tA m Pr-Jtn m Ax.t-Jtn'}, {'score': 0.04545396938920021, 'token': 119585, 'token_str': 'nb-p.t', 'sequence': 'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t nb-p.t m Pr-Jtn m Ax.t-Jtn'}, {'score': 0.0054491497576236725, 'token': 120212, 'token_str': 'ym.y-HAb-sd', 'sequence': 'Raw-Hr.w-Ax.ty-Hai-m-Ax.t M-rn=f-m-Sw-n.ty-m-Jtn Di anx D.t nHH Jtn anx wr ym.y-HAb-sd nb-Snn.t-nb-Jtn nb-p.t ym.y-HAb-

In [None]:
mt_res_df

Unnamed: 0,name,hit@1,hit@5,hit@10,MRR@5,MRR@10
0,AES,0.240144,0.417762,0.477617,0.307374,0.315386
0,AES (all),0.191821,0.342933,0.388983,0.248794,0.255059
0,Ramses,0.278501,0.488897,0.561593,0.358092,0.367861
0,Ramses (all),0.029392,0.068002,0.082151,0.043684,0.045622
0,Combined,0.261649,0.454657,0.521701,0.334476,0.343519
0,M-BERT,0.011455,0.020583,0.027384,0.014706,0.015587
0,Coptic,0.0,0.000228,0.000685,5.2e-05,0.00011


# Get sawmedizin predictions

In [None]:
file_names = [
        "all_test_aes_predictions.txt", "all_test_aes_all_predictions.txt",
        "all_test_ramses_all_predictions.txt",
        "all_test_combined_predictions.txt",
        "all_test_mbert_predictions.txt",
        "all_test_coptic_predictions.txt"
    ]

for fn in file_names:
    file = open(fn, "r")
    lines = file.readlines()[575:2018]
    target_file = open(fn.split(".")[0] + "_sawmedizin.txt", "w")

    for l in lines:
        target_file.write(l)

    file.close()
    target_file.close()

In [None]:
sm_res_df = pd.DataFrame(columns=['name','hit@1','hit@5','hit@10','MRR@5','MRR@10'])

sm_pred_models = [
      ("AES", "all_test_aes_predictions_sawmedizin.txt"),
      ("AES (all)", "all_test_aes_all_predictions_sawmedizin.txt"),
      ("Ramses (all)", "all_test_ramses_all_predictions_sawmedizin.txt"),
      ("Combined", "all_test_combined_predictions_sawmedizin.txt"),
      ("M-BERT", "all_test_mbert_predictions_sawmedizin.txt"),
      ("Coptic", "all_test_coptic_predictions_sawmedizin.txt")
    ]

for pm in sm_pred_models:
  ps = get_sentences_with_predictions(pm[1])
  print(ps[:1])
  hit1, hit5, hit10, mrr5, mrr10 = calculate_metrics(ps)
  res_obj = {'name': pm[0], 'hit@1': hit1, 'hit@5': hit5, 'hit@10': hit10, 'MRR@5': mrr5, 'MRR@10': mrr10 }
  nr = pd.DataFrame(res_obj, index=[0])
  print(nr)
  sm_res_df = pd.concat([sm_res_df, nr])

Opening file: all_test_aes_predictions_sawmedizin.txt
[['my.tyt', '[MASK]', 'my.tyt', [{'score': 0.3099483549594879, 'token': 121108, 'token_str': 'Dd-mdw', 'sequence': 'Dd-mdw'}, {'score': 0.08297460526227951, 'token': 120032, 'token_str': 'k.t', 'sequence': 'k.t'}, {'score': 0.014134310185909271, 'token': 122971, 'token_str': 'atx', 'sequence': 'atx'}, {'score': 0.01332409679889679, 'token': 121109, 'token_str': 'Xr.y-HAb.t', 'sequence': 'Xr.y-HAb.t'}, {'score': 0.01192709244787693, 'token': 119771, 'token_str': 'nD', 'sequence': 'nD'}, {'score': 0.01156017929315567, 'token': 137709, 'token_str': 'ym.y-xt-Hm.w-kA', 'sequence': 'ym.y-xt-Hm.w-kA'}, {'score': 0.008484387770295143, 'token': 136663, 'token_str': 'ywi=f-pw', 'sequence': 'ywi=f-pw'}, {'score': 0.0072834668681025505, 'token': 133206, 'token_str': 'ymAx.w-xr-nb=f', 'sequence': 'ymAx.w-xr-nb=f'}, {'score': 0.006853368133306503, 'token': 129620, 'token_str': 'Hm-kA', 'sequence': 'Hm-kA'}, {'score': 0.006628577131778002, 'token'

In [None]:
sm_res_df

Unnamed: 0,name,hit@1,hit@5,hit@10,MRR@5,MRR@10
0,AES,0.262647,0.482328,0.519751,0.353523,0.358516
0,AES (all),0.262647,0.471933,0.50797,0.350416,0.355237
0,Ramses (all),0.025641,0.053361,0.066528,0.036059,0.037961
0,Combined,0.270963,0.499653,0.539155,0.364749,0.370109
0,M-BERT,0.019404,0.025641,0.030492,0.021853,0.022408
0,Coptic,0.0,0.0,0.0,0.0,0.0


# Get predictions for some sentences with actual lacunae

In [None]:
aes_model = AutoModelForMaskedLM.from_pretrained('./aes-model')
aes_unmasker = pipeline('fill-mask', model=aes_model, tokenizer=aes_tokenizer, top_k=10)

In [None]:
combined_model = AutoModelForMaskedLM.from_pretrained('./combined-model')
combined_unmasker = pipeline('fill-mask', model=combined_model, tokenizer=combined_tokenizer, top_k=10)

##### [IBUCEEy6AYQxaExmhmbFmeHXoyc] jr ḥm-nṯr nb wꜥb nb jri̯ =sn zp m rʾ-pr [LACUNAE_WORD] st nn rḏi̯.t ḫpr rd ḥr zꜣṯ nn rḏi̯.t smn jwꜥ ḥr-sꜣ =sn ḥr-n,tj n ꜥpr ḥw,t-nṯr m wḫꜣ =s

In [None]:
[p['token_str'] for p in aes_unmasker("yr Hm-nTr nb wab nb yri =sn sp m ra-pr [MASK] st nn rDi.t xpr rd Hr sAT nn rDi.t smn ywa Hr-sA =sn Hr-n,ty n apr Hw,t-nTr m wxA =s")]

['=f', 'nb.t', 'pn', '=sn', 'Raw', 'n.ty', 'nTr', '=k', 'tw', 'nb']

In [None]:
[p['token_str'] for p in combined_unmasker("yr Hm-nTr nb wab nb yri =sn sp m ra-pr [MASK] st nn rDi.t xpr rd Hr sAT nn rDi.t smn ywa Hr-sA =sn Hr-n,ty n apr Hw,t-nTr m wxA =s")]

['n', 'm', '=f', 'nb', 'r', 'nb.t', '=sn', '=k', 'n.y', 'pn']

##### [IBYCkyH6L5gQckCKmWhjhDYdFWQ] [LACUNAE_WORD] mrḥ,t ḥfꜣ,t km.t wꜥwy,t gmm.t m ḥs

In [None]:
[p['token_str'] for p in aes_unmasker("[MASK] mrH,t HfA,t km.t wawy,t gmm.t m Hs")]

['y', 'ynk', 'wn.yn', 'nn', 'm', 'aHa.n', 'ms', 'psi', 'dr', 'nA']

In [None]:
[p['token_str'] for p in combined_unmasker("[MASK] mrH,t HfA,t km.t wawy,t gmm.t m Hs")]

['y', 't', 'm', 'n', 'pA', 'yw', 'nA', 'yr.y-yx.t-nswt', 'yr', 'nn']

##### [IBcAiNtHQLUgb001jZfbAhEi21k] ḫpi̯.w pw jri̯.n [LACUNAE_WORD] =sn

In [None]:
[p['token_str'] for p in aes_unmasker("xpi.w pw yri.n [MASK] =sn")]

['n', 'rn', 'yri.n', 's.t', 'ym.y', 'ym', 'Hr', 'y:m', 'sDm', 'kA']

In [None]:
[p['token_str'] for p in combined_unmasker("xpi.w pw yri.n [MASK] =sn")]

['ym', 'n', 'Hr', 'r', 'yb', 'yri.n', 'rn', 'Hm', 'kA', 'nb']

##### [LACUNAE_WORD] 1

In [None]:
[p['token_str'] for p in aes_unmasker("[MASK] 1")]

['xpS', 'mnw.t', 'mys.t', 'sxn', 'by.t', 'Trp', 's.t', 'ta-rtH', 'Ha', 'nnSm']

In [None]:
[p['token_str'] for p in combined_unmasker("[MASK] 1")]

['ywa', 'mw', 'nbs', 'Sns', 'sfT', 'ta-wt', 'nnSm', 'fAi.t', 'ra', 'mnDm']

##### [IBUCKcMIMTcSFkC2uRPsTtB79yw] mrḥ,t ḥḏ.t 1 mrḥ,t [LACUNAE_WORD] 1 mrḥ,t db 1 mrḥ,t ꜥꜣ 1 njꜣ,w ꜥḏ 1 jḥ ꜥḏ 1 ꜥš ẖpꜣ 1 pr,t ꜥnw 1 ꜥnt,w nḏm 1 sft 1

In [None]:
[p['token_str'] for p in aes_unmasker("mrH,t HD.t 1 mrH,t [MASK] 1 mrH,t db 1 mrH,t aA 1 nyA,w aD 1 yH aD 1 aS XpA 1 pr,t anw 1 ant,w nDm 1 sft 1")]

['Snd.t',
 'Hnq.t',
 'sntr',
 'HD.t',
 'Sns',
 'sAT',
 'fAi.t',
 'mys.t',
 'msdm.t',
 'rnp.t-sp']

In [None]:
[p['token_str'] for p in combined_unmasker("mrH,t HD.t 1 mrH,t [MASK] 1 mrH,t db 1 mrH,t aA 1 nyA,w aD 1 yH aD 1 aS XpA 1 pr,t anw 1 ant,w nDm 1 sft 1")]

['aA', 'ps', 'XAr', 'sw.t', 'HD.t', 'ta-rtH', 'yA.tt', 'nbs', 'wAD.t', 'sntr']

##### [IBUBdw5JznewxU5xjhSj8hk35BA] [LACUNAE_WORD] =f Ttj-rsu̯ wḥm-ꜥnḫ

In [None]:
[p['token_str'] for p in aes_unmasker("[MASK] =f Tty-rsu̯ wHm-anx")]

['sA', 'sn', 'Hm.t', 'sA.t', 'bw.t', 'xpi', 'wnn', 'DAi', 'stp', 'maw.t']

In [None]:
[p['token_str'] for p in combined_unmasker("[MASK] =f Tty-rsu̯ wHm-anx")]

['sA', 'sA.t', 'Hm.t', 'sn', 'Dd', 'iw', 'sn.t', 'mtw', 'in', 'yw']

##### IBYCN98wpLk6MUaJmYQ1uf7R8KE stp.n sw yty =f Jmn m [MASK] m rx.n =f yb =f r qd Hw,t-nTr r smnx ra-pr

In [None]:
[p['token_str'] for p in combined_unmasker("stp.n sw yty =f Jmn m [MASK] m rx.n =f yb =f r qd Hw,t-nTr r smnx ra-pr")]

['p.t', '=f', 'Jwn.w', 'nTr', 'tA', 'Xr-nTr', 'ra', 'hrw', 'Raw', 'Ax.t']

##### IBYCkyH6L5gQckCKmWhjhDYdFWQ [MASK] mrH,t HfA,t km.t wawy,t gmm.t m Hs

In [None]:
[p['token_str'] for p in combined_unmasker("[MASK] mrH,t HfA,t km.t wawy,t gmm.t m Hs")]

['y', 't', 'm', 'n', 'pA', 'yw', 'nA', 'yr.y-yx.t-nswt', 'yr', 'nn']

In [None]:
[p['token_str'] for p in aes_unmasker("[MASK]")]

In [None]:
[p['token_str'] for p in combined_unmasker("[MASK]")]