This file trains ngram models and their aggregation for task of filling in gaps of Ancient Egyptian sentences.

The code is hidden by default for brevity. Click button `Show code` (or equivalent in your language) to delve in to the details.

# Setup

In [1]:
# @title Setup work space and download files { display-mode: "form" }

# Create working directory and move to it.
!echo -n "Populating working directory: "
!mkdir -p /content/data-pos
%cd /content/data-pos

from google.colab import userdata # Needed if using private repository
from tqdm.notebook import tqdm, tqdm_notebook
import requests
import os

rt_url = 'https://zenodo.org/records/4954597/files/ramses-trl_2021_05_29.zip?download=1'
aes_filenames = [
    "_aes_bbawamarna.json", "_aes_bbawhistbiospzt.json",
    "_aes_sawmedizin.json", "_aes_bbawarchive.json",
    "_aes_bbawpyramidentexte.json", "_aes_bbawbriefe.json",
    "_aes_bbawramessiden.json", "_aes_smaek.json",
    "_aes_bbawfelsinschriften.json", "_aes_bbawtempelbib.json", "_aes_tb.json",
    "_aes_bbawgrabinschriften.json", "_aes_bbawtotenlit.json",
    "_aes_tuebingerstelen.json", "_aes_bbawgraeberspzt.json",
    "_aes_sawlit.json" ]
misc_files = [ 'preprocessing/final_files/gaps/all_gaps_id.txt',
               'data/aed-tei/dictionary.json',
               'translations/ge_en_dictionary.json',
               'data/marete-ramses/aligned/combined_dev.txt',
               'data/marete-ramses/aligned/combined_test.txt',
               'preprocessing/final_files/gaps/all_gaps_id_harmonized.txt' ]
aes_path = 'https://raw.githubusercontent.com/simondschweitzer/aes/main/files/aes/'
egy_path='https://raw.githubusercontent.com/annasahola/egy-gaps/main/'
models = [ '2024-03-13a-W2V-AES-alt.model', '2024-03-13a-W2V.RTC-optimized.model',
           '2024-03-13a-W2V-AES-TR-alt.model', '2024-03-13a-W2V-TR-combined-2.01.model' ]
egy_release='https://github.com/annasahola/egy-gaps/releases/download/release-1.0pre1/'

for x in tqdm_notebook(range(3), desc="Downloading"):
  if x == 0 and os.path.exists('ramses-trl/data/src-train.txt') != True:
    print(f"Loading {rt_url}")
    response = requests.get(rt_url)
    open("ramses-trl_2021_05_29.zip", "wb").write(response.content)
    !unzip -q ramses-trl_2021_05_29.zip
  if x == 1 and os.path.exists('_aes_bbawamarna.json') != True:
    for file in aes_filenames:
      print(f"Loading {file}")
      response = requests.get(aes_path + file)
      with open(file, "wb") as file_out:
        file_out.write(response.content)
  if x == 2:
    for path_file in misc_files:
      filename = os.path.basename(path_file)
      if not os.path.exists(filename):
        print(f"Downloading {filename}")
        response = requests.get(egy_path + path_file)
        if not response.ok:
          raise(Exception(f"Failed to download {egy_path + path_file}"))
        with open(filename, "wb") as file_out:
          file_out.write(response.content)
  if x == 3:
    for model in models:
      if not os.path.exists("models/" + model):
        !mkdir -p models
        print(f"Downloading models/{model}")
        response = requests.get(egy_release + model)
        if not response.ok:
          raise(Exception(f"Failed to download {egy_release + model}"))
        with open(f"models/{model}", "wb") as file_out:
          file_out.write(response.content)
    if not os.path.exists('models/2024-03-13a-W2V RTC-optimized.model'):
      !cp 'models/2024-03-13a-W2V.RTC-optimized.model' 'models/2024-03-13a-W2V RTC-optimized.model'



Populating working directory: /content/data-pos


Downloading:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# @title Check files after downloading { display-mode: "form" }
import os

aes_filenames = [
    "_aes_bbawamarna.json", "_aes_bbawhistbiospzt.json",
    "_aes_sawmedizin.json", "_aes_bbawarchive.json",
    "_aes_bbawpyramidentexte.json", "_aes_bbawbriefe.json",
    "_aes_bbawramessiden.json", "_aes_smaek.json",
    "_aes_bbawfelsinschriften.json", "_aes_bbawtempelbib.json", "_aes_tb.json",
    "_aes_bbawgrabinschriften.json", "_aes_bbawtotenlit.json",
    "_aes_tuebingerstelen.json", "_aes_bbawgraeberspzt.json",
    "_aes_sawlit.json" ]
gaps_file = "all_gaps_id.txt"
gaps_file2 = "all_gaps_id_harmonized.txt"
aed_dictionary = "dictionary.json"
ge_en_dict = "ge_en_dictionary.json"
combined_files = [ "combined_dev.txt", "combined_dev.txt" ]
preprocessed_files = [aed_dictionary, ge_en_dict, gaps_file, gaps_file2] + combined_files

for fi in aes_filenames + preprocessed_files:
  if os.path.exists(fi) != True:
    print(f"file {fi} does not exist on file system")
    raise Exception("Missing file(s)")

# Make sure ramses-trl directory exists.
for di in [ 'ramses-trl', 'ramses-trl/data' ]:
  if os.path.exists(di) != True:
    os.mkdir(di)

if os.path.exists('ramses-trl/data/src-train.txt') != True:
  rt_url = 'https://zenodo.org/records/4954597/files/ramses-trl_2021_05_29.zip?download=1'
  import requests
  print(f"Loading {rt_url}")
  response = requests.get(rt_url)
  open("ramses-trl_2021_05_29.zip", "wb").write(response.content)
  !unzip ramses-trl_2021_05_29.zip

rtc_path = 'ramses-trl/data/'
rtc_ext = '.txt'
rtc_src_files = [ 'src-train', 'src-val' ] # Use -train for training/gaps and -val for testing/gaps.
rtc_tgt_files = [ 'tgt-train', 'tgt-val' ]
rtc_files = [rtc_path+fname+rtc_ext for fname in rtc_src_files + rtc_tgt_files]
all_files = aes_filenames + [aed_dictionary] + rtc_files
for fi in all_files:
  if os.path.exists(fi) != True:
    print(f"file {fi} does not exist on Google Drive")
    raise Exception("Missing file(s)")
print("\U0001f44D All required files located.")

👍 All required files located.


In [3]:
# @title Install required libraries { display-mode: "form" }
#%pip install pympler
#%pip install Levenshtein
%pip install unidecode
#%pip install geneticalgorithm2
#%pip install func_timeout
%pip install gardiner2unicode
%pip install wikitextparser



In [4]:
# @title Load required libraries { display-mode: "form" }
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import json
import io
import re
import string
import random
from tqdm.notebook import tqdm, tqdm_notebook
import numpy as np
import collections
import gensim.models
#import Levenshtein
import heapq
from unidecode import unidecode
#from geneticalgorithm2 import geneticalgorithm2 as ga2
from collections import Counter
import time
from tabulate import tabulate
from gardiner2unicode import GardinerToUnicodeMap
from ipywidgets import interact, interactive, fixed, interact_manual, widgets
from gensim.models.callbacks import CallbackAny2Vec
from IPython.display import clear_output

Import libraries (all libraries should be installed above)

# Data extraction

In [5]:
# @title Parse AES Files into `aes_file_contents`, `aes_sentence_dictionary` { display-mode: "form" }

aes_print_sentences = []
written_form_to_lemma_id = dict()
written_form_to_mdc = dict()
def read_aes_files():
  aes_file_contents = []
  aes_sentence_dictionary = {}
  global aes_print_sentences

  notebook = tqdm_notebook(aes_filenames,desc="Loading...")
  for filename in notebook:
    # go through each file
    notebook.desc = filename
    with open(filename, "r") as file:
      data = json.load(file)
      sentences = []

      # go through all elements (sentences) in the JSON data
      for e in data:
        sentence = []
        aes_sentence_dictionary[e] = {}

        sentence_object = data[e]
        token_list = sentence_object["token"]

        for token in token_list:
          # written_form and mdc exist for all tokens
          cotext_translation = token["cotext_translation"] if "cotext_translation" in token else ""
          lemma_form = token["lemma_form"] if "lemma_form" in token else ""
          lemma_id = token["lemmaID"] if "lemmaID" in token else "-1"
          pos = token["pos"] if "pos" in token else ""
          token_object = (token["mdc"], cotext_translation, lemma_form, lemma_id, pos)
          if 'written_form' in token:
            written_form_to_lemma_id[token['written_form']] = lemma_id
          if 'written_form' in token:
            written_form_to_mdc[token['written_form']] = token["mdc"]
          sentence.append(token_object)

        sentences.append(sentence)

        aes_sentence_dictionary[e]["text"] = sentence_object["text"]
        aes_sentence_dictionary[e]["translation"] = sentence_object["sentence_translation"]
        aes_sentence_dictionary[e]["corpus"] = sentence_object["corpus"]
        aes_sentence_dictionary[e]["date"] = sentence_object["date"]
        aes_sentence_dictionary[e]["tokens"] = sentence
        aes_sentence_dictionary[e]["token_list"] = "<s> " + " ".join([token[3] for token in sentence]) + " </s>" # list of token lemma IDs separated by whitespace
        aes_sentence_dictionary[e]["sentence_transliteration"] = " ".join([token[0] for token in sentence])
        # You may add here search term for interesting sentences. I search for sentence with numbers and fractions.
        if len(aes_print_sentences) < 20: # First 20
            aes_print_sentences.append(aes_sentence_dictionary[e])
        if len(aes_print_sentences) < 100 and sum(["5" in token["mdc"] for token in token_list]) and sum(["1/8" in token["mdc"] for token in token_list]):
            aes_print_sentences.append(aes_sentence_dictionary[e])

      aes_file_contents.append(sentences)
  notebook.desc = "Loading AES"
  return (aes_file_contents, aes_sentence_dictionary)
aes_file_contents, aes_sentence_dictionary = read_aes_files()
print(f"AES Loaded: {len(aes_file_contents)} files with {len(aes_sentence_dictionary.items())} sentences")

# Show interesting sentences. The reason to use subset is that all
# sentences could be too much.
aes_print_sentences_compact = [[sentence['text'], sentence['token_list'], sentence['sentence_transliteration']] for sentence in aes_print_sentences[:100]]
pd.set_option('display.max_rows', 5)
aes_print_dataframe = pd.DataFrame(aes_print_sentences_compact,
                         columns=['Text', 'Token list', 'Transliteration'])
aes_print_dataframe


Loading...:   0%|          | 0/16 [00:00<?, ?it/s]

AES Loaded: 16 files with 101796 sentences


Unnamed: 0,Text,Token list,Transliteration
0,25IKMZZWRBARVG5WSM7JVGVBDE,<s> -1 856314 -1 </s>,"[...] [nfr-xpr,pl]-raw-wa-[n-raw] [...]"
1,25IKMZZWRBARVG5WSM7JVGVBDE,<s> -1 73330 850814 64830 550077 851446 51990 ...,"[...] [mH] 4 [m-mj,tt] ⸢SAa⸣-m pA [wD] [rs,j-j..."
...,...,...,...
22,S36FKQSICNFDLADBECABA64XXE,<s> 26870 10100 69410 110440 850815 850815 850...,"jni.n =sn 〈m〉fkA,t HqA,t 14 1/4 1/8 1/32 (w)DA..."
23,S36FKQSICNFDLADBECABA64XXE,<s> 26870 10100 112330 178610 850814 850814 -1...,jni.n =sn HD dbn 1676 1/2 [...] ⸢Hzmn⸣ ⸢dbn⸣ ⸢...


In [6]:
# @title Create test set of AES sentences { display-mode: "form" }

# Divide randomically 20% of sentences to test.
def is_test(token_list):
  h = hash(token_list)
  return True if (h % 100) < 20 else False # Select 20% inputs

def get_aes(ae_dictionary):
  aecorpus = collections.namedtuple('AncientEgyptianCorpus', 'aes aes_1 aes_t aes_g aesen aesen_1 aesen_t aed rtc rtc_1 rtc_t rtc_g')

  aes_g = []
  aes = []
  aes_1 = []
  aes_t = []
  c = 0
  if c > 0:
    print("Examples of sentences with gaps:")
  for id, sentence_all in ae_dictionary.items():
    tokens = sentence_all["tokens"]
    gaps = True
    try:
      # Consider unreadable as gap.
      gaps = any(token[3] == "-1" for token in tokens)
      if gaps == True and c > 0:
        c -= 1
        print(sentence_all["token_list"], "    ", sentence_all["sentence_transliteration"])
    except:
      #print("Gaps check fail: ", sentence)
      pass
    if gaps:
      aes_g.append(sentence_all)
    else:
      cl = is_test(sentence_all["token_list"]) # classification is per token sequence.
      aes.append(sentence_all)
      if cl == False:
        aes_1.append(sentence_all)
      else:
        aes_t.append(sentence_all)

  aecorpus.aes_g = aes_g
  aecorpus.aes = aes
  aecorpus.aes_t = aes_t
  aecorpus.aes_1 = aes_1
  return aecorpus

aecorpus = get_aes(aes_sentence_dictionary)
# For debugging you may enable following prints
print(f"AES partitioned:\n  num sentences={len(aecorpus.aes)} including:\n    training={len(aecorpus.aes_1)}\n    test={len(aecorpus.aes_t)}\n    gapped={len(aecorpus.aes_g)}")

AES partitioned:
  num sentences=79357 including:
    training=63606
    test=15751
    gapped=22439


In [7]:
# @title Read Ancient Egyptian Dictionary { display-mode: "form" }

def read_aed():
  aed_dict = {}

  with open(aed_dictionary, "r") as dictionary_file:
    dictionary_data = json.load(dictionary_file)
  dictionary_entries = dictionary_data["TEI"]["text"]["body"]["entry"]

  for entry in dictionary_entries:
    xml_id = entry["_xml:id"]
    lemma_id = xml_id[3:]
    entry_translations = [(e["quote"], e["_xml:lang"]) for e in entry["sense"]["cit"] if "quote" and "_xml:lang" and "_type" in e and e["_type"] == "translation"]
    aed_dict[lemma_id] = {
        "form": entry["form"]["orth"],
        "grammarGroup": entry["gramGrp"]["term"],
        "translations": entry_translations
    }

  print('Example entry in the dictionary:')
  print(aed_dict["10"])

  print('The size of the dictionary is:', len(aed_dict))

  unique_lemma_ids = set([token[3] for file in aes_file_contents for sentence in file for token in sentence])
  missing_vocabulary = [lemma_id for lemma_id in unique_lemma_ids if lemma_id not in aed_dict.keys()]
  missing_vocabulary_count = len(missing_vocabulary) - 1 # account for lemma_id -1 for lacunae
  print('The number of missing vocabulary is:', missing_vocabulary_count)

  vocabulary_items_with_en_translations = {k: v for k, v in aed_dict.items() if any(translation[1] == 'en' for translation in v["translations"])}
  print('The size of the English-only dictionary:', len(vocabulary_items_with_en_translations))
  print('Portion of vocabulary in the dictionary with English translations:', len(vocabulary_items_with_en_translations)/len(aed_dict))
  return aed_dict
aecorpus.aed = read_aed()

Example entry in the dictionary:
{'form': 'ꜣj.wj', 'grammarGroup': 'substantive/substantive_masc', 'translations': [('Doppelverband, Kreuzverband (?) (med.)', 'de'), ('pair of bandages (med.)', 'en')]}
The size of the dictionary is: 35052
The number of missing vocabulary is: 0
The size of the English-only dictionary: 16971
Portion of vocabulary in the dictionary with English translations: 0.484166381376241


Create English translations where applicable and attach them to AES corpus.

In [8]:
# @title Attach additional English translations to Ancient Egyptian Sentences { display-mode: "form" }
from textblob import TextBlob
from textblob.exceptions import NotTranslated, TranslatorError
import json
ge_en = dict() # A small dictionary for mappings texts to English
ge_en_changed = False
ge_en_new_entries = 0

if os.path.exists("ge_en_dictionary.json"):
  with open('ge_en_dictionary.json') as f:
    ge_en = json.load(f)

def new_translation_from_de(de):
  blob = TextBlob(de)
  try:
    res = blob.translate(from_lang='de', to='en')
  except NotTranslated:
    res = de
  except TranslatorError:
    res=" "

  global ge_en
  ge_en[de] = str(res)
  global ge_en_changed
  ge_en_changed = True
  global ge_en_new_entries
  ge_en_new_entries += 1
  # If added over 2000 entries it is good to store the database.
  if ge_en_new_entries >= 2000:
    with open('ge_en_dictionary.json', 'w') as f:
      json.dump(ge_en, f)
      print(f"Stored updated ge/en dictionary: {len(ge_en)} entries")
    ge_en_new_entries = 0
    ge_en_changed = False
  return str(res)

def add_english_translations(aes_dictionary, aed_dictionary):
  translations = 0
  translations_full = 0
  translations_full_lookup = 0
  translations_part = 0
  translations_part_lookup = 0
  en_translations = dict()
  de_translations = dict()
  for id,ent in aed_dictionary.items():
    t = list(filter(lambda translation: translation[1] == 'en', ent["translations"]))
    if len(t) > 0:
      en_translations[id] = str(t[0][0])
      translations += 1
    t_de = list(filter(lambda translation: translation[1] == 'de', ent["translations"]))
    if len(t) > 0:
      de_translations[id] = str(t_de[0][0])
  print(f"Using dictionary with {len(en_translations.items())} translations")
  translations = 0
  for id, sentence in tqdm_notebook(aes_sentence_dictionary.items(),desc="Translating"):
    if 'translation' in sentence and len(sentence['translation']) > 1:
      # Found german translation, using it.
      de_translations = sentence['translation']
      if de_translations in ge_en:
        sentence["en"] = ge_en[de_translations]
        if translations_full_lookup < 5:
          print('FULL', sentence["text"], sentence["sentence_transliteration"], sentence["en"])
        translations_full_lookup +=1
      else:
        res = new_translation_from_de(de_translations)
        sentence["en"] = res
        if translations_full < 5:
          print('FULL', sentence["text"], sentence["sentence_transliteration"], sentence["en"])
        translations_full += 1
      continue # Translated full sentence
    for token in sentence["tokens"]:
      lemma_id = token[3]
      if lemma_id not in en_translations:
        if lemma_id in de_translations:
          de_translation = de_translations[lemma_id]
          if de_translation in ge_en:
            en_translations[lemma_id] = ge_en[de_translation]
            if translations_part_lookup < 5:
              print('PART', sentence["text"], sentence["sentence_transliteration"], "PART:", en_translations[lemma_id])
            translations_part_lookup += 1
            continue
          else:
            res = new_translation_from_de(de_translation)
            en_translations[lemma_id] = res
            if translations_part < 5:
              translations_part += 1
              print('PART', sentence["text"], sentence["sentence_transliteration"], "PART:", en_translations[lemma_id])
        break
    else:
      translated_tokens = [en_translations[token[3]] for token in sentence["tokens"]]
      sentence["en"] = "  ".join(translated_tokens)
      if translations < 5:
        print('CONSTRUCTED', sentence["text"], sentence["sentence_transliteration"], sentence["en"])
      translations += 1
  print(f"AES: Used {translations_full + translations_full_lookup} English sentence translation from de")
  print(f"AES: Constructed {translations_part + translations_part_lookup} English sentence part translations from de")
  print(f"AES: Constructed {translations} English translations from dictionary")

add_english_translations(aes_sentence_dictionary, aecorpus.aed)

aecorpus.aesen = list([sentence for sentence in aecorpus.aes if "en" in sentence and "translation" in sentence])
aecorpus.aesen_1 = list([sentence for sentence in aecorpus.aes_1 if "en" in sentence and "translation" in sentence])
aecorpus.aesen_t = list([sentence for sentence in aecorpus.aes_t if "en" in sentence and "translation" in sentence])
aecorpus_readable = list([[sentence['text'], sentence['sentence_transliteration'], sentence['en'], sentence['translation']] for sentence in aecorpus.aesen])

pd.DataFrame(aecorpus_readable, columns=['sentence id', 'transliteration', 'translation', 'de translation'])

# Store updated ge/en dictionary if it has been updated
if ge_en_new_entries >= 5:
  #print(list(ge_en.items())[:10])
  with open('ge_en_dictionary.json', 'w') as f:
    json.dump(ge_en, f)
    print(f"Updated ge/en dictionary: {len(ge_en)} entries")

Using dictionary with 16971 translations


Translating:   0%|          | 0/101796 [00:00<?, ?it/s]

FULL 25IKMZZWRBARVG5WSM7JVGVBDE [...] [nfr-xpr,pl]-raw-wa-[n-raw] [...] ... [Nefer-cheper] -re-Wa [en-re] ...
FULL 25IKMZZWRBARVG5WSM7JVGVBDE [...] [mH] 4 [m-mj,tt] ⸢SAa⸣-m pA [wD] [rs,j-jmn,tj] [...] ... 4 Ellen, [equally] from the [southwestern stele] ...
FULL 25IKMZZWRBARVG5WSM7JVGVBDE [...] jri.w [n] ⸢jtr,w⸣ [6] [...] ... it makes [6] JTr, W lengths ...
FULL 25IKMZZWRBARVG5WSM7JVGVBDE [xr] [jr] ⸢Xnw⸣ pA 4 wD ⸢SAa-m⸣ pA [Dw] [jAb,tj] [...] [As for the] area within these four steles [concerns], starting with the [eastern mountain] ...
FULL 25IKMZZWRBARVG5WSM7JVGVBDE [jw] =[s] [n] [jt(j)] =[j] [raw-Hr,w-Ax,tj-Hai-m-Ax,t] [m-rn≡f-m-Sw-n,tj-m-jtn] [...] [m] ⸢Dw.pl⸣ [m] ⸢xAs,t.pl⸣ m sx,t.pl m mAw,t.pl [...] m StA.pl [m] [(j)x,t] [nb.t] jri.w pA jtn ⸢pAy⸣ =j jt(j) ⸢sxpr⸣ =sn r nHH [D,t] [And it belongs to my father] "[Re-Harachten, who cheers sunlight in his name, which is in the sun]" (= aton), ... [consisting of] mountains and deserts, from Weideland, new territory and ... from tree pla

# RTC


In [9]:
# @title Partial Lemmatization for RTC texts { display-mode: "form" }

# @markdown Used to read RTC sentences and form lemmatized tokens, similar to AES.
# @markdown The lemmatization is unable to handle all tokens.
verbose_lemmatize=False # @param {type:"boolean"}

repl_dict = { 'A': 'ꜣ', 'H': 'ḥ', 'S': 'š', ',': '.', 'D': 'ḏ', 'x': 'ḫ', 'a': 'ꜥ', 's': 'z', 'i': 'i̯' }
rem_list = [ '⸢', '⸣', '[', ']', '(', ')', '{', '}', '<', '>', '⸮', '?' ]
lemmatize_dict = dict()
lemmatize_dict_id = dict()
lemma_id_dict = dict()
def lemmatize(txt):
  global repl_dict
  global rem_list
  global lemmatize_dict
  if txt in lemmatize_dict:
    return lemmatize_dict[txt]
  remtxt = ''
  for char in txt: # Remove '⸢', '⸣', '[', ']' etc.
    if char not in rem_list:
      remtxt += char
  if remtxt in lemmatize_dict:
    return lemmatize_dict[remtxt]
  if remtxt.isnumeric():
    return '1...n'
  out = []
  for char in remtxt:
    if char in repl_dict:
      out.append(repl_dict[char])
    else:
      repl_dict[char] = char
      out.append(char)
  out_str = "".join(out)
  # Some replacements I've seen. Not complete list.
  out_str = out_str.replace('jtn', 'Jtn')
  out_str = out_str.replace('ꜣḫ', 'Ꜣḫ')
  out_str = out_str.replace('.tt', '.tjt')
  lemmatize_dict[txt] = out_str
  return out_str

def get_lemma_id(lemma):
  global lemma_id_dict
  if lemma in lemma_id_dict:
     return lemma_id_dict[lemma]
  remtxt = ''
  for char in lemma: # Remove '⸢', '⸣', '[', ']' etc.
    if char not in rem_list:
      remtxt += char
  if remtxt in lemma_id_dict:
     return lemma_id_dict[remtxt]
  return None

mismatch_c = 0

def add_to_lemmatize_dict(txt, lemma, lemmaid, id, transl, transl2):
  global mismatch_c
  global lemmaize_dict
  if txt in lemmatize_dict:
    if lemma != lemmatize_dict[txt] and mismatch_c < 10 and verbose_lemmatize:
      print('Lemmatization mismatch: ', id, "v", lemmatize_dict_id[txt], txt, lemma, lemmatize_dict[txt])
      print(transl)
      print(transl2)
      mismatch_c += 1
    return
  lemmatize_dict[txt] = lemma
  lemma_id_dict[txt] = lemmaid
  lemmatize_dict_id[txt] = id
  remtxt = ''
  for char in txt:
    if char not in rem_list:
      remtxt += char
  if remtxt in lemmatize_dict:
    return
  lemmatize_dict[remtxt] = lemma
  lemma_id_dict[remtxt] = lemmaid
  lemmatize_dict_id[remtxt] = id

print("Lemmatization db")
add_to_lemmatize_dict('LACUNA', 'LACUNA', -1, 'LACUNA', 'LACUNA', 'LACUNA')
add_to_lemmatize_dict('1...n', '1...n', "850814", '1...n', '1...n', '1...n')

# Processing numbers
def add_numbers_to_lemmatize():
  digits = list([digit for digit in range(1,10)])
  magnitudes = list([pow(10,magnitude) for magnitude in range(7)])
  for num in [digit * magnitude for magnitude in magnitudes for digit in digits]:
    add_to_lemmatize_dict(str(num), '1...n', "850814", '1...n', '1...n', '1...n')
  for sfraction in range(1,9):
    add_to_lemmatize_dict("1/" + str(pow(2, sfraction)), '1...n', "850814", '1...n', '1...n', '1...n')
  for fraction in range(3,9):
    add_to_lemmatize_dict(f"1/{fraction}", '1...n', "850814", '1...n', '1...n', '1...n')
  add_to_lemmatize_dict("2/3", '1...n', "850814", '1...n', '1...n', '1...n')
  add_to_lemmatize_dict("3/4", '1...n', "850814", '1...n', '1...n', '1...n')
  add_to_lemmatize_dict("3/8", '1...n', "850814", '1...n', '1...n', '1...n')
  for fraction in range(1,10):
    add_to_lemmatize_dict("1/" + str(10*fraction), '1...n', "850814", '1...n', '1...n', '1...n')

add_numbers_to_lemmatize()

tr = set()
for sentence in aecorpus.aes:
  for token in sentence['tokens']:
    triplet = (token[3], token[0], token[2])
    tr.add(triplet)
    add_to_lemmatize_dict(triplet[1], triplet[2], triplet[0], sentence['text'], sentence['sentence_transliteration'], [token[2] for token in sentence['tokens']])

print("Built lemmatization database")

Lemmatization db
Built lemmatization database


In [10]:
# @title Load Ramses Transliteration Corpus (RTC)<br>[*Ramses Automated Transliterator*](https://gitlab.cnam.fr/gitlab/rosmorse/ramses-trl) by [**Université de Liège/Projet Ramsès**](http://ramses.ulg.ac.be/) & [**Serge Rosmorduc**](http://cedric.cnam.fr/lab/en/author/Rosmorduc/) is licensed under `CC BY-NC-SA` 4.0. { display-mode: "form" }

rtc_cannot_lemmatize = {}
def process_src_line(src):
  chars = src.rstrip('\n')
  return (chars, chars.split(" "))
def process_tgt_line(dst):
  dst = dst.rstrip('\n')
  chars = dst.split()
  chars = "".join(chars)
  chars = chars.replace('_', ' ').rstrip(' ')
  return (chars, chars.split())
count_err = 0
def lemma_id_translate(token):
  s = ""
  if token is None:
    return s
  # There is difference between j/i
  if token in aecorpus.aed:
    t = aecorpus.aed[token]
    if 'translations' in t:
      tr = t['translations']
      if len(tr) > 0 and len(tr[0]) > 0:
        tr1 = tr[:1][:1]
        tr0 = list(filter(lambda translation: translation[1] == 'en', tr))
        if len(tr0) > 0:
          tr1 = tr0
        s += "/" + str(tr1[0][0])
  return s
g2u_map = GardinerToUnicodeMap()
g2u_alt_map={ # Replacements for characters missing somewhere.
    'MISSING': "\uFFFD",
    '': "\u25A1",
    'Ff1': g2u_map.to_unicode_char('Z14')
}
g2u_alt_entries = len(g2u_alt_map)
def g2u(gardiner_list):
  u = []
  for g in gardiner_list.split(" "):
    if g in g2u_alt_map:
      out_char = g2u_alt_map[g]
    else:
      out_char = g2u_map.to_unicode_char(g)
    if out_char is None:
      out_char = "\uFFFD"
      g2u_alt_map[g] = out_char
    else:
      u.append(out_char)
  return " ".join(u)

def read_rtc():
  rtc = []
  rtc_g = []
  rtc_t = []
  rtc_1 = []

  c = 0
  c_found = 0
  model_sentences = []
  full_count = 0
  gaps_count = 0
  total_count = 0
  for i in range(len(rtc_src_files)):
    sentences = []
    print("Processing " + rtc_src_files[i])
    with open(rtc_path + rtc_src_files[i] + rtc_ext, "r") as src_f:
      src_lines = src_f.readlines()
    with open(rtc_path + rtc_tgt_files[i] + rtc_ext, "r") as tgt_f:
      tgt_lines = tgt_f.readlines()
    assert len(src_lines) == len(tgt_lines)
    for src_, tgt_ in zip(src_lines, tgt_lines):
      src, src_list = process_src_line(src_)
      tgt, tgt_list = process_tgt_line(tgt_)
      sentence = {
          'text': tgt,
          'translation': '', # No full sentence translations
          'gardiner': src,
          'gardiner_list': src_list,
          'corpus': rtc_src_files[i],
          'tokens': [],
          'token_list': [], # TODO
          'sentence_transliteration': tgt,
          'gaps': False
      }
      num_found = 0
      tokens_found = True
      print_sentence=False
      # Mapping between gardiner and transliteration would be good.
      # For now the tokens are based on transliteration.
      for idx,tr in enumerate(tgt_list):
        if tr == "//":
          continue # Skip spaces
        # Construct tokens similar to ones with AES.
        tr_ = tr.replace("i", "j").replace("I", "J") # Common difference
        if tr == "---" or tr == "": # Missing symbols have different notation.
          tr_ = "LACUNA"
        trl = lemmatize(tr_)
        trid = get_lemma_id(tr)
        if trid is None:
          trid = get_lemma_id(tr_)
        if trid is None:
          trid = get_lemma_id(trl)

        if len(tr) > 0 and trid is None:
            pattern = r'[^0-9\/[]'
            if re.search(pattern, tr):
              pass
            else:
              # All numbers should be handled
              print(f"Number {tr} not handled")
              global count_err
              count_err += 1
              print_sentence = True
              if count_err > 5:
                raise "Too many errors"
        if trid is None:
          rtc_cannot_lemmatize[tr] = trl
          #print('Cannot lemmatize', tr, trl, trid)
          #print('Closest matches', lemma_id_closest(tr))
          c += 1
        else:
          num_found += 1
        token = (
            tr,
            lemma_id_translate(trid),
            trl, # Rough lemmatization.
            "" if trid is None else trid, # Use lemma id if found
            "" # TODO: Find pos.
        )
        if trid is None:
          tokens_found = False
        sentence["tokens"].append(token)
      sentence["token_list"] = list([token[3] for token in sentence["tokens"]])

      # Check if there are gaps in sentence
      # TODO: Consider partially readable symbols etc.
      if 'LACUNA' in src_list or 'LACUNA' in tgt_list:
        sentence['gaps'] = True
        tokens_found = False
        gaps_count += 1
      sentence['tokens_found'] = tokens_found
      full_count += 1 if tokens_found else 0
      total_count += 1
      if sentence['gaps'] == False and tokens_found: # previously: (tokens_found or num_found > 3):
        if c_found < 5:
          model_sentences.append(sentence)
        if len(model_sentences) < 7 and '5' in sentence['sentence_transliteration'] and '1/8' in sentence['sentence_transliteration']:
          model_sentences.append(sentence)
      c_found += 1
      if sentence['gaps']:
        rtc_g.append(sentence)
      elif i == 1: # Validation set
        rtc.append(sentence)
        rtc_t.append(sentence)
      else:
        rtc.append(sentence)
        rtc_1.append(sentence)
      if print_sentence:
        print("Debug sentence:")
        print("  Text:", sentence["sentence_transliteration"])
        print("  Token_list", sentence["token_list"])
        print("  Gardiner:", sentence["gardiner"])
        print("  Unicode:", g2u(sentence["gardiner"]))
        print_sentence = False

  print("Model sentences")
  for idx, sentence in enumerate(model_sentences):
    print(idx+1)
    print("  Text:", sentence["sentence_transliteration"])
    print("  Token_list", sentence["token_list"])
    #print("  Tokens", sentence["tokens"])
    print("  Gardiner:", sentence["gardiner"])
    print("  Unicode:", g2u(sentence["gardiner"]))

  if g2u_alt_entries < len(g2u_alt_map):
    print(f"Unmapped Gardiner symbols: {len(g2u_alt_map) - g2u_alt_entries}")


  print()

  print(f"All Sentences: {total_count}; fully lemmatized and no gaps: {full_count}")
  #print(f"gapped: {gaps_count}") (already on the next line)
  return (rtc, rtc_1, rtc_t, rtc_g)

print("Reading and processing RTC.")
rtc = read_rtc()
aecorpus.rtc = rtc[0]
aecorpus.rtc_1 = rtc[1]
aecorpus.rtc_t = rtc[2]
aecorpus.rtc_g = rtc[3]
print(f"RTC partitioned: usable sentences={len(aecorpus.rtc)} incl: training={len(aecorpus.rtc_1)} test={len(aecorpus.rtc_t)} gapped: {len(aecorpus.rtc_g)}")

Reading and processing RTC.
Processing src-train
Processing src-val
Model sentences
1
  Text: iw iw =i r swr m =f
  Token_list ['21881', '21881', '10030', '91900', '130360', '64360', '10050']
  Gardiner: M17 Z7 M17 Z7 A1 D21 S29 G36 D21 N35A A2 M17 G17 I9 
  Unicode: 𓇋 𓏲 𓇋 𓏲 𓀀 𓂋 𓋴 𓅨 𓂋 𓈗 𓀁 𓇋 𓅓 𓆑 □
2
  Text: pA hrw n ms pA inw
  Token_list ['851446', '99060', '78870', '74700', '851446', '850830']
  Gardiner: G41 G1 O4 D21 Z7 N5 Z1 N35 MISSING G41 G1 W25 N35 W24 Z7 Y1 Z2 
  Unicode: 𓅯 𓄿 𓉔 𓂋 𓏲 𓇳 𓏤 𓈖 � 𓅯 𓄿 𓏎 𓈖 𓏌 𓏲 𓏛 𓏥 □
3
  Text: pA sS XAr 1/8 1/32 1/40 r 5 1/3
  Token_list ['851446', '144360', '122580', '850814', '850814', '850814', '91900', '850814', '850814']
  Gardiner: G41 G1 Y3 A1 V19 D11 F16 D21 V20 V20 V20 V20 D21 Z2 2 D21 Z2 
  Unicode: 𓅯 𓄿 𓏞 𓀀 𓎅 𓂁 𓄏 𓂋 𓎆 𓎆 𓎆 𓎆 𓂋 𓏥 𓂋 𓏥 □
4
  Text: s 50 6 wa nb XAr ir.w n XAr 10 2 1/8
  Token_list ['147350', '850814', '850814', '400101', '81650', '122580', '851809', '78870', '122580', '850814', '850814', '850814']
  Gardiner: O34 A1 V20 V20 V20 V20 V




# Use MaReTe

There are combined transliterations, using both AES and Ramses sentences.
These use slightly different transliteration than AES MdC so use different models.

In [11]:
# @title Marete Transliteration: Read training and test vectors

# Remove exact same sentences from testing data than
# are present in training data. Default is false as
# some overlap is to be expected.
filter_duplicates=False # @ param {type:"boolean"}
w2v_tr_combined_variants=1 # @ param{type:"int"}

def sentence_transform_in(sentence):
  return {"sentence_transliteration": sentence}

def token_seq_out(sentence, replace_numbers=False, padlen=2, nopad=False):
    # Note: replace numbers has not been implemented
    token_seq = sentence["sentence_transliteration"].split(" ")
    if nopad:
      return token_seq
    return start_pad(padlen) + token_seq + end_pad(padlen)

# Read test materials
with open("combined_dev.txt", "r") as train_file:
  marete_train_file_lines = train_file.readlines()
  marete_train_file_lines = [x.replace("\n", "") for x in marete_train_file_lines]

with open("combined_test.txt", "r") as test_file:
  marete_test_file_lines = test_file.readlines()
  marete_test_file_lines = [x.replace("\n", "") for x in marete_test_file_lines]

marete_train_sentences_in = [sentence_transform_in(sentence) for sentence in marete_train_file_lines]
if filter_duplicates:
  marete_test_sentences_in = [sentence_transform_in(sentence) for sentence in marete_test_file_lines
                                if sentence not in marete_train_file_lines]
else:
  marete_test_sentences_in = [sentence_transform_in(sentence) for sentence in marete_test_file_lines]

# Quality check for marete training and test files.
for x in marete_train_file_lines:
  if '\n' in x:
    print("Found linefeed in marete_train_file_lines")
  if '\xA0' in x:
    print("Found nonbreaking space in marete_train_file_lines")
for x in marete_test_file_lines:
  if '\n' in x:
    print("Found linefeed in marete_test_file_lines")
  if '\xA0' in x:
    print("Found nonbreaking space in marete_test_file_lines")

filtered_tests = len(marete_test_file_lines) - len(marete_test_sentences_in)
print("marete combined sentences: AES and RTC")
print(f"Using {len(marete_train_sentences_in)} training sentences and {len(marete_test_sentences_in)} test sentences. {filtered_tests} filtered out.")





marete combined sentences: AES and RTC
Using 101466 training sentences and 25377 test sentences. 0 filtered out.


In [12]:
# @title Set of Helper functions for processing tokens. { display-mode: "form" }
# @markdown Helper functions are useful to make actual model generation functions easy to read.
# @markdown `print_sentence`, `token_to_text`, `start_pad`, `end_pad`, `is_pad`, `has_pad`, `token_seq`, `token_seq_tr`, `token_seq_rtc`
def print_sentence(sentence):
  print(f'{sentence["text"]}')
  print(f'. tokens: {sentence["token_list"]}')
  print(f'. transliteration: {sentence["sentence_transliteration"]}')
  print(f'. translation: {sentence["translation"]}')
  # Optional English translation (just one sentence here)
  # The sentence dictionary for German to English translations can be used for more.
  if sentence["sentence_transliteration"] == "bn ftt =tw =f":
    print('. It (the inscription on the stele) should not be erased.') # Google Translate (from de)
  elif 'en' in sentence: # Does dictionary lookup translation exist?
    print(f'. en: {sentence["en"]}')

def token_to_text(token):
  s = token
  if token in aecorpus.aed:
    t = aecorpus.aed[token]
    if 'form' in t:
      s += "/" + t['form']
    if 'translations' in t:
      tr = t['translations']
      if len(tr) > 0 and len(tr[0]) > 0:
        tr1 = tr[:1][:1]
        tr0 = list(filter(lambda translation: translation[1] == 'en', tr))
        if len(tr0) > 0:
          tr1 = tr0
        s += "/" + str(tr1[0][0])
  return s

def start_pad(padlen=2):
  if padlen == 2:
    return ['<s0>', '<s1>']
  return [f'<s{i}>' for i in range(padlen)]

def end_pad(padlen=2):
  if padlen == 2:
    return ['</s1>', '</s0>']
  return [f'</s{padlen-i-1}>' for i in range(padlen)]

def is_pad(x):
  return x in start_pad(9) + end_pad(9)

def has_pad(a):
  for x in a:
    if is_pad(x):
      return True
  return False

def token_seq(sentence, replace_numbers=False, padlen=2, nopad=False):
    # Note: replace numbers is no-op here. replacement has already been performed
    token_seq = [token[3] for token in sentence['tokens']] # With lemma id
    if nopad:
      return token_seq
    return start_pad(padlen) + token_seq + end_pad(padlen)

def token_seq_tr(sentence, replace_numbers=False, padlen=2, nopad=False):
    # Note: replace numbers is no-op here. replacement has already been performed
    token_seq = sentence["sentence_transliteration"].split(" ") # Before lemmatization
    if nopad:
      return token_seq
    return start_pad(padlen) + token_seq + end_pad(padlen)

re_fraction=re.compile('^[0-9]+\/[0-9]+$')
def token_seq_rtc(sentence, replace_numbers=False, padlen=2, nopad=False):
    try:
      token_seq = [token[0] for token in sentence['tokens']] # With text - for RTC currently best pick
    except:
      print("Error with: ", sentence)
      raise "Errorred"
    if replace_numbers:
      # Handle numbers
      global re_fraction
      for idx in range(len(token_seq)):
        if token_seq[idx].isnumeric():
          token_seq[idx] = "1...n"
        if re_fraction.match(token_seq[idx]):
          token_seq[idx] = "1...n"
    #token_seq = [token[3] for token in sentence['tokens']] # With lemma id
    if nopad:
      return token_seq
    return start_pad(padlen) + token_seq + end_pad(padlen)


In [13]:
# @title Prediction statistics -- `PredictionStats`. { display-mode: "form" }
# @markdown
class PredictionStats:
    """Collect statistics regarding correctness of tests"""

    def __init__(self, name):
      self.name = name
      self.c = Counter()
      self.fails = 0
      self.start = time.time()
      self.end = float("nan")
      self.include_fails_in_n = True

    def start_testing(self):
      self.start = time.time()
      return self
    def end_testing(self):
      self.end = time.time()
      return self
    def miss(self):
      self.c[-1] += 1
    def fail(self):
      self.fails += 1   # Count failures outside N
    def predicted(self, item, predictions):
      if item in predictions:
        idx = predictions.index(item) + 1
        self.c[idx] += 1
      else:
        self.miss()
        idx = -1
      return idx
    def get_name(self):
      return self.name
    def get_n(self):
      n = self.c.total()
      # Failures are occasionally counted in n.
      if self.include_fails_in_n:
        n = n + self.fails
      return n
    def get_heading(self, include_mrr5=False):
      if include_mrr5:
        return ['hit', 'hit@5', 'hit@10', 'missed', 'MRR@5', 'MRR@10', 'N', 'untestable', 'ms/test']
      return ['hit', 'hit@5', 'hit@10', 'missed', 'MRR', 'N', 'untestable', 'ms/test']
    def get_stats(self, include_mrr5=False):
      accu = 0
      accu_ = 0
      accu5 = 0
      accu10 = 0
      for r in range(1, 6):
        accu_ += self.c[r]/r
      for r in range(1, 11):
        accu += self.c[r]/r
        accu5 += self.c[r] if r <= 5 else 0
        accu10 += self.c[r]
      n = self.get_n()
      if n < 1:
        n = 1
      if include_mrr5:
        return [
          self.c[1] / n,
          accu5 / n,
          accu10 / n,
          self.c[-1] / n,
          accu_ / n,
          accu / n,
          n,
          self.fails,
          ((self.end - self.start) * 1000) / n
        ]
      return [
          self.c[1] / n,
          accu5 / n,
          accu10 / n,
          self.c[-1] / n,
          accu / n,
          n,
          self.fails,
          ((self.end - self.start) * 1000) / n
      ]

# These variables collect statistics
prs = [None] * 2
stats = dict()

In [14]:
# @title Prediction Function `predict_words`. Use model(s) to predict word. { display-mode: "form" }
# @markdown Prediction function has restrictions as follows:
# @markdown
# @markdown - It can only predict one part of text.
# @markdown - It can only work on single word missing parts.

def predict_words(tokens,model,model_score=None,lacuna="LACUNA",verbose=False,omitted=None,vocab=None,algo=0,topn=10,padlen=2,stat_append=None,stat_id=None,nopad=False):
  """Predict missing word(s) from sentence.

    Parameters
    ------------
        tokens: list of str
            Sentence (token sequence) to predict (including start_pad() and end_pad())
        model: Word2Vec model
            Model used for predicting with predict_output_word
        model_score: Word2Vec model
            Model used for scoring with score
        lacuna: str
            Placeholder for missing word
        verbose: bool
            Perform debug prints (default=False)
        omitted: str
            Provide omitted token (only needed for verbose mode debug prints)
        vocab: list of str or None
            Possible words - if none, all words are possible. (only for algo=2)
        algo: int [-1, 2]
            Select algorithm to perform prediction
        topn: int
            Select how many result to return (default=10)

    Return
    -----------
        predictions: list of tuples (str, float)
            List of predictions in the best first order with weights.

        May raise exceptions on exceptional situations.

    Bugs
    ------------
        current implementation is able to predict only one missing word.
  """
  if vocab is None:
    if hasattr(model, 'wv'):
      vocab = model.wv.index_to_key

  try:
    idx = tokens.index(lacuna)
  except ValueError:
    return [(1.0), " ".join(tokens)]*topn

  predict = predict_words_internal(idx, tokens,model,model_score,vocab,algo,topn,padlen,nopad)
  if verbose:
    print(f"OMITTED = {omitted}")
    for i, p in enumerate(predict):
      if p[0] == omitted:
        print(f"PREDICTED {i} {token_to_text(p[0])} (prob: {p[1]}) match [rank={i+1}]")
      else:
        if hasattr(model, 'wv'):
          similarity = model.wv.n_similarity(omitted, p[0])
        else:
          similarity = 1.0 - i / topn # No similarity function
        print(f"PREDICTED {i} {token_to_text(p[0])} (prob: {p[1]}) similarity with orig: {similarity}")
  if stat_append is not None:
    tokens_ = filter(lambda t: not has_pad(t), tokens)
    simi=[]
    hit_idx = None
    for i, p in enumerate(predict):
      if hasattr(model, 'wv'):
        similarity = model.wv.n_similarity(omitted, p[0])
      else:
        similarity = 1.0 - i / topn # No similarity function
      simi.append([p[0],p[1],similarity])
      if similarity == 1:
        hit_idx = i + 1
    stat_append.append((stat_id, tokens_, omitted, hit_idx, tuple([p[0] for p in predict]), simi))
  return predict

# Underlying worker function
def predict_words_internal(idx, tokens,model,model_score=None,vocab=None,algo=0,topn=10,padlen=2,nopad=False):
  token_seq_before = tokens[:idx]
  token_seq_after = tokens[idx + 1:]

  if algo == -1:
    # Simply predict top vocabulary words.
    i = 0
    while is_pad(vocab[i]):
      i = i + 1
    return [(vocab[i+l], 1/(l+1)) for l in range(0, 10)]
  if algo == 0:
    if len(token_seq_before) > 0 and len(token_seq_after) > 0:
      if len(token_seq_before) > len(token_seq_after):
        token_seq_before = token_seq_before[-len(token_seq_after):]
      elif len(token_seq_before) < len(token_seq_after):
        token_seq_after = token_seq_after[:len(token_seq_before)]
      maxpad=4
      predict = model.predict_output_word(token_seq_before + token_seq_after,topn=topn+maxpad)
      # Filter predictions, remove padding symbols
      predict_out = []
      for x in predict:
        if not is_pad(x[0]):
          predict_out.append(x)
        if len(predict_out) == topn:
          break
      return predict_out
    else:
      return ValueError("Input not supported: no before or after tokens")
  if algo == 1:
    if len(token_seq_before) > 0 and len(token_seq_after) > 0:
      if len(token_seq_before) > len(token_seq_after):
        token_seq_before = token_seq_before[-len(token_seq_after):]
      elif len(token_seq_before) < len(token_seq_after):
        token_seq_after = token_seq_after[:len(token_seq_before)]
      predict = model.predict_output_word(token_seq_before + token_seq_after,topn=topn*10)
      sentences = []
      for pr in predict:
        tokens = token_seq_before + [pr[0]] + token_seq_after
        if nopad:
          tokens = tokens[len(start_pad(padlen)):len(tokens) - len(end_pad(padlen))]
        sentences.append(" ".join(tokens))
      scores = model_score.score(sentences)
      adjusted_predictions=[(predict[i][0], scores[i]) for i in range(len(scores))]
      adjusted_predictions.sort(key = lambda x: -x[1])
      predict = adjusted_predictions[:topn]
      return predict
    else:
      return ValueError("Input not supported: no before or after tokens")
  if algo == 2:
    sentences = []
    for token in vocab:
      tokens = token_seq_before + [token] + token_seq_after
      if nopad:
        tokens = tokens[len(start_pad(padlen)):len(tokens) - len(end_pad(padlen))]
      sentences.append(" ".join(tokens))
    scores = model_score.score(sentences)
    adjusted_predictions=[(vocab[i], scores[i]) for i in range(len(scores))]
    # TODO: Quicker sorting here
    adjusted_predictions.sort(key = lambda x: -x[1])
    predict = adjusted_predictions[:topn]
    return predict
  raise ValueError("algo is not supported")



In [15]:
# @title Define Model Testing Functions. { display-mode: "form" }
# @markdown `test_w2v` for testing AES with lemma id. Use `seq` parameter to override `token_seq`to test AES/TR or RTC.
# @markdown Alternatively you may use convenience wrappers `test_w2v_tr` and `test_w2v_rtc`.
def test_w2v(name, w2v, sentences, silent=False, padlen=2, algo=0, seq=token_seq, omitted_indexes=None):
  pr = PredictionStats(name)
  spadlen = padlen
  fpadlen = padlen * 2
  random.seed(42)
  if silent == False:
    print("Processing test sentences with single hidden word")
  c=0
  idx=0
  for sentence in sentences:
    tokens = seq(sentence,padlen=padlen)
    if len(tokens) == fpadlen:
      pr.fail() # Failed processing
      continue
    if omitted_indexes is None:
      token_omitted = random.randrange(len(tokens) - fpadlen) + spadlen
    else:
      token_omitted = omitted_indexes[idx] + spadlen
      idx = idx + 1
    token_orig = tokens[token_omitted]
    tokens[token_omitted] = "LACUNA"
    verbose = False
    if c < 5 and silent == False:
      verbose = True
    if c == 5 and silent == False:
        print("Processing remaining sentences quietly")
    c += 1

    try:
      predict = predict_words(tokens, w2v, None, verbose=verbose, omitted=token_orig, algo=algo, padlen=padlen)
      if len(predict) == 0:
        raise Exception("No predictions")
      try:
        pr.predicted(token_orig, [x[0] for x in predict])
      except:
        pr.fail() # Failed processing predictions
    except:
      pr.fail() # Failed processing predicting

  return pr.end_testing()

def test_w2v_tr(name, w2v, sentences, silent=False, padlen=2, algo=0):
  return test_w2v(name, w2v, sentences, silent, padlen, algo, seq=token_seq_tr)

def test_w2v_rtc(name, w2v, w2v_hs=None, algo=0, sentences=aecorpus.rtc_t, padlen=2, silent=False, nopad=False):
  return test_w2v(name, w2v, sentences, silent, padlen, algo, seq=token_seq_rtc)



# Ngrams

In [16]:
# @title Function for training Ngram-based models
# @markdown The Ngrams are organized in dict to predict middle word.
# @markdown
# @markdown These take a few minutes to test.

# Ngram model
# This class has been carefully defined to be compatible with Word2Vec's
# models from perspective of (the few used) API functions.
# The class needs to provide __init__ and predict_output_word.
# For predict_output_word, ngrams are split and placed in dictionary
# left-most ngrams, right-most ngrams. In case n is odd, right side is
# larger (the context word to be found is on the right side).
#
# This can be used with test_w2v and underlying predict_words with algo=0

# Optionally build models: we test POS so keep it as no.
build_ngrams = False

from nltk.util import ngrams
class NgramModel:
  def __init__(self, tokens, seq=token_seq, n=3):
    lr_to_m = dict()
    c = Counter()
    for s in tokens:
      for ng in ngrams(seq(s), n):
        c[ng] += 1
    # middle value dictionary; (left_side + right_side) => (middle, counter)
    self.right = (n + 1) // 2 # Right side context length
    self.left = n - self.right # Left side context length
    nopad = set()
    pad = set()
    for k in c.most_common():
      key = list(k[0])
      m = key.pop(self.left)
      key = tuple(key)
      # Filter padding characters from results
      if not m in nopad:
        if m in pad:
          continue
        if is_pad(m):
          pad.add(m)
          continue
        else:
          nopad.add(m)
      if key not in lr_to_m:
        lr_to_m[key] = []
      lr_to_m[key].append((m, k[1]))
    self.n = n
    self.seq = seq
    self.total = c.total()
    self.lr_to_m = lr_to_m
    top_ngram = c.most_common(1)
    self.top_ngram_count = top_ngram[0][1] if len(top_ngram) >= 1 else 1

  def predict_output_word(self, seq, topn, verbose=False):
    # Function may be called with varying amounts of context.
    # We need to use exactly n - 1 words of context.
    # Find a context with size n - 1
    if len(seq) < self.n - 1:
      return []
    while len(seq) >= self.n + 1:
      seq = seq[1:len(seq)-1] # Keep middle of context
    while len(seq) >= self.n:
      seq = seq[0:len(seq)-1] # Dismiss last
    left = tuple(seq[:self.left])
    right = tuple(seq[self.left:])
    key = tuple(left + right)

    # Search for results: match left side with dictionary lookup, ...
    r = []
    for n in self.lr_to_m.get(key, []):
      # Add results
      r.append((n[0], n[1]/self.top_ngram_count))
      # If there are extraneous results, throw
      if len(r) >= topn:
        break
    if verbose:
      print(r)
    return r

def store_stats(pr):
  stats[pr.get_name()] = pr
  values = pr.get_stats()
  return values[4]

build_ngrams_aes = build_ngrams
build_ngrams_aes_tr = build_ngrams
build_ngrams_rtc = build_ngrams
build_ngram_tr_combined = build_ngrams

for x in tqdm_notebook(range(4), desc="Train", leave=False):
  if x == 0 and build_ngrams_aes:
    ngram_aes = [None]*10
    for n in tqdm_notebook(range(1, 10), desc="AES / Lemmatized", leave=False):
      ngram_aes[n] = NgramModel(aecorpus.aes_1, n=n)
      pr = test_w2v(f'{n}-ngrams AES', ngram_aes[n], sentences=aecorpus.aes_t, silent=True)
      store_stats(pr)

  if x==1 and build_ngrams_aes_tr:
    ngram_aes_tr = [None]*10
    for n in tqdm_notebook(range(1, 10), desc="AES / TR", leave=False):
      ngram_aes_tr[n] = NgramModel(aecorpus.aes_1, seq=token_seq_tr, n=n)
      pr = test_w2v(f'{n}-ngrams AES TR', ngram_aes_tr[n], sentences=aecorpus.aes_t, seq=token_seq_tr, silent=True)
      store_stats(pr)

  if x==2 and build_ngrams_rtc:
    ngram_rtc = [None]*10
    for n in tqdm_notebook(range(1, 10), desc="RTC", leave=False):
      ngram_rtc[n] = NgramModel(aecorpus.rtc_1, seq=token_seq_rtc, n=n)
      pr = test_w2v(f'{n}-ngrams RTC', ngram_rtc[n], sentences=aecorpus.rtc_t, seq=token_seq_rtc, silent=True)
      store_stats(pr)

  if x==3 and build_ngram_tr_combined:
    # Train and test ngram models (for combined)
    ngram_tr_combined = [None]*10
    for n in tqdm_notebook(range(1, 10), desc="Combined", leave=False):
      ngram_tr_combined[n] = NgramModel(marete_train_sentences_in, seq=token_seq_out, n=n)
      pr = test_w2v(f'{n}-ngrams TR combined', ngram_tr_combined[n], sentences=marete_test_sentences_in, seq=token_seq_out, silent=True)
      store_stats(pr)



Train:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
# @title Function for n-grams results aggregation. { display-mode: "form" }

# @markdown bigrams are not very accurate, 7-grams are rarely found. Combine
# @markdown x-ngrams these into new model, that takes results from the highest ngrams
# @markdown available.
# @markdown

class NgramAggregate:
  def __init__(self, ngrams):
    self.ngrams = tuple(reversed(ngrams))
  def predict_output_word(self, seq, topn, verbose=False):
    results = []
    filter_results = set()
    for ng in self.ngrams:
      if verbose == False:
        new_results = ng.predict_output_word(seq, topn)
      else:
        new_results = ng.predict_output_word(seq, topn, verbose)
      # Only append unique results.
      for a in new_results:
        if a[0] not in filter_results:
          results.append(a) # Ignore scoring
          filter_results.add(a[0])
          if len(results) >= topn:
            break
      if len(results) >= topn:
        break
    # Replace scoring (to remove inconsistencies between different n-grams in the aggregate)
    for idx in range(len(results)):
      results[idx] = (results[idx][0], 1 / (idx + 1))
    return results

# Build combined NgramAggregate (only if n-grams trained)
for x in tqdm_notebook(range(4), desc="Test Model", leave=False):
  if x == 0 and build_ngrams_aes:
    nga = NgramAggregate(ngram_aes[1:])
    pr = test_w2v('1,9-ngrams AES', nga, sentences=aecorpus.aes_t, silent=True)
    store_stats(pr)
  if x == 1 and build_ngrams_aes_tr:
    nga_tr = NgramAggregate(ngram_aes_tr[1:])
    pr = test_w2v('1,9-nngrams AES TR', nga_tr, sentences=aecorpus.aes_t, seq=token_seq_tr, silent=True)
    store_stats(pr)
  if x == 2 and build_ngrams_rtc:
    nga_rtc = NgramAggregate(ngram_rtc[1:])
    pr = test_w2v('1,9-ngrams RTC', nga_rtc, sentences=aecorpus.rtc_t, seq=token_seq_rtc, silent=True)
    store_stats(pr)
  if x == 3 and build_ngram_tr_combined:
    # Construct and test aggregation of ngram models (for combined)
    # Takes 5 minutes
    nga_tr_combined = NgramAggregate(ngram_tr_combined[1:])
    pr = test_w2v('1,9-ngrams TR combined', nga_tr_combined,
                  sentences=tqdm_notebook(marete_test_sentences_in, desc="Sentence", leave=False), seq=token_seq_out, silent=True)
    store_stats(pr)

Test Model:   0%|          | 0/4 [00:00<?, ?it/s]

# [Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html)

Generate Word2Vec and FastText models, test them and allow user to view some results.

The models are implemented using gensim libraries.

In [18]:
# @title Define Model Training function: `get_gensim_gen`

# @markdown The function automates storage and retrieval of models.
# @markdown
# @markdown Update model id when inputs or model settings are updated.
#
# @markdown
# @markdown Also defines Model Generation Progress Class
# @markdown (which provides progress bar for model training).
gensim_model_prefix="2024-03-13a" # @param {type:"string"}
gensim_model_store = True # @param {type:"boolean"}
all_models_pregenerated = False # @param {type:"boolean"}


class TqdmModelProgress(CallbackAny2Vec):
     '''Callback to log information about training'''

     def __init__(self, epochs):
         self.epoch = 0
         self.epochs = epochs
         self.tqdm = tqdm_notebook(desc="Starting", total=self.epochs,
                                   unit="epoch", leave=False)

     def on_epoch_begin(self, model):
         if self.epoch == 0:
             self.tqdm.desc = "Training"

     def on_epoch_end(self, model):
         self.epoch += 1
         if self.epoch == self.epochs:
             self.tqdm.description = "Finished"
         loss = model.get_latest_training_loss()
         if loss is not None:
             self.tqdm.set_postfix({'loss=': loss})
         self.tqdm.update(1)

     def close(self):
         self.tqdm.close()

def get_gensim_gen(corpus, vector_size=200, epochs=100, padlen=2,
                   silent=False, token_seq_func = token_seq, nobar=False,
                   model = gensim.models.Word2Vec, id=None,
                   noload=False, nosave=False, test=None, testlimit=0.1,
                   prev_model = None,
                   **kwargs):
  '''Load or train Word2Vec or FastText model'''
  global gensim_model_id
  global gensim_model_store
  if id is None:
    id = model.__class__.__name__
  filepath = f"models/{gensim_model_prefix}-{id}.model"
  # If all_models_pregenerated is set we expect to
  # find all models encountered in processing and
  # raise error if model is not found.
  if all_models_pregenerated == True:
    if not os.path.exists(filepath):
      raise(Exception(f"Model path {filepath} does not exist"))
  if gensim_model_store and not noload and os.path.exists(filepath):
    model_out = model.load(filepath)
    if test is not None:
      if test(model_out) < testlimit:
        raise Exception("Loaded Model failed to validate")
    return model_out

  c, cn, cn2 = (0, 0, 0)
  sentences_list = []
  token_kwargs = {key:kwargs[key] for key in ['nopad', 'replace_numbers'] if key in kwargs}
  for sentence in corpus:
    tokens = token_seq_func(sentence, padlen=padlen, **token_kwargs)
    if silent == False:
      if c < 5:
        print(tokens)
        c += 1
      elif cn < 2 and '1...n' in tokens:
        print(tokens)
        cn += 1
      elif cn2 < 2 and '123' in tokens:
        print(tokens)
        cn2 += 1
    sentences_list.append(tokens)
  in_kwargs = dict(kwargs)
  if 'window' not in in_kwargs:
    in_kwargs['window']=5
  if 'min_count' not in in_kwargs:
    in_kwargs['min_count']=1
  if 'replace_numbers' in in_kwargs:
    del in_kwargs['replace_numbers']
  if 'nopad' in in_kwargs:
    del in_kwargs['nopad']
  callbacks = []
  if not nobar:
    callbacks = [TqdmModelProgress(epochs)]

  if prev_model is None:
    model_out = model(sentences=sentences_list, vector_size=vector_size,
                      workers=10, epochs=epochs, callbacks=callbacks,
                      **in_kwargs)
  else:
    #model_out = prev_model.train(sentences=sentences_list, vector_size=vector_size,
    #                  workers=10, epochs=epochs-prev_model.epochs, callbacks=callbacks,
    #                  **in_kwargs)
    # Some arguments only apply in creation of model.
    if 'vector_size' in in_kwargs:
      del in_kwargs['vector_size']
    model_out = prev_model.train(corpus_iterable=sentences_list,
                      epochs=epochs, callbacks=callbacks,
                      **in_kwargs)

  if test is not None:
    if test(model_out) < testlimit:
      raise Exception("Trained model failed to validate")
    if gensim_model_store and not nosave:
      model_out.save(filepath)
  callbacks[0].close()
  return model_out

def get_gensim_ft_rtc(corpus, **kwargs):
  return get_gensim_gen(corpus, model=gensim.models.FastText,
                        token_seq_func = token_seq_rtc, **kwargs)

def get_gensim_ft_aes(corpus, **kwargs):
  return get_gensim_gen(corpus, model=gensim.models.FastText,
                        token_seq_func = token_seq_tr, **kwargs)


In [19]:
# @title Train POS models and perform testing.  { display-mode: "form" }
# @markdown Two kinds of models are trained, Word2Vec and n-grams.
# @markdown
# @markdown These POS models work on lemmatized AES sentences.
# @markdown
# @markdown When all models are retrained expect execution of the cell to take
# @markdown around 5 minutes with epoch of 15.
# @markdown The results in report are using maximum epoch of 150 (training time of around half an hour).

train_pos_models=True

pos_train_max_epoch=15 # @param {type:"integer"}

# Ensure direcgtory for models exists
!mkdir -p models

# Helper function: get sequence of POS from lemmatized AES sentence
def token_seq_pos(sentence, replace_numbers=False, padlen=2, nopad=False):
    # Note: replace numbers is no-op here. replacement has already been performed
    token_seq = [token[4] for token in sentence['tokens']] # With lemma id
    if nopad:
      return token_seq
    return start_pad(padlen) + token_seq + end_pad(padlen)

pos_stats = dict()
def pos_store_stats(pr):
  pos_stats[pr.get_name()] = pr
  values = pr.get_stats()
  return values[4]
for x in tqdm_notebook(range(5), desc="Model", leave=False):
  if x == 0 and train_pos_models:
    padlen = 2
    epochs = min(pos_train_max_epoch, 150)
    optimized_w2v_sg_opts = {'ns_exponent': 0.05, 'negative': 21, 'sg': 1, 'vector_size': 200,
                         'window': 2, 'sample': 0.01, 'shrink_windows': True,
                         'compute_loss': True, 'epochs': epochs, 'compute_loss': True}
    w2vpos = get_gensim_gen(aecorpus.aes_1, token_seq_func = token_seq_pos,
                           silent=True, padlen=padlen, id=f"W2V-AES-pos-sg-{epochs}",
                           test=lambda model: pos_store_stats(test_w2v(f"W2V-pos AES {epochs}", model, aecorpus.aes_t, padlen=padlen, silent=True, seq=token_seq_pos)),
                           **optimized_w2v_sg_opts) # Takes around 2 minutes
  if x == 1 and train_pos_models:
    epochs = min(pos_train_max_epoch, 150)
    opts_known_good = {'ns_exponent': 0.21821168418246245, 'negative': 21, 'sg': 1,
                       'vector_size': 227, 'window': 2, #'min_count': 4,
                       'shrink_windows': False, 'compute_loss': True,
                       'epochs': epochs}
    w2vpos2 = get_gensim_gen(aecorpus.aes_1, token_seq_func = token_seq_pos,
                           silent=True, padlen=padlen, id=f"W2V-AES-pos-{epochs}",
                           test=lambda model: pos_store_stats(test_w2v(f"W2V-pos AES (default) {epochs}", model, aecorpus.aes_t, padlen=padlen, silent=True, seq=token_seq_pos)),
                           **opts_known_good) # Takes around 2 minutes
  if x == 2 and train_pos_models:
    # Provide alternative without hand-tuned options.
    padlen = 2
    epochs = min(pos_train_max_epoch, 150)
    opts = { 'sg': 1, 'compute_loss': True, 'epochs': epochs }
    w2vpos_alt = get_gensim_gen(aecorpus.aes_1, token_seq_func = token_seq_pos,
                           silent=True, padlen=padlen, id=f"W2V-AES-pos-default-{epochs}",
                           test=lambda model: pos_store_stats(test_w2v(f"W2V-pos-default AES {epochs}", model, aecorpus.aes_t, padlen=padlen, silent=True, seq=token_seq_pos)),
                           **opts) # Takes around 2 minutes
  if x == 3 and train_pos_models:
    ngram_aes_pos = [None]*10
    for n in tqdm_notebook(range(1, 10), desc="Ngrams", leave=False):
      ngram_aes_pos[n] = NgramModel(aecorpus.aes_1, seq=token_seq_pos, n=n)
      pr = test_w2v(f'{n}-grams AES POS', ngram_aes_pos[n], sentences=aecorpus.aes_t, silent=True, seq=token_seq_pos)
      pos_store_stats(pr)
  if x == 4 and train_pos_models:
    nga_pos = NgramAggregate(ngram_aes_pos[1:])
    pr = test_w2v('1,9-grams AES POS', nga_pos, sentences=aecorpus.aes_t, silent=True, seq=token_seq_pos)
    pos_store_stats(pr)

if train_pos_models:
  text = "Trained POS Models are now available: " + (", ".join(pos_stats.keys()))
  widget = widgets.HTML(value= '<style>p{word-wrap: break-word}</style> <p>'+ text +' </p>')
  display(widget)


Model:   0%|          | 0/5 [00:00<?, ?it/s]

Ngrams:   0%|          | 0/9 [00:00<?, ?it/s]

HTML(value='<style>p{word-wrap: break-word}</style> <p>Trained POS Models are now available: W2V-pos AES 15, W…

In [20]:
# @title Vocabulary viewer { display-mode: "form" }
# @markdown Examine vocabularies of the models interactively.

@interact(model=[("Word2Vec / AES POS",
                  (w2vpos, "POS"))])
def view_vocabulary(model):
  clear_output()
  fields = list(model)[1:]
  model = model[0]
  if model is not None and model != "":
      vocab = list(model.wv.index_to_key)
      d = dict()
      for f in fields:
        if f == "text":
          d[f.capitalize()] = [token_to_text(token) for token in vocab]
        else:
          d[f.capitalize()] = [token for token in vocab if not is_pad(token)]
      print(f"Vocabulary size: {len(vocab)} (incl. padding)")
      pd.set_option('display.min_rows', 25)
      pd.set_option('display.max_rows', 25)
      vocabulary_entries_df = pd.DataFrame(d)
      display(vocabulary_entries_df)



interactive(children=(Dropdown(description='model', options=(('Word2Vec / AES POS', (<gensim.models.word2vec.W…

# Results

In [21]:
# @title Display statistics collected from model testing. { display-mode: "form" }

for k,i in pos_stats.items():
  stats[k] = i

def stats_update():
  row_names = stats.keys()
  column_names = list(stats.values())[0].get_heading(include_mrr5=True)
  ps = [pr.get_stats(include_mrr5=True) for pr in stats.values()]
  dfdata = np.row_stack([(f'{x[0]:.4f}', f'{x[1]:.4f}', f'{x[2]:.4f}', f'{x[3]:.4f}', f'{x[4]:.4f}' , f'{x[5]:.4f}', f'{x[6]}', f'{x[7]}', f'{x[8]:.4f}') for x in ps])
  pd.set_option('display.max_rows', 40)
  pd.set_option('display.min_rows', 40)
  text_stats_df = pd.DataFrame(dfdata, columns=column_names, index=row_names)
  text_stats_df.index = row_names
  display(text_stats_df)

def stats_update_text():
  """Textual statistics for cut&paste"""
  row_names = stats.keys()
  column_names = list(stats.values())[0].get_heading(include_mrr5=True)
  ps = [pr.get_stats(include_mrr5=True) for pr in stats.values()]
  dfdata = np.row_stack([(f'{x[0]:.4f}', f'{x[1]:.4f}', f'{x[2]:.4f}', f'{x[3]:.4f}', f'{x[4]:.4f}' , f'{x[5]:.4f}', f'{x[6]}', f'{x[7]}', f'{x[8]:.4f}') for x in ps])
  pd.set_option('display.max_rows', 90)
  pd.set_option('display.min_rows', 90)
  text_stats_df = pd.DataFrame(dfdata, columns=column_names, index=row_names)
  text_stats_df.index = row_names
  pd.set_option('display.width', 1000)
  print(text_stats_df)
def stats_update_latex():
  """Latex format statistics for cut&paste"""
  row_names = stats.keys()
  column_names = [ '&', 'hit@1', '&', 'hit@5', '&', 'hit@10', '&', 'MRR@10', '&', 'Processed sentences', '\\\\' ]

  #list(stats.values())[0].get_heading(include_mrr5=True)
  ps = [pr.get_stats(include_mrr5=True) for pr in stats.values()]
  dfdata = np.row_stack([('&', f'{x[0]:.6f}', '&', f'{x[1]:.6f}', '&', f'{x[2]:.6f}', '&', f'{x[5]:.6f}', '&', f'{x[6]}', '\\\\') for x in ps])
  pd.set_option('display.max_rows', 90)
  pd.set_option('display.min_rows', 90)
  text_stats_df = pd.DataFrame(dfdata, columns=column_names, index=row_names)
  text_stats_df.index = row_names
  pd.set_option('display.width', 1000)
  print(text_stats_df)

def on_button_clicked(stats_button):
    with stats_output:
        clear_output(wait = True)
        global stats_format
        if stats_format == 'Table':
          stats_update()
        elif stats_format == 'Text':
          stats_update_text()
        else:
          stats_update_latex()

stats_button = widgets.Button(description="Refresh Stats")
stats_output = widgets.Output()
stats_format = 'Table'
with stats_output:
  on_button_clicked(None)
display_type = widgets.Dropdown(
    options=['Table', 'Text', 'Latex'],
    value='Table',
    description='Format:',
    disabled=False,
)

def on_change(change):
  if change['type'] == 'change' and change['name'] == 'value':
    global stats_format
    stats_format = change['new']
    on_button_clicked(None)

display_type.observe(on_change)

exclude_untestable = widgets.Dropdown(
    options=['Nothing', 'Untestable'],
    value='Nothing',
    description='Exclude',
    disabled=False,
)

def on_change_exclude(change):
  #print(change)
  if change['type'] == 'change' and change['name'] == 'value' and change['new'] == 'Nothing':
    #print("Nothing")
    for k in stats.keys():
      stats[k].include_fails_in_n = True
  if change['type'] == 'change' and change['name'] == 'value' and change['new'] == 'Untestable':
    #print("Ecluding untestabke")
    for k in stats.keys():
      stats[k].include_fails_in_n = False
  on_button_clicked(None)

exclude_untestable.observe(on_change_exclude)
display(stats_button, display_type, exclude_untestable, stats_output)
stats_button.on_click(on_button_clicked)



Button(description='Refresh Stats', style=ButtonStyle())

Dropdown(description='Format:', options=('Table', 'Text', 'Latex'), value='Table')

Dropdown(description='Exclude', options=('Nothing', 'Untestable'), value='Nothing')

Output()

In [28]:
# @title Download the models

import pickle

with open('models/1,9-grams.pickle', 'wb') as out:
  out.write(pickle.dumps(nga_pos))

from google.colab import files
print("Models available for downloading:")
buttons = []
for file in os.listdir('models'):
  def create_button(filename):
    layout = widgets.Layout(width='auto', height='40px') #set width and height
    download_button = widgets.Button(
        description=f"Download {filename}",
        display='flex',
        flex_flow='column',
        align_items='stretch',
        layout=layout)
    download_button.on_click(lambda button: files.download(f"models/{filename}"))
    return download_button
  buttons.append(create_button(file))
display(*buttons)


Models available for downloading:


Button(description='Download 2024-03-13a-W2V-AES-pos-15.model', layout=Layout(height='40px', width='auto'), st…

Button(description='Download 2024-03-13a-W2V-AES-pos-sg-15.model', layout=Layout(height='40px', width='auto'),…

Button(description='Download 2024-03-13a-W2V-AES-pos-default-15.model', layout=Layout(height='40px', width='au…

Button(description='Download 1,9-grams.model', layout=Layout(height='40px', width='auto'), style=ButtonStyle()…

Button(description='Download 1,9-grams.pickle', layout=Layout(height='40px', width='auto'), style=ButtonStyle(…