# Tools for Using Cuneiform Word Embeddings
In order to use the cuneiform word embeddings, this notebook creates dictionaries where the cuneiform (utf-8) words are the keys with a set of transliterations or lemmatizations as value.

In [1]:
import pickle
import pandas as pd
import tqdm

# Open DataFrame with sux corpus
This corpus was produced by 2-parse_oracc_sux.ipynb and is a dataframe with four columns: `transliteration`, `words` (a list in which the signs of the transliterated text have been separated ## needs better name), `names` (same as `words` but now in sign names), `utf-8` (the word in cuneiform), and `lemm` (lemmatized form of the word)

In [2]:
with open("corpus/sux_df.p", "rb") as p:
    sux = pickle.load(p)

In [3]:
sux

Unnamed: 0,transliteration,words,names,utf-8,lemm
0,Startepsd2/admin/ed3a/P011046,[Startepsd2/admin/ed3a/P011046],[Startepsd2/admin/ed3a/P011046],Startepsd2/admin/ed3a/P011046,Startepsd2/admin/ed3a/P011046
1,1(barig@c),[1(barig@c)],[DIŠ],𒁹,1(barig@c)
2,še,[še],[ŠE],𒊺,še[barley]N
3,ba-lul,"[ba, lul]","[BA, LUL]",𒁀𒈜,ba-lul
4,nagar,[nagar],[NAGAR],𒉄,nagar[carpenter]N
5,1(barig@c),[1(barig@c)],[DIŠ],𒁹,1(barig@c)
6,nig₂-du₇,"[nig₂, du₇]","[GAR, |U.GUD|]",𒃻𒌌,niŋdu[appropriate-thing]N
7,ag₂,[ag₂],[|NINDA₂×NE|],𒉘,aŋ[measure]V/t
8,hur-sag-še₃-mah,"[hur, sag, še₃, mah]","[|HI×AŠ₂|, SAG, EŠ₂, MAH]",𒄯𒊕𒂠𒈤,hur-sag-še₃-mah
9,sa₁₂-du₅,"[sa₁₂, du₅]","[SAG, DUN₃]",𒊕𒂅,saŋ.DUN₃[recorder]N


# Collect all transliterations/lemmas that represent one utf-8 sequence of cuneiform signs
Match a sequence of cuneiform signs with all the transliterations and lemmas currently in the database. The value of each key is a *set*, containing only unique values. This results in two dictionaries, one containing cuneiform characters sequences paired with transliterations, the other containing cuneiform character sequences paired with lemmatizations. Currently, the `sux` dataframe has more than 4 million entries, and going through them to build the dictionaries may take almost 10 minutes.

In [5]:
utf8_tl_d = {}
utf8_lemm_d = {}
for idx, item in tqdm.tqdm(enumerate(sux["utf-8"])):
    if item[:5] == "Start": # this indicates the start of a new text
        continue
    if item in utf8_tl_d:
        utf8_tl_d[item].add(sux.iloc[idx]["transliteration"])
    else:
        utf8_tl_d[item] = {sux.iloc[idx]["transliteration"]}
    if item in utf8_lemm_d:
        utf8_lemm_d[item].add(sux.iloc[idx]["lemm"])
    else:
        utf8_lemm_d[item] = {sux.iloc[idx]["lemm"]}

4434468it [13:59, 5285.30it/s]


# Save
Pickle the dictyionaries for use in other notebooks.

In [38]:
with open("output/utf8_translit_d.p", "wb") as w:
    pickle.dump(utf8_tl_d, w)
with open("output/utf8_lemm_d.p", "wb") as w2:
    pickle.dump(utf8_lemm_d, w2)

# Search for sign sequence

In [8]:
with open("output/ogsl.p", "rb") as p:
    o = pickle.load(p)
d = dict(zip(o["value"], o["utf8"]))

In [9]:
def cun(text): 
    """transform transliterated input into cuneiform. Use unicode subscript numbers and separate all signs with hyphens; separate words with blanks
    Examples: 'ma-an-gi₄'; 'd-en-lil₂ nibru-ki'.
    Transliteration style (sugal₇ vs. sukkal; dug₄ vs. du₁₁; gen vs. ŋen; etc.) and capitalization are unimportant.
    """
    cun_line = []
    words = text.lower().split()
    for word in words: 
        signs = word.lower().split('-')
        seq = [d[s] if s in d else s for s in signs]
        seq = ''.join(seq)
        cun_line.append(seq)
    line = ' '.join(cun_line)
    return(line)

In [10]:
def translit(cuneiform):
    if cuneiform in utf8_tl_d:
        return [cuneiform, utf8_tl_d[cuneiform]]
    else:
        return 'not found'

In [11]:
def lemmat(cuneiform):
    if cuneiform in utf8_lemm_d:
        return [cuneiform, utf8_lemm_d[cuneiform]]
    else:
        return 'not found'

In [13]:
translit(cun("lugal"))

['𒈗',
 {'LUGAL',
  'bišeba',
  'haniš₂',
  'lillan',
  'lillan(LUGAL)',
  'lillia',
  'lugal',
  'rab₃',
  'sag₄',
  'saŋ₄',
  'šar₃'}]

In [37]:
lemmat(cun("na₄"))

['𒉌𒌓',
 {'DAG₃',
  'NA₄',
  'ah[dry]V/i',
  'bar₄',
  'dag₃',
  'ia[oh!]J',
  'i₃-UD',
  'kur(NA₄)',
  'na[stone]N',
  'na₄',
  'ni-tam',
  'za[bead]N',
  '|NI+UD|',
  '|NI.UD|'}]

In [36]:
translit("𒐢")

['𒐢', {'5(GEŠʾU)', '5(gešʾu)', '5(gešʾu@c)'}]