In [None]:
# --- EvaCun Environment bootstrap: run me first ---
# Supports Google Colab, VS Code, and local Jupyter.
import os, sys, subprocess, shutil
from pathlib import Path

# Detect Colab
IN_COLAB = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

# Fork-first strategy:
# Set EVACUN_REPO_URL to your fork, e.g. https://github.com/<your-username>/Akkademia.git
# If left as a placeholder, we'll fall back to upstream.
FORK_PLACEHOLDER = "https://github.com/<your-username>/Akkademia.git"
EVACUN_REPO_URL = os.environ.get("EVACUN_REPO_URL", FORK_PLACEHOLDER).strip()
UPSTREAM_URL = os.environ.get("EVACUN_UPSTREAM_URL", "https://github.com/gaigutherz/Akkademia.git").strip()

repo_url = EVACUN_REPO_URL if "<your-username>" not in EVACUN_REPO_URL else UPSTREAM_URL

# Clone directory name; usually 'Akkademia'
EVACUN_REPO_NAME = os.environ.get("EVACUN_REPO_NAME", "Akkademia")
REPO_DIR = Path(os.getenv("EVACUN_REPO_DIR", Path.cwd() / EVACUN_REPO_NAME)).resolve()

# Data directories
DATA_DIR = REPO_DIR / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Clone if needed
if not REPO_DIR.exists() or not any(REPO_DIR.iterdir()):
    print(f"Cloning {repo_url} into {REPO_DIR} ...")
    subprocess.check_call(["git", "clone", "--depth", "1", repo_url, str(REPO_DIR)])

# Install project if it has a build file
pyproject = REPO_DIR / "pyproject.toml"
setup_py = REPO_DIR / "setup.py"
if pyproject.exists() or setup_py.exists():
    print("Installing project (editable mode if possible)...")
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", str(REPO_DIR)])
    except subprocess.CalledProcessError:
        print("Editable install failed; falling back to standard install.")
        subprocess.check_call([sys.executable, "-m", "pip", "install", str(REPO_DIR)])

# Add repo to Python path and cd into it
if str(REPO_DIR) not in sys.path:
    sys.path.insert(0, str(REPO_DIR))
os.chdir(REPO_DIR)
print("Working directory set to:", REPO_DIR)
print("EvaCun data directory:", DATA_DIR)

# Optional: install extra deps
for req_name in ["requirements-colab.txt", "requirements.txt"]:
    req_path = REPO_DIR / req_name
    if req_path.exists():
        print(f"Installing dependencies from {req_name} ...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", str(req_path)])
        except subprocess.CalledProcessError as e:
            print(f"Warning: Failed to install from {req_name}: {e}")

print("EvaCun bootstrap complete. Replace any old Google Drive paths with INPUT_DIR / OUTPUT_DIR helpers.")


In [None]:
# --- Path helpers ---
from pathlib import Path
import os

# REPO_DIR and DATA_DIR are defined in the bootstrap cell.
# Recommended: put raw inputs under data/input and outputs under data/outputs
INPUT_DIR = DATA_DIR / "input"
OUTPUT_DIR = DATA_DIR / "outputs"
INPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def in_input(*parts):
    return INPUT_DIR.joinpath(*parts)

def in_output(*parts):
    return OUTPUT_DIR.joinpath(*parts)

print("INPUT_DIR:", INPUT_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)

In [None]:
# --- IO helpers ---
import pandas as pd
from pathlib import Path

def read_csv(path, **kwargs):
    path = Path(path)
    print(f"Reading CSV: {path}")
    return pd.read_csv(path, **kwargs)

def write_csv(df, path, **kwargs):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    print(f"Writing CSV: {path}")
    return df.to_csv(path, index=False, **kwargs)

#Starting Page

1. First you will want to add a shortcut for '[Akkadamia-master](https://drive.google.com/drive/folders/11ktSfHQbcjM2uLT2BHHyZyLH0VAw9YNm?usp=drive_link)' to your 'My Drive' folder in Google Drive: https://drive.google.com/drive/folders/11ktSfHQbcjM2uLT2BHHyZyLH0VAw9YNm?usp=drive_link

2. From there you should be able to mount the notebook and run all following cells. Note that it will overwright the previous instance of any model builds, etc. If you don't want that, you can change the directory paths or download Akkademia-master from GitHub and put it in your folder (then change the directory to reflect the new location).

In [None]:
# (Disabled) Google Drive mount cell — not needed when running from GitHub.
# from google.colab import drive
# drive.mount('/content/drive')

## Create a shortcut in your drive to put Akkademia_master in your "MyDrive"

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# NOTE: Replace absolute Google Drive paths with paths under DATA_DIR / INPUT_DIR / OUTPUT_DIR.
# Example: OLD: '/content/drive/MyDrive/some_folder/file.csv'
#          NEW: in_input('some_folder', 'file.csv')
# (Disabled) Google Drive mount cell — not needed when running from GitHub.
# import sys
# sys.path.append('/content/drive/MyDrive/Akkademia-master/akkadian')
# 
# !pip install sentencepiece
# import sentencepiece
# from pathlib import Path
# import shutil

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-macosx_11_0_arm64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


The following doesn't work because the paths aren't specified correctly. In the tokenization step, we copy + paste code to edit the file paths

In [None]:
import translation_tokenize as tt
tt.train_tokenizer()

ModuleNotFoundError: No module named 'translation_tokenize'

#I. Data Input Method

##1 Read in the CSV data for CDLI and ORACC into a data frame for English.

In [None]:
clean_oracc_en_path = 'DataFiles/clean_oracc_en.csv'
clean_oracc_en_df = pd.read_csv(clean_oracc_en_path)
clean_oracc_en_df['genre'].fillna("Uncertain", inplace=True)
clean_oracc_en_df

Unnamed: 0,txt_id,id_text,Title,object_type,language,period,dialect,genre,subgenre,English,Original_GS_Text
0,X150289,X150289,saao-saa15-X150289.txt,envelope,Akkadian,Neo-Assyrian,Neo-Assyrian,Administrative Letter,,to the deputy governor my lord seal impres...,"(1) [To the deputy (governor)], my lord: (seal..."
1,X210106,X210106,saao-saa21-X210106.txt,tablet,Akkadian,Neo-Assyrian,Neo-Babylonian,Administrative Letter,,the king s word to hundaru king of dilmun ...,"(1) The king's word to Hunda[ru, king of Dilmu..."
2,X211891,X211891,saao-saa21-X211891.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Administrative Letter,,i went to my and appealed to assurb...,(Beginning destroyed)(1') I went [to] my [......
3,P237223,P237223,caspo-P237223.txt,,,,,Uncertain,,of the goddesses of the world ...,[. . .] of the goddesses[. . .] of the world[....
4,P237219,P237219,ccpo-P237219.txt,,,,,Uncertain,,means it rains and the rainbow arches ...,[... means ...] it rains and (the rainbow) arc...
...,...,...,...,...,...,...,...,...,...,...,...
8033,Q000960,Q000960,etcsri-Q000960.txt,brick,Sumerian,Ur III,,Royal Inscription,,šulgi the powerful man king of urim king ...,"(1) Šulgi, the powerful man, king of Urim, kin..."
8034,Q000958,Q000958,etcsri-Q000958.txt,cone,Sumerian,Ur III,,Royal Inscription,,for nanna his master ur namma the powerfu...,"(1) For Nanna, his master, Ur-Namma, the power..."
8035,Q000965,Q000965,etcsri-Q000965.txt,"canephor, foundation tablet",Sumerian,Ur III,,Royal Inscription,,for ninĝirsu the powerful warrior of enlil ...,"(1) For Ninĝirsu, the powerful warrior of Enli..."
8036,Q000957,Q000957,etcsri-Q000957.txt,cone,Sumerian,Ur III,,Royal Inscription,,for enlil king of all lands his master ur...,"(1) For Enlil, king of all lands, his master, ..."


In [None]:
nan_count = clean_oracc_en_df['genre'].isna().sum() #466
unique_genres = clean_oracc_en_df['genre'].unique()
unique_genres

array(['Administrative Letter', 'Uncertain', 'Grant', 'Decree',
       'Astrological Report', 'Literary Work', 'Extispicy Query',
       'Extispicy Report', 'Administrative Record', 'Scholarly Letter',
       'Legal Transaction', 'Prophecy', 'Appointment', 'Priestly Letter',
       'Royal Ritual', 'Votive Donation', 'Eponym List', 'Treaty', 'Gift',
       'School Text', 'Lexical Text', 'Astronomical Diary',
       'Royal Inscription', 'Blessings', 'letter'], dtype=object)

##2 Split the data into 80 / 10 / 10 and save to files: .tr / .en / .ak

In [None]:
training_genres = ['Extispicy Report', 'Grant', 'Royal Ritual', 'Literary Work', 'Uncertain',
                    'Decree', 'Treaty', 'Prophecy', 'Votive Donation', 'Appointment', 'Eponym List',
                    'Lexical Text', 'Gift', 'Blessings', 'School Text']

specified_rows = clean_oracc_en_df[clean_oracc_en_df['genre'].isin(training_genres)]
rows_needed = int(len(clean_oracc_en_df) * 0.8) - len(specified_rows)
remaining_rows = clean_oracc_en_df[~clean_oracc_en_df.index.isin(specified_rows.index)]
additional_rows = remaining_rows.sample(n=rows_needed, random_state=30)
training = pd.concat([specified_rows.copy(), additional_rows])
training['English'] #This is the DataFrame with 80% of the rows; 8038 x 0.8 = 6430

3             of the goddesses      of the world      ...
4           means     it rains and the rainbow arches ...
5                              house  šà means  center...
6                      sin of the gods  whose face is ...
7                    the bišši plant  mustard         ...
                              ...                        
1280               any human being  whether male or fe...
6624      itur ilum  governor of babylon  iṣur ilum  t...
4941      seal of arbailāya   seal of mutakkil aššur  ...
7381      to          dedicated this object for his ow...
4453      if on the 14th day the moon and sun are seen...
Name: English, Length: 6430, dtype: object

In [None]:
testing_and_validation = clean_oracc_en_df[~clean_oracc_en_df.index.isin(training.index)]
testing = testing_and_validation.sample(frac=0.5, random_state=30)
validation = testing_and_validation.drop(testing.index)
#testing.head(10)
#validation.head(10)
testing_and_validation.head(10)

Unnamed: 0,txt_id,id_text,Title,object_type,language,period,dialect,genre,subgenre,English,Original_GS_Text
0,X150289,X150289,saao-saa15-X150289.txt,envelope,Akkadian,Neo-Assyrian,Neo-Assyrian,Administrative Letter,,to the deputy governor my lord seal impres...,"(1) [To the deputy (governor)], my lord: (seal..."
2,X211891,X211891,saao-saa21-X211891.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Administrative Letter,,i went to my and appealed to assurb...,(Beginning destroyed)(1') I went [to] my [......
301,P336327,P336327,saao-saa04-P336327.txt,tablet,Akkadian,Neo-Assyrian,Neo-Babylonian,Extispicy Query,military and political,šamaš great lord give me a firm positive a...,"(1) Šamaš, great lord, [give me a firm positiv..."
306,P336340,P336340,saao-saa04-P336340.txt,tablet,Akkadian,Neo-Assyrian,Neo-Babylonian,Extispicy Query,military and political,will they do battle is it decreed...,(Beginning destroyed)(2) will they do [battle...
307,P336341,P336341,saao-saa04-P336341.txt,tablet,Akkadian,Neo-Assyrian,Neo-Babylonian,Extispicy Query,military and political,šamaš great lord give me a firm positive a...,"(1) Šamaš, great lord, [give me a firm positiv..."
331,P336368,P336368,saao-saa07-P336368.txt,tablet,Akkadian,Neo-Assyrian,,Administrative Record,,a thigh a shoulder outer cuts 2 cuts of ...,"(1) A thigh, a shoulder, outer cuts;(2) 2 cuts..."
341,P336378,P336378,saao-saa08-P336378.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Astrological Report,planetary,if the moon becomes visible on the 1st day ...,(1) If the moon becomes visible on the 1st day...
343,P336381,P336381,saao-saa08-P336381.txt,tablet,Akkadian,Neo-Assyrian,Neo-Babylonian,Astrological Report,unclassifiable,if the moon at its appearance wears a crown ...,(1) If the moon at its appearance wears a crow...
348,P336386,P336386,saao-saa08-P336386.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Astrological Report,lunar,if the moon becomes visible on the 1st day ...,(1) [If] the moon becomes visible on the 1st d...
350,P336388,P336388,saao-saa08-P336388.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Astrological Report,lunar,if the moon at its appearance wears a crown ...,(1) [If the moon] at its appearance wears a cr...


##3 Do the same for transliteration and Akkadian and then save files to NMT_input

Path("/content/drive/MyDrive/Akkademia-master/NMT_input")

In [None]:
akk_tr_cun_path = 'DataFiles/akk_tr_cun.csv'
akk_tr_cun_df = pd.read_csv(akk_tr_cun_path)
pd.set_option('display.max_columns', 1000)

  akk_tr_cun_df = pd.read_csv(akk_tr_cun_path)


In [None]:
aligned = pd.merge(akk_tr_cun_df, clean_oracc_en_df, on='id_text', how='inner')
aligned

Unnamed: 0.1,Unnamed: 0,lang,form,id_word,label,id_text,delim,gdl,pos,cf,gw,sense,norm,epos,headform,contrefs,norm0,base,morph,stem,cont,syntax_ub-after,morph2,aform,id_line,form_stripped,unicode_stripped,unicode,roman,txt_id,Title,object_type,language,period,dialect,genre,subgenre,English,Original_GS_Text
0,18,akk-x-neoass,1/2,P522597.2.1,o 1,P522597,,"[{'n': 'n', 'sexified': '1/2(disz)', 'form': '...",n,,,,,,,,,,,,,,,,P522597.1,1/2,𒈦,𒈦,False,P522597,atae-tilbarsip-P522597.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Legal Transaction,debt note (silver),half a mina of silver belonging to hanni at...,"(1) Half a mina of silver belonging to Hanni, ..."
1,19,akk-x-neoass,MA.NA,P522597.2.2,o 1,P522597,,"[{'gg': 'logo', 'gdl_type': 'logo', 'group': [...",N,manû,unit,a unit of weight,manā,N,,,,,,,,,,,P522597.2,MA NA,𒈠 𒈾,ma.na,False,P522597,atae-tilbarsip-P522597.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Legal Transaction,debt note (silver),half a mina of silver belonging to hanni at...,"(1) Half a mina of silver belonging to Hanni, ..."
2,20,akk-x-neoass,KU₃.BABBAR,P522597.2.3,o 1,P522597,,"[{'gg': 'logo', 'gdl_type': 'logo', 'group': [...",N,ṣarpu,silver,silver,ṣarpu,N,,,,,,,,,,,P522597.3,KU₃ BABBAR,𒆬 𒌓,ku₃.babbar,False,P522597,atae-tilbarsip-P522597.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Legal Transaction,debt note (silver),half a mina of silver belonging to hanni at...,"(1) Half a mina of silver belonging to Hanni, ..."
3,21,akk-x-neoass,ša₂,P522597.3.1,o 2,P522597,,"[{'v': 'ša₂', 'id': 'P522597.3.1.0'}]",DET,ša,of,of,ša,DET,,,,,,,,,,,P522597.1,ša₂,𒃻,𒃻,False,P522597,atae-tilbarsip-P522597.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Legal Transaction,debt note (silver),half a mina of silver belonging to hanni at...,"(1) Half a mina of silver belonging to Hanni, ..."
4,22,akk-x-neoass,{m}ha-an-ni-i,P522597.3.2,o 2,P522597,,"[{'det': 'semantic', 'pos': 'pre', 'seq': [{'v...",PN,Hanni,1,1,Hanni,PN,,,,,,,,,,,P522597.2,m ha an ni i,𒁹 𒄩 𒀭 𒉌 𒄿,{m}ha-an-ni-i,False,P522597,atae-tilbarsip-P522597.txt,tablet,Akkadian,Neo-Assyrian,Neo-Assyrian,Legal Transaction,debt note (silver),half a mina of silver belonging to hanni at...,"(1) Half a mina of silver belonging to Hanni, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
759148,2736265,akk-x-neobab,x,P313663.14.5,13',P313663,,"[{'v': 'x', 'utf8': 'x', 'id': 'P313663.14.5.0...",u,,,,,,,,,,,,,,,,P313663.5,,,,False,P313663,saao-saa21-P313663.txt,tablet,Akkadian,Neo-Assyrian,Neo-Babylonian,Administrative Letter,,did he return this statue in which ...,(Beginning destroyed)(1') Did he retur[n] thi...
759149,2736266,akk-x-neobab,x,P313663.14.6,13',P313663,,"[{'v': 'x', 'utf8': 'x', 'id': 'P313663.14.6.0...",u,,,,,,,,,,,,,,,,P313663.6,,,,False,P313663,saao-saa21-P313663.txt,tablet,Akkadian,Neo-Assyrian,Neo-Babylonian,Administrative Letter,,did he return this statue in which ...,(Beginning destroyed)(1') Did he retur[n] thi...
759150,2736267,akk-x-neobab,x,P313663.14.7,13',P313663,,"[{'v': 'x', 'utf8': 'x', 'id': 'P313663.14.7.0...",u,,,,,,,,,,,,,,,,P313663.7,,,,False,P313663,saao-saa21-P313663.txt,tablet,Akkadian,Neo-Assyrian,Neo-Babylonian,Administrative Letter,,did he return this statue in which ...,(Beginning destroyed)(1') Did he retur[n] thi...
759151,2736268,akk-x-neobab,x,P313663.14.8,13',P313663,,"[{'v': 'x', 'utf8': 'x', 'id': 'P313663.14.8.0...",u,,,,,,,,,,,,,,,,P313663.8,,,,False,P313663,saao-saa21-P313663.txt,tablet,Akkadian,Neo-Assyrian,Neo-Babylonian,Administrative Letter,,did he return this statue in which ...,(Beginning destroyed)(1') Did he retur[n] thi...


In [None]:
akk_tr_cun_df['form'].head(10)

0                1/2
1              MA.NA
2         KU₃.BABBAR
3                ša₂
4      {m}ha-an-ni-i
5                ina
6                IGI
7    {m}DINGIR-ba-di
8                ina
9                IGI
Name: form, dtype: object

In [None]:
aggregation_functions = {
    'unicode_stripped': lambda x: ' '.join(x.fillna('').astype(str)),
    'form': lambda x: ' '.join(x.fillna('').astype(str)),
    'English': 'first'
}

aligned_unique = aligned.groupby('id_text').agg(aggregation_functions).reset_index()
aligned_unique

Unnamed: 0,id_text,unicode_stripped,form,English
0,P223388,𒌋𒌋 𒉡 𒁹 𒃻 𒋫 𒆸 𒌑 𒇻 𒋫 𒆸 𒀀 𒉌 𒉡 𒀀 𒈾 𒀭 𒀝 𒋳 𒆷 𒀀 𒉌 𒀀 𒈾...,man-nu ana ša₂ ta-kil-u₂ lu ta-kil a-ni-nu a-n...,may anyone trust in whomever he trusts as f...
1,P224946,𒉌𒌓 𒈩 𒁹 𒄩 𒆷 𒁉 𒂊 𒋛 𒇽 𒌣 𒍏 𒀀 𒁹 𒅋 𒆷 𒀀 𒀀 𒂗 𒊩 𒋧 𒀀 𒉌 𒊩...,{NA₄}KIŠIB {1}ha-la-be₂-e-si {LU₂}SIMUG-URUDU ...,seal of hallabeše copper smith son of illa...
2,P224947,+ 𒆬 𒌓 + 𒊭 𒁹 𒀭 𒈦 𒌋𒌋 𒉽 𒀸 𒅆 𒁹 𒋢 𒌋𒐊 𒌉 𒁹 𒋢 𒌋 𒀀 𒀸 ...,x+x x x KUG.UD x+x ša {1}{d}MAŠ-MAN-PAB ina IG...,silver belonging to inurta šarru uṣur a...
3,P224948,𒉌𒌓 𒈩 𒁹 𒀭 𒆷 𒋢 𒀀 𒁹 𒀀 𒉣 𒅀 𒀀 𒌋 𒋫 𒊮 𒌷 𒍝 𒀭 𒁀 𒀀 𒀀 220...,{na₄}KIŠIB {m}DINGIR-la-SU A {m}a-ṣil-ia-a-u T...,seal of ila eriba son of aṣil iau from the...
4,P224949,𒉌𒌓 𒈩 𒁹 𒀴 𒀭 𒉺 𒌉 𒁹 𒀊 𒁲 𒀭 𒆳 𒀀 𒐋 𒂆 𒆬 𒌓 𒊭 𒁹 𒀭 𒈦 𒌋𒌋 ...,{NA₄}KIŠIB {1}ARAD-{d}PA DUMU {1}ab-di-{d}kur-...,seal of urdu nabû son of abdi kura 6 shek...
...,...,...,...,...
5505,X301613,𒁹 𒉏 𒁺 𒁺 𒈪 𒑑 𒌍 𒀸 𒅆 𒋼 𒁺 𒈪 𒑔 𒊕 𒈪 𒌍 𒀸 𒅆 𒋼 𒌓...,x ana NIM GUB x x x x GIN GE₆ 3 sin ina IGI MU...,stood to the east blew ...
5506,X301620,𒈬 𒁹 𒈨 𒐏𒑆 𒄰 𒁹 𒀭 𒈗 𒉈 𒌍 𒌋 𒍑 𒈾 𒋢 𒈠 𒁹 𒋝 𒀀 𒆚 𒈲 𒈪 𒈫 𒌍...,MU 1 ME 49.KAM {m}an LUGAL NE 30 10 UŠ na-su m...,year 149 king antiochus month v the 1st o...
5507,X301631,𒌗 𒄞 𒌍 𒌋𒐈 𒈾 𒋢 𒋛𒀀 𒈲 𒋛𒀀 𒀭 𒍝 𒌍 𒀸 𒅆 𒄞 𒌓 𒁹 𒋙 𒁺...,{ITU}GU₄ 30 13 na-su DIR muš DIR AN ZA x x x x...,month ii the 1st of which was identical wit...
5508,X301632,𒀳 𒋛𒀀 𒀭 𒍝 𒍇 𒌋 𒆳 𒁺 𒈨𒌍 𒁹 𒋛𒀀 𒈪 𒑑 𒌍 𒂊 𒀸 𒁁 𒈫 𒌑 𒌍 ...,APIN x DIR AN ZA ULU₃ u KUR GIN{MEŠ} 1 DIR x x...,month viii clouds were in the sky the ...


In [None]:
!pip install sklearn.utils
from sklearn.model_selection import train_test_split

training_akk, temp_df = train_test_split(aligned_unique['unicode_stripped'], test_size=0.2, random_state=30)

testing_akk, validation_akk = train_test_split(temp_df, test_size=0.5, random_state=30)

training_akk = training_akk.to_frame()
testing_akk = testing_akk.to_frame()
validation_akk = validation_akk.to_frame()
#training_akk.head(30)

training_tr, temp_df2 = train_test_split(aligned_unique['form'], test_size=0.2, random_state=30)

testing_tr, validation_tr = train_test_split(temp_df2, test_size=0.5, random_state=30)

training_tr = training_tr.to_frame()
testing_tr = testing_tr.to_frame()
validation_tr = validation_tr.to_frame()


Collecting sklearn.utils
  Downloading sklearn_utils-0.0.15.tar.gz (26 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting pyfunctional (from sklearn.utils)
  Downloading pyfunctional-1.5.0-py3-none-any.whl.metadata (40 kB)
Collecting scipy (from sklearn.utils)
  Downloading scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting scikit-learn (from sklearn.utils)
  Downloading scikit_learn-1.5.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (12 kB)
Collecting statsmodels (from sklearn.utils)
  Downloading statsmodels-0.14.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.2 kB)
Collecting seaborn (from sklearn.utils)
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting dill>=0.2.5 (from pyfunctional->sklearn.utils)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting tabulate<=1.0.0 (from pyfunctional->sklear

In [None]:
def save_df_to_file(df, filename):
  folder_path = 'DataFiles/NMT_input/'
  with open(f'{folder_path}{filename}', 'w') as f:
        for index, row in df.iterrows():
            f.write(' '.join(str(value) for value in row) + '\n')

save_df_to_file(training, 'train.en')
save_df_to_file(testing, 'test.en')
save_df_to_file(validation, 'valid.en')
save_df_to_file(training_akk, 'train.ak')
save_df_to_file(testing_akk, 'test.ak')
save_df_to_file(validation_akk, 'valid.ak')
save_df_to_file(training_tr, 'train.tr')
save_df_to_file(testing_tr, 'test.tr')
save_df_to_file(validation_tr, 'valid.tr')

#II. Tokenization:

##1. I copy in the intialization steps and the functions train_and_move and train_tokenizer. I also edited the paths in order for the functions to run properly.

In [None]:
# NOTE: Replace absolute Google Drive paths with paths under DATA_DIR / INPUT_DIR / OUTPUT_DIR.
# Example: OLD: '/content/drive/MyDrive/some_folder/file.csv'
#          NEW: in_input('some_folder', 'file.csv')
# (Disabled) Google Drive mount cell — not needed when running from GitHub.
# BASE_DIR = Path("../NMT_input") / Path("not_divided_by_three_dots")
# 
# DIVIDED_BY_THREE_DOTS = False
# 
# if DIVIDED_BY_THREE_DOTS == True:
#     BASE_DIR = Path("DataFiles/NMT_input/") #Christian & Adam
# #   BASE_DIR = Path("/content/drive/MyDrive/Akkademia-master/NMT_input") #update the path when connected
# else:
# #   BASE_DIR = Path("/content/drive/MyDrive/Akkademia-master/NMT_input") / Path("not_divided_by_three_dots")
#     BASE_DIR = Path("DataFiles/NMT_input/") #Christian & Adam
# 
# TOKEN_DIR = BASE_DIR / Path("tokenization")
# 
# TRAIN_AK = Path("train.ak")
# TRAIN_TR = Path("train.tr")
# TRAIN_EN = Path("train.en")
# VALID_AK = Path("valid.ak")
# VALID_TR = Path("valid.tr")
# VALID_EN = Path("valid.en")
# TEST_AK = Path("test.ak")
# TEST_TR = Path("test.tr")
# TEST_EN = Path("test.en")
# FOR_TRANSLATION_TR = Path("for_translation.tr")
# 
# 
# def train_and_move(input_file, model_type, model_prefix, vocab_size):
#     sentencepiece.SentencePieceTrainer.train(f'--input={input_file} --model_type={model_type} --model_prefix={model_prefix} --vocab_size={vocab_size}')
# 
#     f = model_prefix + ".model"
#     shutil.move(f, TOKEN_DIR / f)
#     f = model_prefix + ".vocab"
#     shutil.move(f, TOKEN_DIR / f)
#     print('done')
# 
# 
# def train_tokenizer():
#     train_and_move(BASE_DIR / TRAIN_AK, "char", "signs_char", 400)
#     # train_and_move(BASE_DIR / TRAIN_AK, "bpe", "signs_bpe", 400)
#     train_and_move(BASE_DIR / TRAIN_TR, "bpe", "transliteration_bpe", 1000)
#     train_and_move(BASE_DIR / TRAIN_EN, "bpe", "translation_bpe", 10000)

In [None]:
train_and_move(BASE_DIR / TRAIN_AK, "char", "my_signs_char", 400)

done


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=DataFiles/NMT_input/train.ak --model_type=char --model_prefix=my_signs_char --vocab_size=400
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: DataFiles/NMT_input/train.ak
  input_format: 
  model_prefix: my_signs_char
  model_type: CHAR
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_

In [None]:
train_tokenizer()

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=DataFiles/NMT_input/train.ak --model_type=char --model_prefix=signs_char --vocab_size=400
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: DataFiles/NMT_input/train.ak
  input_format: 
  model_prefix: signs_char
  model_type: CHAR
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0


done
done


178) LOG(INFO) Running command: --input=DataFiles/NMT_input/train.tr --model_type=bpe --model_prefix=transliteration_bpe --vocab_size=1000
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: DataFiles/NMT_input/train.tr
  input_format: 
  model_prefix: transliteration_bpe
  model_type: BPE
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_i

done


(178) LOG(INFO) Running command: --input=DataFiles/NMT_input/train.en --model_type=bpe --model_prefix=translation_bpe --vocab_size=10000
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: DataFiles/NMT_input/train.en
  input_format: 
  model_prefix: translation_bpe
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1


## 2. Next I copy in and edit the paths in the rest of the file and test out the functions. These functions mainly create new files with tokenized or detokenized information.

In [None]:
def tokenize(model_prefix, file, should_remove_prefix=False, token_dir=TOKEN_DIR, base_dir=BASE_DIR, output_dir=TOKEN_DIR):
    sp = sentencepiece.SentencePieceProcessor()
    f = model_prefix + ".model"
    sp.load(str(token_dir / f))

    with open(base_dir / file, "r", encoding="utf8") as fin:
        data = fin.readlines()

    if should_remove_prefix:
        tokenized_data = [" ".join(sp.encode_as_pieces(line.split(": ", 1)[1])) for line in data]
    else:
        tokenized_data = [" ".join(sp.encode_as_pieces(line)) for line in data]
    #print('\n'.join(tokenized_data))

    output_file = output_dir / file
    with open(output_file, "w", encoding="utf8") as fout:
        for line in tokenized_data:
            fout.write(line + "\n")

def detokenize_atae_translated():
    sp1 = sentencepiece.SentencePieceProcessor()
    sp1.load(str(TOKEN_DIR / "transliteration_bpe.model"))

    sp2 = sentencepiece.SentencePieceProcessor()
    sp2.load(str(TOKEN_DIR / "translation_bpe.model"))

    with open(Path("DataFiles/atae_translated.txt"), "r", encoding="utf8") as fin:
        data = fin.readlines()

    detokenized_data = []
    for line in data:
        if line[0] == 'S':
            parts = line.split("\t", 1)
            detokenized_data.append(parts[0] + "\t" + sp1.decode_pieces(parts[1].split(" ")))
        elif line[0] == 'H' or line[0] == 'D':
            parts = line.split("\t", 2)
            detokenized_data.append(parts[0] + "\t" + parts[1] + "\t" + sp2.decode_pieces(parts[2].split(" ")))
        else:
            detokenized_data.append(line)

    with open(Path("DataFiles/atae_translated_detokenized.txt"), "w", encoding="utf8") as fout:
        for line in detokenized_data:
            fout.write(line)

def detokenize_best_run_test_data_translated(only_core_data):
    sp1 = sentencepiece.SentencePieceProcessor()
    sp1.load(str(TOKEN_DIR / "transliteration_bpe.model"))

    sp2 = sentencepiece.SentencePieceProcessor()
    sp2.load(str(TOKEN_DIR / "translation_bpe.model"))

    with open(Path("DataFiles/best_run_test_data_translated.txt"), "r", encoding="utf8") as fin:
        data = fin.readlines()

    detokenized_data = []
    for line in data:
        if line[0] == 'S':
            if not only_core_data:
                parts = line.split("\t", 1)
                detokenized_data.append(parts[0] + "\t" + sp1.decode_pieces(parts[1].split(" ")).replace("_", " "))
        elif line[0] == 'T':
            parts = line.split("\t", 1)
            if only_core_data:
                detokenized_data.append("<gold>: " + sp2.decode_pieces(parts[1].split(" ")).replace("_", " "))
            else:
                detokenized_data.append(parts[0] + "\t" + sp2.decode_pieces(parts[1].split(" ")).replace("_", " "))
        elif line[0] == 'H' or line[0] == 'D':
            parts = line.split("\t", 2)
            if only_core_data:
                if line[0] == 'H':
                    detokenized_data.append("<predicted>: " + sp2.decode_pieces(parts[2].split(" ")).replace("_", " ") + "\n")
            else:
                detokenized_data.append(parts[0] + "\t" + parts[1] + "\t" + sp2.decode_pieces(parts[2].split(" ")).replace("_", " "))
        else:
            if not only_core_data:
                detokenized_data.append(line)

    with open(Path("DataFiles/best_run_test_data_translated_detokenized.txt"), "w", encoding="utf8") as fout:
        for line in detokenized_data:
            fout.write(line)


def run_tokenizer():
    # TODO: Compare signs_chars to signs_bpe
    tokenize("signs_char", TRAIN_AK)
    tokenize("signs_char", VALID_AK)
    tokenize("signs_char", TEST_AK)

    tokenize("transliteration_bpe", TRAIN_TR)
    tokenize("transliteration_bpe", VALID_TR)
    tokenize("transliteration_bpe", TEST_TR)

    tokenize("translation_bpe", TRAIN_EN)
    tokenize("translation_bpe", VALID_EN)
    tokenize("translation_bpe", TEST_EN)


def tokenize_transliteration_for_translation():
    tokenize("transliteration_bpe", FOR_TRANSLATION_TR, True)


def main():
    train_tokenizer()
    run_tokenizer()
    # tokenize_transliteration_for_translation()
    # detokenize_atae_translated()
    # detokenize_best_run_test_data_translated(True)

`run_tokenizer()` works properly (runs tokenize 9 times and updates the corresponding files in nmt_input/not_divided_by_three_dots/tokenization)

In [None]:
run_tokenizer()

detokenize_atae_translated() updates the detokenize_atae_translated.txt file

In [None]:
detokenize_atae_translated() #had to change the '../atae_translated.txt' because it can't find the path this way

In [None]:
detokenize_best_run_test_data_translated(True) # also works- the input true means only the core data is translated

Next I'm just running a quick test to make sure the code works by populating an empty file I randomly called "requirements" + this worked and filled the file.

In [None]:
def detokenize_atae_translated_test():
    sp1 = sentencepiece.SentencePieceProcessor()
    sp1.load(str(TOKEN_DIR / "transliteration_bpe.model"))

    sp2 = sentencepiece.SentencePieceProcessor()
    sp2.load(str(TOKEN_DIR / "translation_bpe.model"))

    with open(Path("DataFiles/atae_translated.txt"), "r", encoding="utf8") as fin:
        data = fin.readlines()

    detokenized_data = []
    for line in data:
        if line[0] == 'S':
            parts = line.split("\t", 1)
            detokenized_data.append(parts[0] + "\t" + sp1.decode_pieces(parts[1].split(" ")))
        elif line[0] == 'H' or line[0] == 'D':
            parts = line.split("\t", 2)
            detokenized_data.append(parts[0] + "\t" + parts[1] + "\t" + sp2.decode_pieces(parts[2].split(" ")))
        else:
            detokenized_data.append(line)

    with open(Path("DataFiles/requirements.txt"), "w", encoding="utf8") as fout:
        for line in detokenized_data:
            fout.write(line)

In [None]:
detokenize_atae_translated_test()

#III. Paramaterization:

The below is the code from Akkademia/run_slurm.sh with some edited paths and translated to python. Currently will create directories in your 'Files' tab (not in your drive), but the path for OUT_DIR can easily be edited so that the directories are sent into Akkademia-master. For example, you could change the code to something like `OUT_DIR = f"/content/drive/MyDrive/Akkademia-master/result.LR_{LR}.MAX_TOKENS_{MAX_TOKENS}"` if you wanted the directories to be added to Akkademia-master.

In [None]:
import os

def run(LR, MAX_TOKENS):
    OUT_DIR = f"result.LR_{LR}.MAX_TOKENS_{MAX_TOKENS}"
    os.makedirs(OUT_DIR, exist_ok=True)

    with open(f"{OUT_DIR}/akkadian_fairseq.slurm", "w") as slurm_file:
        with open("DataFiles/akkadian_fairseq.template", "r") as template_file: # edited path to template file
            slurm_template = template_file.read()
            slurm_template = slurm_template.replace("LR_PLACEHOLDER", str(LR))
            slurm_template = slurm_template.replace("MAX_TOKENS_PLACEHOLDER", str(MAX_TOKENS))
            slurm_file.write(slurm_template)
    print(OUT_DIR)
    os.chdir(OUT_DIR)
    os.system("sbatch akkadian_fairseq.slurm")
    os.chdir("..")

for LR in [0.05, 0.1]:
    for MAX_TOKENS in [4000, 8000]:
      run(LR, MAX_TOKENS)

result.LR_0.05.MAX_TOKENS_4000
result.LR_0.05.MAX_TOKENS_8000
result.LR_0.1.MAX_TOKENS_4000
result.LR_0.1.MAX_TOKENS_8000


sh: sbatch: command not found
sh: sbatch: command not found
sh: sbatch: command not found
sh: sbatch: command not found


Some concerns might be that once you make each of the fairseq.slurm files, they are also in linux/bash (?) so I dont think you can run them in the notebooks either. Might need to change some of the files so that the akkadian_fairseq.sh file will run properly from the notebook in the Translation step.

#IV. Translation

In this section we will prepare some texts for machine translation. The following format will allow for an iterative approach to the data input:

|id_text|lang|transliteration|cuneiform|translation|
|-------|----|:--------------|:--------|:----------|
|P123456|sux|1 ma-na ku3.babbar| 𒁹	𒈠𒈾 𒆬𒌓| (empty)|

And get the desired result:

|id_text|lang|transliteration|cuneiform|translation|
|-------|----|:--------------|:--------|:----------|
|P123456|sux|1 ma-na ku3.babbar| 𒁹	𒈠𒈾 𒆬𒌓| 1 mina silver|

In [None]:
!git clone https://github.com/gaigutherz/Akkademia.git
!cd Akkademia/trans_result.LR_0.1.MAX_TOKENS_4000 && cat checkpoint_best.pt.* > checkpoint_best.pt
!cd Akkademia/not_divided_by_three_dots_result.LR_0.1.MAX_TOKENS_4000 && cat checkpoint_best.pt.* > checkpoint_best.pt
!pip install sentencepiece
!git clone https://github.com/pytorch/fairseq
!cd fairseq && pip install ./
!chmod +x fairseq/fairseq_cli/interactive.py
!sed -i 's/#!\/usr\/bin\/env python3 -u/#!\/usr\/bin\/env python3/g' fairseq/fairseq_cli/interactive.py

Cloning into 'Akkademia'...
remote: Enumerating objects: 10676, done.[K
remote: Counting objects: 100% (170/170), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 10676 (delta 107), reused 152 (delta 93), pack-reused 10506 (from 1)[K
Receiving objects: 100% (10676/10676), 3.28 GiB | 4.04 MiB/s, done.
Resolving deltas: 100% (9878/9878), done.
Updating files: 100% (7381/7381), done.
Cloning into 'fairseq'...
remote: Enumerating objects: 35209, done.[K
remote: Counting objects: 100% (126/126), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 35209 (delta 68), reused 88 (delta 52), pack-reused 35083 (from 1)[K
Receiving objects: 100% (35209/35209), 25.23 MiB | 4.13 MiB/s, done.
Resolving deltas: 100% (25558/25558), done.
Processing /Users/christiankarren/Desktop/research_repo/FGCuneiform_MachineTranslation/fairseq
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparin

## 4.1 Load the Models

In [None]:
TRANSLITERATION_MODEL_PATH = "DataFiles/NMT_input/tokenization/transliteration_bpe.model"
TRANSLATION_MODEL_PATH = "DataFiles/NMT_input/tokenization/translation_bpe.model"

In [None]:
def translation(line):
    if len(line) < 1:
        return False

def translate_transliteration(sentence):
    tmp_file = "transliteration.tmp"
    with open(tmp_file, "w") as f:
        f.write(sentence)
    print(TRANSLATION_MODEL_PATH)
    raw_result = translate_transliteration_base(tmp_file, True).stdout
    os.remove(tmp_file)

    output = ""
    print(TRANSLATION_MODEL_PATH)
    for line in raw_result.decode().split('\n'):
        if translation(line):
            output += detokenize_translation(line, TRANSLATION_MODEL_PATH)

    return output


#if __name__ == '__main__':
    # sentence = input("Please enter a transliteration sentence for translation\n")
   # print(translate_transliteration(sentence))

In [None]:
ls DataFiles/NMT_input/tokenization/

CODE_OF_CONDUCT.md         setup.py
CONTRIBUTING.md            signs_char.model
LICENSE                    signs_char.vocab
MANIFEST.in                t2e_translation_memory.en
README.md                  test.ak
RELEASE.md                 test.en
[34mdocs[m[m/                      test.tr
[34mexamples[m[m/                  [34mtests[m[m/
[34mfairseq[m[m/                   train.ak
[34mfairseq_cli[m[m/               train.en
fg.csv                     train.py
for_translation.tr         train.tr
hubconf.py                 translation_bpe.model
[34mhydra_plugins[m[m/             translation_bpe.vocab
my_signs_char.model        transliteration_bpe.model
my_signs_char.vocab        transliteration_bpe.vocab
pyproject.toml             valid.ak
release_utils.py           valid.en
[34mscripts[m[m/                   valid.tr
setup.cfg


#V. Final Assessment of the Translation

Here we use the outcome of the text / train / valid files to determine how well the Machine Translation worked for this group of texts.