In [1]:
import cltk
import os

from cltk.corpus.utils.importer import CorpusImporter
from collections import Counter
from cltk.corpus.akkadian.file_importer import FileImport
from cltk.corpus.akkadian.cdli_corpus import CDLICorpus
from cltk.corpus.akkadian.tokenizer import Tokenizer
from cltk.stem.akkadian.atf_converter import ATFConverter
from cltk.tokenize.word import WordTokenizer

In [2]:
# Set up and confirm location of CDLI corpus 

Akkadian = CorpusImporter('akkadian')
Akkadian.import_corpus('cdli_corpus')
file = os.path.join('cltk_data', 'akkadian', 'atf', 'cdli_corpus', 'cdliatf_unblocked.atf')
os.path.isfile(file)

True

In [3]:
# Load corpus

fi = FileImport(file)
cc = CDLICorpus()

fi.read_file()
cc.parse_file(fi.file_lines)

In [4]:
# Select your text, here I've chosen the Taylor Prism (RINAP 3.1.22)

selected_text = cc.catalog['P462830']['transliteration']
# print(selected_text)

In [5]:
# This is a frequency analysis that looks at every sign value in the entirety of the selected text.
# Each sign retains the reading value given by CDLI (i.e. {x} = determinative, _x_ = sumerian, x = akkadian)
# Thus, each sign separates values of different usage (e.g. ('ma', 'sumerian') =/= ('ma', 'akkadian'))

toto_signs = []

atf = ATFConverter()
tk = Tokenizer()
wtk = WordTokenizer('akkadian')
lines = [tk.string_tokenizer(text, include_blanks=False)
        for text in atf.process(selected_text)]
words = [wtk.tokenize(line[0]) for line in lines]

for signs in words:
    individual_words = [wtk.tokenize_sign(a) for a in signs]
    individual_signs = [c for b in individual_words for c in b]
    for count in individual_signs:
        toto_signs.append(count)

frequency_analysis = Counter(toto_signs).most_common(15)
print(frequency_analysis)

[(('a', 'akkadian'), 276), (('na', 'akkadian'), 258), (('ma', 'akkadian'), 196), (('meš', 'sumerian'), 187), (('ti', 'akkadian'), 171), (('u2', 'akkadian'), 144), (('i', 'akkadian'), 128), (('iri', 'determinative'), 113), (('la', 'akkadian'), 107), (('nu', 'akkadian'), 98), (('ri', 'akkadian'), 92), (('e', 'akkadian'), 92), (('ša', 'akkadian'), 87), (('šu2', 'akkadian'), 82), (('ni', 'akkadian'), 80)]
