In [2]:
from collections import Counter
from cltk.stem.lemma import LemmaReplacer
from cltk.stem.latin.j_v import JVReplacer
from cltk.prosody.latin.macronizer import Macronizer
import cltk.corpus.latin

lemmatizer = LemmaReplacer('latin')
macronizer = Macronizer('tag_ngram_123_backoff')
jv_replacer = JVReplacer()

skip_chars = '0123456789`~!@#$%^&*()))_+-={}[]|\:;"\'<>?,./'

def clean_text(text):
    table = {ord(char): None for char in skip_chars}
    return text.translate(table)

nonword_name_abbreviations = 'Ap,A,D,F,C,Cn,H,L,Mai,Mam,M,Min,N,Oct,P,Q,Seq,Ser,Sp,St,T,Ti,V,Vol,Vop'.lower().split(',')

def clean_entry(entry):
    if (entry in nonword_name_abbreviations):
        return entry.capitalize()+'.'
    return ''.join(filter(lambda x: x.isalpha(), entry))

def get_corpus_files(author):
    fileids = filter(lambda s: s.startswith(author), cltk.corpus.latin.latinlibrary.fileids())
    files = [cltk.corpus.latin.latinlibrary.abspath(fileid) for fileid in fileids]
    return files

def merge_file_contents(paths):
    text = ''
    for path in files:
        f = open(path)
        text += f.read()
        #print('loaded', path, 'buffer length', len(text))
        f.close()
    return text


In [3]:
authors = (
    'caesar',
)

data = {}
for author in authors:
    freq_table = []

    files = get_corpus_files(author)
    text = merge_file_contents(files)
    text = clean_text(text)

    tokens = lemmatizer.lemmatize(jv_replacer.replace(text).lower())
    N = len(tokens)
    longest_token = max(tokens, key=len)
    freq_count = Counter(tokens)
    data[author] = freq_count

In [12]:
def normalize(freq_table):
    result = freq_table.copy()
    N = sum(freq_table.values())
    for lemma in result:
        result[lemma] /= N
    return result

def normalize_all(data):
    result = data.copy()
    for author in result:
        result[author] = normalize(result[author])
    return result

def merge_all(data):
    result = Counter()
    for author in data:
        result += data[author]
    return result

def percent(n, decimal=2):
    return '{:.{decimal}f}'.format(n * 100, decimal=decimal) + '%'

def word_use_detail(word, author):
    author_data = data[author]
    N = sum(author_data.values())
    return author[0:3] + ':' + str(percent(author_data[word] / N, 2))

PERIOD = 1
LAST_WORD = 2000
SHOW_DETAILS = False


def print_counter_data(counter):
    f = open('+'.join(authors)+'.txt', 'w')
    coverage = 0
    N = sum(counter.values())
    index = 1
    for word, freq in counter.most_common(LAST_WORD):
        coverage += freq / N
        if index == 1 or index % PERIOD == 0:

            details = ' '.join([word_use_detail(word, author) for author in authors])
            line = ' '.join((
                '%5s' % index,
                '%-15s' % clean_entry(word),
                '%6s' % percent(freq, 3),
                '(%5s)' % percent(coverage, 1),
                details if SHOW_DETAILS else '',
            ))
            print(line)
            f.write(line+'\n')
        index += 1
    f.close()
normalized_combined_freq_table = normalize(merge_all(data))
print_counter_data(normalized_combined_freq_table)

    1 qui             3.303% ( 3.3%) 
    2 in              2.327% ( 5.6%) 
    3 et              2.042% ( 7.7%) 
    4 is              1.549% ( 9.2%) 
    5 ad              1.540% (10.8%) 
    6 sui             1.266% (12.0%) 
    7 cum             1.253% (13.3%) 
    8 atque           1.251% (14.5%) 
    9 ab              1.193% (15.7%) 
   10 edo             1.146% (16.9%) 
   11 hic             1.127% (18.0%) 
   12 sum             1.099% (19.1%) 
   13 ex              1.050% (20.1%) 
   14 ut              0.891% (21.0%) 
   15 magnus          0.721% (21.8%) 
   16 non             0.664% (22.4%) 
   17 omne            0.661% (23.1%) 
   18 facio           0.623% (23.7%) 
   19 eo              0.593% (24.3%) 
   20 possum          0.578% (24.9%) 
   21 neque           0.546% (25.4%) 
   22 caesar          0.522% (25.9%) 
   23 legio           0.499% (26.4%) 
   24 hostis          0.472% (26.9%) 
   25 ito             0.468% (27.4%) 
   26 noster          0.455% (27.8%) 
   27 suo   

  658 inferus         0.025% (79.0%) 
  659 suscipio        0.025% (79.0%) 
  660 potior          0.025% (79.0%) 
  661 vetus           0.025% (79.0%) 
  662 sentio          0.025% (79.1%) 
  663 judicium        0.025% (79.1%) 
  664 initio          0.025% (79.1%) 
  665 permaneo        0.025% (79.1%) 
  666 legatio         0.025% (79.2%) 
  667 insidiae        0.025% (79.2%) 
  668 inimicus        0.025% (79.2%) 
  669 aequus          0.025% (79.2%) 
  670 fama            0.025% (79.3%) 
  671 attribuo        0.025% (79.3%) 
  672 commemoro       0.025% (79.3%) 
  673 quartum         0.025% (79.3%) 
  674 thos            0.025% (79.4%) 
  675 impetro         0.025% (79.4%) 
  676 varus           0.025% (79.4%) 
  677 pedes           0.025% (79.4%) 
  678 hiemo           0.025% (79.5%) 
  679 trans           0.025% (79.5%) 
  680 turma           0.025% (79.5%) 
  681 mora            0.024% (79.5%) 
  682 romanum         0.024% (79.6%) 
  683 familia         0.024% (79.6%) 
  684 succed