In [73]:
from collections import Counter
from cltk.stem.lemma import LemmaReplacer
from cltk.stem.latin.j_v import JVReplacer
from cltk.prosody.latin.macronizer import Macronizer
import cltk.corpus.latin

lemmatizer = LemmaReplacer('latin')
macronizer = Macronizer('tag_ngram_123_backoff')
jv_replacer = JVReplacer()

nonword_name_abbreviations = 'Ap,A,D,F,C,Cn,H,L,Mai,Mam,M,Min,N,Oct,P,Q,Seq,Ser,Sp,St,T,Ti,V,Vol,Vop'.lower().split(',')

def clean_entry(entry):
    if (entry in nonword_name_abbreviations):
        return entry.capitalize()+'.'
    return ''.join(filter(lambda x: x.isalpha(), entry))

def get_corpus_files(author):
    fileids = filter(lambda s: s.startswith(author), cltk.corpus.latin.latinlibrary.fileids())
    files = [cltk.corpus.latin.latinlibrary.abspath(fileid) for fileid in fileids]
    return files

def merge_file_contents(paths):
    text = ''
    for path in files:
        f = open(path)
        text += f.read()
        #print('loaded', path, 'buffer length', len(text))
        f.close()
    return text

In [188]:
authors = (
    'caesar',
    'livy',
#     'cato',
#     'ovid',
#     'more',
)

data = {}
for author in authors:
    freq_table = []

    files = get_corpus_files(author)
    text = merge_file_contents(files)
    text = clean_text(text)

    tokens = lemmatizer.lemmatize(jv_replacer.replace(text).lower())
    N = len(tokens)
    longest_token = max(tokens, key=len)
    freq_count = Counter(tokens)
    data[author] = freq_count

In [189]:
def normalize(freq_table):
    result = freq_table.copy()
    N = sum(freq_table.values())
    for lemma in result:
        result[lemma] /= N
    return result

def normalize_all(data):
    result = data.copy()
    for author in result:
        result[author] = normalize(result[author])
    return result

def merge_all(data):
    result = Counter()
    for author in data:
        result += data[author]
    return result

def percent(n, decimal=2):
    return '{:.{decimal}f}'.format(n * 100, decimal=decimal) + '%'

def word_use_detail(word, author):
    author_data = data[author]
    N = sum(author_data.values())
    return author + ':' + str(percent(author_data[word] / N, 2))

    

PERIOD = 100
LAST_WORD = 10000

def print_counter_data(counter):
    coverage = 0
    N = sum(counter.values())
    index = 1
    for word, freq in counter.most_common(LAST_WORD):
        coverage += freq / N
        if index == 1 or index % PERIOD == 0:

            details = ' '.join([word_use_detail(word, author) for author in authors])
            print(
                '%5s' % index,
                '%-15s' % clean_entry(word),
                '%6s' % percent(freq, 3),
                '(%5s)' % percent(coverage, 1),
                details,
            )
        index += 1

normalized_combined_freq_table = normalize(merge_all(data))
print_counter_data(normalized_combined_freq_table)

    1 qui             2.744% ( 2.7%) caesar:3.30% livy:2.62%
  100 copia           0.147% (41.8%) caesar:0.35% livy:0.10%
  200 deus            0.080% (52.7%) caesar:0.01% livy:0.09%
  300 libertas        0.056% (59.3%) caesar:0.03% livy:0.06%
  400 hispania        0.040% (64.1%) caesar:0.02% livy:0.04%
  500 quiesco         0.033% (67.7%) caesar:0.01% livy:0.04%
  600 semper          0.027% (70.7%) caesar:0.03% livy:0.03%
  700 rursus          0.023% (73.2%) caesar:0.06% livy:0.02%
  800 dives           0.020% (75.3%) caesar:0.00% livy:0.02%
  900 effero          0.017% (77.2%) caesar:0.02% livy:0.02%
 1000 licinus         0.015% (78.8%) caesar:0.00% livy:0.02%
 1100 initio          0.014% (80.2%) caesar:0.02% livy:0.01%
 1200 contemno        0.012% (81.6%) caesar:0.01% livy:0.01%
 1300 consulto        0.011% (82.7%) caesar:0.00% livy:0.01%
 1400 sperno          0.010% (83.7%) caesar:0.00% livy:0.01%
 1500 vinum           0.009% (84.6%) caesar:0.00% livy:0.01%
 1600 admoneo         0.