In [None]:
from collections import Counter
from cltk.stem.lemma import LemmaReplacer
from cltk.stem.latin.j_v import JVReplacer
from cltk.prosody.latin.macronizer import Macronizer
import cltk.corpus.latin

lemmatizer = LemmaReplacer('latin')
macronizer = Macronizer('tag_ngram_123_backoff')
jv_replacer = JVReplacer()

skip_chars = '0123456789`~!@#$%^&*()))_+-={}[]|\:;"\'<>?,./'

def clean_text(text):
    table = {ord(char): None for char in skip_chars}
    return text.translate(table)

nonword_name_abbreviations = 'Ap,A,D,F,C,Cn,H,L,Mai,Mam,M,Min,N,Oct,P,Q,Seq,Ser,Sp,St,T,Ti,V,Vol,Vop'.lower().split(',')

def clean_entry(entry):
    if (entry in nonword_name_abbreviations):
        return entry.capitalize()+'.'
    return ''.join(filter(lambda x: x.isalpha(), entry))

def get_corpus_files(author):
    fileids = filter(lambda s: s.startswith(author), cltk.corpus.latin.latinlibrary.fileids())
    files = [cltk.corpus.latin.latinlibrary.abspath(fileid) for fileid in fileids]
    return files

def merge_file_contents(paths):
    text = ''
    for path in files:
        f = open(path)
        text += f.read()
        #print('loaded', path, 'buffer length', len(text))
        f.close()
    return text


In [None]:
authors = (
    'caesar',
    'livy',
    'ovid',
    'cicero',
    'galileo',
    'bacon',
    'more',
)

data = {}
for author in authors:
    freq_table = []

    files = get_corpus_files(author)
    text = merge_file_contents(files)
    text = clean_text(text)

    tokens = lemmatizer.lemmatize(jv_replacer.replace(text).lower())
    N = len(tokens)
    longest_token = max(tokens, key=len)
    freq_count = Counter(tokens)
    data[author] = freq_count

In [None]:
def normalize(freq_table):
    result = freq_table.copy()
    N = sum(freq_table.values())
    for lemma in result:
        result[lemma] /= N
    return result

def normalize_all(data):
    result = data.copy()
    for author in result:
        result[author] = normalize(result[author])
    return result

def merge_all(data):
    result = Counter()
    for author in data:
        result += data[author]
    return result

def percent(n, decimal=2):
    return '{:.{decimal}f}'.format(n * 100, decimal=decimal) + '%'

def word_use_detail(word, author):
    author_data = data[author]
    N = sum(author_data.values())
    return author[0:3] + ':' + str(percent(author_data[word] / N, 2))

PERIOD = 1
LAST_WORD = 2000
SHOW_DETAILS = True

def print_counter_data(counter):
    coverage = 0
    N = sum(counter.values())
    index = 1
    for word, freq in counter.most_common(LAST_WORD):
        coverage += freq / N
        if index == 1 or index % PERIOD == 0:

            details = ' '.join([word_use_detail(word, author) for author in authors])
            print(
                '%5s' % index,
                '%-15s' % clean_entry(word),
                '%6s' % percent(freq, 3),
                '(%5s)' % percent(coverage, 1),
                details if SHOW_DETAILS else '',
            )
        index += 1

normalized_combined_freq_table = normalize(merge_all(data))
print_counter_data(normalized_combined_freq_table)