In [73]:
from collections import Counter
from cltk.stem.lemma import LemmaReplacer
from cltk.stem.latin.j_v import JVReplacer
from cltk.prosody.latin.macronizer import Macronizer
import cltk.corpus.latin

lemmatizer = LemmaReplacer('latin')
macronizer = Macronizer('tag_ngram_123_backoff')
jv_replacer = JVReplacer()

nonword_name_abbreviations = 'Ap,A,D,F,C,Cn,H,L,Mai,Mam,M,Min,N,Oct,P,Q,Seq,Ser,Sp,St,T,Ti,V,Vol,Vop'.lower().split(',')

def clean_entry(entry):
    if (entry in nonword_name_abbreviations):
        return entry.capitalize()+'.'
    return ''.join(filter(lambda x: x.isalpha(), entry))

def get_corpus_files(author):
    fileids = filter(lambda s: s.startswith(author), cltk.corpus.latin.latinlibrary.fileids())
    files = [cltk.corpus.latin.latinlibrary.abspath(fileid) for fileid in fileids]
    return files

def merge_file_contents(paths):
    text = ''
    for path in files:
        f = open(path)
        text += f.read()
        #print('loaded', path, 'buffer length', len(text))
        f.close()
    return text

In [196]:
authors = (
    'caesar',
    'livy',
    'ovid',
    'cicero',
    'galileo',
    'bacon',
    'more',
)

data = {}
for author in authors:
    freq_table = []

    files = get_corpus_files(author)
    text = merge_file_contents(files)
    text = clean_text(text)

    tokens = lemmatizer.lemmatize(jv_replacer.replace(text).lower())
    N = len(tokens)
    longest_token = max(tokens, key=len)
    freq_count = Counter(tokens)
    data[author] = freq_count

In [198]:
def normalize(freq_table):
    result = freq_table.copy()
    N = sum(freq_table.values())
    for lemma in result:
        result[lemma] /= N
    return result

def normalize_all(data):
    result = data.copy()
    for author in result:
        result[author] = normalize(result[author])
    return result

def merge_all(data):
    result = Counter()
    for author in data:
        result += data[author]
    return result

def percent(n, decimal=2):
    return '{:.{decimal}f}'.format(n * 100, decimal=decimal) + '%'

def word_use_detail(word, author):
    author_data = data[author]
    N = sum(author_data.values())
    return author + ':' + str(percent(author_data[word] / N, 2))

    

PERIOD = 100
LAST_WORD = 5000
SHOW_DETAILS = False

def print_counter_data(counter):
    coverage = 0
    N = sum(counter.values())
    index = 1
    for word, freq in counter.most_common(LAST_WORD):
        coverage += freq / N
        if index == 1 or index % PERIOD == 0:

            details = ' '.join([word_use_detail(word, author) for author in authors])
            print(
                '%5s' % index,
                '%-15s' % clean_entry(word),
                '%6s' % percent(freq, 3),
                '(%5s)' % percent(coverage, 1),
                details if SHOW_DETAILS else '',
            )
        index += 1

normalized_combined_freq_table = normalize(merge_all(data))
print_counter_data(normalized_combined_freq_table)

    1 qui             3.461% ( 3.5%) 
  100 locus           0.126% (45.1%) 
  200 annus           0.073% (54.5%) 
  300 summum          0.047% (60.3%) 
  400 romanis         0.037% (64.5%) 
  500 opinio          0.030% (67.7%) 
  600 intersum        0.025% (70.5%) 
  700 traho           0.021% (72.7%) 
  800 ignosco         0.018% (74.6%) 
  900 pulcher         0.015% (76.3%) 
 1000 caelo           0.014% (77.7%) 
 1100 descendo        0.012% (79.0%) 
 1200 spatior         0.011% (80.2%) 
 1300 labor           0.010% (81.2%) 
 1400 innocens        0.009% (82.1%) 
 1500 cora            0.008% (83.0%) 
 1600 municipium      0.007% (83.8%) 
 1700 opportunus      0.007% (84.5%) 
 1800 vicis           0.006% (85.2%) 
 1900 sollicitus      0.006% (85.8%) 
 2000 circumdo        0.006% (86.4%) 
 2100 radix           0.005% (86.9%) 
 2200 delectatio      0.005% (87.4%) 
 2300 uoluisse        0.004% (87.8%) 
 2400 matrona         0.004% (88.3%) 
 2500 subjecto        0.004% (88.7%) 
 2600 fundit