In [52]:
from collections import Counter
from cltk.stem.lemma import LemmaReplacer
from cltk.stem.latin.j_v import JVReplacer
import cltk.corpus.latin

jv_replacer = JVReplacer()

skip_chars = '0123456789`~!@#$%^&*()))_+-={}[]|\:;"\'<>?,./'

def clean_text(text):
    table = {ord(char): None for char in skip_chars}
    return text.translate(table)

nonword_name_abbreviations = 'Ap,A,D,F,C,Cn,H,L,Mai,Mam,M,Min,N,Oct,P,Q,Seq,Ser,Sp,St,T,Ti,V,Vol,Vop'.lower().split(',')

def clean_entry(entry):
    if (entry in nonword_name_abbreviations):
        return entry.capitalize()+'.'
    return ''.join(filter(lambda x: x.isalpha(), entry))

def get_corpus_files(author):
    fileids = filter(lambda s: s.startswith(author), cltk.corpus.latin.latinlibrary.fileids())
    files = [cltk.corpus.latin.latinlibrary.abspath(fileid) for fileid in fileids]
    return files

def merge_file_contents(paths):
    text = ''
    for path in paths:
        f = open(path)
        text += f.read()
        #print('loaded', path, 'buffer length', len(text))
        f.close()
    return text


In [53]:
# We need to import a data model to train the lemmatizer.

import os
from cltk.utils.file_operations import open_pickle

# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)  
    
# Set up CLTK Latin backoff lemmatizer
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)

In [61]:
from cltk.tokenize.word import WordTokenizer

authors = (
    'caesar',
)
word_tokenizer = WordTokenizer('latin')

data = {}
for author in authors:
    freq_table = []

    files = get_corpus_files(author)
    text = merge_file_contents(files)
    text = clean_text(text)

#     tokens = word_tokenizer.tokenize(jv_replacer.replace(text).lower())
    tokens = word_tokenizer.tokenize(text.lower())
    lemmataPairs = lemmatizer.lemmatize(tokens)
    lemmata = [pair[1] for pair in lemmataPairs]
    N = len(lemmata)
    longest_token = max(lemmata, key=len)
    freq_count = Counter(lemmata)
    data[author] = freq_count

In [62]:
def percent(n, decimal=2):
    return round(n * 100, decimal)

def normalize(freq_table):
    result = freq_table.copy()
    N = sum(freq_table.values())
    for lemma in result:
        result[lemma] /= N
    return result

def normalize_all(data):
    result = data.copy()
    for author in result:
        result[author] = normalize(result[author])
    return result

def merge_all(data):
    result = Counter()
    for author in data:
        result += data[author]
    return result

def word_use_detail(word, author):
    author_data = data[author]
    N = sum(author_data.values())
    return author[0:3] + ':' + str(percent(author_data[word] / N, 2))

PERIOD = 1
LAST_WORD = 3000
SHOW_DETAILS = False

def print_counter_data(counter):
    f = open('+'.join(authors)+'-'+str(LAST_WORD)+'.txt', 'w')
    coverage = 0
    N = sum(counter.values())
    index = 1
    for word, freq in counter.most_common(LAST_WORD):
        coverage += freq / N
        if index == 1 or index % PERIOD == 0:

            details = ' '.join([word_use_detail(word, author) for author in authors])
            segs = (
                index,
                clean_entry(word),
                percent(freq, 3),
                percent(coverage, 1),
                details if SHOW_DETAILS else '',
            )
            line = ' '.join((
                '%5s' % segs[0],
                '%-15s' % segs[1],
                '%6s' % segs[2],
                '(%5s)' % segs[3],
                segs[4],
            ))
            print(line)
            f.write('\t'.join([str(seg) for seg in segs])+'\n')
        index += 1
    f.close()
normalized_combined_freq_table = normalize(merge_all(data))
print_counter_data(normalized_combined_freq_table)

    1 qui              2.892 (  2.9) 
    2 que              2.357 (  5.2) 
    3 in               2.274 (  7.5) 
    4 sum              2.182 (  9.7) 
    5 is               2.013 ( 11.7) 
    6 et               1.993 ( 13.7) 
    7 ad               1.503 ( 15.2) 
    8 cum              1.278 ( 16.5) 
    9 sui              1.277 ( 17.8) 
   10 atque            1.222 ( 19.0) 
   11 ab               1.164 ( 20.2) 
   12 hic              1.092 ( 21.2) 
   13 ex                0.99 ( 22.2) 
   14 omnis             0.96 ( 23.2) 
   15 ut                0.87 ( 24.1) 
   16 suus              0.84 ( 24.9) 
   17 res              0.803 ( 25.7) 
   18 magnus           0.665 ( 26.4) 
   19 castrum          0.659 ( 27.0) 
   20 non              0.648 ( 27.7) 
   21 locus            0.583 ( 28.3) 
   22 caesaris         0.562 ( 28.8) 
   23 possum            0.56 ( 29.4) 
   24 facio            0.541 ( 29.9) 
   25 neque            0.533 ( 30.5) 
   26 caesar            0.51 ( 31.0) 
   27 legio 

  858 profectio        0.017 ( 83.1) 
  859 pararis          0.017 ( 83.1) 
  860 xv               0.017 ( 83.2) 
  861 fabius           0.017 ( 83.2) 
  862 afranius         0.017 ( 83.2) 
  863 neu              0.017 ( 83.2) 
  864 unde             0.017 ( 83.2) 
  865 poena            0.017 ( 83.2) 
  866 despicio         0.017 ( 83.3) 
  867 species          0.017 ( 83.3) 
  868 obsidibus        0.017 ( 83.3) 
  869 ariovistus       0.017 ( 83.3) 
  870 incendium        0.016 ( 83.3) 
  871 vinea            0.016 ( 83.3) 
  872 servus           0.016 ( 83.4) 
  873 dissensio        0.016 ( 83.4) 
  874 natio            0.016 ( 83.4) 
  875 comperio         0.016 ( 83.4) 
  876 contumelia       0.016 ( 83.4) 
  877 cado             0.016 ( 83.4) 
  878 onus             0.016 ( 83.5) 
  879 occulo           0.016 ( 83.5) 
  880 eicio            0.016 ( 83.5) 
  881 scutum           0.016 ( 83.5) 
  882 utrimque         0.016 ( 83.5) 
  883 nobilis          0.016 ( 83.5) 
  884 longit

 2483 arrogantia       0.003 ( 94.7) 
 2484 setius           0.003 ( 94.7) 
 2485 luna             0.003 ( 94.7) 
 2486 paco             0.003 ( 94.7) 
 2487 strepitus        0.003 ( 94.8) 
 2488 ramus            0.003 ( 94.8) 
 2489 aulerci          0.003 ( 94.8) 
 2490 exsisto          0.003 ( 94.8) 
 2491 aquitaniae       0.003 ( 94.8) 
 2492 aulercis         0.003 ( 94.8) 
 2493 colo             0.003 ( 94.8) 
 2494 cibus            0.003 ( 94.8) 
 2495 ubiorum          0.003 ( 94.8) 
 2496 sugambri         0.003 ( 94.8) 
 2497 succido          0.003 ( 94.8) 
 2498 essedarius       0.003 ( 94.8) 
 2499 aridus           0.003 ( 94.8) 
 2500 expeditio        0.003 ( 94.8) 
 2501 britanni         0.003 ( 94.8) 
 2502 triginta         0.003 ( 94.8) 
 2503 atrebatem        0.003 ( 94.8) 
 2504 missus           0.003 ( 94.8) 
 2505 aduatucos        0.003 ( 94.8) 
 2506 succendo         0.003 ( 94.8) 
 2507 vorenus          0.003 ( 94.8) 
 2508 sextium          0.003 ( 94.8) 
 2509 luteti