In [15]:
# Translate verbal POS string like '3PIIA'
# @see https://github.com/cltk/latin_treebank_perseus

personaTable = {
    '1': 'prima',
    '2': 'secunda',
    '3': 'tertia',
}

numerusTable = {
    'S': 'singularis',
    'P': 'pluralis',
    '-': '-',
}

tempusAspectusqueTable = {
    "P": ("praesens", 'imperfectivus'),   # "praesens"
    "I": ("praeteritus", 'imperfectivus'), # "imperfect"
    "R": ("praesens", "perfectivus"),     # "perfect
    "L": ("praeteritus", 'perfectivus'),  # "pluperfect"
    'T': ('futurus', 'perfectivus'),      # "future perfect"
    'F': ('futurus', 'imperfectivus'),    # "future
    '-': ('-', '-'),
}

modusTable = {
    "I": "indicativus",
    "S": "coniunctivus",
    "M": 'imperativus',
    "N": 'infinitivus',   
    '-': '-',
}

genusTable = {
    "A": "activum",
    "P": "passivum",
}

def translate_verb_gloss_to_status(s):
    persona = personaTable[s[0]]
    numerus = numerusTable[s[1]]
    [tempus, aspectus] = tempusAspectusqueTable[s[2]]
    modus = modusTable[s[3]]
    vox = genusTable[s[4]]
    return {
        'persona': persona,
        'numerus': numerus,
        'tempus': tempus,
        'aspectus': aspectus,
        'modus': modus,
        'vox': vox,
    }

def get_status_string(status):
    return ' '.join([value for value in status.values() if value not in implicit_status])

In [19]:
POS_OUTPUT_ROOT = './pos/'

from os import walk
human_output = True
machine_output = True

tags = []

for (dirpath, dirnames, filenames) in walk(POS_OUTPUT_ROOT):
    for filename in filenames:
        if (filename.endswith('txt')):
            with open(POS_OUTPUT_ROOT+filename) as f:
                text = f.read()
                lines = text.split('\n')
                for line in lines:
                    if '|' in line:
                        tag = line.split('|')[1]
                        tags.append(tag)
                    
if (machine_output):
    f = open(POS_OUTPUT_ROOT+'pos_stat.csv', 'w')

from collections import Counter

verb_tags = [
    tag[1:6]
    for tag in tags
    if tag and tag[0] == 'V' and tag[1] != '-'
]
status_counts = Counter(verb_tags)

implicit_status = 'singularis praesens imperfectivus indicativus activum'.split(' ')

def percent_string(n, decimal=2):
    return '{:.{decimal}f}'.format(n * 100, decimal=decimal) + '%'

N = sum(status_counts.values())

print(' '*27+'Implicit:', ' '.join(implicit_status))
coverage = 0
for (index, entry) in enumerate(status_counts.most_common()):
    coverage += entry[1] / N
    line = (
        index + 1,
        entry[1],
        entry[1] / N,
        coverage,
        translate_verb_gloss_to_status(entry[0]),
    )
    if (human_output):
        print(
            '%2s' % str(line[0]),
            '%6s' % line[1],
            '%+6s' % percent_string(line[2]),
            '(%+7s)' % percent_string(line[3]),
            get_status_string(line[4])
        )
    if (machine_output):
        f.write(';'.join([str(entry) for entry in line])+'\n')

if f:
    f.close()


                           Implicit: singularis praesens imperfectivus indicativus activum
 1 347922 24.38% ( 24.38%) tertia
 2 154310 10.82% ( 35.20%) tertia passivum
 3 146920 10.30% ( 45.50%) tertia perfectivus
 4 106081  7.43% ( 52.93%) tertia pluralis
 5  59033  4.14% ( 57.07%) tertia coniunctivus
 6  53286  3.73% ( 60.80%) tertia praeteritus coniunctivus
 7  52257  3.66% ( 64.47%) prima
 8  50967  3.57% ( 68.04%) prima perfectivus
 9  39460  2.77% ( 70.80%) tertia praeteritus
10  39378  2.76% ( 73.56%) tertia pluralis praeteritus
11  38158  2.67% ( 76.24%) tertia perfectivus coniunctivus
12  32171  2.25% ( 78.49%) tertia pluralis praeteritus coniunctivus
13  29515  2.07% ( 80.56%) tertia pluralis perfectivus
14  27390  1.92% ( 82.48%) tertia praeteritus perfectivus
15  25030  1.75% ( 84.24%) secunda
16  22136  1.55% ( 85.79%) prima pluralis
17  20557  1.44% ( 87.23%) tertia futurus
18  19586  1.37% ( 88.60%) tertia praeteritus perfectivus coniunctivus
19  18135  1.27% ( 89.87%) p