# Experiment: Most common onsets in the Ilias

In [27]:
def load(txt):
    with open(txt, "r",encoding = "UTF-8") as source:
        testo = source.read()
    return testo

In [28]:
ilias = load("IliasNew.txt")

In [29]:
tokens = ilias.split()

In [30]:
import string
import unicodedata

def delete_accents(s):
    return "".join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')
def clean(token_list):
    tokens = [w for w in token_list if w != " "]
    tokens = [w for w in token_list if w != ""]
    tokens = [w for w in tokens if w != '\ufeff']
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if len(w) > 2]
    tokens = [w.lower() for w in tokens]
    tokens = [delete_accents(w) for w in tokens]
    return tokens

In [31]:
tokens = clean(tokens)
tokens[:10]

['μηνιν',
 'αειδε',
 'θεα',
 'πηληιαδεω',
 'αχιληος',
 'μυριʼ',
 'αχαιοις',
 'αλγεʼ',
 'πολλας',
 'ιφθιμους']

In [32]:
# Greek vowels
vowels = ["α", "ε", "η", "ι", "ο", "ω", "υ"]

In [146]:
def onset_extractor(tok_list):
    '''We take onsets from 1 to 3 charachters for each words'''
    onsets = {}
    for w in tok_list:
        for i in range(1,4):
            if len(w) > i:
                onset = w[:i]
                if onset in onsets.keys():
                    onsets[onset] += 1
                else:
                    onsets[onset] = 1          
    return onsets

In [35]:
def find_onset(tok_list):
    onsets = []
    for w in tok_list:
        ons = ""
        for letter in w:
            if letter not in vowels:
                ons += letter
            else:
                ons = ""
        onsets.append(ons)
    return onsets

find_onsets(tokens)

['μ',
 'μην',
 '',
 'α',
 'αε',
 'αειδ',
 'θ',
 'θε',
 'π',
 'πηλ',
 'πηλη',
 'πηληι',
 'πηληιαδ',
 'πηληιαδε',
 '',
 'αχ',
 'αχιλ',
 'αχιλη',
 'μ',
 'μυρ',
 '',
 'αχ',
 'αχα',
 'αχαι',
 'αχαιο',
 '',
 'αλγ',
 'π',
 'πολλ',
 '',
 'ιφθ',
 'ιφθιμ',
 'ιφθιμο',
 'ψ',
 'ψυχ',
 '',
 'α',
 'αιδ',
 'πρ',
 'προ',
 'προι',
 'προιαψ',
 '',
 'α',
 'αυτ',
 'αυτο',
 '',
 'ελ',
 'ελωρ',
 'ελωρι',
 'τ',
 'τε',
 'τευχ',
 'κ',
 'κυν',
 'κυνεσσ',
 '',
 'ο',
 'οι',
 'οιων',
 'οιωνο',
 'οιωνοισ',
 'δ',
 'δι',
 '',
 'ετ',
 'ετελ',
 'ετελε',
 'ετελει',
 'ετελειετ',
 'πρ',
 'πρωτ',
 'δ',
 'δι',
 'διαστ',
 'διαστητ',
 '',
 'ερ',
 'ερισ',
 'ερισαντ',
 '',
 'ατρ',
 'ατρε',
 'ατρειδ',
 '',
 'αν',
 '',
 'ανδρ',
 'κ',
 'κα',
 'δ',
 'δι',
 'τ',
 'σφ',
 'σφω',
 'θ',
 'θε',
 '',
 'ερ',
 'εριδ',
 'ξ',
 'ξυν',
 'ξυνε',
 'ξυνεηκ',
 'λ',
 'λητ',
 'λητο',
 'κ',
 'κα',
 'δ',
 'δι',
 'γ',
 'β',
 'βασ',
 'βασιλ',
 'βασιλη',
 'χ',
 'χολ',
 'χολωθ',
 'χολωθε',
 'ν',
 'νο',
 'νουσ',
 '',
 'αν',
 'στρ',
 'στρατ',
 '',
 'ορσ',
 ''

1

In [147]:
ilias_onsets = onset_extractor(tokens)

In [148]:
onset_list = list(ilias_onsets.items())
len(onset_list)

2429

In [181]:
# sort descending
sorted_onsets = sorted(onset_list, reverse=True, key = lambda tup : tup[1])    

In [182]:
sorted_onsets[1:3]

[('ε', 21648), ('π', 15126)]

In [183]:
# split
one_char_ons = []
more_char_ons = []
for tup in sorted_onsets:
        if len(tup[0]) == 1:
            one_char_ons.append(tup)
        else:
            more_char_ons.append(tup)

In [186]:
def simple_representation(s):
    representation = ""
    for letter in s:
        if letter in vowels:
            representation += "V"
        elif letter in ["ζ","ξ","ψ"]:
            representation += "CC"
        else:
            representation += "C"
    return representation

In [187]:
repr_dict = {}
repr_catalog  = {}
for tup in sorted_onsets:
    s = simple_representation(tup[0])
    if s in repr_dict.keys():
        repr_dict[s] += 1
    else:
        repr_dict[s] = 1
    
    if s in repr_catalog.keys():
        repr_catalog[s].append(tup[0])
    else:
        repr_catalog[s] = []
        repr_catalog[s].append(tup[0])

In [188]:
repr_dict = sorted(list(repr_dict.items()), reverse=True, key=lambda tup : tup[1])
repr_dict

[('CVC', 569),
 ('VCV', 405),
 ('VCC', 277),
 ('CCV', 250),
 ('CVV', 230),
 ('VVC', 161),
 ('CV', 97),
 ('VC', 95),
 ('CC', 70),
 ('VVV', 49),
 ('VCCV', 46),
 ('VV', 40),
 ('CCVC', 40),
 ('CVCC', 39),
 ('C', 14),
 ('VVCC', 14),
 ('CCVV', 13),
 ('V', 7),
 ('CCC', 7),
 ('VCCC', 5),
 ('CCCV', 1)]

In [189]:
from pprint import pprint
pprint(repr_catalog, compact=True)

{'C': ['π', 'κ', 'μ', 'τ', 'δ', 'ν', 'σ', 'θ', 'φ', 'γ', 'χ', 'λ', 'β', 'ρ'],
 'CC': ['πρ', 'τρ', 'στ', 'κρ', 'ζ', 'κλ', 'φρ', 'χρ', 'σφ', 'ξ', 'πτ', 'πλ',
        'σκ', 'κτ', 'γλ', 'βρ', 'σχ', 'θρ', 'λλ', 'ψ', 'φθ', 'σπ', 'γν', 'χθ',
        'σθ', 'κν', 'μν', 'δρ', 'βλ', 'σμ', 'θν', 'φλ', 'τλ', 'πν', 'νθ', 'δμ',
        'ντ', 'χλ', 'ππ', 'μφ', 'νδ', 'ργ', 'σσ', 'λθ', 'γχ', 'γρ', 'σβ', 'λκ',
        'ρχ', 'γγ', 'τμ', 'ρσ', 'δν', 'ρμ', 'ρν', 'θλ', 'μβ', 'λπ', 'ρκ', 'φν',
        'ρπ', 'γκ', 'ρδ', 'μμ', 'λγ', 'ρθ', 'νν', 'ρρ', 'ττ', 'κμ'],
 'CCC': ['στρ', 'νδρ', 'σπλ', 'σθλ', 'νθρ', 'μβρ', 'ρξ'],
 'CCCV': ['ρξο'],
 'CCV': ['προ', 'τρω', 'πρι', 'κρα', 'πρω', 'ζε', 'στη', 'φρε', 'χρυ', 'κρο',
         'κλι', 'τρι', 'γλα', 'ζω', 'στε', 'σφι', 'κρε', 'πτε', 'σχε', 'κλυ',
         'πλη', 'χρο', 'τρο', 'βρο', 'πτο', 'στι', 'πλε', 'στα', 'τρε', 'κρη',
         'ξυ', 'φρο', 'κλε', 'κτε', 'σθε', 'ξα', 'στο', 'φρα', 'φθι', 'μνη',
         'πρυ', 'χθο', 'σκο', 'ψυ', 'ξι', 'σπε', 'στυ', 'θνη', 'σφω'

### Features