# Experiment: Most common onsets in the Ilias

In [1]:
from cltk.tokenize.word import WordTokenizer

In [2]:
def load(txt):
    with open(txt, "r",encoding = "UTF-8") as source:
        testo = source.read()
    return testo

In [3]:
ilias = load("./data/raw_data/IliasNew.txt")

In [4]:
tokenizer = WordTokenizer('greek')
tokens = tokenizer.tokenize(ilias)

In [5]:
import string
import unicodedata

def delete_accents(s):
    return "".join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')
def clean(token_list):
    tokens = [w for w in token_list if w != " "]
    tokens = [w for w in token_list if w != ""]
    tokens = [w for w in tokens if w != '\ufeff']
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if len(w) > 2]
    tokens = [w.lower() for w in tokens]
    tokens = [delete_accents(w) for w in tokens]
    return tokens

In [6]:
tokens = clean(tokens)
tokens[:10]

['μηνιν',
 'αειδε',
 'θεα',
 'πηληιαδεω',
 'αχιληος',
 'ουλομενην',
 'μυριʼ',
 'αχαιοις',
 'αλγεʼ',
 'εθηκε']

In [12]:
# Greek vowels
vowels = ["α", "ε", "η", "ι", "ο", "ω", "υ"]

In [13]:
def onset_extractor(tok_list):
    '''We take onsets from 1 to 3 charachters for each words'''
    onsets = {}
    for w in tok_list:
        for i in range(1,4):
            if len(w) > i:
                onset = w[:i]
                if onset in onsets.keys():
                    onsets[onset] += 1
                else:
                    onsets[onset] = 1          
    return onsets

In [None]:
def find_onset(tok_list):
    onsets = []
    for w in tok_list:
        ons = ""
        for letter in w:
            if letter not in vowels:
                ons += letter
            else:
                ons = ""
        onsets.append(ons)
    return onsets



In [16]:
ilias_onsets = onset_extractor(tokens)

In [17]:
onset_list = list(ilias_onsets.items())
len(onset_list)

2508

In [18]:
# sort descending
sorted_onsets = sorted(onset_list, reverse=True, key = lambda tup : tup[1])    

In [19]:
sorted_onsets[1:3]

[('ε', 25725), ('π', 16985)]

In [20]:
# split
one_char_ons = []
more_char_ons = []
for tup in sorted_onsets:
        if len(tup[0]) == 1:
            one_char_ons.append(tup)
        else:
            more_char_ons.append(tup)

In [21]:
def simple_representation(s):
    representation = ""
    for letter in s:
        if letter in vowels:
            representation += "V"
        elif letter in ["ζ","ξ","ψ"]:
            representation += "CC"
        else:
            representation += "C"
    return representation

In [22]:
repr_dict = {}
repr_catalog  = {}
for tup in sorted_onsets:
    s = simple_representation(tup[0])
    if s in repr_dict.keys():
        repr_dict[s] += 1
    else:
        repr_dict[s] = 1
    
    if s in repr_catalog.keys():
        repr_catalog[s].append(tup[0])
    else:
        repr_catalog[s] = []
        repr_catalog[s].append(tup[0])

In [23]:
repr_dict = sorted(list(repr_dict.items()), reverse=True, key=lambda tup : tup[1])
repr_dict

[('CVC', 593),
 ('VCV', 414),
 ('VCC', 288),
 ('CCV', 260),
 ('CVV', 238),
 ('VVC', 166),
 ('CV', 97),
 ('VC', 95),
 ('CC', 73),
 ('VVV', 51),
 ('VCCV', 47),
 ('CVCC', 42),
 ('VV', 41),
 ('CCVC', 41),
 ('C', 14),
 ('VVCC', 14),
 ('CCVV', 13),
 ('CCC', 8),
 ('V', 7),
 ('VCCC', 5),
 ('CCCV', 1)]

In [24]:
from pprint import pprint
pprint(repr_catalog, compact=True)

{'C': ['π', 'κ', 'μ', 'τ', 'δ', 'θ', 'ν', 'φ', 'σ', 'γ', 'χ', 'λ', 'β', 'ρ'],
 'CC': ['πρ', 'τρ', 'στ', 'κρ', 'ζ', 'κλ', 'φρ', 'χρ', 'σφ', 'πτ', 'ξ', 'πλ',
        'σκ', 'κτ', 'γλ', 'βρ', 'σχ', 'θρ', 'ψ', 'λλ', 'φθ', 'σπ', 'χθ', 'γν',
        'σθ', 'μν', 'δρ', 'βλ', 'κν', 'σμ', 'θν', 'φλ', 'τλ', 'πν', 'νθ', 'δμ',
        'ντ', 'χλ', 'ππ', 'μφ', 'νδ', 'ργ', 'σσ', 'λθ', 'γχ', 'γρ', 'σβ', 'λκ',
        'ρχ', 'γγ', 'τμ', 'ρμ', 'ρσ', 'ρν', 'δν', 'θλ', 'μβ', 'λπ', 'ρκ', 'φν',
        'νν', 'ρρ', 'ρπ', 'γκ', 'νσ', 'ρδ', 'μμ', 'λφ', 'λγ', 'ρθ', 'ττ', 'κμ',
        'κπ'],
 'CCC': ['στρ', 'νδρ', 'σπλ', 'σθλ', 'νθρ', 'νστ', 'μβρ', 'ρξ'],
 'CCCV': ['ρξο'],
 'CCV': ['προ', 'τρω', 'πρι', 'κρα', 'πρω', 'ζε', 'φρε', 'στη', 'χρυ', 'κρο',
         'κλι', 'τρι', 'γλα', 'στε', 'ζω', 'σφι', 'πτο', 'κρε', 'σχε', 'πλη',
         'πτε', 'χρο', 'κλυ', 'τρο', 'βρο', 'στι', 'κτε', 'πλε', 'στα', 'τρε',
         'φρο', 'ξυ', 'κρη', 'κλε', 'σθε', 'χθο', 'κλα', 'ξα', 'μνη', 'στο',
         'φθι', 'ξι', 'σκο', 'φρα',

### Features