# Preliminaries to DeepLearning

In this notebook I'll try to extract informations about the context in which verbs are used in Homer. 
This will then build the database on which I can train a NN.

In [1]:
# imports
from nltk.tokenize import word_tokenize
from more_itertools import locate
from collections import Counter
from pprint import pprint
import re

## Functions

In [2]:
def load(txt,lines = False):
    '''
    loads text from txt files
    '''
    if lines:
        with open(txt, "r",encoding = "UTF-8") as source:
            data = source.readlines()
    else:
        with open(txt, "r",encoding = "UTF-8") as source:
            data = source.read()
    return data

In [3]:
def clean_text(txt):
    txt = re.sub(r"\n", " ",txt, re.MULTILINE) # delete row break
    txt = re.sub(r"\\n", " ",txt, re.MULTILINE) # delete row break
    txt = re.sub("n", "",txt, re.MULTILINE) # delete row break
    txt = re.sub(r";","?",txt,re.MULTILINE) # change question mark
    txt = re.sub(r"(\d+(\.\d+)?)","",txt,0,re.MULTILINE) # delete verse number
    txt = re.sub(r"\w(·)","",txt,0,re.MULTILINE)
    txt = re.sub(r'(\.|,|;|!|\")',"",txt,re.MULTILINE)
    return txt

In [4]:
def extract_verbs(parsed_tokens):
    '''
    Given the from xml extracted tokens and parsing, this function extracts only the verbs.

    return: 
        verbs : a list of lemmata
    '''
    verbs = []
    for d in data:
        infos = d.split("|")
        regex = r"v.*"
        match = re.match(regex, infos[1])
        if match is not None:
            verbs.append(infos[2][:-1])
    # clean duplicates and return
    return list(set(verbs))

In [36]:
def homer_dictionary(xml_data):
    '''
    Given the xml parsed data, returns a long dictionary: v_form : pos_tagging
    '''
    ilias_dict = {}
    for d in xml_data:
        el = d.split('|')
        if el[0] not in ilias_dict.keys():
            ilias_dict[el[0]] = el[1]
    return ilias_dict

In [6]:
def delete_stopwords(stopword_file,tokens):
    with open(stopword_file,"r",encoding="UTF-8") as stopw:
        stopwords = stopw.read()

    stopwords = stopwords.split('\n')
    stopwords.extend('.')
    stopwords.extend(',')
    stopwords.extend('“')
    stopwords.extend('·')
    print(f"Len of tokens before cleaning: {len(tokens)}")
    data_clean = [w.lower() for w in tokens if w not in stopwords]
    print(f"Done!\nLen of tokens after cleaning: {len(data_clean)}")
    return data_clean

In [7]:
def allIndices(sent, wanted):
    '''
    Finds all indices of a wanted word in a given text
    '''
    indices = list(locate(sent, lambda a : a == wanted))
    return indices

In [8]:
def find_neighbours(corpus,target_list,window=9):
    '''
    corpus = token list (as result of a tokenizer)
    target_list = verbal forms for a given verb
    window = window length

    '''
    context = []
    # iterate per verb_form
    for verb_form in list(target_list):
        # iterate per sentence
        indices = allIndices(corpus,verb_form)
        for occurrence in indices:
            for j in range(max(occurrence-window,0),min(occurrence+window, len(corpus))):
                if j not in indices:
                    context.append(corpus[j])
    return context

In [9]:
def extract_neighbours(verb,corpus,window=8,print_common = True,hmany=100):
    '''
    Given a verb (infinitive) or Perseus quotation form,it finds all the paradigmatic form in the Text and returns the neighbours.

    Return:

    a tuple:
    context : the neighbours (Counter dictionary)
    verb_forms : the set of paradigmatic forms for the given verb
    '''
    verb_forms = set()
    for d in data:
        el = d.split('|')
        if el[2] == verb+"\n":
            verb_forms.add(el[0])
    context = Counter(find_neighbours(corpus,verb_forms,window = window))
    if print_common:
        pprint(context.most_common(hmany), compact=True)
    return (context,verb_forms)

## Class Analyzer

In [107]:
class Analyzer():
    '''
    This class takes a verb in the quotation form on Perseus and returns a full analysis of the grammatical contexts in which it appears in Homer
    '''
    
    def __init__(self,verb):
        # main form
        self.verb = verb
        
        # counters for the neighbourhood along grammatical classes
        self.pos = {'n' : 0, 'v':0,'a' : 0,'d' : 0,'l' : 0,'g' : 0,'c' : 0,'r' : 0,'p' : 0,'m' : 0,'m' : 0,'i' : 0,'u':0}
        self.tense = {'p':0,'i':0,'l':0,'r':0,'t':0,'f':0,'a':0}
        self.voice = {'a':0, 'p':0, 'm':0, 'e':0}
        self.case = {'n':0,'g':0,'d':0,'a':0,'v':0,'l':0,}
        self.mood = {'i':0,'s':0,'o':0,'n':0,'m':0,'p':0}

        # Counters
        self.context = None
        
        # total forms in the neighbourhood
        self.context_words = 0

        # set of the paradigmatic forms
        self.v_forms = set()
        
    def __repr__(self):
        print(f"Analyzer für das Verb {self.verb}")
        
    def extract_context(self,corpus):
        neighbours = extract_neighbours(self.verb,corpus = corpus,print_common=False)
        self.context = neighbours[0]
        self.v_forms = neighbours[1]
    
    def analyze(self,lexicon,corpus):
        if self.context == None:
            self.extract_context(corpus=corpus)
        self.context_words = sum(self.context.values())
        pos = []
        tense = []
        mood = []
        voice = []
        case = []
        for w in list(self.context):
            if w in lexicon.keys() and len(lexicon[w]) > 2 : # if the word is present and if this is not empty
                morpho_info = lexicon[w]
                pos.append(morpho_info[0]) #w[0] = pos
                if morpho_info[3] != "-":
                    tense.append(morpho_info[3]) #w[3] = tense/aspect
                if morpho_info[4] != "-":
                    mood.append(morpho_info[4]) #w[4] = mood
                if morpho_info[5] != "-":
                    voice.append(morpho_info[5]) #w[5] = voice
                if morpho_info[7] != "-":
                    case.append(morpho_info[7]) #w[7] = case

        pos = Counter(pos)
        mood = Counter(mood)
        voice = Counter(voice)
        tense = Counter(tense)
        case = Counter(case)
        
        for x in pos.keys():
            self.pos[x] = pos[x]
        for x in mood.keys():
            self.mood[x] = mood[x]
        for x in tense.keys():
            self.tense[x] = tense[x]
        for x in voice.keys():
            self.voice[x] = voice[x]
        for x in case.keys():
            self.case[x] = case[x]
    
    def normalize(self):
        self.pos = self.get_pos()
        self.mood = self.get_mood()
        self.voice = self.get_voice()
        self.tense = self.get_tense()
        self.case = self.get_case()
        
        
    def get_info(self,category,pr = False):
        cat_c_normalized = {}
        for w,c in category.items():
            try:
                cat_c_normalized[w] = round(c/self.context_words,5)
            except:
                cat_c_normalized[w] = 0.0
            if pr:
                print(f"{w} : {cat_c_normalized[w]}")
        return cat_c_normalized
            
    def get_pos(self,pr=False):
        return self.get_info(self.pos,pr)
        
    def get_tense(self,pr=False):
        return self.get_info(self.tense,pr)
        
    def get_mood(self,pr=False):
        return self.get_info(self.mood,pr)
    
    def get_voice(self,pr=False):
        return self.get_info(self.voice,pr)
    
    def get_case(self,pr=False):
        return self.get_info(self.case,pr)
    
    def get_all(self,pr=True):
        print("POS:\n")
        self.get_pos(pr=pr)
        print("\nTENSE\n")
        self.get_tense(pr=pr)
        print("\nVOICE\n")
        self.get_voice(pr=pr)
        print("\nMOOD\n")
        self.get_mood(pr=pr)
        print("\nCASE\n")
        self.get_case(pr=pr)

In [45]:
# tokens annotiert
data = load("homerische_tokens.txt",lines = True)

In [46]:
# raw text (Ilias + Odyssee)
homer = load("HomerGesamt.txt")
# clean it
homer = clean_text(homer)

In [47]:
homer.replace(r'\\n'," ")
homer.replace(r'\n', " ")
homer[:1000]

'μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος οὐλομένην ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν ἡρώων αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν  οἰωνοῖσί τε πᾶσι Διὸς δʼ ἐτελείετο βουλή ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε Ἀτρεΐδης τε ἄναξ ἀνδρῶν καὶ δῖος Ἀχιλλεύς  τίς τʼ ἄρ σφωε θεῶν ἔριδι ξυνέηκε μάχεσθαι?\nΛητοῦς καὶ Διὸς υἱό ὃ γὰρ βασιλῆϊ χολωθεὶς\n  νοῦσον ἀνὰ στρατὸν ὄρσε κακήν ὀλέκοντο δὲ λαοί\nοὕνεκα τὸν Χρύσην ἠτίμασεν ἀρητῆρα\nἈτρεΐδη ὃ γὰρ ἦλθε θοὰς ἐπὶ νῆας Ἀχαιῶν\nλυσόμενός τε θύγατρα φέρων τʼ ἀπερείσιʼ ἄποινα,\nστέμματʼ ἔχων ἐν χερσὶν ἑκηβόλου Ἀπόλλωνος\n  χρυσέῳ ἀνὰ σκήπτρῳ, καὶ λίσσετο πάντας Ἀχαιούς,\nἈτρεΐδα δὲ μάλιστα δύω, κοσμήτορε λαῶ\nἈτρεΐδαι τε καὶ ἄλλοι ἐϋκνήμιδες Ἀχαιοί,\nὑμῖν μὲν θεοὶ δοῖεν Ὀλύμπια δώματʼ ἔχοντες\nἐκπέρσαι Πριάμοιο πόλιν, εὖ δʼ οἴκαδʼ ἱκέσθα\n  παῖδα δʼ ἐμοὶ λύσαιτε φίλην, τὰ δʼ ἄποινα δέχεσθαι,\nἁζόμενοι Διὸς υἱὸν ἑκηβόλον Ἀπόλλωνα.\n\nἔνθʼ ἄλλοι μὲν πάντες ἐπευφήμησαν Ἀχαιοὶ\nαἰδεῖσθαί θʼ ἱερῆα καὶ ἀγλαὰ δέχθαι ἄποιν\nἀλλʼ οὐκ Ἀτρεΐδῃ

In [48]:
# tokenize homer gesamt
homer_tokens = word_tokenize(homer)

In [49]:
# stopwords delete
tokens = delete_stopwords("stopwords.txt",homer_tokens)

Len of tokens before cleaning: 553731
Done!
Len of tokens after cleaning: 421398


In [50]:
# Mapping token --> pos
homer_dict = homer_dictionary(data)

In [51]:
#test dictionary
homer_dict['ἔθηκε']

'v3saia---'

In [68]:
homer_dict['ἄποινα']

'n-p---na-'

In [52]:
# test extraction
euhomai_neighbours, euhomai_forms = extract_neighbours("εὔχομαι",corpus=tokens)
#

[('ὣς', 66), ('εἶναι', 57), ('ἔφατʼ', 43), ('ἀλλʼ', 28), ('αὐτὰρ', 28),
 ('ἐν', 26), ('γὰρ', 25), ('ἐπὶ', 25), ('νῦν', 24), ('ἔπος', 23), ('ἔκλυε', 23),
 ('”', 23), ('ῥʼ', 22), ('ἐκ', 22), ('ἐγὼ', 22), ('διὶ', 22), ('δὴ', 20),
 ('διὸς', 20), ('γένος', 20), ('ηὔδ', 19), ('οὔ', 18), ('γʼ', 18), ('γε', 18),
 ('ἀθήνη', 18), ('ποσειδάωνι', 18), ('ἔπειτα', 18), ('ἦ', 16), ('ἀχαιῶν', 16),
 ('ἀπόλλων', 16), ('ἐπεὶ', 16), ('ἐνὶ', 15), ('ἄρʼ', 15), ('ὃς', 15),
 ('παλλὰς', 15), ('κλῦθι', 14), ('ἐξ', 14), ('μάλʼ', 14), ('ἠδʼ', 13),
 ('ὡς', 13), ('εἴ', 13), (';', 13), ('υἱὸς', 13), ('μέγα', 12), ('φοῖβος', 12),
 ('τις', 12), ('ἐπεί', 12), ('αὐτίκα', 12), ('κούρῃ', 12), ('οὐ', 12),
 ('χεῖρας', 12), ('οὐλοχύτας', 11), ('ζεῦ', 11), ('ἀγορεύσω', 11),
 ('ἄνακτ', 11), ('πατρὸς', 11), ('πολλὰ', 11), ('καί', 11), ('αἶψα', 11),
 ('μάλα', 10), ('οὕτω', 10), ('μοι', 10), ('μέν', 10), ('πατρὶ', 10),
 ('αἴ', 10), ('ἀνασχώ', 10), ('οὐδʼ', 9), ('θεοῖσιν', 9), ('»', 9),
 ('γαιήοχε', 9), ('ἐμὸς', 9), ('κατὰ', 9), (

In [53]:
pprint(euhomai_forms, compact=True)

{'εὐξάμενοι', 'εὐξάμενος', 'εὐξάμενός', 'εὐξαίμην', 'εὐξαμένοιο', 'εὐξαμένου',
 'εὐξαμένῃ', 'εὐχομένη', 'εὐχομένης', 'εὐχομένοιο', 'εὐχομένοισι', 'εὐχομένου',
 'εὐχομένω', 'εὐχόμεθ̓', 'εὐχόμεναι', 'εὐχόμενοι', 'εὐχόμενος', 'εὐχόμενόν',
 'εὔξαντο', 'εὔξατο', 'εὔξεαι', 'εὔχἐ', 'εὔχεαι', 'εὔχεο', 'εὔχεσθαι',
 'εὔχεσθε', 'εὔχετ̓', 'εὔχεται', 'εὔχετο', 'εὔχομ̓', 'εὔχομαι', 'εὔχοντο'}


### Test class

In [82]:
euhomai = Analyzer('εὔχομαι')

In [83]:
euhomai.analyze(lexicon = homer_dict,corpus = tokens)

In [84]:
euhomai.extract_context(corpus=tokens)

In [85]:
# number of neighbours
euhomai.context_words

4541

In [86]:
euhomai.get_all()

POS:

n : 0.05616
v : 0.06342
a : 0.04096
d : 0.01409
l : 0.0
g : 0.00529
c : 0.0044
r : 0.00573
p : 0.00881
m : 0.00066
i : 0.00022
u : 0.00022
x : 0.00022

TENSE

p : 0.01586
i : 0.00969
l : 0.00088
r : 0.00308
t : 0.00022
f : 0.00242
a : 0.03127

VOICE

a : 0.04713
p : 0.00088
m : 0.00617
e : 0.00903

MOOD

i : 0.03479
s : 0.00308
o : 0.00132
n : 0.00749
m : 0.00484
p : 0.01189

CASE

n : 0.03656
g : 0.02092
d : 0.02158
a : 0.03523
v : 0.00352
l : 0.0


## Main extraction

In [87]:
homeric_verbs = extract_verbs(data)

In [91]:
# random test
'φαίνω' in homeric_verbs

True

In [108]:
with open("homerische_daten.txt","a",encoding="UTF-8") as daten:
        daten.write('verb|occurrences|noun|verb|adjective|adverb|article|particle|conjunction|preposition|pronoun|numeral|interjection|punctuation|present|imperfect|pluperfect|perfect|futPerfect|future|aorist|indicative|subjunctive|optative|infinitive|imperative|participle|active|passive|middle|mediopassive|nom|gen|dat|acc|voc|loc|class\n')
        for v in homeric_verbs:
            v_analyzer = Analyzer(v)
            v_analyzer.analyze(corpus=tokens, lexicon=homer_dict)            
            v_analyzer.normalize()
            # counts
            count = 0
            for f in v_analyzer.v_forms:
                count += len(allIndices(tokens,f))
            # first infos: verb_lemma + count of occurrences of the paradigmatic form in text
            daten.write(f"{v}|{count}|")
            
            # pos 
            daten.write(str(v_analyzer.pos['n']) + "|" + str(v_analyzer.pos['v'])+ "|" +str(v_analyzer.pos['a'])+ "|" +str(v_analyzer.pos['d'])+ "|" +str(v_analyzer.pos['l'])+ "|" +str(v_analyzer.pos['g'])+ "|" +str(v_analyzer.pos['c'])+ "|" +str(v_analyzer.pos['r'])+ "|" +str(v_analyzer.pos['p'])+ "|" +str(v_analyzer.pos['m'])+ "|" +str(v_analyzer.pos['i'])+ "|" +str(v_analyzer.pos['u']) + "|")
            
            # tenses and aspects
            daten.write(str(v_analyzer.tense['p']) +"|"+str(v_analyzer.tense['i']) +"|"+str(v_analyzer.tense['l']) +"|"+str(v_analyzer.tense['r']) +"|"+str(v_analyzer.tense['t']) +"|"+str(v_analyzer.tense['f']) +"|"+str(v_analyzer.tense['a']) +"|")
            
            # moods
            daten.write(str(v_analyzer.mood['i']) +"|"+str(v_analyzer.mood['s']) +"|"+str(v_analyzer.mood['o']) +"|"+str(v_analyzer.mood['n']) +"|"+str(v_analyzer.mood['m']) +"|"+str(v_analyzer.mood['p']) +"|")
            
            # voice
            daten.write(str(v_analyzer.voice['a']) +"|"+str(v_analyzer.voice['p']) +"|"+str(v_analyzer.voice['m']) +"|"+str(v_analyzer.voice['e']) +"|")

            # case
            daten.write(str(v_analyzer.case['n'])+"|"+str(v_analyzer.case['g'])+"|"+str(v_analyzer.case['d'])+"|"+str(v_analyzer.case['a'])+"|"+str(v_analyzer.case['v'])+"|"+str(v_analyzer.case['l']))
            # 
            daten.write("|\n")