# Exploration of CLTK

Cltk is a MIT-Package to deal with ancient languages. It supports classes for many old languages (Akkadian, Old Russian, Latin, Greek, Sanskrit ....). Some of them are trivial functions (unicode rendering etc...) but others are really nice.

In [12]:
def load(txt,lines = False):

    '''
    DOC: 
    Loads text from txt files
    params: 
    	- txt (str) : path of the file
    	- lines (bool) : if True readlines(), else read()

    return:
    	- data (str) : the text read off the file
    '''
    if lines:
        with open(txt, "r",encoding = "UTF-8") as source:
            data = source.readlines()
    else:
        with open(txt, "r",encoding = "UTF-8") as source:
            data = source.read()
    return data

In [13]:
homerText = load("HomerGesamt.txt")

In [17]:
import re
def clean_greek(txt):
    txt = re.sub(r"\n", " ",txt, re.MULTILINE) # delete row break
    txt = re.sub(r"\\n", " ",txt, re.MULTILINE) # delete row break
    txt = re.sub("n", "",txt, re.MULTILINE) # delete row break
    txt = re.sub(r";","?",txt,re.MULTILINE) # change question mark
    txt = re.sub(r"(\d+(\.\d+)?)","",txt,0,re.MULTILINE) # delete verse number
    txt = re.sub(r"\w(·)","",txt,0,re.MULTILINE)
    txt = re.sub(r'(\.|,|;|!|\")',"",txt,re.MULTILINE)
    return txt

In [18]:
homerText = clean_greek(homerText)

In [26]:
homerText[:150]

'μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος οὐλομένην ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν ἡρώων αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν  '

In [27]:
# some tokens from homer:
some_tokens = homerText[:150].split()

## Greek Accentuation & Syllabification

This is a separate module, created by James Tauber,which  deals with greek Accentuation and Syllabification, allowing analyzing word accents, dividing word into syllables and printing possible acc patterns for unaccented words.

In [20]:
from greek_accentuation.syllabify import syllabify, display_word

In [30]:
for w in some_tokens:
    w_sill = syllabify(w)
    print(w_sill)

['μῆ', 'νιν']
['ἄ', 'ει', 'δε']
['θε', 'ὰ']
['Πη', 'λη', 'ϊ', 'ά', 'δε', 'ω']
['Ἀ', 'χι', 'λῆ', 'ος']
['οὐ', 'λο', 'μέ', 'νην']
['ἣ']
['μυ', 'ρίʼ']
['Ἀ', 'χαι', 'οῖς']
['ἄλ', 'γεʼ']
['ἔ', 'θη', 'κε']
['πολ', 'λὰς']
['δʼ']
['ἰ', 'φθί', 'μους']
['ψυ', 'χὰς']
['Ἄ', 'ϊ', 'δι']
['προ', 'ΐ', 'α', 'ψεν']
['ἡ', 'ρώ', 'ων']
['αὐ', 'τοὺς']
['δὲ']
['ἑ', 'λώ', 'ρι', 'α']
['τεῦ', 'χε']
['κύ', 'νεσ', 'σιν']


In [31]:
for w in some_tokens:
    w_sill = display_word(syllabify(w))
    print(w_sill)

μῆ.νιν
ἄ.ει.δε
θε.ὰ
Πη.λη.ϊ.ά.δε.ω
Ἀ.χι.λῆ.ος
οὐ.λο.μέ.νην
ἣ
μυ.ρίʼ
Ἀ.χαι.οῖς
ἄλ.γεʼ
ἔ.θη.κε
πολ.λὰς
δʼ
ἰ.φθί.μους
ψυ.χὰς
Ἄ.ϊ.δι
προ.ΐ.α.ψεν
ἡ.ρώ.ων
αὐ.τοὺς
δὲ
ἑ.λώ.ρι.α
τεῦ.χε
κύ.νεσ.σιν


In [32]:
from greek_accentuation.syllabify import onset_nucleus_coda

In [33]:
onset_nucleus_coda('οὐλομένην')

('̓', 'ου', 'λομένην')

In [34]:
from greek_accentuation.syllabify import syllable_length

In [47]:
for w in some_tokens:
    w_sill = syllabify(w)
    sill_parsing = []
    for sillaba in w_sill:
        if syllable_length(sillaba) == syllable_length('τεῦ'): # long
            sill_parsing.append("H")
        elif syllable_length(sillaba) == syllable_length('θε'): #kurz
            sill_parsing.append("L")
        else:
            sill_parsing.append('A') # ancipite
    print(sill_parsing)
            

['H', 'A']
['A', 'H', 'L']
['L', 'A']
['H', 'H', 'A', 'A', 'L', 'H']
['A', 'A', 'H', 'L']
['H', 'L', 'L', 'H']
['H']
['A', 'A']
['A', 'A', 'H']
['A', 'L']
['L', 'H', 'L']
['L', 'A']


TypeError: object of type 'NoneType' has no len()

### Accentuation

In [48]:
from greek_accentuation.accentuation import get_accent_type, display_accent_type

In [54]:
for w in some_tokens:
    try:
        print(f"{w} : AK: {display_accent_type(get_accent_type(w))}")
    except:
        print(f"{w} not classifiable")

μῆνιν : AK: properispomenon
ἄειδε : AK: proparoxytone
θεὰ not classifiable
Πηληϊάδεω : AK: proparoxytone
Ἀχιλῆος : AK: properispomenon
οὐλομένην : AK: paroxytone
ἣ not classifiable
μυρίʼ : AK: oxytone
Ἀχαιοῖς : AK: perispomenon
ἄλγεʼ : AK: paroxytone
ἔθηκε : AK: proparoxytone
πολλὰς not classifiable
δʼ not classifiable
ἰφθίμους : AK: paroxytone
ψυχὰς not classifiable
Ἄϊδι : AK: proparoxytone
προΐαψεν : AK: proparoxytone
ἡρώων : AK: paroxytone
αὐτοὺς not classifiable
δὲ not classifiable
ἑλώρια : AK: proparoxytone
τεῦχε : AK: properispomenon
κύνεσσιν : AK: proparoxytone


#### Possible accentuations

In [61]:
from greek_accentuation.accentuation import possible_accentuations, add_accent

In [66]:
for w in some_tokens:
    try:
        s = syllabify(w)
        possible_patterns = []
        for ak in possible_accentuations(s):
            possible_patterns.append(str(add_accent(s,ak)))
        print(possible_patterns)
    except:
        print(f"Don't know how to deal with {w}")
        
        

['μῆνίν', 'μῆνῖν', 'μῆ́νιν', 'μῆ͂νιν']
['ἄειδέ', 'ἄεῖδε', 'ά̓́ειδε']
['θεὰ́', 'θεὰ͂', 'θέὰ']
['Πηληϊάδεώ', 'Πηληϊάδεῶ', 'Πηληϊάδέω']
['Ἀχιλῆός', 'Ἀχιλῆ͂ος', 'Ἀχίλῆος']
['οὐλομένήν', 'οὐλομένῆν', 'οὐλομέ́νην']
['ὴ̔́', 'ὴ̔͂']
['μυρί́ʼ', 'μυρί͂ʼ', 'μύρίʼ', 'μῦρίʼ']
['Ἀχαιοῖ́ς', 'Ἀχαιοῖ͂ς', 'Ἀχαίοῖς']
['ἄλγέʼ', 'ά̓́λγεʼ', 'ά̓͂λγεʼ']
['ἔθηκέ', 'ἔθῆκε', 'έ̓́θηκε']
['πολλὰ́ς', 'πολλὰ͂ς', 'πόλλὰς']
Don't know how to deal with δʼ
['ἰφθίμούς', 'ἰφθίμοῦς', 'ἰφθί́μους']
['ψυχὰ́ς', 'ψυχὰ͂ς', 'ψύχὰς', 'ψῦχὰς']
['Ἄϊδί', 'Ἄϊδῖ', 'Ἄΐδι', 'Ἄῗδι', 'Ά̓́ϊδι']
['προΐαψέν', 'προΐάψεν', 'προΐᾶψεν', 'προΐ́αψεν']
['ἡρώών', 'ἡρώῶν', 'ἡρώ́ων']
['αὐτοὺ́ς', 'αὐτοὺ͂ς', 'αὔτοὺς']
['δὲ́']
['ἑλώριά', 'ἑλώριᾶ', 'ἑλώρία', 'ἑλώρῖα', 'ἑλώ́ρια']
['τεῦχέ', 'τεῦ͂χε']
['κύνεσσίν', 'κύνεσσῖν', 'κύνέσσιν', 'κύ́νεσσιν']


In [68]:
from greek_accentuation.accentuation import recessive
recessive('εἰσηλθον')

'εἴσηλθον'

## CLTK with TLG

In [76]:
import os
os.environ['GIT_PYTHON_REFRESH'] = "quiet" # since git is not used directly but only as dependence

In [77]:
import git

In [78]:
from cltk.corpus.utils.importer import CorpusImporter

In [79]:
corpus_importer = CorpusImporter('greek')

In [80]:
corpus_importer.list_corpora

['greek_software_tlgu',
 'greek_text_perseus',
 'phi7',
 'tlg',
 'greek_proper_names_cltk',
 'greek_models_cltk',
 'greek_treebank_perseus',
 'greek_treebank_gorman',
 'greek_lexica_perseus',
 'greek_training_set_sentence_cltk',
 'greek_word2vec_cltk',
 'greek_text_lacus_curtius',
 'greek_text_first1kgreek',
 'greek_text_tesserae']

In [81]:
latin_corpora = CorpusImporter('latin')
latin_corpora.list_corpora

['latin_text_perseus',
 'latin_treebank_perseus',
 'latin_text_latin_library',
 'phi5',
 'phi7',
 'latin_proper_names_cltk',
 'latin_models_cltk',
 'latin_pos_lemmata_cltk',
 'latin_treebank_index_thomisticus',
 'latin_lexica_perseus',
 'latin_training_set_sentence_cltk',
 'latin_word2vec_cltk',
 'latin_text_antique_digiliblt',
 'latin_text_corpus_grammaticorum_latinorum',
 'latin_text_poeti_ditalia',
 'latin_text_tesserae']

### Corpus Reader

In [96]:
from cltk.stem.lemma import LemmaReplacer
from cltk.corpus.utils.formatter import cltk_normalize

In [95]:
sentence = homerText[:150]

In [97]:
sentence = cltk_normalize(sentence)

In [100]:
lemmatizer = LemmaReplacer("greek")

In [102]:
lemmatizer.lemmatize(sentence,return_raw = True)

['μῆνιν/μῆνις',
 'ἄειδε/ἀείδω',
 'θεὰ/θεὰ',
 'Πηληϊάδεω/Πηληϊάδεω',
 'Ἀχιλῆος/ἀχιλλεύς',
 'οὐλομένην/οὐλόμενος',
 'ἣ/ἣ',
 'μυρίʼ/μυρίʼ',
 'Ἀχαιοῖς/ἀχαιός',
 'ἄλγεʼ/ἄλγεʼ',
 'ἔθηκε/τίθημι',
 'πολλὰς/πολλὰς',
 'δʼ/δʼ',
 'ἰφθίμους/ἴφθιμος',
 'ψυχὰς/ψυχὰς',
 'Ἄϊδι/Ἄϊδι',
 'προΐαψεν/προΐαψεν',
 'ἡρώων/ἥρως',
 'αὐτοὺς/αὐτοὺς',
 'δὲ/δὲ',
 'ἑλώρια/ἑλώριον',
 'τεῦχε/τεύχω',
 'κύνεσσιν/κύων']

In [103]:
from cltk.lemmatize.greek.backoff import BackoffGreekLemmatizer

In [108]:
lemmatizer = BackoffGreekLemmatizer()

In [110]:
lemmatizer.lemmatize(homerText[:150].split())

[('μῆνιν', 'μῆνις'),
 ('ἄειδε', 'ἀείδω'),
 ('θεὰ', 'θεά'),
 ('Πηληϊάδεω', 'Πηληϊάδεω'),
 ('Ἀχιλῆος', 'Ἀχιλλεύς'),
 ('οὐλομένην', 'οὐλόμενος'),
 ('ἣ', 'ὁ'),
 ('μυρίʼ', 'μυρίʼ'),
 ('Ἀχαιοῖς', 'Ἀχαιός'),
 ('ἄλγεʼ', 'ἄλγεʼ'),
 ('ἔθηκε', 'τίθημι'),
 ('πολλὰς', 'πολύς'),
 ('δʼ', 'δʼ'),
 ('ἰφθίμους', 'ἴφθιμος'),
 ('ψυχὰς', 'ψυχή'),
 ('Ἄϊδι', 'Ἀΐδης'),
 ('προΐαψεν', 'προΐαψεν'),
 ('ἡρώων', 'ἥρως'),
 ('αὐτοὺς', 'αὐτός'),
 ('δὲ', 'δέ'),
 ('ἑλώρια', 'ἑλώριον'),
 ('τεῦχε', 'τεύχω'),
 ('κύνεσσιν', 'κύων')]

## Pos Tagging for Greek

In [111]:
from cltk.tag.pos import POSTag

In [112]:
tagger = POSTag('greek')

In [113]:
tagger.tag_ngram_123_backoff(homerText[:150])

[('μῆνιν', 'N-S---FA-'),
 ('ἄειδε', 'V2SPMA---'),
 ('θεὰ', 'N-S---FV-'),
 ('Πηληϊάδεω', None),
 ('Ἀχιλῆος', None),
 ('οὐλομένην', 'A-S---FA-'),
 ('ἣ', 'P-S---FN-'),
 ('μυρίʼ', None),
 ('Ἀχαιοῖς', None),
 ('ἄλγεʼ', None),
 ('ἔθηκε', 'V3SAIA---'),
 ('πολλὰς', 'A-P---FA-'),
 ('δʼ', None),
 ('ἰφθίμους', 'A-P---MA-'),
 ('ψυχὰς', 'N-P---FA-'),
 ('Ἄϊδι', None),
 ('προΐαψεν', None),
 ('ἡρώων', 'N-P---MG-'),
 ('αὐτοὺς', 'A-P---MA-'),
 ('δὲ', 'G--------'),
 ('ἑλώρια', 'N-P---NA-'),
 ('τεῦχε', 'V3SIIA---'),
 ('κύνεσσιν', 'N-P---MD-')]

## Prosody scanner

In [115]:
from cltk.prosody.greek.scanner import Scansion

In [116]:
scanner = Scansion()

In [120]:
scanner.scan_text('νέος μὲν καὶ ἄπειρος, δικῶν ἔγωγε ἔτι. μὲν καὶ ἄπειρος')

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\Slavist29\Anaconda3\envs\MachineLearning\lib\site-packages\cltk\prosody\greek\scanner.py", line 162, in _long_by_position
    next_syll = sentence[sentence.index(syllable) + 1]
IndexError: list index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Slavist29\Anaconda3\envs\MachineLearning\lib\logging\__init__.py", line 1028, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\Slavist29\Anaconda3\envs\MachineLearning\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 96-97: character maps to <undefined>
Call stack:
  File "C:\Users\Slavist29\Anaconda3\envs\MachineLearning\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\Slavist29\Anaconda3\en

['˘¯¯¯˘¯¯˘¯˘¯˘˘x']

## Tokenizer

In [121]:
from cltk.tokenize.greek.sentence import SentenceTokenizer
sent_tokenizer = SentenceTokenizer()

In [122]:
sent_tokenizer.tokenize(homerText[:150])

['μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος οὐλομένην ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν ἡρώων αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν  ']

In [123]:
from cltk.tokenize.sentence import TokenizeSentence

In [124]:
tokenizer = TokenizeSentence('greek')

In [126]:
tokens  = tokenizer.tokenize(homerText[:150])

['μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος οὐλομένην ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν ἡρώων αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν  ']

## Word2Vec with Gensim

In [131]:
from cltk.ir.query import search_corpus

In [132]:
for x in search_corpus('πνεῦμα', 'tlg', context='sentence', case_insensitive=True, expand_keyword=True, threshold=0.5):
    print(x)

UnpicklingError: invalid load key, 'v'.