In [1]:
from cltk.corpus.utils.importer import CorpusImporter
from cltk.corpus.readers import get_corpus_reader
from cltk.tag.pos import POSTag
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from tqdm.auto import tqdm
from pandas.api.types import CategoricalDtype

In [49]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from itertools import zip_longest, islice

ModuleNotFoundError: No module named 'matplotlib'

In [3]:
# import cProfile
# import pstats
# from pstats import SortKey
# import tracemalloc

In [4]:
pd.set_option("display.max_rows", 100)

In [7]:
corpus_importer = CorpusImporter('latin')

corpus_importer.import_corpus('latin_models_cltk')
corpus_importer.import_corpus('latin_text_perseus')

reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus')

In [None]:
# perseus_sents = list(reader.sents())

# len(perseus_sents)

In [None]:
# with open('out.json', 'w', encoding='utf-8') as f:
#     json.dump(docs, f, ensure_ascii=False, indent=4)

In [9]:
tagger = POSTag('latin')
lemmatizer = BackoffLatinLemmatizer()

sent_tokenizer = SentenceTokenizer(strict=True)

In [10]:
# def get_categories():
pos = {
    'n': 'noun',
    'v': 'verb',
    't': 'participle',
    'a': 'adjective',
    'd': 'adverb',
    'c': 'conjunction',
    'r': 'preposition',
    'p': 'pronoun',
    'm': 'numeral',
    'i': 'interjection',
    'e': 'exclamation',
    'u': 'punctuation'
}

person = {
    '1': 'first person',
    '2': 'second person',
    '3': 'third person'
}

number = {
    's': 'singular',
    'p': 'plural'
}

tense = {
    'p': 'present',
    'i': 'imperfect',
    'r': 'perfect',
    'l': 'pluperfect',
    't': 'future perfect',
    'f': 'future',
}

mood = {
    'i': 'indicative',
    's': 'subjunctive',
    'n': 'infinitive',
    'm': 'imperative',
    'p': 'participle',
    'd': 'gerund',
    'g': 'gerundive',
    'u': 'supine',
}

voice = {
    'a': 'active',
    'p': 'passive',
}

gender = {
    'm': 'masculine',
    'f': 'feminine',
    'n': 'neuter',
}

case = {
    'n': 'nominative',
    'g': 'genitive',
    'd': 'dative',
    'a': 'accusative',
    'b': 'ablative',
    'v': 'vocative',
    'l': 'locative',
}

degree = {
    'c': 'comparative',
    's': 'superlative',
}

categories = {'pos': pos, 'person': person, 'number': number, 'tense': tense,
              'mood': mood, 'voice': voice, 'gender': gender, 'case': case,
              'degree': degree}
categories_names = {1: 'pos', 2: 'person', 3: 'number', 4: 'tense',
                    5: 'mood', 6: 'voice', 7: 'gender', 8: 'case',
                    9: 'degree'}
    
#     return categories, categories_names

In [11]:
def convert_analysis_to_dict(analysis, keep_empty=True):   
    dict_analysis = {}
    
    for i, cat_value_letter in enumerate(analysis, start=1):
        cat_name = categories_names[i]
        if cat_value_letter == '-':
            if keep_empty:
                dict_analysis[cat_name] = 'N/A'
            continue
        try:    
            cat_value_word = categories[cat_name][cat_value_letter.lower()]
            dict_analysis[cat_name] = cat_value_word
        except KeyError as k_e:
            print(k_e)
            print(cat_value_letter, analysis)
            
    return dict_analysis

In [12]:
def is_foreign_lang_in_sent(sent):
    # истинно для открывков вроде
    # h(\ to/sa fa/rmaka h)/dh, o(/sa tre/fei eu)rei=a xqw/n
    foreign_chars = set(['\\', '/', '|', '='])
#     foreign_chars_code = re.compile(r'\\\w+|\w\\|/\w|\w/|\||=')
#     return bool(re.search(foreign_chars_code, sent))
    return any(word in sent for word in foreign_chars)

Функция находит в предложении предлоги, употребляющиеся с аблативом, и стоящие после них слова. Если есть что-то необычное, например, слово не в аблативе, или не имеющее падежа, то оно возвращается.

Сперва сделаем разборы слов предложения, заодно они токенизируются. Все слова на время сохраним в `words_in_sent`. Если среди них нет ни одного нужного предлога, функция сразу вернёт `None`. Если предлоги есть, то идёт проход по всем словам, проверка каждого слова и если это нужный предлог, то проверяется следующее слово. Если предлог - последнее слово в предложении (вернее в клаузе), то результат обозначается специальным маркером `%END%`, в противном случае анализируется следующее слово.

In [13]:
def get_non_ablative_after_preposition(sent):   
    
    # pro вроде иногда с Acc
    abl_preps = set(('a', 'ab', 'de', 'cum', 'ex', 'e', 'sine', 'pro', 'prae'))
    results = {'sentence': None, 'strange_pairs': []}
    
    tagged_words =  tagger.tag_ngram_123_backoff(sent)
    words_number = len(tagged_words)
    
    words_in_sent, _ = zip(*tagged_words)
    if not any(prep in words_in_sent for prep in abl_preps):
        return None
    
    for i, (word, analysis) in enumerate(tagged_words):        
        if analysis is None:
            continue
        
        is_needed_prep = analysis[0] == 'R' and word in abl_preps
        if not is_needed_prep:
            continue
        
        verbose_analysis = convert_analysis_to_dict(analysis)
        verbose_analysis['word'] = word
        [(_, word_lemma)] = lemmatizer.lemmatize([word])
        verbose_analysis['lemma'] = word_lemma

        is_last = i == words_number - 1        
        if is_last:
            results['sentence'] = sent
            results['strange_pairs'].append(
                (verbose_analysis,
                 {cat_name: '%END%' for cat_name in list(categories.keys())+['word', 'lemma']}))
        else:
            next_word, next_word_analysis = tagged_words[i+1]
            if next_word_analysis is None:
                # может всё-таки брать такие слова?
                continue
                
            if next_word_analysis[8-1].lower() != 'b':
                next_word_verbose_analysis = convert_analysis_to_dict(
                    next_word_analysis)
                next_word_verbose_analysis['word'] = next_word
                [(_, next_word_lemma)] = lemmatizer.lemmatize([next_word])
                next_word_verbose_analysis['lemma'] = next_word_lemma
                
                results['sentence'] = sent
                results['strange_pairs'].append((verbose_analysis,
                                                next_word_verbose_analysis))
            
    if results['sentence'] is None:
        return None
    return results

In [14]:
def flatten_dict_into_str(iter_):
    if isinstance(iter_, str):
        yield iter_
    else:
        try:
            for i, obj in iter_.items():
                yield from flatten_dict_into_str(obj)
        except:
            try:
                for obj in iter_:
                    yield from flatten_dict_into_str(obj)
            except:
                yield iter_

Функция `grouper()` из [itertools recipes](https://docs.python.org/library/itertools.html#itertools-recipes)

> this is feeding the same iterator to `izip_longest` multiple times, causing it to consume successive values of the same sequence rather than striped values from separate sequences. [StackOverflow](https://stackoverflow.com/questions/434287/what-is-the-most-pythonic-way-to-iterate-over-a-list-in-chunks)

In [15]:
def grouper(iterable, chunk_size, fillvalue=None):
    args = [iter(iterable)] * chunk_size
    return zip_longest(*args, fillvalue=fillvalue)

In [16]:
def analyse_document(doc):
    analysis_for_doc = {'author': doc['author'], 'title': doc['originalTitle'],
                       'sentences': []}
    
    text = ' '.join([str(el) for el in flatten_dict_into_str(doc['text'])])
    sentences = sent_tokenizer.tokenize(text)
    
    ### chunking
    num_of_sent = len(sentences)
    chunk_size = 300
    if num_of_sent >= 2 * chunk_size:
        sentences = grouper(sentences, chunk_size)
    else:
        sentences = [sentences]
        
    tqdm.write(str(type(sentences)))
    
    for sent_chunk in tqdm(sentences):
        for sent in sent_chunk:
            # если в предложении вкрапления греческого, которые в корпусе 
            # записаны очень странно, то пропускаем это предложение
            # также пропускаем пустое предложение, которое могло появиться из-за
            # fillvalue в итераторе 
            if sent is None or is_foreign_lang_in_sent(sent):
                continue

            sent_data = get_non_ablative_after_preposition(sent)
            if sent_data is None:
                continue

            analysis_for_doc['sentences'].append(sent_data)
    
    if not analysis_for_doc['sentences']:
        return None
    
    return analysis_for_doc

    
#     for sent in tqdm(sentences):
#         # если в предложении вкрапления греческого, которые в корпусе 
#         # записаны очень странно, то пропускаем это предложение 
#         if is_foreign_lang_in_sent(sent):
#             continue
        
#         sent_data = get_non_ablative_after_preposition(sent)
#         if sent_data is None:
#             continue
   
#         analysis_for_doc['sentences'].append(sent_data)
    
#     return analysis_for_doc

Выше в качестве `docs` взят генератор. Это довольно быстро по времени, и экономнее по памяти. **NB**: характерные гребни на графике использования RAM когда выполняется цикл.

In [19]:
def get_data_from_x_docs(count_doc, start=0):
    # count = 3000
    # results = []
    # for sent in tqdm(perseus_sents[:count]):
    #     result = get_non_ablative_after_preposition(sent)
    #     if result is not None:
    #         results.append(result)

    results = []

#     for doc in tqdm(docs[:count_doc]):
    for doc in tqdm(islice(docs, start, count_doc)):
        result = analyse_document(doc)
        if result is not None:
            results.append(result)
            
    return results

In [30]:
docs = reader.docs()

In [31]:
# tracemalloc.start()
count_doc = 10

# time = datetime.now().strftime('%H.%M.%S-%d-%m')
# STATS_FILENAME = 'runstats.'.format(time)
# cProfile.run('get_data_from_x_docs(count_doc)', STATS_FILENAME) 
# results = get_data_from_x_docs(count_doc)

results = get_data_from_x_docs(count_doc)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

<class 'itertools.zip_longest'>


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


<class 'itertools.zip_longest'>


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


<class 'list'>


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


<class 'itertools.zip_longest'>


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


<class 'list'>


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


<class 'list'>


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


<class 'list'>


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


<class 'list'>


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


<class 'list'>


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


<class 'list'>


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))





In [None]:
# top_stats = snapshot.statistics('lineno')

In [None]:
# print("[ Top 10 ]")
# for stat in top_stats[:20]:
#     print(stat)

In [None]:
# p = pstats.Stats('runstats')
# p.strip_dirs().sort_stats(SortKey.CUMULATIVE).print_stats()

In [None]:
with open('res_2docs_perseus.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

In [32]:
type(results)

list

In [33]:
len(results)

7

In [42]:
len(results[6]['sentences'])

2

Сделать из списка словарей, каждый из которых описывает один документ, датафрейм. Каждой паре слов из словаря присвоить метаданные

In [43]:
def convert_dict_to_dataframe(results):
    author = results['author']
    title = results['title']
    entries_list = []
    
    for sentence_and_word_pairs_dict in results['sentences']:
        sentence = sentence_and_word_pairs_dict['sentence'].replace('\n', ' ')
        word_pairs = sentence_and_word_pairs_dict['strange_pairs']
        for prep_dict, second_word_dict in word_pairs:
            entry = {'prep': prep_dict['word'], **second_word_dict,
                     'sentence': sentence,
                     'author': author, 'title': title}
            entries_list.append(entry)
    
    
    df = pd.DataFrame(entries_list)
    cols = ['prep', 'word', 'lemma', 'pos', 'person', 'number', 'tense', 'mood',
            'voice', 'gender', 'case', 'degree', 'sentence', 'author', 'title']
    df = df[cols]  
    
    df.replace('N/A', np.nan, inplace=True)
    
    dtypes = {cat_name: CategoricalDtype(categories=cat_map.values())
              for cat_name, cat_map in categories.items()}
    dtypes['prep'] = 'category'
    
    df_new = df.astype(dtypes)
    
    return df_new

In [44]:
df_full = pd.concat(list(map(convert_dict_to_dataframe, results)))

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
by_prep = df.groupby('prep')

In [None]:
df.dtypes

In [None]:
by_pos = df[['pos', 'word', 'lemma', 'sentence']].groupby(['pos'])['word'].count()
by_pos

In [None]:
df[['pos', 'word', 'lemma', 'sentence']].groupby(['pos', 'word'])['word'].count()

In [45]:
df_full.describe()

Unnamed: 0,prep,word,lemma,pos,person,number,tense,mood,voice,gender,case,degree,sentence,author,title
count,409,409,409,403,6,339,20,20,20,333,332,0.0,409,409,409
unique,9,190,164,11,2,2,2,5,2,3,5,0.0,394,3,7
top,a,me,ego,noun,third person,singular,perfect,participle,passive,masculine,accusative,,Ergo postquam factus est imperator Zeno a fili...,apuleius,Rerum Gestarum
freq,86,40,40,136,4,264,12,11,10,149,99,,3,213,189


In [46]:
df_full[['pos', 'word', 'lemma', 'sentence']].groupby(['pos', 'word'])['word'].count()

pos          word         
noun         adulescentiae    1
             aetatis          2
             amico            1
             amnis            1
             anni             1
                             ..
numeral      septem           2
             tribus           3
exclamation  ma               1
punctuation  ,                2
             -                1
Name: word, Length: 186, dtype: int64

In [47]:
df_full[['prep', 'pos', 'word']].groupby(['prep', 'pos'])['word'].count()

prep  pos        
a     noun           24
      participle      8
      adjective       7
      adverb          5
      conjunction     3
      pronoun        34
      numeral         1
ab    noun           11
      participle      1
      adjective       1
      adverb          2
      pronoun        10
cum   noun           12
      verb            4
      adjective       6
      adverb          9
      preposition     8
      pronoun        11
      numeral         1
      punctuation     1
de    noun           27
      verb            3
      participle      1
      adjective      24
      adverb          5
      conjunction     1
      pronoun        24
      punctuation     1
e     noun            3
      adverb          1
      preposition     1
      pronoun         1
      numeral         3
ex    noun           18
      participle      2
      adjective      18
      adverb          2
      preposition     1
      pronoun        13
      numeral         1
prae  noun            

In [48]:
df_full[df_full['word'] == '%END%']

Unnamed: 0,prep,word,lemma,pos,person,number,tense,mood,voice,gender,case,degree,sentence,author,title
