In [1]:
from cltk.corpus.utils.importer import CorpusImporter
from cltk.corpus.readers import get_corpus_reader
from cltk.tag.pos import POSTag
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from tqdm.auto import tqdm

In [27]:
import json
from pprint import pprint
import pandas as pd

In [4]:
corpus_importer = CorpusImporter('latin')

corpus_importer.import_corpus('latin_models_cltk')

corpus_importer.import_corpus('latin_text_perseus')

reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus')

In [5]:
docs = list(reader.docs())

In [None]:
perseus_sents = list(reader.sents())

len(perseus_sents)

In [None]:
with open('out.json', 'w', encoding='utf-8') as f:
    json.dump(docs, f, ensure_ascii=False, indent=4)

In [6]:
tagger = POSTag('latin')
lemmatizer = BackoffLatinLemmatizer()

In [7]:
sent_tokenizer = SentenceTokenizer(strict=True)

In [8]:
tagger.tag_ngram_123_backoff('Gallia est omnis divisa in partes tres')

[('Gallia', None),
 ('est', 'V3SPIA---'),
 ('omnis', 'A-S---MN-'),
 ('divisa', 'T-PRPPNN-'),
 ('in', 'R--------'),
 ('partes', 'N-P---FA-'),
 ('tres', 'M--------')]

Функция находит в предложении предлоги, употребляющиеся с аблативом, и стоящие после них слова. Если есть что-то необычное, например, слово не в аблативе, или не имеющее падежа, то оно возвращается.

1: 	part of speech
 
 	n	noun
 	v	verb
 	t	participle
 	a	adjective
 	d	adverb
 	c	conjunction
 	r	preposition
 	p	pronoun
 	m	numeral
 	i	interjection
 	e	exclamation
 	u	punctuation

In [24]:
def convert_analysis_to_dict(analysis, keep_empty=True):

    pos = {
        'n': 'noun',
        'v': 'verb',
        't': 'participle',
        'a': 'adjective',
        'd': 'adverb',
        'c': 'conjunction',
        'r': 'preposition',
        'p': 'pronoun',
        'm': 'numeral',
        'i': 'interjection',
        'e': 'exclamation',
        'u': 'punctuation'
    }
    
    person = {
        '1': 'first person',
        '2': 'second person',
        '3': 'third person'
    }
    
    number = {
        's': 'singular',
        'p': 'plural'
    }
    
    tense = {
        'p': 'present',
        'i': 'imperfect',
        'r': 'perfect',
        'l': 'pluperfect',
        't': 'future perfect',
        'f': 'future',
    }
    
    mood = {
        'i': 'indicative',
        's': 'subjunctive',
        'n': 'infinitive',
        'm': 'imperative',
        'p': 'participle',
        'd': 'gerund',
        'g': 'gerundive',
        'u': 'supine',
    }
    
    voice = {
        'a': 'active',
        'p': 'passive',
    }
    
    gender = {
        'm': 'masculine',
        'f': 'feminine',
        'n': 'neuter',
    }
    
    case = {
        'n': 'nominative',
        'g': 'genitive',
        'd': 'dative',
        'a': 'accusative',
        'b': 'ablative',
        'v': 'vocative',
        'l': 'locative',
    }
    
    degree = {
        'c': 'comparative',
        's': 'superlative',
    }
    
    categories = {1: pos, 2: person, 3: number, 4: tense, 5:mood,
                  6: voice, 7: gender, 8: case, 9: degree}
    categories_names = {1: 'pos', 2: 'person', 3: 'number', 4: 'tense',
                        5: 'mood', 6: 'voice', 7: 'gender', 8: 'case',
                        9: 'degree'}
    
    
    dict_analysis = {}
    
    for i, cat_value_letter in enumerate(analysis, start=1):
        cat_name = categories_names[i]
        if cat_value_letter == '-':
            if keep_empty:
                dict_analysis[cat_name] = 'N/A'
            continue
        try:    
            cat_value_word = categories[i][cat_value_letter.lower()]
            dict_analysis[cat_name] = cat_value_word
        except KeyError as k_e:
            print(k_e)
            print(cat_value_letter, analysis)
            
    return dict_analysis

In [10]:
def get_non_ablative_after_preposition(sent):
    # pro вроде иногда с Acc
    abl_preps = ('a', 'ab', 'de', 'cum', 'ex', 'e', 'sine', 'pro', 'prae')
    results = {'sentence': None, 'strange_pairs': []}
    
    tagged_words =  tagger.tag_ngram_123_backoff(sent)
    words_number = len(tagged_words)
    
    for i, (word, analysis) in enumerate(tagged_words):        
        if analysis is None:# or analysis == 'None':
            continue
        
        is_needed_prep = analysis[0] == 'R' and word in abl_preps
        if not is_needed_prep:
            continue
        
#         word_with_verbose_analysis = convert_analysis_to_dict(word, analysis)
        verbose_analysis = convert_analysis_to_dict(analysis)
        verbose_analysis['word'] = word
        word_lemma = lemmatizer.lemmatize([word])[0][1]
        verbose_analysis['lemma'] = word_lemma

        is_last = i == words_number - 1        
        if is_last:
            results['sentence'] = sent
            results['strange_pairs'].append(
                (verbose_analysis, ('%END%', '%END')))
        else:
            next_word, next_word_analysis = tagged_words[i+1]
            if next_word_analysis is None:
                continue
                
            if next_word_analysis[8-1].lower() != 'b':
                next_word_verbose_analysis = convert_analysis_to_dict(
                    next_word_analysis)
                next_word_verbose_analysis['word'] = next_word
                next_word_lemma = lemmatizer.lemmatize([next_word])[0][1]
                next_word_verbose_analysis['lemma'] = next_word_lemma
                
                results['sentence'] = sent
                results['strange_pairs'].append((verbose_analysis,
                                                next_word_verbose_analysis))
            
    if results['sentence'] is None:
        return None
    return results

In [13]:
def flatten_dict_into_str(iter_):
    if isinstance(iter_, str):
        yield iter_
    else:
        try:
            for i, obj in iter_.items():
                yield from flatten_dict_into_str(obj)
        except:
            try:
                for obj in iter_:
                    yield from flatten_dict_into_str(obj)
            except:
                yield iter_

In [11]:
def analyse_document(doc):
    analysis_for_doc = {'author': doc['author'], 'title': doc['originalTitle'],
                       'sentences': []}
    
    text = ' '.join(map(str, flatten_dict_into_str(doc['text'])))
    
    sentences = sent_tokenizer.tokenize(text)
    for sent in tqdm(sentences):
        sent_data = get_non_ablative_after_preposition(sent)
        if sent_data is None:
            continue
            
        analysis_for_doc['sentences'].append(sent_data)
    
    return analysis_for_doc

In [23]:
# count = 3000

# results = []


# for sent in tqdm(perseus_sents[:count]):
#     result = get_non_ablative_after_preposition(sent)
#     if result is not None:
#         results.append(result)

count_doc = 2

results = []

for doc in tqdm(docs[:count_doc]):
    result = analyse_document(doc)
    if result is not None:
        results.append(result)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5006.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1315.0), HTML(value='')))

'1'
1 V1SPIA---
'3'
3 V3SPSA---




In [25]:
with open('res_2docs_perseus.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

In [21]:
len(results[0]['sentences'])

184

Сделать из списка словарей, каждый из которых описывает один документ, датафрейм. Каждой паре слов из словаря присвоить метаданные

In [35]:
def convert_dict_to_dataframe(results):
    author = results['author']
    title = results['title']
    entries_list = []
    
    for sentence_and_word_pairs_dict in results['sentences']:
        sentence = sentence_and_word_pairs_dict['sentence'].replace('\n', ' ')
        word_pairs = sentence_and_word_pairs_dict['strange_pairs']
        for prep_dict, second_word_dict in word_pairs:
            entry = {'prep': prep_dict['word'], **second_word_dict,
                     'sentence': sentence,
                     'author': author, 'title': title}
            entries_list.append(entry)
    
    return pd.DataFrame(entries_list)

In [37]:
df = convert_dict_to_dataframe(results[0])

In [40]:
df.describe()

Unnamed: 0,prep,pos,person,number,tense,mood,voice,gender,case,degree,word,lemma,sentence,author,title
count,189,189,189.0,189,189.0,189.0,189.0,189,189,189.0,189,189,189,189,189
unique,9,9,1.0,3,3.0,4.0,3.0,4,6,1.0,101,89,184,1,1
top,ex,noun,,singular,,,,feminine,genitive,,se,sui,Ergo postquam factus est imperator Zeno a fili...,ammianus marcellinus.,Rerum Gestarum
freq,36,73,189.0,124,176.0,176.0,176.0,73,54,189.0,13,15,3,189,189
