In [3]:
from dataclasses import dataclass
from doc_data.processor import read_data
from stanza.models.common.doc import Sentence, Word, Document

In [4]:
import pandas as pd

mvi = pd.read_csv("../data/mvi.csv")
mvi = mvi["lemma"]

In [5]:
doc_path = "../data/proc/Herodotus (0016) - Histories (001).pickle"

In [41]:
doc = read_data(doc_path)

In [38]:
def author_text_from_path(path: str):
    import os
    path = os.path.basename(path)
    author, text = path.replace(".pickle", "").split(" - ")
    return author, text

In [40]:
from typing import Union, List

class LocalSentence(Sentence):    
    def __init__(self, author, text, stanza_sentence=None):
        if stanza_sentence:
            self.__dict__ = stanza_sentence.__dict__
        self.author = author
        self.text = text

class LocalToken(Word):
    def __init__(self, stanza_word=None):
        if stanza_word:
            self.__dict__ = stanza_word.__dict__
            
    def get_head(self) -> Union["LocalToken", None]:
        if self.head == 0:
            return None
        else:
            return LocalToken(self.sent.words[self.head - 1])
        
    def get_children(self) -> Union[List["LocalToken"], None]:
        children = []
        for token in self.sent.words:
            if token.head == self.id:
                children.append(LocalToken(token))
        if len(children) == 0:
            return None
        return children
    
@dataclass
class QueryHit:
    sentence: Sentence
    hit: LocalToken
    head: Union[None, LocalToken]
    children: List[LocalToken]
    
    
def lemma_main_query(doc: Document, constraint: List[str]) -> List[QueryHit]:
    query_hits = []

    for sent in doc.sentences:
        for token in sent.words:
            token = LocalToken(token)
            if token.lemma in list(mvi) and token.get_children() is not None:
                query_hits.append(QueryHit(sentence= token.sent, hit= token, head= token.get_head(), children= token.get_children()))
    return query_hits


def feature_sub_query(main_hits: List[QueryHit], constraint: List[str]) -> List[QueryHit]:
    query_hits = []
    
    for hit in main_hits:
        for child in hit.children:
            if child.feats is None:
                continue
            if any([feat in child.feats.split("|") for feat in constraint]):
                query_hits.append(QueryHit(sentence= child.sent, hit=child, head=child.get_head(), children = child.get_children()))
    return query_hits
    

In [42]:
author, text = author_text_from_path(doc_path)
for sent in doc.sentences:
    sent = LocalSentence(author=author, text=text)

In [44]:
doc.sentences[0]

[
  {
    "id": 1,
    "text": "Ἡροδότου",
    "lemma": "Ἡροδότης",
    "upos": "NOUN",
    "xpos": "n-s---mg-",
    "feats": "Case=Gen|Gender=Masc|Number=Sing",
    "head": 4,
    "deprel": "nmod",
    "start_char": 0,
    "end_char": 8
  },
  {
    "id": 2,
    "text": "Ἁλικαρνησσέος",
    "lemma": "Ἁλικαρνησσής",
    "upos": "NOUN",
    "xpos": "n-s---mg-",
    "feats": "Case=Gen|Gender=Masc|Number=Sing",
    "head": 3,
    "deprel": "nmod",
    "start_char": 9,
    "end_char": 22
  },
  {
    "id": 3,
    "text": "ἱστορίης",
    "lemma": "ἱστορία",
    "upos": "NOUN",
    "xpos": "n-s---fg-",
    "feats": "Case=Gen|Gender=Fem|Number=Sing",
    "head": 4,
    "deprel": "nmod",
    "start_char": 23,
    "end_char": 31
  },
  {
    "id": 4,
    "text": "ἀπόδεξις",
    "lemma": "ἀπόδεξις",
    "upos": "NOUN",
    "xpos": "n-s---fn-",
    "feats": "Case=Nom|Gender=Fem|Number=Sing",
    "head": 0,
    "deprel": "root",
    "start_char": 32,
    "end_char": 40
  },
  {
    "id": 5,
    "t

In [10]:
q1_results = lemma_main_query(doc, list(mvi))

In [11]:
len(q1_results)

1378

In [12]:
q2_results = feature_sub_query(q1_results, ["Case=Gen", "Case=Dat"])

In [13]:
len(q2_results)

793

In [15]:
q3_results = feature_sub_query(q1_results, ["VerbForm=Inf"])

In [16]:
len(q3_results)

517

In [18]:
q2_sents = [x.sentence for x in q2_results]
q3_sents = [x.sentence for x in q3_results]
q4_sents = list(set(q2_sents).intersection(set(q3_sents)))
len(q4_sents)

229

In [20]:
print(q4_sents[0].text)

ἔνθα δὴ Μεγακρέοντος ἀνδρὸς Ἀβδηρίτεω ἔπος εὖ εἰρημένον ἐγένετο , ὃς συνεβούλευσε Ἀβδηρίτῃσι πανδημεί , αὐτοὺς καὶ γυναῖκας , ἐλθόντας ἐς τὰ σφέτερα ἱρὰ ἵζεσθαι ἱκέτας τῶν θεῶν παραιτεομένους καὶ τὸ λοιπόν σφι ἀπαμύνειν τῶν ἐπιόντων κακῶν τὰ ἡμίσεα , τῶν τε παροιχομένων ἔχειν σφι μεγάλην χάριν , ὅτι βασιλεὺς Ξέρξης οὐ δὶς ἑκάστης ἡμέρης ἐνόμισε σῖτον αἱρέεσθαι ·
