In [56]:
from dataclasses import dataclass
from doc_data.processor import read_data
from stanza.models.common.doc import Sentence, Word, Document

In [188]:
import pandas as pd

mvi = pd.read_csv("../data/mvi.csv")
mvi = mvi["lemma"]

In [189]:
doc_path = "../data/proc/Herodotus (0016) - Histories (001).pickle"

In [245]:
doc = read_data(doc_path)

In [235]:
from typing import Union, List

class LocalToken(Word):
    def __init__(self, stanza_word=None):
        if stanza_word:
            self.__dict__ = stanza_word.__dict__
            
    def get_head(self) -> Union[LocalToken, None]:
        if self.head == 0:
            return None
        else:
            return LocalToken(self.sent.words[self.head - 1])
        
    def get_children(self) -> Union[List[LocalToken], None]:
        children = []
        for token in self.sent.words:
            if token.head == self.id:
                children.append(LocalToken(token))
        if len(children) == 0:
            return None
        return children
    
@dataclass
class QueryHit:
    sentence: Sentence
    hit: LocalToken
    head: Union[None, LocalToken]
    children: List[LocalToken]
    
    
def lemma_main_query(doc: Document, constraint: List[str]) -> List[QueryHit]:
    query_hits = []

    for sent in doc.sentences:
        for token in sent.words:
            token = LocalToken(token)
            if token.lemma in list(mvi) and token.get_children() is not None:
                query_hits.append(QueryHit(sentence= token.sent, hit= token, head= token.get_head(), children= token.get_children()))
    return query_hits


def feature_sub_query(main_hits: List[QueryHit], constraint: List[str]) -> List[QueryHit]:
    query_hits = []
    
    for hit in main_hits:
        for child in hit.children:
            if child.feats is None:
                continue
            if any([feat in child.feats.split("|") for feat in constraint]):
                query_hits.append(QueryHit(sentence= child.sent, hit=child, head=child.get_head(), children = child.get_children()))
    return query_hits
    

In [236]:
q1_results = lemma_main_query(doc, list(mvi))

In [237]:
len(q1_results)

1378

In [238]:
q2_results = feature_sub_query(q1_results, ["Case=Gen", "Case=Dat"])

In [212]:
len(q2_results)

793

In [209]:
q3_results = feature_sub_query(q1_results, ["VerbForm=Inf"])

In [211]:
len(q3_results)

517

In [222]:
q2_sents = [x.sentence for x in q2_results]
q3_sents = [x.sentence for x in q3_results]
q4_sents = list(set(q2_sents).intersection(set(q3_sents)))

In [247]:
a = doc.sentences[0].words[1].__class__

In [249]:
a.__class__ = LocalToken

TypeError: __class__ assignment only supported for heap types or ModuleType subclasses