# Сбор связей

In [None]:
!pip install pymorphy2-dicts-ru

In [None]:
!pip install levenshtein

In [None]:
from collections import Counter, defaultdict
from nlp import nlp
from dawg import CompletionDAWG
import torch
import stanza
import glob


def filter_deps(word):
    return word.deprel in ["amod"] # , "appos"


def find_deps(words):
    deps = []
    
    for w in words:
        if w.upos == "NOUN" and "Animacy=Anim" in w.feats:
            deps.append((w, [rw for rw in words if rw.head == w.id and filter_deps(rw)]))
    
    return deps


def format_dep(word):
    return f"{word.lemma.replace('-', '')}:{word.deprel}:{word.upos}"


def collect_deps(all_deps, deps):
    for (root, dep_words) in deps:
        if dep_words:
            all_deps[root.lemma.replace('-', '')] += Counter({ format_dep(dw) for dw in dep_words })

    
def get_stanza_filenames():
    return glob.iglob('/mnt/data/proza_ru/**/*.txt.stanza', recursive=True)


def main():
    deps_counter = defaultdict(Counter)
    
    i = 0
    
    with torch.no_grad():
        for filename in get_stanza_filenames():
            if i % 1000 == 0: print(i)
            i += 1
            
            with open(filename, 'rb') as file:
                try:
                    doc = stanza.Document.from_serialized(file.read())

                    for sent in doc.sentences:
                        collect_deps(deps_counter, find_deps(sent.words))
                except Exception as e:
                    print(f"rm {filename} &&")

    return deps_counter


# deps = defaultdict(Counter)

# for sent in doc.sentences:
#     add_deps(deps, find_deps(sent.words))
    


# completion_dawg = CompletionDAWG([f"{root}:{dep}" for root in deps for dep in deps[root]])
rels = main()

In [456]:
from morph import morph
from rapidfuzz.distance import JaroWinkler
from itertools import takewhile
import dawg


def is_known(word):
    for parse in morph.parse(word):
        if parse.normal_form.replace('ё', 'е') == word.replace('ё', 'е') and 'nomn' in parse.tag and parse.is_known and ('anim' in parse.tag or 'ADJF' in parse.tag or "PRTF" in parse.tag):
            return True

    return False


def normalize(word):
    for parse in morph.parse(word):
        if parse.is_known and ('anim' in parse.tag or 'ADJF' in parse.tag or "PRTF" in parse.tag):
            return parse.normal_form.replace('ё', 'е')

    return word.replace('ё', 'е')


# Удалить руты с очен маленьким количеством связей
# Удалить редкие депсы
def merge_unknown_words_to_known(rels):
    dict_roots = [root for root in rels if is_known(root)]
    non_dict_roots = [root for root in rels if root not in dict_roots]

    print(len(dict_roots), len(non_dict_roots))
    
    # Удаляем редкие неизвестные слова
    # и совсем короткие
    # и нормализуемые
    for ndr in non_dict_roots:
        vals = rels[ndr].values()
        
        if max(vals) < 3:
            print('Удаляем по весу ', ndr, vals)
            rels.pop(ndr)
            non_dict_roots.remove(ndr)
            
        if len(ndr) < 3:
            print('Удаляем по длине ', ndr, vals)
            rels.pop(ndr)
            non_dict_roots.remove(ndr)
        
        normalized_ndr = normalize(ndr)
        if normalized_ndr != ndr and normalized_ndr in dict_roots:
            print('Сливаем normalized', ndr, normalized_ndr)
            rels[normalized_ndr] |= rels[ndr]
            rels.pop(ndr) 
            non_dict_roots.remove(ndr)
            
        ndr_a = f'{ndr}а';
        
        if is_known(ndr_a):
            print('Сливаем +a', ndr, ndr_a)
            rels[ndr_a] = rels.get(ndr_a, Counter())
            rels[ndr_a] |= rels[ndr]
            rels.pop(ndr)
            non_dict_roots.remove(ndr)
        
            
    # Сливаем похожие слова
    for ndr in non_dict_roots:
        nearest_dr = None
        nearest_dr_sim = 0
        
        for dr in dict_roots:
            sim = JaroWinkler.similarity(ndr, dr)
            
            if sim < 0.96:
                continue
                
            if sim > nearest_dr_sim:
                nearest_dr_sim = sim
                nearest_dr = dr
        
        if nearest_dr:
            print('Сливаем близкие ', ndr, nearest_dr)
            rels[nearest_dr] |= rels[ndr]
            rels.pop(ndr)
            
    print(len(dict_roots), len(non_dict_roots))
        

def drop_rare_roots(rels):
    rare = [root for root in rels if max(rels[root].values()) < 3]
    
    for r in rare:
        print('drop_rare_roots ', r, max(rels[r].values()))
        rels.pop(r)
        
        


# merge_unknown_words_to_known(rels)


def rels_to_seq(rels):
    for root in rels:
        if not is_known(root):
            continue
            
        deps = rels[root]
        bound = 0
        
        if len(deps) > 2:
            [common1, common2] = deps.most_common(2)
            bound = common2[1] / 10
    
        for dep in deps:
#            местоимение
            [dep_word, dep_rel, dep_pos] = dep.split(":")
            if 'Apro' in morph.parse(dep_word)[0].tag:
                continue
                
            if 'блуд' in dep_word or 'мертв' in dep_word or 'твой' in dep_word or 'сбит' in dep_word or 'убит' in dep_word: 
                continue
            
            if deps[dep] >= bound:
                yield f"{root.lower().replace('ё', 'е')}:{dep.lower().replace('ё', 'е')}"
                



completion_dawg = dawg.CompletionDAWG(rels_to_seq(rels))

completion_dawg.save('/mnt/data/adj.dawg')

In [442]:
morph.parse('спятившая')

[Parse(word='спятившая', tag=OpencorporaTag('PRTF,perf,intr,past,actv femn,sing,nomn'), normal_form='спятить', score=1.0, methods_stack=((DictionaryAnalyzer(), 'спятившая', 2345, 22),))]

In [466]:
import random
import dawg
from morph import morph
from typing import List, Set, Tuple


def inflect(word: str, grs_variants: List[Set[str]]) -> Tuple[str, str]:
    parsed = morph.parse(word)

    for grs in grs_variants:
        for p in parsed:
            inflected = p.inflect(grs)

            if inflected:
                return inflected.word

    return None


def get_dep(completion_dawg, root, seed=None):
    keys = completion_dawg.keys(f"{root}:")
    
    if not keys:
        return None
    
    if seed:
        random.seed(seed)
    
    [root, dep, rel, pos] = random.choice(keys).split(":")

    
    if rel == 'amod' and pos == 'adj':
        root_parsed = morph.parse(root)[0]
        inflected_adj = inflect(dep, [{ "ADJF", root_parsed.tag.gender, root_parsed.tag.case }])
        
        if inflected_adj is None:
            return None
        
        return f"{inflected_adj} {root_parsed.word}"
    
    if rel == 'amod' and pos == 'verb':
        root_parsed = morph.parse(root)[0]
        base_verb_grs = { 'PRTF', 'sing', root_parsed.tag.gender, root_parsed.tag.case }
        
        inflected_adj = inflect(dep, [{'actv', 'pres'} | base_verb_grs ,{'pssv', 'past'} | base_verb_grs, {'actv', 'past'} | base_verb_grs])
        
        if inflected_adj is None:
            return None
        return f"{inflected_adj} {root_parsed.word}"
    
    return None
    
# for i in range(0, 300):
#     print(get_dep(completion_dawg, 'внучка', i))

completion_dawg.keys(':')


['зомби:мерзкий:amod:adj',
 'зомби:настоящий:amod:adj',
 'зомби:новый:amod:adj',
 'зомби:первый:amod:adj',
 'зомби:послушный:amod:adj',
 'зомби:приближаться:amod:verb',
 'зомби:рыжий:amod:adj']

# Распознание сущностей