# Сбор связей

In [4]:
!pip install pymorphy2 pymorphy2-dicts-ru

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.5 MB/s eta 0:00:011
[?25hCollecting pymorphy2-dicts-ru
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 3.2 MB/s eta 0:00:01
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt>=0.6
  Downloading docopt-0.6.2.tar.gz (25 kB)
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25ldone
[?25h  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=d510b76f0c13d73b88b18bfcd462f5c157173082f4e8b686e7c1b423e9a51ecf
  Stored in directory: /home/kukuruku/.cache/pip/wheels/56/ea/58/ead137b087d9e326852a851351d1debf4ada529b6ac0ec4e8c
Successfully built docopt
Installing collected packages: pymorphy2-dicts-ru, dawg-python, docopt, pymorphy2
Successfully inst

In [None]:
!pip install levenshtein

# Сбор зависимостей

In [35]:
from collections import Counter
from nlp import nlp
import torch
import stanza
import glob
from morph import morph
from functools import lru_cache


def get_stanza_filenames():
    return glob.iglob('/mnt/data/**/*.stanza', recursive=True)

@lru_cache(2048)
def normalize(word, pos):
    word = word.replace('-', '')
    
    for parse in morph.parse(word):
        if parse.is_known and pos in parse.tag:
            return parse.normal_form
        
    print(word, pos)

    return 'unknown'


def collect_rels(rels, words):
    heads = (word for word in words if word.upos == "NOUN")
    
    for head in heads:
        deps = {normalize(word.text, "ADJF") for word in words
                if word.head == head.id and word.deprel == "amod" and word.upos == "ADJ"} - {'unknown'}
        
        if not deps:
            continue
        
        rel_key = normalize(head.text, "NOUN")
        
        if rel_key in rels:
            rels[rel_key] |= deps
        else:
            rels[rel_key] = deps


def process_all():
    counters = {}
    
    i = 0
    
    with torch.no_grad():
        for filename in get_stanza_filenames():
            if i % 10 == 0: print(i)
            i += 1
            
            with open(filename, 'rb') as file:
                try:
                    rels = {}
                    doc = stanza.Document.from_serialized(file.read())

                    for sent in doc.sentences:
                        collect_rels(rels, sent.words)
                        
                    for rel_key in rels:
                        cnt = Counter(rels[rel_key])
                        
                        if rel_key in counters:
                            counters[rel_key] += cnt
                        else:
                            counters[rel_key] = cnt

                except Exception as e:
                    print(e)

    return counters


rels = process_all()

len(rels)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
1010
1020
Could not create new Document from serialised string.
1030
Could not create new Document from serialised string.
1040
1050


28686

In [66]:
rels['секс']

Counter({'необязательный': 1,
         'добрачный': 1,
         'случайный': 1,
         'хороший': 1,
         'интимный': 1,
         'сам': 1})

In [41]:
from morph import morph
from rapidfuzz.distance import JaroWinkler
from itertools import takewhile
import dawg


def is_known(word):
    for parse in morph.parse(word):
        if parse.normal_form.replace('ё', 'е') == word.replace('ё', 'е') and 'nomn' in parse.tag and parse.is_known and ('anim' in parse.tag or 'ADJF' in parse.tag or "PRTF" in parse.tag):
            return True

    return False


def normalize(word):
    for parse in morph.parse(word):
        if parse.is_known and ('anim' in parse.tag or 'ADJF' in parse.tag or "PRTF" in parse.tag):
            return parse.normal_form.replace('ё', 'е')

    return word.replace('ё', 'е')


# Удалить руты с очен маленьким количеством связей
# Удалить редкие депсы
def merge_unknown_words_to_known(rels):
    dict_roots = [root for root in rels if is_known(root)]
    non_dict_roots = [root for root in rels if root not in dict_roots]

    print(len(dict_roots), len(non_dict_roots))
    
    # Удаляем редкие неизвестные слова
    # и совсем короткие
    # и нормализуемые
    for ndr in non_dict_roots:
        vals = rels[ndr].values()
        
        if max(vals) < 3:
            print('Удаляем по весу ', ndr, vals)
            rels.pop(ndr)
            non_dict_roots.remove(ndr)
            
        if len(ndr) < 3:
            print('Удаляем по длине ', ndr, vals)
            rels.pop(ndr)
            non_dict_roots.remove(ndr)
        
        normalized_ndr = normalize(ndr)
        if normalized_ndr != ndr and normalized_ndr in dict_roots:
            print('Сливаем normalized', ndr, normalized_ndr)
            rels[normalized_ndr] |= rels[ndr]
            rels.pop(ndr) 
            non_dict_roots.remove(ndr)
            
        ndr_a = f'{ndr}а';
        
        if is_known(ndr_a):
            print('Сливаем +a', ndr, ndr_a)
            rels[ndr_a] = rels.get(ndr_a, Counter())
            rels[ndr_a] |= rels[ndr]
            rels.pop(ndr)
            non_dict_roots.remove(ndr)
        
            
    # Сливаем похожие слова
    for ndr in non_dict_roots:
        nearest_dr = None
        nearest_dr_sim = 0
        
        for dr in dict_roots:
            sim = JaroWinkler.similarity(ndr, dr)
            
            if sim < 0.96:
                continue
                
            if sim > nearest_dr_sim:
                nearest_dr_sim = sim
                nearest_dr = dr
        
        if nearest_dr:
            print('Сливаем близкие ', ndr, nearest_dr)
            rels[nearest_dr] |= rels[ndr]
            rels.pop(ndr)
            
    print(len(dict_roots), len(non_dict_roots))
        

def drop_rare_roots(rels):
    rare = [root for root in rels if max(rels[root].values()) < 3]
    
    for r in rare:
        print('drop_rare_roots ', r, max(rels[r].values()))
        rels.pop(r)
        
        


# merge_unknown_words_to_known(rels)


def rels_to_seq(rels):
    for root in rels:
        if not is_known(root):
            continue
            
        deps = rels[root]
        bound = 0
        
        if len(deps) > 2:
            [common1, common2] = deps.most_common(2)
            bound = common2[1] / 10
    
        for dep in deps:
#            местоимение
            [dep_word, dep_rel, dep_pos] = dep.split(":")
            if 'Apro' in morph.parse(dep_word)[0].tag:
                continue
                
            if 'блуд' in dep_word or 'мертв' in dep_word or 'твой' in dep_word or 'сбит' in dep_word or 'убит' in dep_word: 
                continue
            
            if deps[dep] >= bound:
                yield f"{root.lower().replace('ё', 'е')}:{dep.lower().replace('ё', 'е')}"
                



completion_dawg = dawg.CompletionDAWG(rels_to_seq(rels))

completion_dawg.save('/mnt/data/adj.dawg')

ModuleNotFoundError: No module named 'rapidfuzz'

In [442]:
morph.parse('спятившая')

[Parse(word='спятившая', tag=OpencorporaTag('PRTF,perf,intr,past,actv femn,sing,nomn'), normal_form='спятить', score=1.0, methods_stack=((DictionaryAnalyzer(), 'спятившая', 2345, 22),))]

In [466]:
import random
import dawg
from morph import morph
from typing import List, Set, Tuple


def inflect(word: str, grs_variants: List[Set[str]]) -> Tuple[str, str]:
    parsed = morph.parse(word)

    for grs in grs_variants:
        for p in parsed:
            inflected = p.inflect(grs)

            if inflected:
                return inflected.word

    return None


def get_dep(completion_dawg, root, seed=None):
    keys = completion_dawg.keys(f"{root}:")
    
    if not keys:
        return None
    
    if seed:
        random.seed(seed)
    
    [root, dep, rel, pos] = random.choice(keys).split(":")

    
    if rel == 'amod' and pos == 'adj':
        root_parsed = morph.parse(root)[0]
        inflected_adj = inflect(dep, [{ "ADJF", root_parsed.tag.gender, root_parsed.tag.case }])
        
        if inflected_adj is None:
            return None
        
        return f"{inflected_adj} {root_parsed.word}"
    
    if rel == 'amod' and pos == 'verb':
        root_parsed = morph.parse(root)[0]
        base_verb_grs = { 'PRTF', 'sing', root_parsed.tag.gender, root_parsed.tag.case }
        
        inflected_adj = inflect(dep, [{'actv', 'pres'} | base_verb_grs ,{'pssv', 'past'} | base_verb_grs, {'actv', 'past'} | base_verb_grs])
        
        if inflected_adj is None:
            return None
        return f"{inflected_adj} {root_parsed.word}"
    
    return None
    
# for i in range(0, 300):
#     print(get_dep(completion_dawg, 'внучка', i))

completion_dawg.keys(':')


['зомби:мерзкий:amod:adj',
 'зомби:настоящий:amod:adj',
 'зомби:новый:amod:adj',
 'зомби:первый:amod:adj',
 'зомби:послушный:amod:adj',
 'зомби:приближаться:amod:verb',
 'зомби:рыжий:amod:adj']

# Распознание сущностей