In [3]:
import csv
import re
from functools import partial
from bs4 import BeautifulSoup
from xml.dom import minidom

In [4]:
def read_csv(filename: str, encoding='cp1251'):
     with open(filename, "r", encoding=encoding) as csvfile:
        lines = csv.reader(csvfile, delimiter=',')
        return list(map(lambda x: list(map(str, x)), lines))

In [62]:
def normalize_str(s):
    return s.lower().replace("ё", "е")

class Morphem:
    def __init__(self, normal, clazz, variations):
        self.normal_form = normalize_str(normal)
        self.clazz = clazz
        self.variations = set(map(normalize_str, variations))
        self.variations.add(normal)
    
    def upd_norm(self, new_norm):
        self.normal_form = normalize_str(new_norm)
        
        
def odict_to_morph(t: str):
    if t in ['мо', 'жо', 'мо-жо', 'м', 'ж', 'мн.', 'с', 'со']:
        return "S"
    if t in ['св', 'св-нсв', 'нсв']:
        return "V"
    if t in ['п', 'числ.-п', 'мс-п']:
        return "A"
    if t in ['союз']:
        return "CONJ"
    if t in ['предл.']:
        return "PR"
    if t in ['н', 'межд.', 'част.', 'предик.', 'вводн.', 'сравн.']:
        return "ADV"
    if t in ['числ.', 'мс']:
        return "NI"
    raise ValueError("Unknown type {}".format(t))
    
opco_known = ['NOUN', 'ADJF', 'ADJS', 'COMP', 'GRND', 'INFN', 'VERB', 'PRTF', 'PRTS', 'PREP', 'CONJ', 'ADVB', 'INTJ', 'PRCL', 'NPRO', 'PRED', 'NUMR']
    
    
def opco_to_morph(t: str):
    if t in ['NOUN']:
        return "S"
    if t in ['ADJF', 'ADJS', 'COMP']:
        return "A"
    if t in ['GRND', 'INFN', 'VERB', 'PRTF', 'PRTS']:
        return "V"
    if t in ['PREP']:
        return "PR"
    if t in ['CONJ']:
        return "CONJ"
    if t in ['ADVB', 'INTJ', 'PRCL']:
        return "ADV"
    if t in ['NPRO', 'PRED', 'NUMR']:
        return "NI"
    raise ValueError("Unknown type {}".format(t))

In [48]:
morphems = []

for line in read_csv('odict.csv'):
    new_morphem = Morphem(line[0], odict_to_morph(line[1]), line[2:])
    # morphems.append(new_morphem)

In [49]:
class OpCorpElement(Morphem):
    def __init__(self, eid, normal_form, clazz, variations):
        super().__init__(normal_form, clazz, variations)
        self.eid = eid


def find_word_after(aft, pos, s):
    aft += '"'
    beg = s.find(aft, pos)
    if beg == -1:
        return -1, ""
    beg += len(aft)
    end = s.find('"', beg)
    if end == -1:
        return -1, ""
    return end, s[beg: end]


cnt = 0
links = {}

opcorp = {}
with open('dict.opcorpora.xml', 'r') as file:
    for line in file:
        if line.find("<lemma ") == -1 and line.find("<link ") == -1:
            continue
        if line.find("<lemma ") != -1:
            pos, eid = find_word_after('<lemma id=', 0, line)
            pos, clazz = find_word_after('<g v=', pos, line)
            variants = []

            while True:
                pos, word = find_word_after("<f t=", pos, line)
                if pos == -1:
                    break
                if word and word not in variants:
                    variants.append(word)
            cnt += 1
            if cnt < 50:
                print(eid, clazz, variants[:3])
            new_element = OpCorpElement(eid, variants[0], opco_to_morph(clazz), variants[1:])
            
            if len(new_element.normal_form) > 1 or new_element.clazz != "S":
                opcorp[eid] = new_element
                morphems.append(new_element)
            
        if line.find("<link") != -1:
            _, fr = find_word_after("from=", 0, line) 
            _, to = find_word_after("to=", 0, line)
            _, tp = find_word_after("type=", 0, line)
            links[(fr, to)] = tp

print(cnt)

1 NOUN ['ёж', 'ежа', 'ежу']
2 NOUN ['ёж', 'ежа', 'ежу']
3 NOUN ['ёжик', 'ёжика', 'ёжику']
4 ADVB ['ёжиком']
5 ADJF ['ёжистый', 'ёжистого', 'ёжистому']
6 ADJS ['ёжист', 'ёжиста', 'ёжисто']
7 COMP ['ёжистее', 'ёжистей', 'поёжистее']
8 VERB ['ёжу', 'ёжим', 'ёжишь']
9 INFN ['ёжить']
10 PRTF ['ёжимый', 'ёжимого', 'ёжимому']
11 PRTS ['ёжим', 'ёжима', 'ёжимо']
12 GRND ['ёжа', 'ёжив', 'ёживши']
13 VERB ['ёжусь', 'ёжимся', 'ёжишься']
14 INFN ['ёжиться']
15 PRTF ['ёжащийся', 'ёжащегося', 'ёжащемуся']
16 PRTF ['ёжившийся', 'ёжившегося', 'ёжившемуся']
17 GRND ['ёжась', 'ёжившись']
18 VERB ['ёкнул', 'ёкнула', 'ёкнуло']
19 INFN ['ёкнуть']
20 PRTF ['ёкнувший', 'ёкнувшего', 'ёкнувшему']
21 GRND ['ёкнув', 'ёкнувши']
22 NOUN ['ёлка', 'ёлки', 'ёлке']
23 NOUN ['ёлочка', 'ёлочки', 'ёлочке']
24 ADJF ['ёлочный', 'ёлочного', 'ёлочному']
27 ADJF ['ёмкий', 'ёмкого', 'ёмкому']
28 ADJS ['ёмок', 'ёмка', 'ёмко']
29 COMP ['ёмче', 'поёмче']
30 ADJF ['ёмкостный', 'ёмкостного', 'ёмкостному']
31 ADJS ['ёмкостен', 'ёмкос

In [50]:
for p, t in sorted(links.items(), key=lambda x: x[1]):
    if p[0] in opcorp and p[1] in opcorp:
        opcorp[p[1]].upd_norm(opcorp[p[0]].normal_form)

In [79]:
stats = {}
with open('annot.opcorpora.xml', 'r') as file:
    for line in file:
        if (line.find("<token ") != -1):
            _, orig_word = find_word_after("text=", 0, line)
            orig_word = normalize_str(orig_word)
            
            pos = line.find("<l ")
            assert pos != -1
            _, lemma = find_word_after("t=", pos, line)
            lemma = normalize_str(lemma)
            _, grammema = find_word_after("<g v=", pos, line)
            if grammema not in opco_known:
                continue
            clazz = opco_to_morph(grammema)
            
            stats[orig_word] = stats.get(orig_word, {})
            stats[orig_word][clazz] = stats[orig_word].get(clazz, (0, []))
            stats[orig_word][clazz] = (stats[orig_word][clazz][0] + 1, stats[orig_word][clazz][1])
            stats[orig_word][clazz][1].append(lemma)
            

In [80]:
print(len(stats))
print(list(stats.items())[:3])

141304
[('школа', {'S': (89, ['школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа', 'школа'])}), ('злословия', {'S': (12, ['злословие', 'злословие', 'злословие', 'злословие', 'злословие', 'злословие', 'злословие', 'злословие', 'злословие', 'злословие', 'злослови

In [51]:
def create_dict(morphems):
    dictionary = {}

    for morphem in morphems:
        for word in morphem.variations:
            if word not in dictionary:
                dictionary[word] = []
            variants = dictionary[word]
            if morphem not in variants:
                variants.append(morphem)
    return dictionary

In [None]:
dictionary = create_dict(morphems)

In [88]:
priority = ['CONJ', 'PR', "S", "V", "A", "ADV"]

def most_frequent(List): 
    d = {} 
    count, itm = 0, '' 
    for item in reversed(List): 
        d[item] = d.get(item, 0) + 1
        if d[item] >= count : 
            count, itm = d[item], item 
    return(itm) 
  
def lemmatize_word(word: str):
    word = normalize_str(word)
    if word in dictionary:
        morphems = list(map(lambda x: (x.normal_form, x.clazz), dictionary[word]))
        d = {}
        for morph in morphems:
            d[morph[1]] = d.get(morph[1], [])
            d[morph[1]].append(morph[0])
        
        pr = None
        if word in stats:
            pr = max(stats[word].items(), key=lambda x: x[1])[0]
            if stats[word][pr][0] * 2 <= sum(map(lambda x: x[0], stats[word].values())):
                print("Warning! Unsure about word: {}, options: {}, stats: {}".format(word, morphems, stats[word]))
            if pr in d:
                morphem = (most_frequent(d[pr]), pr)
            else:
                morphem = (most_frequent(stats[word][pr][1]), pr)
        else:
            for p in priority:
                if p in d:
                    pr = p
                    break
            morphem = (most_frequent(d[pr]), pr)
                
        return morphem[0], morphem[1]
    else:
        if word in stats:
            pr = max(stats[word].items(), key=lambda x: x[1])[0]
            if stats[word][pr][0] * 2 <= sum(map(lambda x: x[0], stats[word].values())):
                print("Warning! Unsure about word: {}, options: {}, stats: {}".format(word, morphems, stats[word]))
            return most_frequent(stats[word][pr][1]), pr
        return word, "S"

In [90]:
with open('dataset.txt', 'r') as file:
    with open('answer.txt', 'w') as out:
        lines = file.readlines()
        for line in lines:
            tokens = list(filter(None, re.split(' |,|\.|\?|!|\n', line)))
            for word in tokens:
                lemmatized, clazz = lemmatize_word(word)
                out.write("{}{{{}={}}} ".format(word, lemmatized, clazz))
            out.write('\n')

