In [1]:
import csv, codecs
import xml.etree.ElementTree as ET

form2lemma = {}
map_tags = {"мн." : "S", "со" : "S", "м" : "S", "ж" : "S", "жо" : "S", "мо" : "S", "мо-жо" : "S", "с" : "S", "п" : "A",
            "числ.-п" : "A", "мс-п" : "A", "нсв" : "V", "св" : "V", "св-нсв" : "V", "предл." : "PR", "союз" : "CONJ", "сравн." : "ADV",
            "н" : "ADV", "вводн." : "ADV", "част." : "ADV", "межд." : "ADV", "предик." : "NI", "числ." : "NI", "мест." : "NI",
            "VERB" : "V", "UNKN" : "NI", "PREP" : "PR", "ADJS" : "A", "ADJF" : "A", "NOUN" : "S", "NPRO" : "NI",
            "PRCL" : "ADV", "ADVB" : "ADV", "CONJ" : "CONJ", "INFN" : "V", "GRND" : "V", "PRTS" : "V", "PRTF" : "V",
            "COMP" : "A", "INTJ" : "ADV", "NUMR" : "NI", "PRED" : "NI", "Prnt" : "ADV"}
freq = {}
freq_lemmas = {}
tags_in_freq = {}

odict_path = 'odict.csv'
opcorpora_annot_path = "annot.opcorpora.no_ambig.nonmod.xml"
opcorpora_dict_path = "dict.opcorpora.xml"

In [2]:
opcorpora_dict_tree = ET.parse(opcorpora_dict_path)
opcorpora_dict_root = opcorpora_dict_tree.getroot()

In [3]:
def build_form2lemma_from_odict(odict_path):
    with open(odict_path, encoding="cp1251") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            n = len(row)
            for i in range(n):
                if i == 1:
                    continue
                if row[i] == "":
                    continue
                if row[i] not in form2lemma:
                    form2lemma[row[i]] = []
                if (row[0], row[1]) not in form2lemma[row[i]]:
                    form2lemma[row[i]].append((row[0], row[1]))
            line_count += 1

        print(f'Processed {line_count} lines.')
        print(form2lemma["было"])
        #print(form2lemma["о"])
        #print(form2lemma["обезболивание"])
        #print(form2lemma["времени"])

In [4]:
def build_for2lemma_from_opcorpora(opcorpora_dict_path):
    #f = codecs.open(opcorpora_dict_path, 'r')
    #u = f.read()
    #print(u[0:80000])

    lemmata = opcorpora_dict_root[2]
    for lemma in lemmata:
        lem = lemma[0].attrib['t']
        tag = lemma[0][0].attrib['v']

        if tag == "PNCT" or tag == "LATN" or tag == "SYMB" or tag == "NUMB" or tag == "ROMN":
            continue

        mapped_tag = map_tags[tag]

        for child in lemma:
            if child.tag == "l":
                continue
            word = child.attrib['t']
            if word not in form2lemma:
                form2lemma[word] = []
            # search for mapped_tag in form2lemma:
            exist = False
            for elem in form2lemma[word]:
                if elem[1] == mapped_tag:
                    exist = True
                    break

            if not exist:
                form2lemma[word].append((lem, mapped_tag))

    #print(form2lemma["ежа"])


In [5]:
def form2lemma_map_tags():
    for elem in form2lemma:
        num = len(form2lemma[elem])
        for i in range(num):
            if form2lemma[elem][i][1] not in map_tags:
                print(elem)
                print(form2lemma[elem][i][0])
                print(form2lemma[elem][i][1])
                print("________")
                continue
            form2lemma[elem][i] = (form2lemma[elem][i][0], map_tags[form2lemma[elem][i][1]])

    print(form2lemma["конец"])

In [6]:
def build_freq(opcorpora_annot_path):
    #f = codecs.open(opcorpora_annot_path, 'r')
    #u = f.read()
    #print(u[0:5000])

    tree = ET.parse(opcorpora_annot_path)
    root = tree.getroot()

    #print(root.tag)

    for text in root:
        paragraphs = text[1]
        for paragraph in paragraphs:
            # print(paragraph.tag, paragraph.attrib)
            for sentence in paragraph:
                # print(sentence.tag, sentence.attrib)
                tokens = sentence[1]
                for token in tokens:
                    word = token.attrib['text']
                    lemma = token[0][0][0].attrib['t']
                    tag = token[0][0][0][0].attrib['v']

                    if word == "почти" or word == "Почти":
                        print("EXIST")

                    if tag == "PNCT" or tag == "LATN" or tag == "SYMB" or tag == "NUMB" or tag == "ROMN":
                        continue

                    if tag not in map_tags:
                        print(token.attrib['text'])
                        print(token[0][0][0].attrib['t'])
                        print(token[0][0][0][0].attrib['v'])
                        print("_________________________________")
                        continue

                    mapped_tag = map_tags[tag]

                    if (word, mapped_tag) not in freq:
                        freq[(word, mapped_tag)] = (0, lemma)
                    freq[(word, mapped_tag)] = (freq[(word, mapped_tag)][0] + 1, lemma)

                    if (lemma, mapped_tag) not in freq_lemmas:
                        freq_lemmas[(lemma, mapped_tag)] = 0
                    freq_lemmas[(lemma, mapped_tag)] = freq_lemmas[(lemma, mapped_tag)] + 1

                    if word not in tags_in_freq:
                        tags_in_freq[word] = []
                    if mapped_tag not in tags_in_freq[word]:
                        tags_in_freq[word].append(mapped_tag)

    print(freq[("решения", "S")])
    print(tags_in_freq["сбор"])

In [7]:
def get_sentences_tokens(dataset_path):
    dataset_file = open(dataset_path, "r")
    raw_text = dataset_file.read()
    sentences = raw_text.split("\n")
    sentences_num = len(sentences)

    print(sentences)

    sentences_tokens = []

    for i in range(sentences_num):
        if sentences[i] == "":
            continue
        sentences[i] = sentences[i].replace(".", "").replace(",", "").replace("?", "").replace("!", "")
        sentences_tokens.append(sentences[i].split(" "))

    print(sentences_tokens)
    return sentences_tokens

In [8]:
def print_ans_odict_only(sentences_tokens):
    lemmatized_sentences = []

    for s in sentences_tokens:
        res = []
        num = -1
        for token in s:
            num += 1
            if token not in form2lemma:
                right_case_token = token.lower()
            else:
                right_case_token = token

            if right_case_token not in form2lemma:
                if token.isupper():
                    res.append(token + "{" + token + "=" + "S" + "}")
                elif token[0].isupper() and num != 0:
                    res.append(token + "{" + token + "=" + "S" + "}")
                elif num == 0:
                    res.append(token + "{" + token.lower() + "=" + "S" + "}")
                else:
                    res.append(token + "{" + token + "=" + "S" + "}")
                continue

            if len(form2lemma[right_case_token]) == 1:
                if form2lemma[right_case_token][0][1] in map_tags:
                    res.append(token + "{" + form2lemma[right_case_token][0][0] + "=" + map_tags[
                        form2lemma[right_case_token][0][1]] + "}")
                else:
                    res.append(token + "{" + form2lemma[right_case_token][0][0] + "=" + "NI" + "}")
            else:
                added = False

                for pair in form2lemma[right_case_token]:
                    if pair[1] == "союз":
                        res.append(token + "{" + form2lemma[right_case_token][0][0] + "=" + map_tags["союз"] + "}")
                        added = True
                        break
                    if pair[1] == "предл.":
                        res.append(token + "{" + form2lemma[right_case_token][0][0] + "=" + map_tags["предл."] + "}")
                        added = True
                        break
                    if pair[1] == "нсв" or pair[1] == "св":
                        res.append(token + "{" + form2lemma[right_case_token][0][0] + "=" + map_tags["нсв"] + "}")
                        added = True
                        break
                    if pair[1] == "п":
                        res.append(token + "{" + form2lemma[right_case_token][0][0] + "=" + map_tags["п"] + "}")
                        added = True
                        break

                if added:
                    continue

                if form2lemma[right_case_token][0][1] not in map_tags:
                    print(token)
                    res.append(token)
                else:
                    res.append(token + "{" + form2lemma[right_case_token][0][0] + "=" + map_tags[
                        form2lemma[right_case_token][0][1]] + "}")

        lemmatized_sentences.append(res)

    for lst in lemmatized_sentences:
        ans = " "
        ans = ans.join(lst)
        print(ans)

In [9]:
def get_most_freq_pair_by_infn(word):
    max_freq = 0
    best_ans = None
    for pair in form2lemma[word]:
        lemma = pair[0]
        tag = pair[1]

        if (lemma, tag) not in freq_lemmas:
            continue
        if freq_lemmas[(lemma, tag)] > max_freq:
            max_freq = freq_lemmas[(lemma, tag)]
            best_ans = (lemma, tag)
    return best_ans

def get_most_freq_pair(word):
    if word not in tags_in_freq:
        return get_most_freq_pair_by_infn(word)
    max_freq = 0
    best_ans = None
    for tag in tags_in_freq[word]:
        cur_freq, lemma = freq[(word, tag)]
        if cur_freq > max_freq:
            max_freq = cur_freq
            best_ans = (lemma, tag)
    return best_ans

In [29]:
def right_verb_lemma(verb, lemma):
    if verb.endswith("ся") or verb.endswith("сь"):
        if not lemma.endswith("ся"):
            lemma += "ся"
    return lemma

In [88]:
endings = {"ый" : "ый", "ого" : "ый", "ому" : "ый", "ым" : "ый", "ом" : "ый", 
           "ая" : "ый", "ой" : "ый", "ую" : "ый", "ое" : "ый", "ые" : "ый", "ых" : "ый", "ым" : "ый", 
           "ыми" : "ый", "ых" : "ый", "ий" : "ий", "его" : "ий", "ему" : "ий", "им" : "ий", "ем" : "ий", 
           "яя" : "ий", "ей" : "ий", "юю" : "ий", "ее" : "ий", "ие" : "ий", "их" : "ий", "им" : "ий", "ими" : "ий"}

def if_adj(word):
    if len(word) >= 2 and word[-2:] in endings:
        return True
    if len(word) >= 3 and word[-3:] in endings:
        return True
    return False

def get_adj_lemma(word):
    if len(word) >= 2 and word[-2:] in endings:
        word = word[:-2] + endings[word[-2:]]
    elif len(word) >= 3 and word[-3:] in endings:
        word = word[:-3] + endings[word[-3:]]
    return word

In [92]:
def find_lemma(word, tag, default):
    lst = form2lemma[word]
    for elem in lst:
        if elem[1] == tag:
            return elem[0]
    return default

def print_ans_dict_and_freq(sentences_tokens):
    lemmatized_sentences = []

    for s in sentences_tokens:
        res = []
        num = -1
        for token in s:
            num += 1
            if token not in form2lemma:
                right_case_token = token.lower()
            else:
                right_case_token = token

            if right_case_token not in form2lemma:
                print(f"Not in dict: {token}")
                if token[0].isupper() and num != 0:
                    res.append(token + "{" + token + "=" + "S" + "}")
                else:    
                    if token in tags_in_freq:
                        best_pair = get_most_freq_pair(token)
                        lemma = best_pair[0]
                        tag = best_pair[1]
                        if tag == "V":
                            lemma = right_verb_lemma(token, lemma)
                            
                        res.append(token + "{" + lemma + "=" + tag + "}")
                        continue
                    
                    lower_case_token = token.lower()
                    if lower_case_token in tags_in_freq:
                        best_pair = get_most_freq_pair(lower_case_token)
                        lemma = best_pair[0]
                        tag = best_pair[1]
                        if tag == "V":
                            lemma = right_verb_lemma(lower_case_token, lemma)
                            
                        res.append(token + "{" + lemma + "=" + tag + "}")
                        continue
                        
                    if if_adj(lower_case_token):
                        res.append(token + "{" + get_adj_lemma(lower_case_token) + "=" + "A" + "}")
                        print(get_adj_lemma(lower_case_token))
                        continue

                    res.append(token + "{" + lower_case_token + "=" + "S" + "}")
                continue

            if len(form2lemma[right_case_token]) == 1:
                lemma = form2lemma[right_case_token][0][0]
                tag = form2lemma[right_case_token][0][1]
                if tag == "V":
                    lemma = right_verb_lemma(right_case_token, lemma)
                
                res.append(token + "{" + lemma + "=" + tag + "}")
            else:
                best_pair = get_most_freq_pair(right_case_token)
                if best_pair != None:
                    right_lemma = find_lemma(right_case_token, best_pair[1], best_pair[0])
                    
                    lemma = right_lemma
                    tag = best_pair[1]
                    if tag == "V":
                        lemma = right_verb_lemma(right_case_token, lemma)
                    
                    res.append(token + "{" + lemma + "=" + tag + "}")
                    continue

                added = False
                for pair in form2lemma[right_case_token]:
                    if pair[1] == "CONJ":
                        res.append(token + "{" + pair[0] + "=" + "CONJ" + "}")
                        added = True
                        break
                if added:
                    continue
                        
                for pair in form2lemma[right_case_token]:
                    if pair[1] == "PR":
                        res.append(token + "{" + pair[0] + "=" + "PR" + "}")
                        added = True
                        break
                if added:
                    continue        
                
                for pair in form2lemma[right_case_token]:
                    if pair[1] == "V":
                        lemma = pair[0]
                        lemma = right_verb_lemma(right_case_token, lemma)
                        
                        res.append(token + "{" + lemma + "=" + "V" + "}")
                        added = True
                        break
                if added:
                    continue
                
                for pair in form2lemma[right_case_token]:
                    if pair[1] == "A":
                        res.append(token + "{" + pair[0] + "=" + "A" + "}")
                        added = True
                        break
                if added:
                    continue
                
                for pair in form2lemma[right_case_token]:
                    if pair[1] == "ADV":
                        res.append(token + "{" + pair[0] + "=" + "ADV" + "}")
                        added = True
                        break
                if added:
                    continue
                    
                for pair in form2lemma[right_case_token]:
                    if pair[1] == "S":
                        res.append(token + "{" + pair[0] + "=" + "S" + "}")
                        added = True
                        break
                if added:
                    continue

                # choose random lemma (first lemma)
                lemma = form2lemma[right_case_token][0][0]
                tag = form2lemma[right_case_token][0][1] 
                if tag == "V":
                    lemma = right_verb_lemma(right_case_token, lemma)
                res.append(token + "{" + lemma + "=" + tag + "}")

        lemmatized_sentences.append(res)

    for lst in lemmatized_sentences:
        ans = " "
        ans = ans.join(lst)
        print(ans)

In [11]:
#f = codecs.open(odict_path, 'r', 'cp1251')
#u = f.read()

#ind = u.find(",н,")
#print(u[0:1000])

build_form2lemma_from_odict(odict_path)
form2lemma_map_tags()

build_for2lemma_from_opcorpora(opcorpora_dict_path)

build_freq(opcorpora_annot_path)

#print(form2lemma["почти"])
#print(tags_in_freq["почти"])

Processed 102498 lines.
[('было', 'част.')]
[('конец', 'S')]
(56, 'решение')
['S']


In [64]:
form2lemma["быть"] = [("быть", "V")]
form2lemma["был"] = [("быть", "V")]
form2lemma["была"] = [("быть", "V")]
form2lemma["были"] = [("быть", "V")]
form2lemma["было"] = [("быть", "V")]
form2lemma["буду"] = [("быть", "V")]
form2lemma["будет"] = [("быть", "V")]
form2lemma["будут"] = [("быть", "V")]

In [104]:
dataset_path = "dataset_37845_55.txt"
#dataset_path = "test.txt"

sentences_tokens = get_sentences_tokens(dataset_path)

print_ans_dict_and_freq(sentences_tokens)

['И это не удивительно, поскольку на самом деле это не настоящие лепестки, а разросшиеся чашелистики.', 'Ряд предложений Н. Я. Виленкина требует дополнительных разъяснений и уточнений.', 'После сырого, холодящего ноги, северного склона приятно снова ступать по сухим, мягким листьям.', 'Возможности мультимедийных технологий так и остаются до сих пор потенциальными, их освоение и использование для дидактических целей идёт медленно и едва ли будет идти быстрее, чем освоение возможностей преждевременно скончавшегося обычного печатного слова.', 'Изучая его указания, сопровождавшиеся иногда мелкими набросками, нельзя не удивляться самостоятельности его градостроительной концепции.', 'Они вошли в лес, и сразу ещё сильнее запахло смолой и хвоей.', 'Я ответил, что тут всё стихи, только переведены они прозой.', 'Разумеется, ему пока трудно это делать, но главное, чтобы у малыша вырабатывалось стремление к чистоте.', 'Кваша сразу понял, что это про него, и не раздумывая сделал шаг вперёд.', 'Таки

Not in dict: партнерство
Not in dict: ОНЭКСИМу
Not in dict: Кайоль
Not in dict: фруской
фрускый
Not in dict: оглядываний
оглядываний
Not in dict: безусловнорефлекторные
безусловнорефлекторный
Not in dict: ОПВ
Not in dict: флораты
Not in dict: Терехово
Not in dict: Гисена
Not in dict: Вийона
Not in dict: Шкап
Not in dict: мамалыжной
мамалыжный
Not in dict: радиоастроном
радиоастроный
Not in dict: Дрэйк
Not in dict: неоинституциональной
неоинституциональный
Not in dict: ребенка
Not in dict: Матрену
Not in dict: ОЮЛ
Not in dict: сопом
сопый
Not in dict: безотчетно
Not in dict: полуразвалившегося
Not in dict: Шамординского
Not in dict: старлей
старлий
Not in dict: Калигула
Not in dict: лоцируемого
лоцируемый
И{и=CONJ} это{это=NI} не{не=ADV} удивительно{удивительный=A} поскольку{поскольку=CONJ} на{на=PR} самом{самый=A} деле{дело=S} это{это=NI} не{не=ADV} настоящие{настоящий=A} лепестки{лепесток=S} а{а=CONJ} разросшиеся{разрастисься=V} чашелистики{чашелистик=S}
Ряд{ряд=S} предложений{предлож