In [11]:
import codecs
import spacy
from spacy import displacy
import pandas as pd
from pymorphy3 import MorphAnalyzer
import itertools
import unittest
import re

In [2]:
original_dataset = pd.read_csv("test_dataset.csv")
new_dataset = original_dataset.assign(without_object_base=lambda x: '', without_object_polypers=lambda x: '')

In [None]:
spacy.require_gpu()
disabled_comps = ['lemmatizer', 'ner', 'entity_linker', 'trf_data', 'textcat']
model = spacy.load("ru_core_news_lg", disable=disabled_comps)
morph = MorphAnalyzer().parse

class Sentence:
    def __init__(self, base, poly):
        self.base = re.sub(r'(?<=[a-zA-Zа-яА-Я])-(?=[a-zA-Zа-яА-Я])', 'Ы', base)
        self.poly = re.sub(r'(?<=[a-zA-Zа-яА-Я])-(?=[a-zA-Zа-яА-Я])', 'Ы', poly)
        self.parsed_sent = model(self.base)
        self.parsed_poly = model(self.poly)

    def find_object_and_children(self):
        objects_children = list()
        objects_list = list()
        for token in self.parsed_sent:
            if token.dep_ == 'obj':
                objects_list.append(token)
                objects_children += [child for child in token.children if child.dep_ != 'conj' or child.dep_ != 'parataxis']
            elif token.dep_ == 'conj' or token.dep_ == 'parataxis':
                for ancestor in token.ancestors:
                    if ancestor.dep_ == 'obj':
                        objects_list.append(token)
                        objects_children += [child for child in token.children if child.dep_ != 'conj' or child.dep_ != 'parataxis']
            
        return objects_list, objects_children


    def child_recursion(self, token):
        all_children = [child for child in token.children if child.pos_ != 'PUNCT']
        if all_children == []:
            return all_children
        else:
            for child in all_children:
                all_children += self.child_recursion(child)
        return set(all_children)


    def if_coordinated(self, token):
        parsed_word = morph(token.text)
        all_tags = set(itertools.chain.from_iterable([re.split(r',| ', str(parsed_word[i].tag)) for i in range(len(parsed_word))]))
        stopped = False
        for tag in all_tags:
            if tag == 'ADJF' or tag == 'PRTF':
                stopped = True
                break
        return stopped

    
    def construct_sentence(self):
        objects_list, object_children = self.find_object_and_children()
        list_to_avoid = ['ее', 'её', 'его', 'их']
        
        res_sent = dict()
        add_stops = list()
        for token in self.parsed_sent:
            if token not in objects_list and token not in object_children:
                res_sent[token] = 'PASS'
            elif token not in objects_list and token in object_children:
                if self.if_coordinated(token) == True and token.text not in list_to_avoid:
                    res_sent[token] = 'STOP'
                    add_stops += self.child_recursion(token)
                else:
                    res_sent[token] = 'PASS'
            elif token in objects_list:
                 res_sent[token] = 'STOP'

        for token in self.parsed_sent:
            if token in add_stops and self.if_coordinated(token) == True and token.text not in list_to_avoid:
                 res_sent[token] = 'STOP'

        res_sent_poly = dict()
        for token in self.parsed_poly:
            res_sent_poly[token] = 'poly'
     
        return res_sent, res_sent_poly

    
    def form_sentence(self):
        res = list()
        prev_mask = False
        punct = False
        parsed_sent, poly_parsed_sent = self.construct_sentence()
        for word in parsed_sent.keys():
            if parsed_sent[word] == 'PASS':
                res.append(word.text)
                prev_mask = False
            elif parsed_sent[word] == 'STOP':
                if prev_mask == False:
                    res.append('MASK')
                    prev_mask = True

        res_poly = list()
        prev_mask = False
        for num in range(len(parsed_sent)):
            if list(parsed_sent.keys())[num].text != list(poly_parsed_sent.keys())[num].text:
                res_poly.append(list(poly_parsed_sent.keys())[num].text)
            else:
                if parsed_sent[list(parsed_sent.keys())[num]] == 'PASS':
                    res_poly.append(list(parsed_sent.keys())[num].text)
                    prev_mask = False
                elif parsed_sent[list(parsed_sent.keys())[num]] == 'STOP':
                    if prev_mask == False:
                        res_poly.append('MASK')
                        prev_mask = True

        # print(len(parsed_sent) == len(poly_parsed_sent))
        return {'BASE': re.sub(r'Ы', '-', ' '.join(res)), 'POLY': re.sub(r'Ы', '-', ' '.join(res_poly))}


    def show_scheme(self):
        displacy.render(self.parsed_sent, style="dep", jupyter=True)

In [None]:
from tqdm import tqdm
for i in tqdm(range(len(new_dataset))):
    if new_dataset['was_changed'][i] == True:
        sentence = Sentence(new_dataset['base'][i], new_dataset['polypers'][i])
        new_dataset.loc[i, 'without_object_base'] = sentence.form_sentence()['BASE']
        new_dataset.loc[i, 'without_object_polypers'] = sentence.form_sentence()['POLY']
    elif new_dataset['was_changed'][i] == False:
        new_dataset.loc[i, 'without_object_base'] = new_dataset['base'][i]
        new_dataset.loc[i, 'without_object_polypers'] = new_dataset['polypers'][i]

In [14]:
result_dataset = pd.read_csv("result_dataset.csv")

with codecs.open('check.txt', 'w', 'utf-8') as f:
    for i in range(len(original_dataset)):
        if result_dataset['was_changed'][i] ==True:
            f.write(f'{i}_Polypers: {result_dataset["polypers"][i]}\n')
            f.write(f'{i}_Polypers_mod: {result_dataset["without_object_polypers"][i]}\n')
            f.write(f'{i}_Base: {result_dataset["base"][i]}\n')
            f.write(f'{i}_Base_mod: {result_dataset["without_object_base"][i]}\n')
            f.write('________________\n')
            f.write('\n')