In [1]:
import codecs
import spacy
from spacy import displacy
import pandas as pd
from pymorphy3 import MorphAnalyzer
from tqdm import tqdm
import itertools
import unittest
import re
import numpy as np

In [2]:
original_dataset = pd.read_csv("test_dataset.csv")
new_dataset = original_dataset.assign(without_object_base=lambda x: '', without_object_polypers=lambda x: '')

changed_subset = new_dataset[new_dataset["was_changed"]].reset_index(drop=True)
unchanged_subset = new_dataset[~new_dataset["was_changed"]].reset_index(drop=True)

In [3]:
spacy.require_gpu()
disabled_comps = ['ner', 'senter', 'attribute_ruler', 'lemmatizer', 'morphologizer']
model = spacy.load("ru_core_news_lg", disable=disabled_comps)
morph = MorphAnalyzer().parse

list_to_avoid = ['ее', 'её', 'его', 'их']

class Sentence:
    def __init__(self, base, poly):
        self.parsed_sent = base
        self.parsed_poly = poly

    def find_object_and_children(self):
        objects_children = list()
        objects_list = list()
        for token in self.parsed_sent:
            if token.dep_ == 'obj':
                objects_list.append(token)
                objects_children += [child for child in token.children if child.dep_ != 'conj' or child.dep_ != 'parataxis']
            elif token.dep_ == 'conj' or token.dep_ == 'parataxis':
                for ancestor in token.ancestors:
                    if ancestor.dep_ == 'obj':
                        objects_list.append(token)
                        objects_children += [child for child in token.children if child.dep_ != 'conj' or child.dep_ != 'parataxis']
            
        return objects_list, objects_children


    def child_recursion(self, token):
        all_children = [child for child in token.children if child.pos_ != 'PUNCT']
        if all_children == []:
            return all_children
        else:
            for child in all_children:
                all_children += self.child_recursion(child)
        return set(all_children)


    def if_coordinated(self, token):
        parsed_word = morph(token.text)
        all_tags = set(itertools.chain.from_iterable([re.split(r',| ', str(parsed_word[i].tag)) for i in range(len(parsed_word))]))
        stopped = False
        for tag in all_tags:
            if tag == 'ADJF' or tag == 'PRTF':
                stopped = True
                break
        return stopped

    
    def construct_sentence(self):
        objects_list, object_children = self.find_object_and_children()
        
        res_sent = dict()
        add_stops = list()
        for token in self.parsed_sent:
            if token not in objects_list and token not in object_children:
                res_sent[token] = 'PASS'
            elif token not in objects_list and token in object_children:
                if self.if_coordinated(token) == True and token.text not in list_to_avoid:
                    res_sent[token] = 'STOP'
                    add_stops += self.child_recursion(token)
                else:
                    res_sent[token] = 'PASS'
            elif token in objects_list:
                 res_sent[token] = 'STOP'

        for token in self.parsed_sent:
            if token in add_stops and self.if_coordinated(token) == True and token.text not in list_to_avoid:
                 res_sent[token] = 'STOP'

        res_sent_poly = dict()
        for token in self.parsed_poly:
            res_sent_poly[token] = 'poly'
     
        return res_sent, res_sent_poly

    
    def form_sentence(self):
        res = list()
        prev_mask = False
        punct = False
        parsed_sent, poly_parsed_sent = self.construct_sentence()
        for word in parsed_sent.keys():
            if parsed_sent[word] == 'PASS':
                res.append(word.text)
                prev_mask = False
            elif parsed_sent[word] == 'STOP':
                if prev_mask == False:
                    res.append('MASK')
                    prev_mask = True

        res_poly = list()
        prev_mask = False
        for num in range(len(parsed_sent)):
            if list(parsed_sent.keys())[num].text != list(poly_parsed_sent.keys())[num].text:
                res_poly.append(list(poly_parsed_sent.keys())[num].text)
            else:
                if parsed_sent[list(parsed_sent.keys())[num]] == 'PASS':
                    res_poly.append(list(parsed_sent.keys())[num].text)
                    prev_mask = False
                elif parsed_sent[list(parsed_sent.keys())[num]] == 'STOP':
                    if prev_mask == False:
                        res_poly.append('MASK')
                        prev_mask = True

        # print(len(parsed_sent) == len(poly_parsed_sent))
        return {'BASE': re.sub(r'Ы', '-', ' '.join(res)), 'POLY': re.sub(r'Ы', '-', ' '.join(res_poly))}


    def show_scheme(self):
        displacy.render(self.parsed_sent, style="dep", jupyter=True)

In [4]:
sentences_base = changed_subset['base'].str.replace(r'(?<=[a-zA-Zа-яА-Я])-(?=[a-zA-Zа-яА-Я])', 'Ы', regex=True)
sentences_poly = changed_subset['polypers'].str.replace(r'(?<=[a-zA-Zа-яА-Я])-(?=[a-zA-Zа-яА-Я])', 'Ы', regex=True)

print("Processing base sentences")
parsed_sentences_base = list(tqdm(model.pipe(sentences_base), total=len(changed_subset)))

print("Processing polypersonal sentences")
parsed_sentences_poly = list(tqdm(model.pipe(sentences_poly), total=len(changed_subset)))

print("Deleting objects")
for i in tqdm(range(len(changed_subset))):
    sentence = Sentence(parsed_sentences_base[i], parsed_sentences_poly[i])
    changed_subset.loc[i, 'without_object_base'] = sentence.form_sentence()['BASE']
    changed_subset.loc[i, 'without_object_polypers'] = sentence.form_sentence()['POLY']

unchanged_subset['without_object_base'] = unchanged_subset['base']
unchanged_subset['without_object_polypers'] = unchanged_subset['polypers']

result_dataset = pd.concat([changed_subset, unchanged_subset]).sort_values(by='Unnamed: 0')
result_dataset.to_csv("result_dataset.csv", index=False)

Processing base sentences


100%|██████████| 59801/59801 [00:21<00:00, 2719.77it/s]


Processing polypersonal sentences


100%|██████████| 59801/59801 [00:18<00:00, 3302.39it/s]


Deleting objects


100%|██████████| 59801/59801 [02:09<00:00, 462.14it/s]


In [5]:
result_dataset = pd.read_csv("result_dataset.csv")
result_dataset = result_dataset[result_dataset["was_changed"]].reset_index(drop=True)

with codecs.open('check.txt', 'w', 'utf-8') as f:
    for i in range(len(result_dataset)):
        if result_dataset['was_changed'][i] ==True:
            f.write(f'{i}_Polypers: {result_dataset["polypers"][i]}\n')
            f.write(f'{i}_Polypers_mod: {result_dataset["without_object_polypers"][i]}\n')
            f.write(f'{i}_Base: {result_dataset["base"][i]}\n')
            f.write(f'{i}_Base_mod: {result_dataset["without_object_base"][i]}\n')
            f.write('________________\n')
            f.write('\n')

In [6]:
df_old = pd.read_csv("result_dataset_old.csv").sort_values(by='Unnamed: 0').reset_index(drop=True)
df_new = pd.read_csv("result_dataset.csv").sort_values(by='Unnamed: 0').reset_index(drop=True)
df_old.compare(df_new)