# Imports

In [1]:
import pandas as pd
import re
import numpy as np
from numpy import dot
from numpy.linalg import norm
import glob
import decimal
import torch
import tensorflow_hub as hub
import tensorflow_text as text
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

# Reading docs

In [2]:
def read_file(path: str, encoding: str, is_lower=False):
    list_of_paragraph = []
    buffer = []
    with open(path, encoding=encoding) as file:
        regex = re.compile(
            r'(((?<!Статья )(?<!^)(?<!(\.|\s))(?<!     )[а-яА-Я\d]{2,})([\.]{0,1}[\]\)\"]{0,2}[\.\;\:]{1}(\s|$)))',
            flags=re.IGNORECASE)
        text = file.read()

        if is_lower:
            text = text.lower()
        offset = 0
        buf = ''
        for ind, value in enumerate(regex.split(text, maxsplit=0)):
            if ind == 0 + offset:
                buf = value
            if ind == 1 + offset:
                list_of_paragraph.append((buf + value).strip())
                offset += 6

        second_regex = re.compile(r'([^\dгст][\.\;\:](?=\s|$))', flags=re.IGNORECASE)

        for value1 in list_of_paragraph:
            buf = ''
            for value in second_regex.split(value1, maxsplit=0):
                if value == '':
                    continue

                if len(value) > 2 and buf != '':
                    buffer.append(buf.strip())
                    buf = value
                elif len(value) > 2:
                    buf = value
                elif len(value) <= 2:
                    buffer.append((buf + value).strip())
                    buf = ''
    return buffer

### Getting file paths

In [3]:
list_of_path = list(filter(lambda x: not '1ДИ' in str(x), glob.glob("docs/*/*.txt")))

list_of_instruction_paths = list(filter(lambda x: str(x).startswith('docs\\Список ДИ'), list_of_path))
list_of_external_doc_paths = list(filter(lambda x: str(x).startswith('docs\\Список внешних'), list_of_path))

### Preparation of sentences

### Cleaning

In [4]:
def remove_all_specific_symbols(text: str):
    text = re.sub(r'[\.\,\;\:]', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    bad_symbols = [')', '*']
    for symbol in bad_symbols:
        if text.startswith(symbol):
            text = text[1:]
    return text

In [5]:
def clean(text: str) -> str:
    text = re.sub(r'(\d+)(?:\.)?', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text

In [6]:
def valid(text: str) -> bool:
    def mini_valid(x: str):
        list_of_specific_symbols = ['.', ',', ':', ';']
        for sym in list_of_specific_symbols:
            if sym in x:
                return len([y for y in re.split(r'\s', ' '.join(x.split(sym, maxsplit=0))) if len(y) > 1]) > 2
        return True

    flag: bool = len(list(filter(lambda x: len(str(x)) > 0 and mini_valid(x), re.split(r'\s', text, maxsplit=0)))) > 2
    return len(text) > 3 and flag

In [7]:
%%time

list_of_external_docs = []
list_of_instructions = []

in_lower_case = True

for path_to_doc in list_of_instruction_paths:
    buf_list = read_file(path_to_doc, 'windows-1251', in_lower_case)
    buf_list = [clean(x) for x in buf_list]
    buf_list = [remove_all_specific_symbols(x) for x in buf_list if valid(x)]
    list_of_instructions.append(buf_list)

for path_to_doc in list_of_external_doc_paths:
    buf_list = read_file(path_to_doc, 'windows-1251', in_lower_case)
    buf_list = [clean(x) for x in buf_list]
    buf_list = [remove_all_specific_symbols(x) for x in buf_list if valid(x)]
    list_of_external_docs.append(buf_list)

CPU times: total: 6.83 s
Wall time: 7.36 s


## Preparing the embeddings

### Download model

In [8]:
download_first_model = True

In [10]:
embed = None
size = 512
if download_first_model:
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
else:
    size = 768
    embed = SentenceTransformer('symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli', device='cuda')

In [11]:
bad_words = []
bad_words2 = []

with open('./stop.txt', encoding='windows-1251') as file:
    for text in file.readlines():
        if text.endswith('\n'):
            if download_first_model:
                bad_words.append(embed(remove_all_specific_symbols(text[:-1])))
            else:
                bad_words.append(normalize(embed.encode([remove_all_specific_symbols(text[:-1])])))

            bad_words2.append(remove_all_specific_symbols(text[:-1]))

### Getting embeddings from instructions

In [12]:
%%time
BATCH_SIZE = 64
list_of_instruction_embeddings = []

for doc in list_of_instructions:
    buffer = np.empty((0, size), dtype="float32")
    for i in range(0, len(doc), BATCH_SIZE):
        batch = doc[i:i + BATCH_SIZE]
        if download_first_model:
            buffer = np.concatenate((buffer, embed(batch)), axis=0)
        else:
            # buffer = np.concatenate((buffer, embed.encode(batch)), axis=0)
            encoded_batch = embed.encode(batch)
            buffer = np.concatenate((buffer, normalize(encoded_batch)), axis=0)

    list_of_instruction_embeddings.append(buffer)

CPU times: total: 13 s
Wall time: 10.2 s


### Getting embeddings from external docs

In [13]:
%%time
BATCH_SIZE = 64
list_of_embeddings_of_external_docs = []

for doc in list_of_external_docs:
    buffer = np.empty((0, size), dtype="float32")
    for i in range(0, len(doc), BATCH_SIZE):
        batch = doc[i:i + BATCH_SIZE]
        if download_first_model:
            buffer = np.concatenate((buffer, embed(batch)), axis=0)
        else:
            # buffer = np.concatenate((buffer, embed.encode(batch)), axis=0)
            encoded_batch = embed.encode(batch)
            buffer = np.concatenate((buffer, normalize(encoded_batch)), axis=0)

    list_of_embeddings_of_external_docs.append(buffer)

CPU times: total: 47.6 s
Wall time: 35.6 s


### Computing similarity matrix

In [14]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
%%time
list_for_df = []
exclude_df = pd.DataFrame(columns=[
    'sentence_number_from_instruction',
    'sentence_number_from_external_doc',
    'value',
    'stop_word',
    'text_of_instruction',
    'text_of_external_doc',
    'instruction_filename',
    'external_doc_filename'
])
for index, doc1 in enumerate(list_of_instruction_embeddings):
    buffer = {
        'instruction_filename': list_of_instruction_paths[index]
    }
    for ind, doc2 in enumerate(list_of_embeddings_of_external_docs):
        buffer['external_doc_filename'] = list_of_external_doc_paths[ind]
        similarity_matrix = np.inner(doc1, doc2)

        for i, row in enumerate(similarity_matrix):
            flag = False
            for word_index, word in enumerate(bad_words):
                if np.max(np.inner(word, doc1[i])) >= 0.95:
                    flag = True
                    exclude_df = exclude_df.append({
                        'sentence_number_from_instruction': i,
                        'sentence_number_from_external_doc': None,
                        'value': np.max(np.inner(word, doc1[i])),
                        'stop_word': bad_words2[word_index],
                        'text_of_instruction': list_of_instructions[index][i],
                        'text_of_external_doc': None,
                        'instruction_filename': list_of_instruction_paths[index],
                        'external_doc_filename': None
                    }, ignore_index=True)
                    break
                if np.max(np.inner(word, doc2[int(np.argmax(row))])) >= 0.95:
                    flag = True
                    exclude_df = exclude_df.append({
                        'sentence_number_from_instruction': None,
                        'sentence_number_from_external_doc': int(np.argmax(row)),
                        'value': np.max(np.inner(word, doc2[int(np.argmax(row))])),
                        'stop_word': bad_words2[word_index],
                        'text_of_instruction': None,
                        'text_of_external_doc': list_of_external_docs[ind][int(np.argmax(row))],
                        'instruction_filename': None,
                        'external_doc_filename': list_of_external_doc_paths[ind]
                    }, ignore_index=True)
                    break
            if flag:
                continue
            buffer['sentence_number_from_instruction'] = i
            buffer['sentence_number_from_external_doc'] = int(np.argmax(row))
            buffer['value'] = np.max(row)
            buffer['text_of_instruction'] = list_of_instructions[index][i]
            buffer['text_of_external_doc'] = list_of_external_docs[ind][int(np.argmax(row))]

            list_for_df.append(buffer.copy())

## Creating dataframe

In [None]:
exclude_df.index

In [None]:
new_df = pd.DataFrame(
    columns=[
        'sentence_number_from_instruction',
        'sentence_number_from_external_doc',
        'value',
        'text_of_instruction',
        'instruction_filename',
        'text_of_external_doc',
        'external_doc_filename'
    ])

In [None]:
%%time
import multiprocessing
import importlib
import thread

importlib.reload(thread)

pool = multiprocessing.Pool(multiprocessing.cpu_count())

results = pool.map(thread.add_rows, [list_for_df[i:i + 256] for i in range(0, len(list_for_df), 256)])
pool.close()
pool.join()

new_df = pd.concat(results)

In [None]:
new_df.index

In [None]:
# %%time
# for index, row in enumerate(list_for_df):
#     new_df = new_df.append({
#         'sentence_number_from_instruction': row['sentence_number_from_instruction'],
#         'sentence_number_from_external_doc': row['sentence_number_from_external_doc'],
#         'value': row['value'],
#         'text_of_instruction': row['text_of_instruction'],
#         'instruction_filename': row['instruction_filename'],
#         'text_of_external_doc': row['text_of_external_doc'],
#         'external_doc_filename': row['external_doc_filename'],
#     }, ignore_index=True)

In [None]:
with pd.ExcelWriter("embeddings.xlsx", engine="xlsxwriter") as writer:
    new_df.to_excel(writer, 'good', engine='xlsxwriter')
    sheets_good = writer.sheets['good']
    sheets_good.autofilter(0, 0, new_df.shape[0], new_df.shape[1])
    # sheets_good.auto_filter.ref = sheets_good.dimensions
    print("Файл создан")

In [None]:
with pd.ExcelWriter("stop_words.xlsx", engine="xlsxwriter") as writer2:
    exclude_df.to_excel(writer2, 'good', engine='xlsxwriter')
    sheets_good = writer2.sheets['good']
    sheets_good.autofilter(0, 0, exclude_df.shape[0], exclude_df.shape[1])
    # sheets_good.auto_filter.ref = sheets_good.dimensions
    print("Файл создан")