# Imports

In [103]:
import pandas as pd
import re
import numpy as np
import glob
import decimal
import tensorflow_hub as hub
import tensorflow_text as text
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

# Reading docs

In [104]:
def read_file(path: str, encoding: str, is_lower=False):
    list_of_paragraph = []
    buffer = []
    with open(path, encoding=encoding) as file:
        regex = re.compile(
            r'(((?<!Статья )(?<!^)(?<!(\.|\s))(?<!     )[а-яА-Я\d]{2,})([\.]{0,1}[\]\)\"]{0,2}[\.\;\:]{1}(\s|$)))',
            flags=re.IGNORECASE)
        text = file.read()

        if is_lower:
            text = text.lower()
        offset = 0
        buf = ''
        for ind, value in enumerate(regex.split(text, maxsplit=0)):
            if ind == 0 + offset:
                buf = value
            if ind == 1 + offset:
                list_of_paragraph.append((buf + value).strip())
                offset += 6

        second_regex = re.compile(r'([^\dгст][\.\;\:](?=\s|$))', flags=re.IGNORECASE)

        for value1 in list_of_paragraph:
            buf = ''
            for value in second_regex.split(value1, maxsplit=0):
                if value == '':
                    continue

                if len(value) > 2 and buf != '':
                    buffer.append(buf.strip())
                    buf = value
                elif len(value) > 2:
                    buf = value
                elif len(value) <= 2:
                    buffer.append((buf + value).strip())
                    buf = ''
    return buffer

### Getting file paths

In [105]:
list_of_path = list(filter(lambda x: not '1ДИ' in str(x), glob.glob("docs/*/*.txt")))

list_of_instruction_paths = list(filter(lambda x: str(x).startswith('docs\\Список ДИ'), list_of_path))
list_of_external_doc_paths = list(filter(lambda x: str(x).startswith('docs\\Список внешних'), list_of_path))

### Preparation of sentences

In [106]:
bad_words = []
with open('./stop.txt', encoding='windows-1251') as file:
    for text in file.readlines():
        if text.endswith('\n'):
            bad_words.append(text[:-1])
bad_words

['примечания:',
 'общие требования.',
 '* в соответствии с требованиями инструкций;',
 'методические указания.',
 'приложения:',
 'приложение',
 '- руководящими документами',
 'и доп., вступ.',
 'в ред.',
 'начальник',
 'начальника',
 'начальник смены',
 '* заместитель начальника',
 'примечание:',
 'примечание.',
 'примечания:',
 'замечание устранено, подпись.',
 '(подпись).',
 'по мере необходимости:',
 '(в ред.',
 'федеральный закон от № -фз (ред.',
 '- федеральный закон от n -фз.']

In [107]:
def clean(text: str) -> str:
    text = re.sub(r'(\d+)(?:\.)?', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text

In [108]:
def valid(text: str) -> bool:
    def mini_valid(x: str):
        list_of_specific_symbols = ['.', ',', ':', ';']
        for sym in list_of_specific_symbols:
            if sym in x:
                return len([y for y in re.split(r'\s', ' '.join(x.split(sym, maxsplit=0))) if len(y) > 1]) > 2
        return True

    flag: bool = len(list(filter(lambda x: len(str(x)) > 0 and mini_valid(x), re.split(r'\s', text, maxsplit=0)))) > 2
    return len(text) > 3 and flag and text.strip() not in bad_words

In [109]:
clean('maki2.9ng i.t ?asd213ver 2000.123.34 years old. Richa6.1rd McCl8intock')

'making i.t ?asdver years old. Richard McClintock'

In [110]:
clean('- федеральный закон от 22.12.2014 n 443-фз.')
clean('федеральный закон от № -фз (ред.')

'федеральный закон от № -фз (ред.'

In [111]:
clean('- федеральный закон от 22.12.2014 n 443-фз.')

'- федеральный закон от n -фз.'

In [112]:
valid(clean('- федеральный закон от 22.12.2014 n 443-фз.\n     '))
list_of_external_docs = []
list_of_instructions = []

in_lower_case = True

for path_to_doc in list_of_instruction_paths:
    buf_list = read_file(path_to_doc, 'windows-1251', in_lower_case)
    buf_list = [clean(x) for x in buf_list]
    buf_list = [x for x in buf_list if valid(x)]
    list_of_instructions.append(buf_list)

for path_to_doc in list_of_external_doc_paths:
    buf_list = read_file(path_to_doc, 'windows-1251', in_lower_case)
    buf_list = [clean(x) for x in buf_list]
    buf_list = [x for x in buf_list if valid(x)]
    list_of_external_docs.append(buf_list)

## Preparing the embeddings

### Download model

In [113]:
download_first_model = True

In [114]:
embed = None
size = 512
if download_first_model:
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
else:
    size = 768
    embed = SentenceTransformer('nq-distilbert-base-v1', device='cuda')

### Getting embeddings from instructions

In [115]:
%%time
BATCH_SIZE = 64
list_of_instruction_embeddings = []

for doc in list_of_instructions:
    buffer = np.empty((0, size), float)
    for i in range(0, len(doc), BATCH_SIZE):
        batch = doc[i:i + BATCH_SIZE]
        if download_first_model:
            buffer = np.concatenate((buffer, embed(batch)), axis=0)
        else:
            buffer = np.concatenate((buffer, embed.encode(batch)), axis=0)

    list_of_instruction_embeddings.append(buffer)

CPU times: total: 15.7 s
Wall time: 12.2 s


### Getting embeddings from external docs

In [116]:
%%time
BATCH_SIZE = 64
list_of_embeddings_of_external_docs = []

for doc in list_of_external_docs:
    buffer = np.empty((0, size), float)
    for i in range(0, len(doc), BATCH_SIZE):
        batch = doc[i:i + BATCH_SIZE]
        if download_first_model:
            buffer = np.concatenate((buffer, embed(batch)), axis=0)
        else:
            buffer = np.concatenate((buffer, embed.encode(batch)), axis=0)

    list_of_embeddings_of_external_docs.append(buffer)

CPU times: total: 1min 3s
Wall time: 52.2 s


### Computing similarity matrix

In [117]:
list_for_df = []

for index, doc1 in enumerate(list_of_instruction_embeddings):
    buffer = {
        'instruction_filename': list_of_instruction_paths[index]
    }
    for ind, doc2 in enumerate(list_of_embeddings_of_external_docs):
        buffer['external_doc_filename'] = list_of_external_doc_paths[ind]
        if download_first_model:
            similarity_matrix = np.inner(doc1, doc2)
        else:
            similarity_matrix = cosine_similarity(doc1, doc2)

        for i, row in enumerate(similarity_matrix):
            buffer['sentence_number_from_instruction'] = i
            buffer['sentence_number_from_external_doc'] = int(np.argmax(row))
            buffer['value'] = np.max(row)
            buffer['text_of_instruction'] = list_of_instructions[index][i]
            buffer['text_of_external_doc'] = list_of_external_docs[ind][int(np.argmax(row))]

            list_for_df.append(buffer.copy())

## Creating dataframe

In [126]:
new_df = pd.DataFrame(
    columns=[
        'sentence_number_from_instruction',
        'sentence_number_from_external_doc',
        'value',
        'text_of_instruction',
        'instruction_filename',
        'text_of_external_doc',
        'external_doc_filename'
    ])

In [127]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)


In [128]:
%%time
for index, row in enumerate(list_for_df):
    # new_df.loc[index] = [
    #     row['sentence_number_from_instruction'],
    #     row['sentence_number_from_external_doc'],
    #     row['value'],
    #     row['text_of_instruction'],
    #     row['instruction_filename'],
    #     row['text_of_external_doc'],
    #     row['external_doc_filename'],
    # ]
    new_df = new_df.append({
        'sentence_number_from_instruction': row['sentence_number_from_instruction'],
        'sentence_number_from_external_doc': row['sentence_number_from_external_doc'],
        'value': row['value'],
        'text_of_instruction': row['text_of_instruction'],
        'instruction_filename': row['instruction_filename'],
        'text_of_external_doc': row['text_of_external_doc'],
        'external_doc_filename': row['external_doc_filename'],
    }, ignore_index=True)

CPU times: total: 6min 24s
Wall time: 6min 24s


In [122]:
with pd.ExcelWriter("embeddings.xlsx", engine="xlsxwriter") as writer:
    new_df.to_excel(writer, 'good', engine='xlsxwriter')
    sheets_good = writer.sheets['good']
    sheets_good.autofilter(0, 0, new_df.shape[0], new_df.shape[1])

    print("Файл создан")

Файл создан
