# Imports

In [85]:
import pandas as pd
import re
import numpy as np
import glob
import decimal
import tensorflow_hub as hub
import tensorflow_text as text
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

# Reading docs

In [86]:
def read_file(path, encoding, is_lower=False):
    list_of_paragraph = []
    buffer = []
    with open(path, encoding=encoding) as file:
        regex = re.compile(
            r'(((?<!Статья )(?<!^)(?<!(\.|\s))(?<!     )[а-яА-Я\d]{2,})([\.]{0,1}[\]\)\"]{0,2}[\.\;\:]{1}(\s|$)))',
            flags=re.IGNORECASE)
        text = file.read()

        if is_lower:
            text = text.lower()
        offset = 0
        buf = ''
        for ind, value in enumerate(regex.split(text, maxsplit=0)):
            if ind == 0 + offset:
                buf = value
            if ind == 1 + offset:
                list_of_paragraph.append((buf + value).strip())
                offset += 6

        second_regex = re.compile(r'([^\dгст][\.\;\:](?=\s|$))', flags=re.IGNORECASE)

        for value1 in list_of_paragraph:
            buf = ''
            for value in second_regex.split(value1, maxsplit=0):
                if value == '':
                    continue

                if len(value) > 2 and buf != '':
                    buffer.append(buf.strip())
                    buf = value
                elif len(value) > 2:
                    buf = value
                elif len(value) <= 2:
                    buffer.append((buf + value).strip())
                    buf = ''
    return buffer

### Getting file paths

In [87]:
list_of_path = list(filter(lambda x: not '1ДИ' in str(x), glob.glob("docs/*/*.txt")))
list_of_path

list_of_instruction_paths = list(filter(lambda x: str(x).startswith('docs\\Список ДИ'), list_of_path))
list_of_external_doc_paths = list(filter(lambda x: str(x).startswith('docs\\Список внешних'), list_of_path))

### Preparation of sentences

In [88]:
list_of_external_docs = []
list_of_instructions = []

in_lower_case = True

for path_to_doc in list_of_instruction_paths:
    buf_list = read_file(path_to_doc, 'windows-1251', in_lower_case)
    buf_list = [x for x in buf_list if len(x) > 3]
    list_of_instructions.append(buf_list)

for path_to_doc in list_of_external_doc_paths:
    buf_list = read_file(path_to_doc, 'windows-1251', in_lower_case)
    buf_list = [x for x in buf_list if len(x) > 3]
    list_of_external_docs.append(buf_list)

## Preparing the embeddings

### Download model

In [89]:
download_first_model = False

In [90]:
embed = None
size = 512
if download_first_model:
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
else:
    size = 768
    embed = SentenceTransformer('nq-distilbert-base-v1', device='cuda')

### Getting embeddings from instructions

In [91]:
%%time
BATCH_SIZE = 64
list_of_instruction_embeddings = []

for doc in list_of_instructions:
    buffer = np.empty((0, size), float)
    for i in range(0, len(doc), BATCH_SIZE):
        batch = doc[i:i + BATCH_SIZE]
        if download_first_model:
            buffer = np.concatenate((buffer, embed(batch)), axis=0)
        else:
            buffer = np.concatenate((buffer, embed.encode(batch)), axis=0)

    list_of_instruction_embeddings.append(buffer)

CPU times: total: 52.4 s
Wall time: 48.5 s


### Getting embeddings from external docs

In [92]:
%%time
BATCH_SIZE = 64
list_of_embeddings_of_external_docs = []

for doc in list_of_external_docs:
    buffer = np.empty((0, size), float)
    for i in range(0, len(doc), BATCH_SIZE):
        batch = doc[i:i + BATCH_SIZE]
        if download_first_model:
            buffer = np.concatenate((buffer, embed(batch)), axis=0)
        else:
            buffer = np.concatenate((buffer, embed.encode(batch)), axis=0)

    list_of_embeddings_of_external_docs.append(buffer)

CPU times: total: 1min 7s
Wall time: 1min 2s


### Computing similarity matrix

In [93]:
list_for_df = []

for index, doc1 in enumerate(list_of_instruction_embeddings):
    buffer = {
        'instruction_filename': list_of_instruction_paths[index]
    }
    for ind, doc2 in enumerate(list_of_embeddings_of_external_docs):
        buffer['external_doc_filename'] = list_of_external_doc_paths[ind]
        if download_first_model:
            similarity_matrix = np.inner(doc1, doc2)
        else:
            similarity_matrix = cosine_similarity(doc1, doc2)

        for i, row in enumerate(similarity_matrix):
            buffer['sentence_number_from_instruction'] = i
            buffer['sentence_number_from_external_doc'] = int(np.argmax(row))
            buffer['value'] = np.max(row)
            buffer['text_of_instruction'] = list_of_instructions[index][i]
            buffer['text_of_external_doc'] = list_of_external_docs[ind][int(np.argmax(row))]

            list_for_df.append(buffer.copy())

## Creating dataframe

In [94]:
new_df = pd.DataFrame(
    columns=[
        'sentence_number_from_instruction',
        'sentence_number_from_external_doc',
        'value',
        'text_of_instruction',
        'instruction_filename',
        'text_of_external_doc',
        'external_doc_filename'
    ])

In [95]:
for index, row in enumerate(list_for_df):
    new_df.loc[index] = [
        row['sentence_number_from_instruction'],
        row['sentence_number_from_external_doc'],
        row['value'],
        row['text_of_instruction'],
        row['instruction_filename'],
        row['text_of_external_doc'],
        row['external_doc_filename'],
    ]

In [96]:
with pd.ExcelWriter("embeddings.xlsx", engine="xlsxwriter") as writer:
    new_df.to_excel(writer, 'good', engine='xlsxwriter')
    sheets_good = writer.sheets['good']
    sheets_good.autofilter(0, 0, new_df.shape[0], new_df.shape[1])

    print("Файл создан")


Файл создан
