### Data

In [None]:
texts = ['Завтра будет хорошая погода.', 'Мальчик упал и сломал ногу.', 'Иван Лопатин - лучший тренер в мире.']

### Embeddings

In [None]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('intfloat/multilingual-e5-base')
model = model.half()

In [None]:
EMBEDDINGS_PATH = 'embeddings.npy'

if not os.path.exists(EMBEDDINGS_PATH):
    pool = model.start_multi_process_pool()
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=16, pool=pool)
    np.save(EMBEDDINGS_PATH, embeddings)
    model.stop_multi_process_pool(pool)
else:
    embeddings = np.load(EMBEDDINGS_PATH)

### Faiss

In [None]:
# !pip install faiss-cpu
import faiss

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

In [None]:
def similarity_texts(text, k=3, threshold=0.75):
    text_embedding = model.encode([text])
    distances, indices = index.search(text_embedding, k)
    distances, indices = distances.squeeze(), indices.squeeze()

    return [
        texts[indices[i].item()]
        for i in range(distances.shape[0])
        if distances[i] >= threshold
    ]