In [1]:
import sys, os
import numpy as np
import faiss
import torch
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm

Set folder path

In [2]:
DATABASE_PATH = os.path.abspath("../../var/database")
DOCS_PATH = os.path.abspath("../../docs")
DEVICE = "cpu"

# Read model name
with open(os.path.join(DATABASE_PATH, "model.txt"), "r", encoding="utf-8") as f:
    EMBEDDINGS_MODEL_NAME = f.read()

In [13]:
def get_embedding():
    
    """
    Returns embedding
    """
    
    embeddings_model = HuggingFaceEmbeddings(
        model_name=EMBEDDINGS_MODEL_NAME,
        model_kwargs={"device": DEVICE}
    )
    vector_store = FAISS.load_local(
        DATABASE_PATH,
        embeddings=embeddings_model,
        allow_dangerous_deserialization=True
    )
    return embeddings_model, vector_store


embeddings_model, vector_store = get_embedding()

In [None]:
def find_docs(questions):
    
    """
    Find documents by questions
    """
    
    embeddings = np.array(embeddings_model.embed_documents(questions))
    mean_embedding = embeddings.mean(axis=0).tolist()
    docs = vector_store.similarity_search_by_vector(mean_embedding, k=3)
    return embeddings, docs

def print_docs(docs):
    for i, doc in enumerate(docs):
        print(f"Result {i+1}:")
        print(f"ID: {doc.metadata['id']}")
        print(f"Content: {doc.page_content}")
        #print(f"Distance: {distance}")
        print("-" * 50)


questions = [
    "Что такое BayLang?",
    #"Какие преимущества у BayLang?",
    "Как его установить?",
    #"Установка BayLang",
]
#questions = ["Расскажи о преимуществах BayLang"]

# Ищем релевантные документы
_, docs = find_docs(questions)
context = "\n\n".join([doc.page_content for doc in docs])

#print(docs)
print(context)