# Build HF embeddings

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

# https://huggingface.co/spaces/mteb/leaderboard
sentence_transformer_model = "lier007/xiaobu-embedding-v2" # rank 1 in chinese
# sentence_transformer_model = "Alibaba-NLP/gte-Qwen2-7B-instruct" # rank 2 in chinese

hf_embeddings_model = HuggingFaceEmbeddings(
    model_name=sentence_transformer_model,
    cache_folder="../sentence_transformer_model",
)

  from tqdm.autonotebook import tqdm, trange


# Build HF vector database

In [2]:
import os
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter

document_root_path = "../docs"
documents = ["CNS16190-zh_TW.md", "CNS16190-zh_TW_only_provision.md"]
document_idx = 0
embedding_cache_path = "../embedding_cache/"
db_collection_names = ["CNS16190_md_op_emb_1000_200", "CNS16190_md_hf_emb_1000_200"]
db_collection_idx = 1

if os.path.isdir(
    os.path.join(
        embedding_cache_path, "collection", db_collection_names[db_collection_idx]
    )
):
    # database already exists, load it
    hf_vectorstore = Qdrant.from_existing_collection(
        embedding=hf_embeddings_model,
        path=embedding_cache_path,
        collection_name=db_collection_names[db_collection_idx],
    )
else:
    # database does not exist, create it
    loader = UnstructuredMarkdownLoader(
        os.path.join(document_root_path, documents[document_idx]), mode="elements"
    )
    doc = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(documents=doc)

    hf_vectorstore = Qdrant.from_documents(
        splits,
        embedding=hf_embeddings_model,
        path=embedding_cache_path,
        collection_name=db_collection_names[db_collection_idx],
    )

In [3]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=hf_vectorstore.as_retriever(), llm=llm
)


In [7]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [8]:
question = "「於所定義支援期間內，對其銷售及生產持續監視及矯正安全脆弱性。」符合哪一項控制措施？"

In [9]:
result = retriever_from_llm.invoke(question)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. 在支援期間內，哪些控制措施適用於監視和矯正銷售和生產中的安全脆弱性？', '2. 如何確保在所定義的支援期間內對銷售和生產持續進行監視和矯正以解決安全脆弱性問題？', '3. 哪些方法可以用來監視和矯正在支援期間內出現的銷售和生產中的安全脆弱性？']


In [6]:
result

[Document(page_content='控制措施5.2-3：製造者宜於所定義支援期間內，對其銷售及生產，以及所生產之產品及運作的服務，持續監視、識別及矯正安全脆弱性。', metadata={'source': '../docs\\CNS16190-zh_TW.md', 'category_depth': 2, 'last_modified': '2024-07-06T20:12:04', 'languages': ['kor'], 'parent_id': 'd5365764ada7a35c75a0d0918088ba52', 'filetype': 'text/markdown', 'file_directory': '../docs', 'filename': 'CNS16190-zh_TW.md', 'category': 'Title', '_id': '5a0c949ef2e147fbb43223fc5c9ed917', '_collection_name': 'CNS16190_md_hf_emb_1000_200'}),
 Document(page_content='預期於其所定義支援期間內，對裝置執行此運作。然而製造者可於該期間外繼續執行此運作並發布安全更新，以矯正脆弱性。', metadata={'source': '../docs\\CNS16190-zh_TW.md', 'category_depth': 0, 'last_modified': '2024-07-06T20:12:04', 'languages': ['kor'], 'filetype': 'text/markdown', 'file_directory': '../docs', 'filename': 'CNS16190-zh_TW.md', 'category': 'Title', '_id': '16cc2a3b38f2426ba342de5e3282d4ae', '_collection_name': 'CNS16190_md_hf_emb_1000_200'}),
 Document(page_content='控制措施5.2-2：對已揭露之脆弱性宜以及時方式採取行動。', metadata={'source': '../docs\\CNS16190-zh