# Build HF embedding

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

# https://huggingface.co/spaces/mteb/leaderboard
sentence_transformer_model = "lier007/xiaobu-embedding-v2" # rank 1 in chinese
# sentence_transformer_model = "Alibaba-NLP/gte-Qwen2-7B-instruct" # rank 2 in chinese

hf_embeddings_model = HuggingFaceEmbeddings(
    model_name=sentence_transformer_model,
    cache_folder="../sentence_transformer_model",
)


  from tqdm.autonotebook import tqdm, trange


: 

# Build HF vector database

In [2]:
import os
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter

document_root_path = "../docs"
documents = ["CNS16190-zh_TW.md", "CNS16190-zh_TW_only_provision.md"]
document_idx = 0
embedding_cache_path = "../embedding_cache/"
db_collection_names = ["CNS16190_md_op_emb_1000_200", "CNS16190_md_hf_emb_1000_200"]
db_collection_idx = 1

if os.path.isdir(
    os.path.join(
        embedding_cache_path, "collection", db_collection_names[db_collection_idx]
    )
):
    # database already exists, load it
    hf_vectorstore = Qdrant.from_existing_collection(
        embedding=hf_embeddings_model,
        path=embedding_cache_path,
        collection_name=db_collection_names[db_collection_idx],
    )
else:
    # database does not exist, create it
    loader = UnstructuredMarkdownLoader(
        os.path.join(document_root_path, documents[document_idx]), mode="elements"
    )
    doc = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(documents=doc)

    hf_vectorstore = Qdrant.from_documents(
        splits,
        embedding=hf_embeddings_model,
        path=embedding_cache_path,
        collection_name=db_collection_names[db_collection_idx],
    )

  attn_output = torch.nn.functional.scaled_dot_product_attention(


# Test search results of HF vector database 

In [8]:
question = "「於所定義支援期間內，對其銷售及生產持續監視及矯正安全脆弱性。」符合哪一項控制措施？"

print("similarity\n")
relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nsimilarity_score_threshold\n")
relevant_docs = hf_vectorstore.search(
    question, search_type="similarity_score_threshold", k=5, score_threshold=0.7
)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nmmr\n")
relevant_docs = hf_vectorstore.search(question, search_type="mmr", k=5)

for item in relevant_docs:
    print(f"{item}\n")


similarity

page_content='控制措施5.2-3：製造者宜於所定義支援期間內，對其銷售及生產，以及所生產之產品及運作的服務，持續監視、識別及矯正安全脆弱性。' metadata={'source': '../docs\\CNS16190-zh_TW.md', 'category_depth': 2, 'last_modified': '2024-07-06T20:12:04', 'languages': ['kor'], 'parent_id': 'd5365764ada7a35c75a0d0918088ba52', 'filetype': 'text/markdown', 'file_directory': '../docs', 'filename': 'CNS16190-zh_TW.md', 'category': 'Title', '_id': '5a0c949ef2e147fbb43223fc5c9ed917', '_collection_name': 'CNS16190_md_hf_emb_1000_200'}

page_content='預期於其所定義支援期間內，對裝置執行此運作。然而製造者可於該期間外繼續執行此運作並發布安全更新，以矯正脆弱性。' metadata={'source': '../docs\\CNS16190-zh_TW.md', 'category_depth': 0, 'last_modified': '2024-07-06T20:12:04', 'languages': ['kor'], 'filetype': 'text/markdown', 'file_directory': '../docs', 'filename': 'CNS16190-zh_TW.md', 'category': 'Title', '_id': '16cc2a3b38f2426ba342de5e3282d4ae', '_collection_name': 'CNS16190_md_hf_emb_1000_200'}

page_content='控制措施5.3-13：製造者應以對使用者清晰透通之可存取方式，公布所定義的支援期間。' metadata={'source': '../docs\\CNS16190-zh_TW.md', '

# Build OpenAI vector database

In [2]:
import os
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader

document_root_path = "../docs"
documents = ["CNS16190-zh_TW.md", "CNS16190-zh_TW_only_provision.md", "ts_103701_split.pdf"]
document_idx = 2
embedding_cache_path = "../embedding_cache/"
db_collection_names = ["CNS16190_md_op_emb_1000_200", "CNS16190_md_hf_emb_1000_200", "TS103701_pdf_op_emb_1000_200"]
db_collection_idx = 2
mode = "pdf"

if mode == "md":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        openai_vectorstore = Qdrant.from_existing_collection(
            embedding=OpenAIEmbeddings(),
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        # database does not exist, create it
        loader = UnstructuredMarkdownLoader(
            os.path.join(document_root_path, documents[document_idx]), mode="elements"
        )
        doc = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(doc)

        openai_vectorstore = Qdrant.from_documents(
            splits,
            embedding=OpenAIEmbeddings(),
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

if mode == "pdf":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        openai_vectorstore = Qdrant.from_existing_collection(
            embedding=OpenAIEmbeddings(),
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        pdf_loader = PyPDFLoader(os.path.join(document_root_path, documents[document_idx]))
        pdf_doc = pdf_loader.load()
        pdf_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        pdf_splits = pdf_text_splitter.split_documents(documents=pdf_doc)

        openai_vectorstore = Qdrant.from_documents(
            pdf_splits,
            embedding=OpenAIEmbeddings(),
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

# Test search results of OpenAI vector database 

In [9]:
question = """「The purpose of this test case is the conceptual assessment of the manner in which vulnerabilities are acted on a) and the 
confirmation that the preconditions for the implementation are ensured b). 」符合哪一項 Test group？
告訴我他的 Test group objective、Test units 和 Assignment of verdict """

print("similarity\n")
relevant_docs = openai_vectorstore.search(question, search_type="similarity", k=5)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nsimilarity_score_threshold\n")
relevant_docs = openai_vectorstore.search(
    question, search_type="similarity_score_threshold", k=5, score_threshold=0.7
)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nmmr\n")
relevant_docs = openai_vectorstore.search(question, search_type="mmr", k=5)

for item in relevant_docs:
    print(f"{item}\n")


similarity

page_content='acknowledgement of receipt and status updates. \nThe verdict FAIL is assigned otherwise. \n5.2.2 Test group 5.2-2 \n5.2.2.0 Test group objective \nThe test group addresses the provision 5.2-2.  \n5.2.2.1 Test case 5.2-2-1 (conceptual) \nTest purpose \nThe purpose of this test case is the conceptual assessment of the manner in which vulnerabilities are acted on a) and the \nconfirmation that the preconditions fo r the implementation are ensured b). \nTest units \na) The TL shall  assess whether the "Action" and the "Time Frame" of each disclosed vulnerability in \nIXIT 3-VulnTypes facilitate that vulnerabilities  are acted on in a timely manner under consideration of the \nvulnerability disclosure policy according to "Publication of Vulnerability Disclosure Policy" in IXIT 2-UserInfo. \nNOTE 1: The consideration of severity and criticality of the addressed vulnerabilities and the kind of vulnerability \n(e.g. firmware, hardware or software) is helpful.' metadat