# Build vector database

In [1]:
import os
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

document_root_path = "../docs"
documents = ["CNS16190-zh_TW.md", "CNS16190-zh_TW_only_provision.md"]
document_idx = 0
embedding_cache_path = "../embedding_cache/"
db_collection_names = ["CNS16190_md_op_emb_1000_200", "CNS16190_md_hf_emb_1000_200"]
db_collection_idx = 0

if os.path.isdir(
    os.path.join(
        embedding_cache_path, "collection", db_collection_names[db_collection_idx]
    )
):
    # database already exists, load it
    vectorstore = Qdrant.from_existing_collection(
        embedding=OpenAIEmbeddings(),
        path=embedding_cache_path,
        collection_name=db_collection_names[db_collection_idx],
    )
else:
    # database does not exist, create it
    loader = UnstructuredMarkdownLoader(
        os.path.join(document_root_path, documents[document_idx]), mode="elements"
    )
    doc = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(doc)

    vectorstore = Qdrant.from_documents(
        splits,
        embedding=OpenAIEmbeddings(),
        path=embedding_cache_path,
        collection_name=db_collection_names[db_collection_idx],
    )


# Test search results of vector database 

In [2]:
question = "「對本標準中視為不適用或消費者IoT裝置未滿足之各項建議，應記錄衡量理由」符合哪一項控制措施？"

print("similarity\n")
relevant_docs = vectorstore.search(question, search_type="similarity", k=5)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nsimilarity_score_threshold\n")
relevant_docs = vectorstore.search(
    question, search_type="similarity_score_threshold", k=5, score_threshold=0.7
)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nmmr\n")
relevant_docs = vectorstore.search(question, search_type="mmr", k=5)

for item in relevant_docs:
    print(f"{item}\n")


similarity

page_content='控制措施4-1：對本標準中視為不適用或消費者IoT裝置未滿足之各項建議，應記錄衡量理由。' metadata={'source': '../docs\\CNS16190-zh_TW.md', 'category_depth': 1, 'last_modified': '2024-07-06T20:12:04', 'languages': ['kor'], 'parent_id': 'a48e6c6b789a8e5351cb8315efdf38ad', 'filetype': 'text/markdown', 'file_directory': '../docs', 'filename': 'CNS16190-zh_TW.md', 'category': 'Title', '_id': 'edf8c58121fa45dda6031125cb2bda3e', '_collection_name': 'CNS16190_md_op_emb_1000_200'}

page_content='本標準提供適用於所有消費者IoT裝置之1組基準控制措施。其旨在藉由其他標準加以補充，該等標準係對特定裝置，定義更特定控制措施且完全可測試及/或可查證之要求事項，其將與本標準一起有助於保證方案的制定。' metadata={'source': '../docs\\CNS16190-zh_TW.md', 'category_depth': 0, 'last_modified': '2024-07-06T20:12:04', 'languages': ['kor'], 'filetype': 'text/markdown', 'file_directory': '../docs', 'filename': 'CNS16190-zh_TW.md', 'category': 'Title', '_id': '5d7412c75df6495a9ef744a3fc2331c9', '_collection_name': 'CNS16190_md_op_emb_1000_200'}

page_content='本標準設定安全基準；然而，由於消費者IoT之廣闊前景，控制措施的適用性取決於各裝置。本標準透過使用非必備宜使用控制措施(建議)，提供一定程度