In [4]:
import os

root_path = os.path.expanduser('~/Auray')
root_path

'/home/yuva7508/Auray'

# Build HF embedding

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

# https://huggingface.co/spaces/mteb/leaderboard
sentence_transformer_models = [
    "lier007/xiaobu-embedding-v2", # rank 1 in chinese
    "iampanda/zpoint_large_embedding_zh", # rank 4 in chinese
    "Alibaba-NLP/gte-large-en-v1.5", # rank 21 in english
    "sentence-transformers/all-MiniLM-L6-v2"
    # "dunzhang/stella_en_400M_v5", # rank 6 in english  (deprecated)
]

sentence_transformer_model = sentence_transformer_models[2]

hf_embeddings_model = HuggingFaceEmbeddings(
        model_name=sentence_transformer_model,
        cache_folder= os.path.join(root_path, "sentence_transformer_model"),
        model_kwargs={"trust_remote_code": True}
    )


[1723142671.550527] [t53n4xctr1723134817088-d5qrv:15577:f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device


  from tqdm.autonotebook import tqdm, trange


# Build HF vector database

In [10]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import re

document_root_path = os.path.join(root_path, "docs")
documents = [
    "CNS16190-zh_TW.md", 
    "CNS16190-zh_TW_only_provision.md", 
    "ts_103701_split.pdf"
    ]
document_idx = 2

chunk_size = 1000
chunk_overlap = 200
model_alias = re.split('[-_]', re.split('/', sentence_transformer_model)[-1])[0]
embedding_cache_path = os.path.join(root_path, "embedding_cache")

mode = documents[document_idx].split(".")[-1]
db_collection_names = [
    f"CNS16190_{mode}_hf_{model_alias}_emb_{chunk_size}_{chunk_overlap}",
    f"TS103701_{mode}_hf_{model_alias}_emb_{chunk_size}_{chunk_overlap}"
    ]
db_collection_idx = 1


if mode == "md":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        # database does not exist, create it
        loader = UnstructuredMarkdownLoader(
            os.path.join(document_root_path, documents[document_idx]), mode="elements"
        )
        doc = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(doc)

        hf_vectorstore = Qdrant.from_documents(
            splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

if mode == "pdf":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        pdf_loader = PyPDFLoader(os.path.join(document_root_path, documents[document_idx]))
        pdf_doc = pdf_loader.load()
        pdf_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        pdf_splits = pdf_text_splitter.split_documents(documents=pdf_doc)

        hf_vectorstore = Qdrant.from_documents(
            pdf_splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

# CNS16190 search xiaobu

In [6]:
question = """「脆弱性揭露政策明確規定安全研究者及其他人能報告問題之過程。必要時可更新此種政策，以進一步確保製造者與安全研究者往來之透明度及清晰度，反之亦然。」
符合哪一項控制措施？"""

print("similarity\n")
relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nsimilarity_score_threshold\n")
relevant_docs = hf_vectorstore.search(
    question, search_type="similarity_score_threshold", k=5, score_threshold=0.7
)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nmmr\n")
relevant_docs = hf_vectorstore.search(question, search_type="mmr", k=5)

for item in relevant_docs:
    print(f"{item}\n")


similarity

page_content='脆弱性揭露政策明確規定安全研究者及其他人能報告問題之過程。必要時可更新此種政策，以進一步確保製造者與安全研究者往來之透明度及清晰度，反之亦然。' metadata={'source': '/home/yuva7508/Auray/docs/CNS16190-zh_TW_only_provision.md', 'category_depth': 0, 'last_modified': '2024-08-08T01:51:56', 'languages': ['kor'], 'filetype': 'text/markdown', 'file_directory': '/home/yuva7508/Auray/docs', 'filename': 'CNS16190-zh_TW_only_provision.md', 'category': 'Title', 'element_id': 'eee6593a76cfd779067189d23698a4a8', '_id': '4c8fac908f8d46559f9e2e76b47a2618', '_collection_name': 'CNS16190_md_hf_xiaobu_emb_1000_200'}

page_content='控制措施5.2-1：製造者應使脆弱性揭露政策公開可用。此政策至少應包括：' metadata={'source': '/home/yuva7508/Auray/docs/CNS16190-zh_TW_only_provision.md', 'category_depth': 2, 'last_modified': '2024-08-08T01:51:56', 'languages': ['kor'], 'parent_id': '64f958134b1d92241fc9322cf1505867', 'filetype': 'text/markdown', 'file_directory': '/home/yuva7508/Auray/docs', 'filename': 'CNS16190-zh_TW_only_provision.md', 'category': 'Title', 'element_id': 'd6d154857ba37

# TS103701 search xiaobu 

In [7]:
question = """「The purpose of this test case is the functional assessment of the publication of the rationale for absence of updates and 
hardware replacement support.」
1. 符合哪一項 Test group？
2. 這個 Test group 的 Test purpose 是什麼？
3. 這個 Test group 的 Test units 是什麼？
4. 這個 Test group 的  Assignment of verdict 是什麼？

"""

print("similarity\n")
relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nsimilarity_score_threshold\n")
relevant_docs = hf_vectorstore.search(
    question, search_type="similarity_score_threshold", k=5, score_threshold=0.7
)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nmmr\n")
relevant_docs = hf_vectorstore.search(question, search_type="mmr", k=5)

for item in relevant_docs:
    print(f"{item}\n")


similarity

page_content='"evaluated" hints at a formal comparison against a se t of objectives, e.g. a recognized certification scheme. 
The objective of this test group is to assess, firstly, whether the network and security functionalities are reviewed or evaluated on the base of the corresponding scope and secondly, whether the report matches the identification (version and name) of the DUT implementation.' metadata={'source': '/home/yuva7508/Auray/docs/ts_103701_split.pdf', 'page': 31, '_id': '1c8e455f06f34d98a9670e726853d7e7', '_collection_name': 'TS103701_pdf_hf_xiaobu_emb_1000_200'}

page_content='b) The TL shall  check whether "Confirmation of Update Procedures" in IXIT 4-Conf states a confirmation. 
Assignment of verdict 
The verdict PASS is assigned if: 
• there is an indication that the described management procedure allows a timely deployment of security 
updates; and 
• a confirmation for the implementation is given. 
The verdict FAIL is assigned otherwise. 
5.3.9 Test gr

# TS103701 search gte 

In [12]:
question = """「The purpose of this test case is the conceptual assessment of the isolation capabilities a) and hardware replacement 
support b) of the DUT.」
1. 符合哪一項 Test group？
2. 這個 Test group 的 Test purpose 是什麼？
3. 這個 Test group 的 Test units 是什麼？
4. 這個 Test group 的  Assignment of verdict 是什麼？

"""

print("similarity\n")
relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nsimilarity_score_threshold\n")
relevant_docs = hf_vectorstore.search(
    question, search_type="similarity_score_threshold", k=5, score_threshold=0.7
)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nmmr\n")
relevant_docs = hf_vectorstore.search(question, search_type="mmr", k=5)

for item in relevant_docs:
    print(f"{item}\n")


similarity

page_content='support b) of the DUT. 
Test units 
a) The TL shall  assess whether the described method in "Isolation" in IXIT 9-ReplSup is suitable to isolate the 
IoT product, i.e. to remove the IoT product from the netw ork it is connected to, or to place the IoT product in a 
self-contained environment. 
b) The TL shall  assess whether the described method in "Hardware Replacement" in IXIT 9-ReplSup is suitable 
to be able to replace the hardware. 
Assignment of verdict 
The verdict PASS is assigned if: 
• the described method is suited for the isolation of the IoT product; and 
• the described method is suited for the replacement of the hardware. 
The verdict FAIL is assigned otherwise. 
5.3.15.2 Test case 5.3-15-2 (functional) 
Test purpose 
The purpose of this test case is the functional assessment of the isolation capabilities (a-c) and hardware replacement 
support (d-e) of the DUT. 
Test units 
a) The TL shall  set up the IoT product in the intended environment.' m