In [1]:
import os

import dotenv

dotenv.load_dotenv()
ROOT_PATH = os.path.expanduser(os.getenv("ROOT_PATH"))
ROOT_PATH

True

# Build HF embedding

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

# https://huggingface.co/spaces/mteb/leaderboard
sentence_transformer_models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "lier007/xiaobu-embedding-v2",  # rank 1 in chinese
    "Alibaba-NLP/gte-large-en-v1.5",  # rank 21 in english
    # "iampanda/zpoint_large_embedding_zh", # rank 4 in chinese
    # "dunzhang/stella_en_400M_v5", # rank 6 in english  (deprecated)
]

sentence_transformer_model = sentence_transformer_models[2]

hf_embeddings_model = HuggingFaceEmbeddings(
    model_name=sentence_transformer_model,
    cache_folder=os.path.join(ROOT_PATH, "sentence_transformer_model"),
    model_kwargs={"trust_remote_code": True},
)


  from tqdm.autonotebook import tqdm, trange


# Build HF vector database

In [4]:
import re

from langchain_community.document_loaders import PyPDFLoader, UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter

document_root_path = os.path.join(ROOT_PATH, "docs")
documents = [
    "CNS16190-zh_TW.md",  # 0
    "CNS16190-zh_TW_only_provision.md",  # 1
    "CNS16190-zh_TW_only_provision.pdf",  # 2
    "ts_103701_only_test_scenario.pdf",  # 3
    "en_303645_only_provision.pdf",  # 4
]
document_idx = 4

chunk_size = 1000
chunk_overlap = 200
model_alias = re.split("[-_]", re.split("/", sentence_transformer_model)[-1])[0]
embedding_cache_path = os.path.join(ROOT_PATH, "embedding_cache")

mode = documents[document_idx].split(".")[-1]
db_collection_names = [
    f"CNS16190_{mode}_hf_{model_alias}_emb_{chunk_size}_{chunk_overlap}",
    f"TS103701_{mode}_hf_{model_alias}_emb_{chunk_size}_{chunk_overlap}",
    f"EN303645_{mode}_hf_{model_alias}_emb_{chunk_size}_{chunk_overlap}",
]
db_collection_idx = db_collection_idx = next(
    (
        idx
        for idx, item in enumerate(db_collection_names)
        if item[:2].casefold() == documents[document_idx][:2].casefold()
    ),
    -1,
)

if mode == "md":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        # database does not exist, create it
        loader = UnstructuredMarkdownLoader(
            os.path.join(document_root_path, documents[document_idx]), mode="elements"
        )
        doc = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200
        )
        splits = text_splitter.split_documents(doc)

        hf_vectorstore = Qdrant.from_documents(
            splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

if mode == "pdf":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        pdf_loader = PyPDFLoader(
            os.path.join(document_root_path, documents[document_idx])
        )
        pdf_doc = pdf_loader.load()
        pdf_text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200
        )
        pdf_splits = pdf_text_splitter.split_documents(documents=pdf_doc)

        hf_vectorstore = Qdrant.from_documents(
            pdf_splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

# CNS16190 search

In [5]:
from langsmith import traceable

question = """「脆弱性揭露政策明確規定安全研究者及其他人能報告問題之過程。必要時可更新此種政策，以進一步確保製造者與安全研究者往來之透明度及清晰度，反之亦然。」
符合哪一項控制措施？"""

def _convert_docs(results):
    return [
        {
            "page_content": r.page_content,
            "type": "Document",
            "metadata": r.metadata,
        }
        for r in results
    ]

@traceable(run_type="retriever")
def retrieve_docs(query):
    relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)
    contents = relevant_docs
    return _convert_docs(contents)

retrieve_docs(question)


[{'page_content': '脆弱性揭露政策明確規定安全研究者及其他人能報告問題之過程。必要時可更新此種政策，以進\n一步確保製造者與安全研究者往來之透明度及清晰度，反之亦然。\n協調脆弱性揭露(CVD)係1組用以處理有關潛在安全脆弱性之揭露，並支援修復此等脆弱性的過程\n。CVD係由國際標準化組織(ISO)於ISO/IEC29147[4]中關於脆弱性揭露之標準化，且已於全球某些大\n型軟體公司中證明成功。\n於IoT產業中，CVD目前尚未成熟[16]，因某些公司不願與安全研究者往來。於此，CVD為公司提供\n框架以管理此過程。此係安全研究者對公司通報安全問題之途徑，使公司領先於惡意利用的威脅\n，並給予公司於公開揭露前回應，並解決脆弱性之機會。\n控制措施5.2-2：對已揭露之脆弱性宜以及時方式採取行動。\n對脆弱性採取行動之〝及時方式〞差異甚大，且為事故特定；然而通常情況下，對軟體解決方案\n，脆弱性處理過程係於90天內完成，包括修補程式之可用性及問題通知。以解決硬體修復可能需\n比軟體修復較長時間。此外，與伺服器軟體修復相比，須部署至裝置之修復可能需較長時間實施\n。\n控制措施5.2-\n3：製造者宜於所定義支援期間內，對其銷售及生產，以及所生產之產品及運作的\n服務，持續監視、識別及矯正安全脆弱性。\n備考1. \n預期製造者將實施適當維護產品中使用之所有軟體及硬體，此包括適當維護及提供相關聯服務所\n選定的第三方，以支援產品功能。\n軟體解決方案通常包含開放原始碼及第三方軟體組件。建立並維護所有軟體組件及其子組件之列\n表，係能監視產品脆弱性的先決條件。存在各種工具用以掃描原始碼及二進碼，並構建所謂軟體\n組成清單(SBOM)，其可識別第三方組件及產品中所使用的版本。然後，此資訊用以監視各已識別\n軟體組件之相關聯安全及使用授權的風險。\n預期脆弱性將於第一時間直接報告受影響之利害相關者。若不可能，可將脆弱性報告主管機關。\n亦鼓勵製造者與諸如GSMA \n及IoT安全基金會等權責產業機構共享資訊。協調脆弱性揭露之指引，可查詢參引ISO/IEC29147[4]\n之IoT安全基金會[22]。\n預期於其所定義支援期間內，對裝置執行此運作。然而製造者可於該期間外繼續執行此運作並發\n布安全更新，以矯正脆弱性。',
  'type': 'Docume

# TS103701 search

In [5]:
from langsmith import traceable

# 5.3.14.2
question = """「The purpose of this test case is the functional assessment of the publication of the rationale for absence of updates and 
hardware replacement support.」
1. 符合哪一項 Test group？
2. 這個 Test group 的 Test purpose 是什麼？
3. 這個 Test group 的 Test units 是什麼？
4. 這個 Test group 的  Assignment of verdict 是什麼？"""

def _convert_docs(results):
    return [
        {
            "page_content": r.page_content,
            "type": "Document",
            "metadata": r.metadata,
        }
        for r in results
    ]

@traceable(run_type="retriever")
def retrieve_docs(query):
    relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)
    contents = relevant_docs
    return _convert_docs(contents)

retrieve_docs(question)

[{'page_content': 'The purpose of this test case is the functional assessment of the publication of the rationale for absence of updates and hardware replacement support.',
  'type': 'Document',
  'metadata': {'source': '/home/yuva/dev/Auray/docs/ts_103701_only_test_scenario.md',
   'last_modified': '2024-08-25T21:33:02',
   'languages': ['eng'],
   'parent_id': 'b90f799704398e12c25fa25c8bcda41b',
   'filetype': 'text/markdown',
   'file_directory': '/home/yuva/dev/Auray/docs',
   'filename': 'ts_103701_only_test_scenario.md',
   'category': 'NarrativeText',
   'element_id': '9d818380be1a7fe44ad7413a7233e334',
   '_id': '1cdca2f441a940ac8b82c7474f7bec9c',
   '_collection_name': 'TS103701_md_hf_gte_emb_1000_200'}},
 {'page_content': '5.3.4 Test Group 5.3-4 5.3.4.0 Test Group Objective 5.3.4.1 Test Case 5.3-4-1 (Conceptual) Test Purpose',
  'type': 'Document',
  'metadata': {'source': '/home/yuva/dev/Auray/docs/ts_103701_only_test_scenario.md',
   'category_depth': 1,
   'last_modified':

# EN303645 search

In [5]:
from langsmith import traceable

question = """「Manufacturers should continually monitor for, identify and rectify security vulnerabilities within 
products and services they sell, produce, have produced and services they operate during the defined support period.」
符合哪一項 provision？"""

def _convert_docs(results):
    return [
        {
            "page_content": r.page_content,
            "type": "Document",
            "metadata": r.metadata,
        }
        for r in results
    ]

@traceable(run_type="retriever")
def retrieve_docs(query):
    relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)
    contents = relevant_docs
    return _convert_docs(contents)

retrieve_docs(question)

[{'page_content': 'ETSI ETSI EN 303 645 V2.1.1 (2020 -06) 15  \nProvision 5.2-3 Manufacturers should continually monitor for, identify and rectify security vulnerabilities within \nproducts and services they sell, produce, have produced and services they operate during the defined support period. \nNOTE 1: Manufacturers are expected to exercise due care for all software and hardware components used in the \nproduct, this includes due care related to the selected third parties that provide associated services to \nsupport the functions of the product.',
  'type': 'Document',
  'metadata': {'source': '/home/yuva7508/Auray/docs/en_303645_only_provision.pdf',
   'page': 2,
   '_id': '5757047ce1714f398a7f359a7b198fb4',
   '_collection_name': 'EN303645_pdf_hf_gte_emb_1000_200'}},
 {'page_content': 'Vulnerabilities are expected to be reported directly to the affected stakeholders in the first instance. If that is not \npossible, vulnerabilities can be reported to national authorities. Manufac