In [1]:
import dotenv

dotenv.load_dotenv()

True

In [2]:
import os

root_path = os.path.expanduser('~/Auray')
root_path

'/home/yuva7508/Auray'

# Build HF embedding

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

# https://huggingface.co/spaces/mteb/leaderboard
sentence_transformer_models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "lier007/xiaobu-embedding-v2", # rank 1 in chinese
    "Alibaba-NLP/gte-large-en-v1.5", # rank 21 in english
    # "iampanda/zpoint_large_embedding_zh", # rank 4 in chinese
    # "dunzhang/stella_en_400M_v5", # rank 6 in english  (deprecated)
]

sentence_transformer_model = sentence_transformer_models[2]

hf_embeddings_model = HuggingFaceEmbeddings(
        model_name=sentence_transformer_model,
        cache_folder= os.path.join(root_path, "sentence_transformer_model"),
        model_kwargs={"trust_remote_code": True}
    )


  from tqdm.autonotebook import tqdm, trange


# Build HF vector database

In [4]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import re

document_root_path = os.path.join(root_path, "docs")
documents = [
    "CNS16190-zh_TW.md", 
    "CNS16190-zh_TW_only_provision.md", 
    "ts_103701_only_test_scenario.pdf",
    "en_303645_only_provision.pdf"
    ]
document_idx = 3

chunk_size = 1000
chunk_overlap = 200
model_alias = re.split('[-_]', re.split('/', sentence_transformer_model)[-1])[0]
embedding_cache_path = os.path.join(root_path, "embedding_cache")

mode = documents[document_idx].split(".")[-1]
db_collection_names = [
    f"CNS16190_{mode}_hf_{model_alias}_emb_{chunk_size}_{chunk_overlap}",
    f"TS103701_{mode}_hf_{model_alias}_emb_{chunk_size}_{chunk_overlap}",
    f"EN303645_{mode}_hf_{model_alias}_emb_{chunk_size}_{chunk_overlap}",
    ]
db_collection_idx = 2


if mode == "md":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        # database does not exist, create it
        loader = UnstructuredMarkdownLoader(
            os.path.join(document_root_path, documents[document_idx]), mode="elements"
        )
        doc = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(doc)

        hf_vectorstore = Qdrant.from_documents(
            splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

if mode == "pdf":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        pdf_loader = PyPDFLoader(os.path.join(document_root_path, documents[document_idx]))
        pdf_doc = pdf_loader.load()
        pdf_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        pdf_splits = pdf_text_splitter.split_documents(documents=pdf_doc)

        hf_vectorstore = Qdrant.from_documents(
            pdf_splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

# CNS16190 search

In [5]:
from langsmith import traceable

question = """「脆弱性揭露政策明確規定安全研究者及其他人能報告問題之過程。必要時可更新此種政策，以進一步確保製造者與安全研究者往來之透明度及清晰度，反之亦然。」
符合哪一項控制措施？"""

def _convert_docs(results):
    return [
        {
            "page_content": r.page_content,
            "type": "Document",
            "metadata": r.metadata,
        }
        for r in results
    ]

@traceable(run_type="retriever")
def retrieve_docs(query):
    relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)
    contents = relevant_docs
    return _convert_docs(contents)

retrieve_docs(question)


[{'page_content': '脆弱性揭露政策明確規定安全研究者及其他人能報告問題之過程。必要時可更新此種政策，以進一步確保製造者與安全研究者往來之透明度及清晰度，反之亦然。',
  'type': 'Document',
  'metadata': {'source': '/home/yuva7508/Auray/docs/CNS16190-zh_TW_only_provision.md',
   'category_depth': 0,
   'last_modified': '2024-08-08T01:51:56',
   'languages': ['kor'],
   'filetype': 'text/markdown',
   'file_directory': '/home/yuva7508/Auray/docs',
   'filename': 'CNS16190-zh_TW_only_provision.md',
   'category': 'Title',
   'element_id': 'eee6593a76cfd779067189d23698a4a8',
   '_id': '251f5e4bcad74e92b8bb426169e91fb1',
   '_collection_name': 'CNS16190_md_hf_gte_emb_1000_200'}},
 {'page_content': '控制措施5.11-3：宜對使用者明確說明如何刪除其個人資料。',
  'type': 'Document',
  'metadata': {'source': '/home/yuva7508/Auray/docs/CNS16190-zh_TW_only_provision.md',
   'category_depth': 2,
   'last_modified': '2024-08-08T01:51:56',
   'languages': ['kor'],
   'parent_id': '24ca796268a594c8baa94404739edf5b',
   'filetype': 'text/markdown',
   'file_directory': '/home/yuva7508/Auray/docs',
   

# TS103701 search

In [5]:
from langsmith import traceable

question = """「The purpose of this test case is the functional assessment of the publication of the rationale for absence of updates and 
hardware replacement support.」
1. 符合哪一項 Test group？
2. 這個 Test group 的 Test purpose 是什麼？
3. 這個 Test group 的 Test units 是什麼？
4. 這個 Test group 的  Assignment of verdict 是什麼？"""

def _convert_docs(results):
    return [
        {
            "page_content": r.page_content,
            "type": "Document",
            "metadata": r.metadata,
        }
        for r in results
    ]

@traceable(run_type="retriever")
def retrieve_docs(query):
    relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)
    contents = relevant_docs
    return _convert_docs(contents)

retrieve_docs(question)

[{'page_content': 'Assignment of verdict \nThe verdict PASS is assigned if: \n• the access to the resource for publishing the defined support period to the user is provided as described in the \nIXIT; and \n• the access to the resource for publishing the defined support period is unrestricted; and \n• the defined support period is published. \nThe verdict FAIL is assigned otherwise. \n5.3.14 Test group 5.3-14 \n5.3.14.0 Test group objective \nThe test group addresses the provision 5.3-14. \n5.3.14.1 Test case 5.3-14-1 (conceptual) \nTest purpose \nThe purpose of this test case is the conceptual assessment of the publication of the rationale for absence of updates and \nhardware replacement support.',
  'type': 'Document',
  'metadata': {'source': '/home/yuva7508/Auray/docs/ts_103701_split.pdf',
   'page': 21,
   '_id': '11bbf9412e474e4ba4f8c2eb02978f56',
   '_collection_name': 'TS103701_pdf_hf_gte_emb_1000_200'}},
 {'page_content': 'practicability reasons or security reasons; and \n• n

# EN303645 search

In [5]:
from langsmith import traceable

question = """「Manufacturers should continually monitor for, identify and rectify security vulnerabilities within 
products and services they sell, produce, have produced and services they operate during the defined support period.」
符合哪一項 provision？"""

def _convert_docs(results):
    return [
        {
            "page_content": r.page_content,
            "type": "Document",
            "metadata": r.metadata,
        }
        for r in results
    ]

@traceable(run_type="retriever")
def retrieve_docs(query):
    relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)
    contents = relevant_docs
    return _convert_docs(contents)

retrieve_docs(question)

[{'page_content': 'ETSI ETSI EN 303 645 V2.1.1 (2020 -06) 15  \nProvision 5.2-3 Manufacturers should continually monitor for, identify and rectify security vulnerabilities within \nproducts and services they sell, produce, have produced and services they operate during the defined support period. \nNOTE 1: Manufacturers are expected to exercise due care for all software and hardware components used in the \nproduct, this includes due care related to the selected third parties that provide associated services to \nsupport the functions of the product.',
  'type': 'Document',
  'metadata': {'source': '/home/yuva7508/Auray/docs/en_303645_only_provision.pdf',
   'page': 2,
   '_id': '5757047ce1714f398a7f359a7b198fb4',
   '_collection_name': 'EN303645_pdf_hf_gte_emb_1000_200'}},
 {'page_content': 'Vulnerabilities are expected to be reported directly to the affected stakeholders in the first instance. If that is not \npossible, vulnerabilities can be reported to national authorities. Manufac