In [3]:
import os
import dotenv

dotenv.load_dotenv()
HF_ACCESS_TOKEN = os.getenv('HF_ACCESS_TOKEN')
ROOT_PATH = os.path.expanduser(os.getenv("ROOT_PATH"))
ROOT_PATH

'/home/yuva/dev/LLM_Examples'

# Build HF embedding

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

# https://huggingface.co/spaces/mteb/leaderboard
sentence_transformer_models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "lier007/xiaobu-embedding-v2",  # rank 1 in chinese
    "Alibaba-NLP/gte-large-en-v1.5",  # rank 21 in english
    # "iampanda/zpoint_large_embedding_zh", # rank 4 in chinese
    # "dunzhang/stella_en_400M_v5", # rank 6 in english  (deprecated)
]

sentence_transformer_model = sentence_transformer_models[1]

hf_embeddings_model = HuggingFaceEmbeddings(
    model_name=sentence_transformer_model,
    cache_folder=os.path.join(ROOT_PATH, "sentence_transformer_model"),
    model_kwargs={"trust_remote_code": True},
)


  from tqdm.autonotebook import tqdm, trange
  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


# Build HF vector database

In [3]:
import re

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

document_root_path = os.path.join(ROOT_PATH, "docs")
documents = [
    "CNS16190-zh_TW.md",  # 0
    "CNS16190-zh_TW_only_provision.md",  # 1
    "CNS16190-zh_TW_only_provision.pdf",  # 2
    "ts_103701_only_test_scenario.pdf",  # 3
    "ts_103701_only_test_scenario.md",  # 4
    "en_303645_only_provision.pdf",  # 5
]
document_idx = 4

chunk_size = 1000
chunk_overlap = 200
model_alias = re.split("[-_]", re.split("/", sentence_transformer_model)[-1])[0]
embedding_cache_path = os.path.join(ROOT_PATH, "embedding_cache")

mode = documents[document_idx].split(".")[-1]
db_collection_names = [
    f"CNS16190_{mode}_hf_{model_alias}_emb",
    f"TS103701_{mode}_hf_{model_alias}_emb",
    f"EN303645_{mode}_hf_{model_alias}_emb",
]
db_collection_idx = db_collection_idx = next(
    (
        idx
        for idx, item in enumerate(db_collection_names)
        if item[:2].casefold() == documents[document_idx][:2].casefold()
    ),
    -1,
)

if mode == "md":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        splits = None
        with open(os.path.join(document_root_path, documents[document_idx]), "r") as f:
            markdown_document = f.read()

            headers_to_split_on = [
                ("#", "Header 1"),
                ("##", "Header 2"),
                ("###", "Header 3"),
                ("####", "Header 4"),
            ]

            markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
            splits = markdown_splitter.split_text(markdown_document)

        hf_vectorstore = Qdrant.from_documents(
            splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

if mode == "pdf":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        pdf_loader = PyPDFLoader(
            os.path.join(document_root_path, documents[document_idx])
        )
        pdf_doc = pdf_loader.load()
        pdf_text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        pdf_splits = pdf_text_splitter.split_documents(documents=pdf_doc)

        hf_vectorstore = Qdrant.from_documents(
            pdf_splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

### MarkdownHeaderTextSplitter (for testing)

In [1]:
import os

import dotenv
from langchain_text_splitters import MarkdownHeaderTextSplitter

dotenv.load_dotenv()
ROOT_PATH = os.path.expanduser(os.getenv("ROOT_PATH"))

document_root_path = os.path.join(ROOT_PATH, "docs")
md_header_splits = None
with open(
    os.path.join(document_root_path, "ts_103701_only_test_scenario.md"), "r"
) as f:
    markdown_document = f.read()

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(markdown_document)

md_header_splits

[Document(metadata={'Header 1': '5.0 TSO 4: Reporting implementation', 'Header 2': '5.0.1 Test group 4-1', 'Header 3': '5.0.1.0 Test group objective'}, page_content='The test group addresses the provision 4-1.'),
 Document(metadata={'Header 1': '5.0 TSO 4: Reporting implementation', 'Header 2': '5.0.1 Test group 4-1', 'Header 3': '5.0.1.1 Test Case 4-1-1 (Conceptual)', 'Header 4': 'Test purpose'}, page_content='The purpose of this test case is the conceptual assessment of the justifications for recommendations that are considered to be not applicable for or not fulfilled by the DUT.'),
 Document(metadata={'Header 1': '5.0 TSO 4: Reporting implementation', 'Header 2': '5.0.1 Test group 4-1', 'Header 3': '5.0.1.1 Test Case 4-1-1 (Conceptual)', 'Header 4': 'Test units'}, page_content='a) The TL **shall** check whether a justification is given in the ICS for each recommendation that is considered to be not applicable for or not fulfilled by the DUT.'),
 Document(metadata={'Header 1': '5.0 

### UnstructuredMarkdownLoader with element (for testing)

In [2]:
import os

import dotenv
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

dotenv.load_dotenv()
ROOT_PATH = os.path.expanduser(os.getenv("ROOT_PATH"))

document_root_path = os.path.join(ROOT_PATH, "docs")

# database does not exist, create it
loader = UnstructuredMarkdownLoader(
    os.path.join(document_root_path, "ts_103701_only_test_scenario.md"), mode="elements"
)
doc = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(doc)
splits

[Document(metadata={'source': '/home/yuva/dev/LLM_Examples/docs/ts_103701_only_test_scenario.md', 'category_depth': 0, 'last_modified': '2024-09-20T15:38:33', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': '/home/yuva/dev/LLM_Examples/docs', 'filename': 'ts_103701_only_test_scenario.md', 'category': 'Title', 'element_id': '3eb24cc2acf37c1b0a9f4fbda123aca1'}, page_content='5.0 TSO 4: Reporting implementation'),
 Document(metadata={'source': '/home/yuva/dev/LLM_Examples/docs/ts_103701_only_test_scenario.md', 'category_depth': 1, 'last_modified': '2024-09-20T15:38:33', 'languages': ['eng'], 'parent_id': '3eb24cc2acf37c1b0a9f4fbda123aca1', 'filetype': 'text/markdown', 'file_directory': '/home/yuva/dev/LLM_Examples/docs', 'filename': 'ts_103701_only_test_scenario.md', 'category': 'Title', 'element_id': 'c36d324d896d5ddb4a4c1b32429f4450'}, page_content='5.0.1 Test group 4-1'),
 Document(metadata={'source': '/home/yuva/dev/LLM_Examples/docs/ts_103701_only_test_scenario.m

In [9]:
from langsmith import traceable
import pandas as pd

df = pd.read_csv("../docs/test_data_from_GoogleSheet.csv")

row = df.iloc[6] # <--- 【change index】
query = f"控制措施{row['provision']}，detail:{row['detail']}"
question = f"{query}\
    How to determine PASS or FAIL from the Assignment of verdict of Test group {row['provision']}.\
    Output example: The verdict PASS is assigned if:\
    • the publication of software update support period is understandable and comprehensible for a user with limited technical knowledge. \
    The verdict FAIL is assigned otherwise. "

print(f"question:\n{question}")


def _convert_docs(results):
    return [
        {
            "page_content": r.page_content,
            "type": "Document",
            "metadata": r.metadata,
        }
        for r in results
    ]


@traceable(run_type="retriever")
def retrieve_docs(query):
    relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=10)
    contents = relevant_docs
    return _convert_docs(contents)


retrieve_docs(question)

question:
控制措施5.4-3，detail:關鍵安全參數不得硬編碼於使用設備軟體原始碼中。    How to determine PASS or FAIL from the Assignment of verdict of Test group 5.4-3.        Output example: The verdict PASS is assigned if:        • the publication of software update support period is understandable and comprehensible for a user with limited technical knowledge.         The verdict FAIL is assigned otherwise. 


[{'page_content': 'The verdict PASS is assigned if:  \n- for all critical security parameter hard-coded in device software source code there is no indication that the application of the provisioning mechanism differs from its IXIT documentation.  \nThe verdict FAIL is assigned otherwise.',
  'type': 'Document',
  'metadata': {'Header 1': '5.4 Tso 5.4: Securely Store Sensitive Security Parameters',
   'Header 2': '5.4.3.2 Test Case 5.4-3-2 (Functional)',
   'Header 3': 'Assignment of verdict',
   '_id': '969688dc1cc64f54bde6b3325c729c9c',
   '_collection_name': 'TS103701_md_hf_xiaobu_emb'}},
 {'page_content': 'The verdict PASS is assigned if:  \n- there is no indication that any critical security parameter hard-coded in device software source code is not documented as such; and\n- for all critical security parameter hard-coded in device software source code, the "Provisioning Mechanism" ensures that it is not used during the operation of the DUT.  \nThe verdict FAIL is assigned otherwis