# Build HF embedding

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

# https://huggingface.co/spaces/mteb/leaderboard
# sentence_transformer_model = "lier007/xiaobu-embedding-v2" # rank 1 in chinese
# sentence_transformer_model = "iampanda/zpoint_large_embedding_zh" # rank 4 in chinese
sentence_transformer_model = "dunzhang/stella_en_400M_v5" # rank 6 in english
# sentence_transformer_model = "Alibaba-NLP/gte-large-en-v1.5" # rank 21 in english

hf_embeddings_model = HuggingFaceEmbeddings(
    model_name=sentence_transformer_model,
    cache_folder="../sentence_transformer_model",
    model_kwargs={"trust_remote_code": True}
)

# hf_embeddings_model = SentenceTransformer(
#     sentence_transformer_model,
#     cache_folder="../sentence_transformer_model",
#     trust_remote_code=True
# )


  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are ini

# Build HF vector database

In [2]:
import os
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader

document_root_path = "../docs"
documents = ["CNS16190-zh_TW.md", "CNS16190-zh_TW_only_provision.md", "ts_103701_split.pdf"]
document_idx = 2
embedding_cache_path = "../embedding_cache/"
db_collection_names = [
    "CNS16190_md_op_emb_1000_200", 
    "CNS16190_md_hf_emb_1000_200", 
    "TS103701_pdf_op_emb_1000_200", 
    "TS103701_pdf_hf_stella_emb_1000_200"
    ]
db_collection_idx = 3
mode = "pdf"

if mode == "md":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        # database does not exist, create it
        loader = UnstructuredMarkdownLoader(
            os.path.join(document_root_path, documents[document_idx]), mode="elements"
        )
        doc = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(doc)

        hf_vectorstore = Qdrant.from_documents(
            splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

if mode == "pdf":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        hf_vectorstore = Qdrant.from_existing_collection(
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        pdf_loader = PyPDFLoader(os.path.join(document_root_path, documents[document_idx]))
        pdf_doc = pdf_loader.load()
        pdf_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        pdf_splits = pdf_text_splitter.split_documents(documents=pdf_doc)

        hf_vectorstore = Qdrant.from_documents(
            pdf_splits,
            embedding=hf_embeddings_model,
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

# Test search results of HF vector database 

In [3]:
question = """「The purpose of this test case is the conceptual assessment of the mechanisms to make brute force attacks via network 
interfaces impracticable.  」符合哪一項 Test group？
告訴我他的 Test group objective、Test units 和 Assignment of verdict """

print("similarity\n")
relevant_docs = hf_vectorstore.search(question, search_type="similarity", k=5)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nsimilarity_score_threshold\n")
relevant_docs = hf_vectorstore.search(
    question, search_type="similarity_score_threshold", k=5, score_threshold=0.7
)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nmmr\n")
relevant_docs = hf_vectorstore.search(question, search_type="mmr", k=5)

for item in relevant_docs:
    print(f"{item}\n")


similarity

page_content='Assignment of verdict \nThe verdict PASS is assigned if: \n• the documented mechanisms make brute force attacks via network interfaces impracticable. \nThe verdict FAIL is assigned otherwise. \n5.1.5.2 Test case 5.1-5-2 (functional) \nTest purpose \nThe purpose of this test case is the functional assessment of the mechanisms to make brute force attacks via network \ninterfaces impracticable concerning the completeness of the IXIT documentation a) and the corresponding mechanisms \nb).' metadata={'source': '../docs\\ts_103701_split.pdf', 'page': 6, '_id': '04a89fba0ea44bff8f3377dbbcc4e17c', '_collection_name': 'TS103701_pdf_hf_stella_emb_1000_200'}

page_content='The objective of this test group is to assess, firstly, whether the device functionalities are accessible only after \nauthentication, secondly, whether the authentication method can discriminate between different subjects, thirdly, \nwhether it is effective and resistant to adversaries and, finally, w

# Build OpenAI vector database

In [1]:
import os
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader

document_root_path = "../docs"
documents = ["CNS16190-zh_TW.md", "CNS16190-zh_TW_only_provision.md", "ts_103701_split.pdf"]
document_idx = 2
embedding_cache_path = "../embedding_cache/"
db_collection_names = ["CNS16190_md_op_emb_1000_200", "CNS16190_md_hf_emb_1000_200", "TS103701_pdf_op_emb_1000_200"]
db_collection_idx = 2
mode = "pdf"

if mode == "md":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        openai_vectorstore = Qdrant.from_existing_collection(
            embedding=OpenAIEmbeddings(),
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        # database does not exist, create it
        loader = UnstructuredMarkdownLoader(
            os.path.join(document_root_path, documents[document_idx]), mode="elements"
        )
        doc = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(doc)

        openai_vectorstore = Qdrant.from_documents(
            splits,
            embedding=OpenAIEmbeddings(),
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

if mode == "pdf":
    if os.path.isdir(
        os.path.join(
            embedding_cache_path, "collection", db_collection_names[db_collection_idx]
        )
    ):
        # database already exists, load it
        openai_vectorstore = Qdrant.from_existing_collection(
            embedding=OpenAIEmbeddings(),
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )
    else:
        pdf_loader = PyPDFLoader(os.path.join(document_root_path, documents[document_idx]))
        pdf_doc = pdf_loader.load()
        pdf_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        pdf_splits = pdf_text_splitter.split_documents(documents=pdf_doc)

        openai_vectorstore = Qdrant.from_documents(
            pdf_splits,
            embedding=OpenAIEmbeddings(),
            path=embedding_cache_path,
            collection_name=db_collection_names[db_collection_idx],
        )

# Test search results of OpenAI vector database 

In [2]:
question = """「The purpose of this test case is the conceptual assessment of the mechanisms to make brute force attacks via network 
interfaces impracticable.  」符合哪一項 Test group？
告訴我他的 Test group objective、Test units 和 Assignment of verdict """

print("similarity\n")
relevant_docs = openai_vectorstore.search(question, search_type="similarity", k=5)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nsimilarity_score_threshold\n")
relevant_docs = openai_vectorstore.search(
    question, search_type="similarity_score_threshold", k=5, score_threshold=0.7
)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nmmr\n")
relevant_docs = openai_vectorstore.search(question, search_type="mmr", k=5)

for item in relevant_docs:
    print(f"{item}\n")


similarity

page_content='Assignment of verdict \nThe verdict PASS is assigned if: \n• the documented mechanisms make brute force attacks via network interfaces impracticable. \nThe verdict FAIL is assigned otherwise. \n5.1.5.2 Test case 5.1-5-2 (functional) \nTest purpose \nThe purpose of this test case is the functional assessment of the mechanisms to make brute force attacks via network \ninterfaces impracticable concerning the completeness of the IXIT documentation a) and the corresponding mechanisms \nb).' metadata={'source': '../docs\\ts_103701_split.pdf', 'page': 6, '_id': '67e06c29e8004b5ca4335a61af2c5657', '_collection_name': 'TS103701_pdf_op_emb_1000_200'}

page_content='ETSI ETSI TS 103 701 V1.1.1 (2021 -08) 29  \nEXAMPLE: The old authentication value is no longer valid and the new authentication value is valid after a \nchange. \nAssignment of verdict \nThe verdict PASS is assigned if: \n• all mechanisms for the user to change authentication values for user authentication m