# Build HF embedding

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

# https://huggingface.co/spaces/mteb/leaderboard
sentence_transformer_model = "lier007/xiaobu-embedding-v2"  # rank 1 in chinese
# sentence_transformer_model = "Alibaba-NLP/gte-Qwen2-7B-instruct" # rank 2 in chinese
# sentence_transformer_model = "dunzhang/stella_en_1.5B_v5"  # rank 1 in english

hf_embeddings_model = HuggingFaceEmbeddings(
    model_name=sentence_transformer_model,
    cache_folder="../sentence_transformer_model",
)


  from tqdm.autonotebook import tqdm, trange
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Prepare docs

In [2]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import Qdrant
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

md_loader = UnstructuredMarkdownLoader("../docs/CNS16190-zh_TW.md", mode="elements")
md_doc = md_loader.load()
md_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
md_splits = md_text_splitter.split_documents(documents=md_doc)

pdf_loader = PyPDFLoader("../docs/ts_103701_split.pdf")
pdf_doc = pdf_loader.load()
pdf_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
pdf_splits = pdf_text_splitter.split_documents(documents=pdf_doc)

print(f"md_splits: {len(md_splits)}")
print(f"pdf_splits: {len(pdf_splits)}")
pdf_splits

md_splits: 581
pdf_splits: 232


[Document(page_content='ETSI ETSI TS 103 701 V1.1.1 (2021 -08) 23  \n \nFigure 5: Role of an assessment scheme \nOn the base of the generic provisions from to ETSI TS 103 645 [1]/ETSI EN 303 645 [2] it is not possible to derive \nspecific criteria for every kind of implementation for each test  case. Therefore the experience of the TL is needed to \nadapt the given criteria in the test cases if necessary. The requirements on the experience and equipment of the TL are \ntypically part of an assessment scheme. \nThe present document contains informative content concerni ng best practice security. In particular cryptographic \nrequirements are typically defined by the assessment scheme  considering the corresponding  information in the present \ndocument and the properties of the technology, risk and usage. This allows comparability of the assessment results \nunder a specific scheme. \nNOTE: In the cases of a certification scheme this type of specification is typically done by the party 

In [3]:
chunked_documents = md_splits + pdf_splits
len(chunked_documents), chunked_documents

(813,
 [Document(page_content='中華民國國家標準 CNS', metadata={'source': '../docs/CNS16190-zh_TW.md', 'category_depth': 0, 'last_modified': '2024-07-06T20:12:04', 'languages': ['kor'], 'filetype': 'text/markdown', 'file_directory': '../docs', 'filename': 'CNS16190-zh_TW.md', 'category': 'Title'}),
  Document(page_content='消費者物聯網之網宇安全：基準要求事項 Cybersecurity for consumer internet of things: Baseline requirements', metadata={'source': '../docs/CNS16190-zh_TW.md', 'category_depth': 1, 'last_modified': '2024-07-06T20:12:04', 'languages': ['kor'], 'parent_id': '504351d598c6cd9a1c4f66c098f38200', 'filetype': 'text/markdown', 'file_directory': '../docs', 'filename': 'CNS16190-zh_TW.md', 'category': 'Title'}),
  Document(page_content='CNS16190:2023', metadata={'source': '../docs/CNS16190-zh_TW.md', 'last_modified': '2024-07-06T20:12:04', 'languages': ['kor'], 'parent_id': '685b4ae38847f492e0d17ef0f2b0e669', 'filetype': 'text/markdown', 'file_directory': '../docs', 'filename': 'CNS16190-zh_TW.md', 'categ

# Build vector database

In [None]:
vector_db = Qdrant.from_documents(
    chunked_documents,
    hf_embeddings_model,
    location=":memory:",
)

In [None]:
retriever = vector_db.as_retriever(search_kwargs={'k': 10})

In [7]:
question = """tell me Test group 5.4-3 in ts_103701"""
k = 10

print("similarity\n")
relevant_docs = vector_db.search(question, search_type="similarity", k=k)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nsimilarity_score_threshold\n")
relevant_docs = vector_db.search(
    question, search_type="similarity_score_threshold", k=k, score_threshold=0.7
)

for item in relevant_docs:
    print(f"{item}\n")

print("\n\nmmr\n")
relevant_docs = vector_db.search(question, search_type="mmr", k=k)

for item in relevant_docs:
    print(f"{item}\n")


similarity

page_content='5.8.3.0 Test group objective \nThe test group addresses the provision 5.8-3.  \nThis test group aims at revealing any capabilities of a DUT to sense information about its surroundings, such as optic, \nacoustic, biometric or location sensors. It is to be documented in a way that the user is knowledgeable about information that is obtained by the DUT.' metadata={'source': '../docs/ts_103701_split.pdf', 'page': 51, '_id': '87e697ac825444308ed4cd993911a250', '_collection_name': 'd4f087d584744b2aaea87999a9cfd6a0'}

page_content='The verdict FAIL is assigned otherwise. \n5.8 TSO 5.8: Ensure that personal data is secure \n5.8.1 Test group 5.8-1 \n5.8.1.0 Test group objective \nThe test group addresses the provision 5.8-1.  \nThe difference compared to Test group 5.5-1 is, that the use case in the underlying provision is concretised on the communication of personal data, which requires at least confidentiality. \nThe objective of this test group is to assess, firstly

# Build chain

In [20]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

prompt.messages[
    0
].prompt.template = "你是一個資安專家，使用以下檢索到的背景資料回答問題，如果不知道答案就說不知道。\n背景資料：{context} \n問題：{question} \n答案："
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='你是一個資安專家，使用以下檢索到的背景資料回答問題，如果不知道答案就說不知道。\n背景資料：{context} \n問題：{question} \n答案：'))])

In [21]:
import dotenv
from langchain_openai import ChatOpenAI

dotenv.load_dotenv() # invoke langsmith

llm = ChatOpenAI(model="gpt-4o-mini")

In [22]:
# concat all docs as a string
def format_docs(docs):
    retrieval_docs = "\n\n".join(doc.page_content for doc in docs)

    return retrieval_docs


In [23]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
).with_config({"run_name": "test_cenario_generating"})

# Invoke LLM

In [24]:
rag_chain.invoke(f"使用繁體中文，給我ts103701的Test group 5.4-4")

AIMessage(content='控制措施 5.4-4 的測試情境可以包括以下幾個方面：\n\n1. **完整性檢查**：測試系統應能夠驗證軟體更新的完整性，確保更新前後的檔案未被篡改。可以透過比較哈希值來進行驗證。\n\n2. **真確性核對**：在每次更新前，系統應能夠檢查更新檔案的來源是否可信，並確保更新內容是正確的。這可以通過數位簽章或其他身份驗證機制來實現。\n\n3. **唯一性確保**：測試每個裝置所使用的安全參數是否是唯一的，並確保這些參數不會被其他裝置重複使用，以降低自動化攻擊的風險。\n\n4. **通訊保護**：測試在裝置軟體中與相關服務之間的通訊是否受到保護，確保關鍵安全參數不會在傳輸過程中被攔截或篡改。\n\n5. **自動化攻擊防護**：測試系統是否具備防範各類別裝置自動化攻擊的機制，如監控異常行為、實施速率控制等措施。\n\n這些測試情境可以幫助確保控制措施 5.4-4 在實際環境中的有效性，從而加強系統的安全性。', response_metadata={'token_usage': {'completion_tokens': 341, 'prompt_tokens': 1050, 'total_tokens': 1391}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_661538dc1f', 'finish_reason': 'stop', 'logprobs': None}, id='run-d762042e-8363-407d-9edb-3b868c87ed3d-0', usage_metadata={'input_tokens': 1050, 'output_tokens': 341, 'total_tokens': 1391})