In [18]:
# !pip install google-cloud-aiplatform gradio pandas
# !pip install pymupdf pdfplumber python-docx xmltodict lxml beautifulsoup4
# !pip install langchain_google_vertexai langchain


# Parse Documents

In [5]:
import importlib
import batch_parser
importlib.reload(batch_parser)

from batch_parser import BatchParser


# Initialize parser for "data/" folder
parser = BatchParser(data_folder="data")

# Parse all files in data/ and subfolders
parsed_results = parser.parse_batch()

# Show parsed results
for fname, chunks in parsed_results.items():
    print(f"\n--- {fname} ---")
    for i, chunk in enumerate(chunks[:5]):  # show first 5 chunks
        print(f"{i+1}. {chunk}")



--- data/EHR-reqs.docx ---
1. Problem-oriented EHR requirements SL-07 version 9
2. IT developers and IT consultants often ask for an exemplary requirements specification as a starting point in their own project. This document is such a specification. It is a template filled out with a complex example: Requirements for an Electronic Health Record system (EHR). Only a few points had to be illustrated with examples from other areas. Large parts of the specification may be reused in other projects. Parts that are too special for reuse are shown in blue.
3. All the requirements are written in tables. Column 1 is the customer's demands. Column 2 is his example solution and later the supplier's proposed solution. There is a contract that matches the requirements. Requirements template, contract, requirements and supplier’s proposal for real projects are available here: http://www.itu.dk/people/slauesen/SorenReqs.html#SL-07
5. Soren Lauesen: Problem-oriented requirements SL-07 – Guide and con

## BigQuery Vector Search 

- Embeds + stores Parsed requirements in BigQuery
- Runs a semantic query and retrieves relevant chunks

In [9]:

import retriever_bq
importlib.reload(retriever_bq)
from retriever_bq import RequirementRetrieverBQ

retriever = RequirementRetrieverBQ(
    dataset_id="healthcare_requirements",
    table_id="req_chunks",
    embedding_model="text-embedding-005"
)

retriever.add_requirements(parsed_results, batch_size=200)


query = "How does the system store patient vitals?"

# Fast Python-side similarity
results = retriever.search(query, top_k=3, use_bq_vectors=False)

for row in results:
    print(f"[{row.filename} | Chunk {row.chunk_id} | Score: {row.similarity:.4f}]")
    print(row.requirement_text)
    print("---")



🚀 Initializing RequirementRetrieverBQ with embedding model: text-embedding-005




✅ Dataset healthcare_requirements already exists.
✅ Table second-sandbox-470608-m2.healthcare_requirements.req_chunks already exists.
📦 Preparing to embed and insert 231 requirement chunks...
📄 Processing file: data/EHR-reqs.docx with 130 chunks
📄 Processing file: data/Common_InsuranceReqs_FINAL.pdf with 101 chunks
📝 Inserting batch of 200 rows into second-sandbox-470608-m2.healthcare_requirements.req_chunks...
✅ Successfully inserted 200 rows.
📝 Inserting batch of 31 rows into second-sandbox-470608-m2.healthcare_requirements.req_chunks...
✅ Successfully inserted 31 rows.
✅ Finished inserting 231 requirement chunks into BigQuery.
🔍 Running semantic search for query: How does the system store patient vitals?
📡 Retrieved 1599 rows from BigQuery, computing similarity in Python...
[data/EHR-reqs.docx | Chunk 25 | Score: 0.6824]
(below called the EHR system)
---
[data/EHR-reqs.docx | Chunk 300 | Score: 0.6635]
Data source:	Some are recorded during a clinical session; others are imported fro

## Requirement Understanding Layer (LangChain Chains)

#### Chain multi-step LLM calls:
- 1. Classifier → functional / non-functional / regulatory.
- 2. Compliance Mapper → tags (FDA, IEC, ISO).
- 3. Risk Analyzer → identifies High/Medium/Low impact.


In [24]:
import requirement_understanding
importlib.reload(requirement_understanding)
from requirement_understanding import RequirementUnderstanding

understander = RequirementUnderstanding(model="gemini-2.5-pro")
# understander = RequirementUnderstanding()

req = "The EHR system shall securely store patient vitals for 10 years to comply with FDA regulations."
result = understander.analyze(req)

print(result)



{'type': 'Regulatory', 'priority': 'High', 'compliance_tags': ['FDA 21 CFR Part 11', 'HIPAA'], 'traceability_id': 'a1b2c3d4-e5f6-7890-1234-567890abcdef'}


### Test Case Generation Layer (LangChain LLMChain)

Whats expected

✅ Clear description of the test case.

✅ Step-by-step procedural steps.

✅ Expected results spelled out in compliance terms.

✅ Proper traceability link back to the requirement.

In [33]:
import test_case_generator
importlib.reload(test_case_generator)
from test_case_generator import TestCaseGenerator

generator = TestCaseGenerator(model="gemini-2.5-pro")


req = "The EHR system shall securely store patient vitals for 10 years to comply with FDA regulations."
metadata = {
    "type": "Regulatory",
    "priority": "High",
    "compliance_tags": ["FDA 21 CFR Part 11", "HIPAA"],
    "traceability_id": "a1b2c3d4-e5f6-7890-1234-567890abcdef"
}

test_cases = generator.generate_test_cases(req, metadata)

for tc in test_cases:
    print(tc["description"])
    print("Steps:", tc["steps"])
    print("Expected:", tc["expected_result"])
    print("----")



Positive Scenario: Verify that patient vitals are securely stored and retrievable within the 10-year retention period.
Steps: ['Log in as a clinician.', 'Select a test patient record.', 'Enter and save a new set of patient vitals (e.g., Blood Pressure: 120/80, Heart Rate: 70 bpm).', 'Simulate the passage of 5 years using a system time manipulation tool.', 'Log in as an authorized user (e.g., auditor).', "Navigate to the same patient's record and attempt to retrieve the vitals entered 5 years ago."]
Expected: ['The system successfully saves the vitals with an accurate timestamp.', 'The vitals are retrieved successfully and the data matches the originally entered data.', 'The system logs the access event in the audit trail, complying with HIPAA and FDA 21 CFR Part 11.']
----
Negative Scenario: Verify that an authorized user cannot permanently delete patient vitals before the 10-year retention period has passed.
Steps: ['Log in as a system administrator with high-level privileges.', 'Sele