In [None]:
# import stuffs
 
import os
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain.embeddings import HuggingFaceEmbeddings
#from langchain_community.llms import Ollama
from langchain_core.runnables import RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
from chromadb.config import Settings
import fitz  #PyMuPDF for extracting images
from PIL import Image
import io

In [4]:
def extract_images_from_pdf(pdf_path, output_folder):
    """
    Extract images from a PDF file and save them to the specified folder.

    Args:
        pdf_path (str): Path to the PDF file.
        output_folder (str): Folder to save extracted images.

    Returns:
        list: Paths to the extracted images.
    """
    doc = fitz.open(pdf_path)
    image_paths = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        images = page.get_images(full=True)

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image = Image.open(io.BytesIO(image_bytes))

            # Save image
            image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page{page_num + 1}_img{img_index + 1}.{image_ext}"
            image_path = os.path.join(output_folder, image_name)
            image.save(image_path)
            image_paths.append(image_path)

    return image_paths

In [5]:
def create_chunks_from_pdf_with_images(docs_folder, image_output_folder):
    """
    Process PDF files to extract text and images, then create chunks.

    Args:
        docs_folder (str): Folder containing PDF files.
        image_output_folder (str): Folder to save extracted images.

    Returns:
        list: Combined text and image chunks.
    """
    if not os.path.exists(image_output_folder):
        os.makedirs(image_output_folder)

    Docs = []

    for filename in os.listdir(docs_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(docs_folder, filename)
            print(f"Processing file: {file_path}")

            # Load the document
            loader = PyPDFLoader(file_path)
            data = loader.load()

            # Split the text into chunks
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
            text_chunks = text_splitter.split_documents(data)
            Docs.extend(text_chunks)

            # Extract images and add their paths as chunks
            image_paths = extract_images_from_pdf(file_path, image_output_folder)
            for image_path in image_paths:
                Docs.append({"type": "image", "content": image_path})

    return Docs

In [2]:
def create_chunks_from_pdf(docs_folder):
    Docs = []
    # Iterate through all PDF files in the 'docs' folder
    for filename in os.listdir(docs_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(docs_folder, filename)
            print(f"Processing file: {file_path}")

            # Load the document
            loader = PyPDFLoader(file_path)
            data = loader.load()

            # Split the document into chunks
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
            splits = text_splitter.split_documents(data)

            # Add the splits to the main list
            Docs.extend(splits)
    return Docs

In [7]:
def create_codedocs_from_folder(project_folder):
    CodeDocs = []

    # Traverse the folder structure
    for root, dirs, files in os.walk(project_folder):
        for file in files:
            if file.endswith(".h") or file.endswith(".c"):
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")

                # Read the file content
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        file_content = f.read()

                    # Split the content into chunks
                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=0)
                    splits = text_splitter.split_text(file_content)

                    # Add metadata and content to CodeDocs
                    for i, chunk in enumerate(splits):
                        CodeDocs.append(Document(
                            page_content=chunk,
                            metadata={
                                "file_name": file,
                                "file_path": file_path,
                                "chunk_index": i
                            }
                        ))

                except Exception as e:
                    print(f"Error reading file {file_path}: {e}")

    return CodeDocs

In [3]:
# Define the folder containing the documents
Requirement_docs_folder = "ReqDocs/"
 
Reference_docs_folder = "ReferenceDocs/"
 
#Code_folder = "ReferenceCode/"
 
RefUMLDesignFolder = "ReferenceUMLDesignDocs/"
ImageOutputFolder = "ExtractedImages/"
 
reqDocs = create_chunks_from_pdf(Requirement_docs_folder)
refDocs = create_chunks_from_pdf(Reference_docs_folder)
#refCode = create_codedocs_from_folder(Code_folder)
refDesgn = create_chunks_from_pdf(RefUMLDesignFolder)
feature = "thermal monitoring unit"

Processing file: ReqDocs/1340874- CCRack_mpci_Safety_NXP_SwRS-20241115_113100793.pdf
Processing file: ReqDocs/461624- CCRack_mpci_EnvelopeB_SysRS-20241115_114428860.pdf
Processing file: ReqDocs/Component_Specification_Modular_Power_Compute_Interlayer_MPCI_V1.1.pdf
Processing file: ReferenceDocs/S32G_SAF_EMCEM_UM.pdf
Processing file: ReferenceDocs/SAF.pdf
Processing file: ReferenceDocs/S32G_SAF_IA_UM.pdf
Processing file: ReferenceDocs/S32SAFPB.pdf
Processing file: ReferenceDocs/RTD_THERMAL_UM.pdf
Processing file: ReferenceDocs/TP-S32-SAFETY-SOFTWARE-TECHNICAL-OVERVIEW.pdf
Processing file: ReferenceDocs/RTD_THERMAL_IM.pdf
Processing file: ReferenceUMLDesignDocs/PlantUML_Language_Reference_Guide_en.pdf


In [4]:
# Configure Chroma Settings
bge_client_settings_reqs = Settings(
    chroma_api_impl="chromadb.api.segment.SegmentAPI",  # Default API implementation
    is_persistent=True,  # Enable persistence to save embeddings
    persist_directory="./persistent_dir_embeddings_reqs",  # Path for storage; ensure it exists
)

bge_client_settings_refs = Settings(
    chroma_api_impl="chromadb.api.segment.SegmentAPI",  # Default API implementation
    is_persistent=True,  # Enable persistence to save embeddings
    persist_directory="./persistent_dir_embeddings_refs",  # Path for storage; ensure it exists
)

bge_client_settings_UML = Settings(
    chroma_api_impl="chromadb.api.segment.SegmentAPI",  # Default API implementation
    is_persistent=True,  # Enable persistence to save embeddings
    persist_directory="./persistent_dir_embeddings_UML",  # Path for storage; ensure it exists
)

bge_model_local_embeddings = OllamaEmbeddings(model="bge-m3")

bge_model_vectorstore_reqs = Chroma.from_documents(documents=reqDocs, embedding=bge_model_local_embeddings, client_settings=bge_client_settings_reqs)
bge_model_vectorstore_refs = Chroma.from_documents(documents=refDocs, embedding=bge_model_local_embeddings, client_settings=bge_client_settings_refs)
bge_model_vectorstore_UML = Chroma.from_documents(documents=refDesgn, embedding=bge_model_local_embeddings, client_settings=bge_client_settings_UML)

In [5]:
# Configure Chroma Settings
snowflakee_client_settings_reqs = Settings(
    chroma_api_impl="chromadb.api.segment.SegmentAPI",  # Default API implementation
    is_persistent=True,  # Enable persistence to save embeddings
    persist_directory="./persistent_dir_embeddings_reqs",  # Path for storage; ensure it exists
)

snowflakee_client_settings_refs = Settings(
    chroma_api_impl="chromadb.api.segment.SegmentAPI",  # Default API implementation
    is_persistent=True,  # Enable persistence to save embeddings
    persist_directory="./persistent_dir_embeddings_refs",  # Path for storage; ensure it exists
)

snowflakee_client_settings_UML = Settings(
    chroma_api_impl="chromadb.api.segment.SegmentAPI",  # Default API implementation
    is_persistent=True,  # Enable persistence to save embeddings
    persist_directory="./persistent_dir_embeddings_UML",  # Path for storage; ensure it exists
)

snowflakee_model_local_embeddings = OllamaEmbeddings(model="snowflake-arctic-embed2")

snowflakee_model_vectorstore_reqs = Chroma.from_documents(documents=reqDocs, embedding=snowflakee_model_local_embeddings, client_settings=snowflakee_client_settings_reqs)
snowflakee_model_vectorstore_refs = Chroma.from_documents(documents=refDocs, embedding=snowflakee_model_local_embeddings, client_settings=snowflakee_client_settings_refs)
snowflakee_model_vectorstore_UML = Chroma.from_documents(documents=refDesgn, embedding=snowflakee_model_local_embeddings, client_settings=snowflakee_client_settings_UML)

In [6]:
mistralmodel = ChatOllama(
    model="mistral",
    temperature=0.2,
    top_k=20,
    top_p=0.7,
    repeat_last_n=0
)

In [7]:
llama2model = ChatOllama(
    model="llama3.2",
    temperature=0.2,
    top_k=20,
    top_p=0.7,
    repeat_last_n=0
)

In [8]:
DeepseekModel = ChatOllama(
    model="deepseek-coder:6.7b",
    temperature=0.2,
    top_k=20,
    top_p=0.7,
    repeat_last_n=0
)

In [14]:
# Prompt Template
RAG_TEMPLATE = """
You are an expert systems analyst specializing in requirements engineering. Your task is to summarize the requirements from the provided **Requirement_Documents**.

### **Instructions:**
- Identify and categorize all **Functional** and **Non-Functional** requirements.
- Highlight any **constraints, dependencies, or assumptions** that may impact system design.
- Ignore unrelated or ambiguous information.
- Ensure consistency by following the below structured output format:

### **Output Format:**
1. **Functional Requirements**  
   - FR1: [Requirement description]  
   - FR2: [Requirement description]  

2. **Non-Functional Requirements**  
   - NFR1: [Requirement description]  
   - NFR2: [Requirement description]  

3. **Constraints & Dependencies**  
   - C1: [Constraint description]  
   - D1: [Dependency description] 

### **Requirement_Documents:**
<Requirement_Documents>
{Requirement_Documents}
</Requirement_Documents>

"""

# Define the ChatPromptTemplate
rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

# Define RAG pipeline function
def extract_requirements(feature: str, model):
    """
    Processes requirement documents and extracts relevant requirements for the specified feature.
    
    Parameters:
        feature (str): The feature for which requirements need to be extracted.
        model: The LLM model used for processing.

    Returns:
        str: Formatted response with extracted requirements.
    """
    try:
        # Define chain for processing requirement documents
        chain = (
            RunnablePassthrough.assign(
                Requirement_Documents=lambda input: format_docs(input["Requirement_Documents"])
            )
            | rag_prompt
            | model
            | StrOutputParser()
        )

        # Query for extracting requirements
        question = (
            f"Extract and summarize all requirements related to the {feature}. "
            "Ensure all critical details, dependencies, and constraints are captured."
        )

        # Perform similarity search on vector store
        requirement_chunks = bge_model_vectorstore_reqs.similarity_search(question)

        # Run the chain
        response = chain.invoke({
            "Requirement_Documents": requirement_chunks
        })

        return format_response(response)
    
    except Exception as e:
        return f"Error: {str(e)}"

# Function to clean up and format response
def format_response(response: str) -> str:
    """
    Formats the LLM-generated response by ensuring consistent paragraph spacing and structure.

    Parameters:
        response (str): Raw text response from the model.

    Returns:
        str: Cleaned and structured response.
    """
    # Insert line breaks before code blocks and normalize spacing
    formatted_response = re.sub(r"(\n`.*?`)", r"\n\n\1", response)  # Ensure spacing for code blocks
    formatted_response = re.sub(r"(\n\n)", r"\n", formatted_response)  # Normalize paragraph spacing

    return formatted_response

# Run the extraction process
requirements_output = extract_requirements(feature="Thermal Monitoring Unit", model=llama2model)

# Print formatted output
print(requirements_output)

Based on the provided **Requirement_Documents**, I have identified the following requirements:
### **Functional Requirements**
- FR1: The system shall monitor the SOC temperature to ensure it remains within a safe operating range.
- FR2: The system shall trigger an alert when the SOC temperature exceeds the safe operating range.
- FR3: The system shall monitor the SOC temperature continuously and update the temperature reading in real-time.
- FR4: The system shall monitor the SOC temperature using a reliable and accurate sensor.
### **Non-Functional Requirements**
- NFR1: The system shall respond to user input within 2 seconds.
- NFR2: The system shall maintain a minimum uptime of 99.9% without any downtime for maintenance.
- NFR3: The system shall ensure data security and confidentiality by implementing encryption and access controls.
### **Constraints & Dependencies**
- C1: The system shall be designed to operate within a temperature range of -20°C to 80°C.
- D1: The system shall be 

In [15]:
# Updated Prompt Template using Extracted Requirements
RAG_TEMPLATE = """
You are an expert in Software / System Design. You excel in understanding the Reference Documents and the provided Requirements.
Your task is to extract the **API-specific details** from the provided **Reference_Documents**, understand them and map them to the given **Extracted_Requirements**.

### **Instructions:**
- Identify relevant **API functions**, parameters, protocols, and constraints that match the **Extracted_Requirements**.
- Clearly link each API function to the corresponding requirements.
- If no matching API is found for a requirement, state "No direct match found" and suggest alternatives if applicable.
- Use the following structured output format:

### **Output Format:**
1. **Extracted Requirement:** [Requirement Detail]
   - **Matching API Function(s):** [Function Name]
   - **Parameters:** [Parameter List]
   - **Protocol Used:** [Protocol Name (if applicable)]
   - **Constraints/Performance Notes:** [Relevant Details]
   - **Confidence Level:** [High / Medium / Low]
   - **Missing Details:** [Any missing or unclear information]

### **Input Documents:**
- **Extracted Requirements:**  
  <Extracted_Requirements>
  {Extracted_Requirements}
  </Extracted_Requirements>

- **Reference Documents:**  
  <Reference_Documents>
  {Reference_Documents}
  </Reference_Documents>
"""

# Define the ChatPromptTemplate
rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

# Define function to extract API details using extracted requirements
def extract_api_details(extracted_requirements: str, reference_docs_vectorstore, model):
    """
    Extracts API-specific details relevant to given extracted requirements from vendor reference documents.
    
    Parameters:
        extracted_requirements (str): The requirements extracted from the previous step.
        reference_docs_vectorstore: Vector store used for document retrieval.
        model: The LLM model used for processing.

    Returns:
        str: Formatted response containing extracted API details.
    """
    try:
        # Define a dynamic query based on the extracted requirements
        question = (
            f"Extract and summarize all API-related details that map to the following requirements: {extracted_requirements}. "
            "Ensure all relevant API functions, parameters, protocols, and constraints are captured."
        )

        # Retrieve reference document chunks from the vector store
        reference_chunks = reference_docs_vectorstore.similarity_search(question,10)
        if not reference_chunks:
            return f"No relevant reference documents found for the provided requirements."

        # Build the processing chain with dynamic assignment
        chain = (
            RunnablePassthrough.assign(
                Reference_Documents=lambda input: format_docs(input["Reference_Documents"]),
                Extracted_Requirements=lambda input: input["Extracted_Requirements"]
            )
            | rag_prompt
            | model
            | StrOutputParser()
        )

        # Invoke the chain with both contexts
        response = chain.invoke({
            "Reference_Documents": reference_chunks,
            "Extracted_Requirements": extracted_requirements
        })

        return format_response(response)
    
    except Exception as e:
        return f"Error: {str(e)}"

# Function to clean up and format the response
def format_response(response: str) -> str:
    """
    Formats the LLM-generated response by ensuring consistent paragraph spacing and structure.
    """
    formatted_response = re.sub(r"(\n`.*?`)", r"\n\n\1", response)
    formatted_response = re.sub(r"(\n\n)", r"\n", formatted_response)
    return formatted_response

# Example usage
extracted_requirements = requirements_output
reference_output = extract_api_details(
    extracted_requirements=extracted_requirements,
    reference_docs_vectorstore=bge_model_vectorstore_refs,
    model=llama2model
)

# Print formatted output
print(reference_output)


Based on the provided Reference Documents, I have identified the following API-specific details that match the Extracted Requirements:
### **Extracted Requirement:** The system shall monitor the SOC temperature to ensure it remains within a safe operating range.
   - **Matching API Function(s):** `get_temperature` and `set_threshold`
   - **Parameters:** `site_id` (temperature sensor site), `threshold_value` (temperature threshold)
   - **Protocol Used:** None
   - **Constraints/Performance Notes:** The system shall ensure that the temperature reading is accurate and reliable.
   - **Confidence Level:** High
   - **Missing Details:** None
### **Extracted Requirement:** The system shall monitor the SOC temperature to detect any anomalies or changes.
   - **Matching API Function(s):** `get_temperature` and `set_threshold`
   - **Parameters:** `site_id` (temperature sensor site), `threshold_value` (temperature threshold)
   - **Protocol Used:** None
   - **Constraints/Performance Notes:**

In [19]:
# Updated Prompt Template using PlantUML and dynamic feature injection
RAG_TEMPLATE = """
You are a seasoned software architect specializing in UML-based software design. You have been provided with:
1. Chip vendor API details that align with the following **Reference_docs**.
<Reference_docs>
{Reference_docs}
</Reference_docs>
2. Guidelines for generating UML Designs that allign with **Design Guidelines:**  
<Design_Guidelines>
{Design_Guidelines}
</Design_Guidelines>

Bring out your understanding of **Reference_docs** and use the UML Design Guidelines from **Design_Guidelines**, generate a comprehensive UML Diagram that meets all technical constraints.

**Instructions:**
- Identify the main components and interactions based on the provided requirements and design guidelines.
- Output the diagram in PlantUML format.
- Clearly annotate the diagram with inline comments explaining key design decisions.
- Output only the PlantUML code without any extra text.
"""

# Create the ChatPromptTemplate from the above template
rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

def generate_plantuml_diagram(
    extracted_api_details: str,
    design_guidelines_vectorstore,
    model, 
    UML_Diagram: str
):
    """
    Generate a UML sequence diagram in PlantUML format based on extracted API details and design guidelines.
    
    Args:
        extracted_api_details (str): Extracted API details (as text) from previous steps.
        design_guidelines_vectorstore: Vector store instance to retrieve design guidelines.
        feature (str): The feature for which the UML diagram is to be generated.
        model: The LLM model used in the pipeline.
    
    Returns:
        str: The formatted PlantUML code for the UML sequence diagram.
    """
    # Query to retrieve design guidelines from the vector store.
    query = f"Extract the guidelines for a {UML_Diagram} that meet technical constraints."
    design_guidelines_docs = design_guidelines_vectorstore.similarity_search(query)
    
    if not design_guidelines_docs:
        return f"No relevant design guidelines found for: {UML_Diagram}"
    
    # Build the processing chain with dynamic assignment.
    chain = (
        RunnablePassthrough.assign(
            Design_Guidelines=lambda input: format_docs(input["Design_Guidelines"]),
            Reference_docs=lambda input: extracted_api_details
        )
        | rag_prompt
        | model
        | StrOutputParser()
    )
    
    # Invoke the chain with the required contexts.
    response = chain.invoke({
        "Design_Guidelines": design_guidelines_docs,
        "Reference_docs": extracted_api_details
    })
    
    # Clean up and format the PlantUML output.
    formatted_response = re.sub(r"(\n`.*?`)", r"\n\n\1", response)
    formatted_response = re.sub(r"(\n\n)", r"\n", formatted_response)
    return formatted_response

# Example usage in a Jupyter Notebook:
# (Ensure that 'extracted_api_details', 'bge_model_vectorstore_UML', and 'mistralmodel' are defined in your notebook.)
#
extracted_api_details = reference_output
diagram_of_interest = "Sequence Diagram"
vector_store = bge_model_vectorstore_UML
#
plantuml_output = generate_plantuml_diagram(
    extracted_api_details,
    vector_store,
    mistralmodel,
    diagram_of_interest
)
print(plantuml_output)


 ```plantuml
@startuml
skinparam lifelineStrategy solid
skinparam style strictuml
actor User
actor Chip
Chip --|> User: Interface
Chip --|> API_Server: Communicate API Requests
API_Server --|> Chip: API Response
API_Server --|> Database: Store/Retrieve Data
Database --|> API_Server: Data
User ->> Chip: Enable Features
User ->> Chip: Disable Features
User ->> Chip: Get Status
@enduml

```
In this diagram:
- `User` represents the end-user interacting with the chip.
- `Chip` represents the hardware device that communicates with the API server.
- `API_Server` represents the server that handles API requests and responses.
- `Database` represents the database where data is stored and retrieved.
The arrows represent the interactions between these components. The `--|>` notation indicates that the component on the left sends a message to the component on the right, while the component on the right does not respond. The `->>` notation indicates that the component on the left sends a message to 