In [None]:
import os

# Set the API key as an environment variable
os.environ['OPENAI_API_KEY'] = "<Your Azure OpenAI API Key>"

print(os.getenv('OPENAI_API_KEY'))

In [2]:
feature_input = "Additional Voltage Monitoring"
UML_Diagram_1 = "Activity Diagram"
UML_Diagram_2 = "Class Diagram"

In [3]:
import requests
import time
import json
import fitz  # PyMuPDF
from chromadb import Client, Embeddings, EmbeddingFunction, PersistentClient
from chromadb.config import Settings
import numpy as np
import hashlib
import os
from PIL import Image
import io
import base64

TEXT_EMBEDDING_MODEL = "text-embedding-3-small"
IMAGE_EMBEDDING_MODEL = "image-embedding-1"
CHAT_MODEL = "gpt-4.1"
API_URL = 'https://openaichatgpt-ms-epb1-xc.openai.azure.com/openai/deployments'
API_KEY = os.getenv('OPENAI_API_KEY')  # Read API key from environment variable
CHAT_API_VERSION = "2025-01-01-preview"
EMBEDDING_API_VERSION = "2024-02-01"

if not API_KEY:
    raise ValueError("API key not found. Please set the OPENAI_API_KEY environment variable.")

HEADERS = {
    'Authorization': f"Bearer {API_KEY}",
    'Content-Type': 'application/json'
}
REQUIREMENT_METADATA_FILE = "Requirement_pdf_metadata.json"
REFERENCE_DATA_METADATA_FILE = "Reference_pdf_metadata.json"
GUIDELINE_DATA_METADATA_FILE = "Guideline_pdf_metadata.json"
REFERENCE_CODE_METADATA_FILE = "Reference_Code_metadata.json"
CODE_GUIDELINE_METADATA_FILE = "Code_Guideline_pdf_metadata.json"

# Function to calculate MD5 hash of a PDF file
def calculate_pdf_hash(pdf_path):
    """
    Calculate the MD5 hash of a PDF file.

    Args:
        pdf_path (str): The file path to the PDF file.

    Returns:
        str: The MD5 hash of the PDF file as a hexadecimal string.

    Example:
        >>> calculate_pdf_hash('example.pdf')
        'd41d8cd98f00b204e9800998ecf8427e'
    """
    hasher = hashlib.md5()
    with open(pdf_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.

    Args:
        pdf_path (str): The file path to the PDF file.

    Returns:
        str: The extracted text from the PDF file.

    Example:
        >>> extract_text_from_pdf('example.pdf')
        'This is the extracted text from the PDF file.'
    """
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

def read_source_code(file_path):
    """
    Reads the content of a source code file and returns it as a string.
    Args:
        file_path (str): The path to the file to be read.
    Returns:
        str: The content of the file as a string.
    Raises:
        FileNotFoundError: If the file does not exist.
        IOError: If there is an error reading the file.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

# Function to extract images from PDF
def extract_images_from_pdf(pdf_path):
    """
    Extract images from a PDF file.

    Args:
        pdf_path (str): The file path to the PDF file.

    Returns:
        list: A list of PIL Image objects extracted from the PDF file.
    """
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        for img in page.get_images(full=True):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append(image)
    doc.close()
    return images

# Split text into chunks
def split_text_into_chunks(text, chunk_size=1000):
    """
    Split text into chunks of a specified size.

    Args:
        text (str): The text to be split into chunks.
        chunk_size (int, optional): The size of each chunk. Default is 1000.

    Returns:
        list: A list of text chunks.

    Example:
        >>> split_text_into_chunks('This is a long text that needs to be split into chunks.', chunk_size=10)
        ['This is a ', 'long text ', 'that needs', ' to be spl', 'it into ch', 'unks.']
    """
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Embedding Function Class with Rate Limiting
class MyEmbeddingFunction(EmbeddingFunction):
    """
    A class to handle embedding functions with rate limiting.

    Attributes:
        batch_size (int): The size of each batch for processing.

    Methods:
        get_embedding(batch_data):
            Sends a batch of document chunk request to the embedding API and returns the embeddings for the chunks.
        
        __call__(chunks):
            Processes chunks in batches to reduce the number of API calls.
        
        __get_user_querry_embedding__(user_querry):
            Retrieves the embedding for a single user query.
    """
    def __init__(self, batch_size=5, max_retries=5, backoff_factor=2):
        """
        Initializes the MyEmbeddingFunction with a specified batch size.

        Args:
            batch_size (int): The size of each batch for processing. Default is 5.
        """
        self.batch_size = batch_size
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor

    def get_embedding(self, batch_data):
        """
        Helper function to send a batch request to the embedding API.

        Args:
            batch_data (list): A list of inputs for batch processing.

        Returns:
            list: A list of embeddings if successful, otherwise None.
        """
        data = {
            'input': batch_data,  # Sending a list of inputs for batch processing
            'dimensions': 1024
        }
        
        for attempt in range(self.max_retries):
            try:
                response = requests.post(f'{API_URL}/{TEXT_EMBEDDING_MODEL}/embeddings?api-version={EMBEDDING_API_VERSION}', headers=HEADERS, json=data)
                response.raise_for_status()  # Raises an HTTPError if the response was unsuccessful
                data_ = response.json()
                
                # Debugging: Print the response for troubleshooting
                # print("API Response:", data_)

                # Extract embeddings if present
                if 'data' in data_:
                    # Flatten and validate
                    embeddings = [item.get('embedding') for item in data_['data']]
                    return embeddings
                
                print("Embedding not found in response")
                return None
            except requests.exceptions.RequestException as e:
                if 'response' in locals() and response.status_code == 429:
                    # Handle rate limiting
                    print(f"Rate limit exceeded. Retrying in {self.backoff_factor * (2 ** attempt)} seconds...")
                    time.sleep(self.backoff_factor * (2 ** attempt))
                else:
                    print(f"Request failed: {e}")
                    return None
            except KeyError as e:
                print(f"KeyError in response: {e}")
                return None
        
    def get_image_embedding(self, images):
        """
        Helper function to send a batch of image data to the embedding API.

        Args:
            images (list): A list of PIL Image objects.

        Returns:
            list: A list of embeddings if successful, otherwise None.
        """
        descriptions = []

        for image in images:
            # Convert the image to base64
            buffered = io.BytesIO()
            image.save(buffered, format="PNG")
            image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")

            # Compose the payload
            data = {
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": "Describe this image in detail. Make sure to include all the important details like text. If the image contains UML diagrams, understand and describe its components."
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{image_base64}",
                                    "detail": "high"
                                }
                            }
                        ]
                    }
                ],
                "max_tokens": 500,
                "temperature": 0.5,
                "top_p": 0.9
            }

            try:
                response = requests.post(
                    f'{API_URL}/{CHAT_MODEL}/chat/completions?api-version={CHAT_API_VERSION}',
                    headers=HEADERS,
                    json=data
                )
                response.raise_for_status()
                data_ = response.json()
                if 'choices' in data_ and len(data_['choices']) > 0:
                    descriptions.append(data_['choices'][0]['message']['content'])
                else:
                    descriptions.append(None)
            except requests.exceptions.RequestException as e:
                print(f"Failed to describe image: {e}")
                descriptions.append(None)

        # Generate embeddings for the descriptions
        text_embeddings = []
        for description in descriptions:
            if description:
                text_embedding = self.__get_user_querry_embedding__(description)
                text_embeddings.append(text_embedding)
            else:
                text_embeddings.append(None)

        return text_embeddings

    def __call__(self, chunks):
        """
        Process chunks in batches to reduce the number of API calls.

        Args:
            chunks (list): A list of text chunks to be processed.

        Returns:
            list: A list of embeddings corresponding to the chunks.
        """
        embeddings = []
        for i in range(0, len(chunks), self.batch_size):
            batch = chunks[i:i + self.batch_size]
            result = self.get_embedding(batch)
            if result:
                embeddings.extend(result)
            else:
                # If batch processing fails, fill with None to maintain alignment
                embeddings.extend([None] * len(batch))
        return embeddings

    def __get_user_querry_embedding__(self, user_querry: str) -> Embeddings:
        """
        Retrieves the embedding for a single user query.

        Args:
            user_querry (str): The user query string.

        Returns:
            list: The embedding for the user query as a list of floats, or None if unsuccessful.
        """
        data = {
            'input': user_querry,
            'dimensions': 1024
        }
        
        for attempt in range(self.max_retries):
            try:
                response = requests.post(f'{API_URL}/{TEXT_EMBEDDING_MODEL}/embeddings?api-version={EMBEDDING_API_VERSION}', headers=HEADERS, json=data)
                response.raise_for_status()
                data_ = response.json()
                if 'data' in data_ and len(data_['data']) > 0 and 'embedding' in data_['data'][0]:
                    return data_['data'][0]['embedding']
                return None
            except requests.exceptions.RequestException as e:
                if response.status_code == 429:
                    # Handle rate limiting
                    print(f"Rate limit exceeded. Retrying in {self.backoff_factor * (2 ** attempt)} seconds...")
                    time.sleep(self.backoff_factor * (2 ** attempt))
                else:
                    print(f"Request failed: {e}")
                    return None
            except KeyError as e:
                print(f"KeyError in response: {e}")
                return None

# Initialize Chroma Client (new method)
def init_chroma_client(collection_name):
    """
    Initializes the Chroma client with default settings and ensures the collection is created or retrieved.

    Returns:
        client: The initialized Chroma client.
        collection: The Chroma collection for storing embeddings.

    Example:
        client, collection = init_chroma_client()
    """
    # Initialize with the new default settings
    client = PersistentClient(path=f"./persistent_dir_embeddings_{collection_name}")
    
    # Ensure the collection is created or retrieved
    collection = client.get_or_create_collection(name=collection_name, embedding_function=MyEmbeddingFunction())
    
    return client, collection

# Load or initialize metadata
def load_metadata(metadata_file):
    """
    Loads metadata from a predefined file if it exists.

    Returns:
        dict: The loaded metadata as a dictionary. Returns an empty dictionary if the file does not exist.

    Example:
        metadata = load_metadata()
    """
    if os.path.exists(metadata_file):
        with open(metadata_file, 'r') as f:
            return json.load(f)
    return {}

def save_metadata(metadata, metadata_file):
    """
    Saves the provided metadata to a predefined file.

    Parameters:
        metadata (dict): The metadata to be saved.

    Example:
        save_metadata(metadata)
    """
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=4)

# Check if document embeddings already exist in Chroma
def check_document_in_chroma(pdf_hash, collection):
    """
    Checks if document embeddings already exist in Chroma for a given PDF hash.

    Parameters:
        pdf_hash (str): The hash of the PDF document.
        collection: The Chroma collection.

    Returns:
        bool: True if the document embeddings exist, False otherwise.

    Example:
        exists = check_document_in_chroma(pdf_hash, collection)
    """
    results = collection.get(
        where={"doc_hash": pdf_hash}, 
        include=["documents"]
    )
    
    # If documents matching the hash are found, return True
    return len(results['documents']) > 0

# Check if document embeddings already exist in Chroma
def check_document_in_chroma_metadata(pdf_hash, metadata):
    """
    Checks if document embeddings already exist in the metadata for a given PDF hash.

    Parameters:
        pdf_hash (str): The hash of the PDF document.
        metadata (dict): The metadata dictionary.

    Returns:
        bool: True if the document embeddings exist in the metadata, False otherwise.

    Example:
        exists = check_document_in_chroma_metadata(pdf_hash, metadata)
    """
    return pdf_hash in metadata

# Function to create embeddings for PDF chunks and store in Chroma
def create_embeddings_for_pdf(pdf_path, collection, metadata, metadata_file):
    """
    Creates embeddings for PDF chunks and stores them in Chroma.

    Parameters:
        pdf_path (str): The path to the PDF document.
        collection: The Chroma collection.
        metadata (dict): The metadata dictionary.

    Example:
        create_embeddings_for_pdf("example.pdf", collection, metadata)
    """
    pdf_hash = calculate_pdf_hash(pdf_path)
    
    # Check if document embeddings already exist
    # if check_document_in_chroma(pdf_hash, collection):
    #     print(f"Document '{pdf_path}' unchanged. Using existing embeddings.")
    #     return

    # Check if document is unchanged
    if check_document_in_chroma_metadata(pdf_hash, metadata):
        print(f"Document '{pdf_path}' unchanged. Skipping re-embedding.")
        return

    print(f"Processing new or updated document: {pdf_path}")
    
    # Load the PDF document
    # Extract text from the entire PDF
    full_text = extract_text_from_pdf(pdf_path)

    # Split the extracted text into chunks
    chunks = split_text_into_chunks(full_text, chunk_size=1000)  # You can adjust the chunk_size as needed

    #extract images
    images = extract_images_from_pdf(pdf_path)
    
    # Initialize embedding function (assuming you're using Ollama or any other embedding function)
    embF = MyEmbeddingFunction(batch_size=10)

    # Generate embeddings for all chunks in batches
    embeddings = embF.__call__(chunks)
    
    # Add documents to Chroma collection
    for i, embedding in enumerate(embeddings):
        if embedding is not None:
            collection.add(
                documents=[chunks[i]],
                metadatas=[{
                    'chunk_id': i,
                    'doc_hash': pdf_hash,
                    'source': pdf_path
                }],
                embeddings=[embedding],
                ids=[f"{pdf_path}_chunk{i}"]
            )
    
    if images:
        #Generate Embeddings for Images:
        image_embeddings = embF.get_image_embedding(images)
        for i, embedding in enumerate(image_embeddings):
            if embedding is not None:
                collection.add(
                    documents=[f"Image {i} from {pdf_path}"],
                    metadatas=[{
                        'image_id': i,
                        'doc_hash': pdf_hash,
                        'source': pdf_path
                    }],
                    embeddings=[embedding],
                    ids=[f"{pdf_path}_image{i}"]
                )
    
    # Update metadata
    metadata[pdf_hash] = {'path': pdf_path}
    save_metadata(metadata, metadata_file)
    print(f"Embeddings for '{pdf_path}' created successfully.")

# Function to process header and source files
def process_reference_code(directory, collection, metadata_file):
    """
    Processes reference code files in a given directory by generating embeddings for their content
    and storing the embeddings in a specified collection.
    Args:
        directory (str): The root directory containing the 'inc' and 'src' subdirectories 
                         with header (.h) and source (.c) files respectively.
        collection (object): The collection object where embeddings will be stored. 
                             It should support the `add` method for adding documents, metadata, and embeddings.
        metadata_file (str): The path to the metadata file used to track processed files and their hashes.
    Workflow:
        1. Load metadata from the specified metadata file.
        2. Identify all header (.h) and source (.c) files in the 'inc' and 'src' subdirectories.
        3. Calculate a hash for each file to determine if it has been processed before.
        4. Skip processing for files that are unchanged based on their hash.
        5. For each unprocessed file:
            - Read the file content and split it into chunks.
            - Generate embeddings for each chunk using a custom embedding function.
            - Add the embeddings, along with metadata, to the specified collection.
        6. Update the metadata file with the hash and path of the processed file.
    Notes:
        - The function assumes the existence of helper functions such as `load_metadata`, 
          `calculate_pdf_hash`, `check_document_in_chroma_metadata`, `split_text_into_chunks`, 
          `MyEmbeddingFunction`, and `save_metadata`.
        - The `chunk_size` for splitting text and `batch_size` for embedding generation are hardcoded.
    Raises:
        FileNotFoundError: If the specified metadata file or any required subdirectory/file is not found.
        Exception: For any other errors encountered during file processing or embedding generation.
    Example:
        process_reference_code(
            directory="/path/to/codebase",
            collection=my_collection,
            metadata_file="/path/to/metadata.json"
    """
    metadata = load_metadata(metadata_file)
    
    # List all header and source files in the directory
    header_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.h')]
    source_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.c')]
    
    # Combine all files to process
    all_files = header_files + source_files
    
    for file_path in all_files:
        # Calculate hash to check if the file has changed
        file_hash = calculate_pdf_hash(file_path)  # Reuse the hash function for consistency
        
        # Skip if already processed
        if check_document_in_chroma_metadata(file_hash, metadata):
            print(f"File '{file_path}' unchanged. Skipping re-embedding.")
            continue
        
        print(f"Processing file: {file_path}")
        
        # Read and split the file content into chunks
        with open(file_path, 'r', encoding='utf-8') as f:
            file_content = f.read()
        chunks = split_text_into_chunks(file_content, chunk_size=1000)
        
        # Generate embeddings for the chunks
        embF = MyEmbeddingFunction(batch_size=10)
        embeddings = embF.__call__(chunks)
        
        # Add embeddings to the collection
        for i, embedding in enumerate(embeddings):
            if embedding is not None:
                collection.add(
                    documents=[chunks[i]],
                    metadatas=[{
                        'chunk_id': i,
                        'doc_hash': file_hash,
                        'source': file_path
                    }],
                    embeddings=[embedding],
                    ids=[f"{file_path}_chunk{i}"]
                )
        
        # Update metadata
        metadata[file_hash] = {'path': file_path}
        save_metadata(metadata, metadata_file)
        print(f"Embeddings for '{file_path}' created successfully.")

# Remove embeddings for deleted PDFs
def remove_deleted_pdfs_from_chroma(directory, collection, metadata, metadata_file):
    """
    Remove embeddings for deleted PDFs from the collection.

    This function performs the following tasks:
    1. Identifies PDF files that have been deleted from the "docs" directory.
    2. Removes the corresponding entries from the collection and metadata.

    Parameters:
    collection (object): The collection object from which the PDF embeddings and metadata will be removed.
    metadata (dict): A dictionary containing metadata about the PDFs, where the key is the PDF hash and the value is a dictionary with PDF information.

    Returns:
    None

    Detailed Steps:
    1. Identify Existing Files: The function creates a set of existing PDF files in the "docs" directory.
    2. Identify Deleted Files: It then identifies which PDFs have been deleted by comparing the existing files with the metadata.
    3. Remove Deleted Files: For each deleted PDF, the function removes the corresponding entry from the collection and metadata, and prints a message indicating the removal.
    4. Save Updated Metadata: Finally, the function saves the updated metadata.

    Notes:
    - Ensure that the `save_metadata` function is defined and imported in the script.
    - The "docs" directory should contain only the PDF files that are currently in use.
    """
    existing_files = {f for f in os.listdir(directory) if f.endswith(".pdf")}
    hashes_to_remove = [
        pdf_hash for pdf_hash, info in metadata.items()
        if os.path.basename(info['path']) not in existing_files
    ]

    for pdf_hash in hashes_to_remove:
        print(f"Removing deleted document with hash: {pdf_hash}")
        collection.delete(where={"doc_hash": pdf_hash})
        del metadata[pdf_hash]
    
    save_metadata(metadata, metadata_file)
    
# Process all PDFs in the docs/ directory
def process_all_pdfs(directory, collection, metadata_file):
    """
    Process all PDFs in the specified directory.

    This function performs the following tasks:
    1. Loads metadata.
    2. Removes entries for deleted PDFs from a collection.
    3. Creates embeddings for each PDF file in the directory and updates the collection with the metadata.

    Parameters:
    directory (str): The path to the directory containing the PDF files to be processed.
    collection (object): The collection object where the PDF embeddings and metadata will be stored.

    Returns:
    None

    Example Usage:
    process_all_pdfs('docs/', my_collection)

    Detailed Steps:
    1. Load Metadata: The function starts by loading metadata using the `load_metadata` function.
    2. Remove Deleted PDFs: It then removes entries for deleted PDFs from the collection using the `remove_deleted_pdfs_from_chroma` function.
    3. Process PDF Files: The function lists all PDF files in the specified directory and iterates over each file. For each PDF file, it creates embeddings using the `create_embeddings_for_pdf` function and updates the collection with the metadata.

    Notes:
    - Ensure that the `load_metadata`, `remove_deleted_pdfs_from_chroma`, and `create_embeddings_for_pdf` functions are defined and imported in the script.
    - The `directory` should contain only PDF files that need to be processed.
    """
    metadata = load_metadata(metadata_file)
    
    # Remove entries for deleted PDFs
    remove_deleted_pdfs_from_chroma(directory, collection, metadata, metadata_file)

    pdf_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]
    for pdf_path in pdf_files:
        create_embeddings_for_pdf(pdf_path, collection, metadata, metadata_file)

# Embedding user query and finding the best matching chunk
def find_relevant_chunk(user_query, collection):
    """
    Embed the user query and find the best matching chunk in the collection.

    This function performs the following tasks:
    1. Generates an embedding for the user query.
    2. Queries the collection to find the best matching chunks based on the query embedding.

    Parameters:
    user_query (str): The user's query string that needs to be embedded and matched.
    collection (object): The collection object where the embeddings and documents are stored.

    Returns:
    list or None: A list of documents that best match the user query, or None if no matches are found or if the embedding generation fails.

    Detailed Steps:
    1. Generate Query Embedding: The function uses `MyEmbeddingFunction` to generate an embedding for the user query.
    2. Check Embedding: It checks if the embedding generation was successful. If not, it prints an error message and returns None.
    3. Query Collection: The function queries the collection with the generated embedding to find the top 100 matching results.
    4. Return Results: If matching documents are found, it returns the list of documents. Otherwise, it returns None.

    Notes:
    - Ensure that the `MyEmbeddingFunction` class and its `__get_user_querry_embedding__` method are defined and imported in the script.
    - The `collection` object should support the `query` method with the specified parameters.
    """
    embF = MyEmbeddingFunction()
    query_embedding = embF.__get_user_querry_embedding__(user_query)
    
    # Check if embedding is None
    if query_embedding is None:
        print("Failed to generate embedding for the query.")
        return None
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=10
    )
    
    if results and 'documents' in results and len(results['documents']) > 0:
        return results['documents']
    return None

# Prompting the model for text generation
def prompt_model(messages, model: str = CHAT_MODEL, max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.7, max_retries: int = 5, backoff_factor: int = 2) -> str:
    """
    Prompt the model for text generation.

    This function sends a prompt to a specified model and returns the generated text.

    Parameters:
    prompt_text (str): The text prompt to send to the model.
    model (str): The model to use for text generation. Default is CHAT_MODEL.
    max_tokens (int): The maximum number of tokens to generate. Default is 500.
    temperature (float): The sampling temperature. Default is 0.2.
    top_k (int): The number of highest probability vocabulary tokens to keep for top-k filtering. Default is 50.
    top_p (float): The cumulative probability for top-p (nucleus) sampling. Default is 0.7.

    Returns:
    str or None: The generated text if the request is successful and the response contains text, otherwise None.

    Detailed Steps:
    1. Prepare Request Data: The function prepares the data dictionary with the model parameters and prompt text.
    2. Send Request: It sends a POST request to the API endpoint for text generation.
    3. Check Response: The function checks if the response status is 200 (OK) and if the response contains generated text.
    4. Return Generated Text: If the response is valid and contains text, it returns the generated text. Otherwise, it returns None.

    Notes:
    - Ensure that the `requests` library is imported and the `API_URL` and `HEADERS` constants are defined in the script.
    - The model parameter should be a valid model identifier recognized by the API.
    """
    data = {
        'messages': messages,
        'max_tokens': max_tokens,
        'temperature': temperature,
        'top_p': top_p,
    }
    for attempt in range(max_retries):
        try:
            response = requests.post(f'{API_URL}/{CHAT_MODEL}/chat/completions?api-version={CHAT_API_VERSION}', headers=HEADERS, json=data)
            response.raise_for_status()  # Raises an HTTPError if the response was unsuccessful
            data_ = response.json()
            
            # Debugging: Print the response for troubleshooting
            print("API Response:", data_)

            if 'choices' in data_ and len(data_['choices']) > 0:
                return data_['choices'][0]['message']['content']
            return None
        except requests.exceptions.RequestException as e:
            if isinstance(e, requests.exceptions.HTTPError) and e.response is not None and e.response.status_code == 429:
                # Handle rate limiting
                print(f"Rate limit exceeded. Retrying in {backoff_factor * (2 ** attempt)} seconds...")
                time.sleep(backoff_factor * (2 ** attempt))
            else:
                print(f"Request failed: {e}")
                return None
        except KeyError as e:
            print(f"KeyError in response: {e}")
            return None
    
def summarize_requirements(feature_query, collection):
    relevant_chunks = find_relevant_chunk(feature_query, collection)
    if not relevant_chunks:
        print("No relevant requirements found.")
        return None
    
    # Define the system and user messages
    messages = [
        {
            "role": "system",
            "content": """
            You are a highly skilled AI assistant specializing in summarizing technical requirements. You are provided with Requirement Chunks delimited by tripple backticks.
            Your task is to:
            1. Identify and categorize all functional and non-functional requirements.
            2. Highlight any constraints, dependencies, or assumptions that may impact system design.
            3. Ignore unrelated or ambiguous information and ensure consistency.
            """
        },
        {
            "role": "user",
            "content": f"""
            Requirements:
            ```{relevant_chunks}```
            """
        }
    ]
    
    summary = prompt_model(messages)
    return summary

def extract_design_information(requirements_summary, collection):
    relevant_chunks = find_relevant_chunk(requirements_summary, collection)
    if not relevant_chunks:
        print("No relevant design information found.")
        return None
    
    # Define the system and user messages
    messages = [
        {
            "role": "system",
            "content": """
            You are a highly skilled AI assistant specializing in understanding design information. You are provided with Requirements and Design Information delimited by tripple backticks.
            Your task is to:
            1. Identify relevant API functions, parameters, protocols, and constraints from the Design Information.
            2. Map the extracted design information with the requirements.
            """
        },
        {
            "role": "user",
            "content": f"""
            Requirements:
            ```{requirements_summary}```
            Design Information:
            ```{relevant_chunks}```
            """
        }
    ]
    
    design_info = prompt_model(messages)
    return design_info

def extract_code_information(requirements_summary, reference_code_collection):
    """
    Extract relevant code information from the reference code collection based on the requirements summary.

    Args:
        requirements_summary (str): The summarized requirements.
        reference_code_collection: The Chroma collection containing reference code embeddings.

    Returns:
        str: Relevant code information as a string, or None if no relevant code is found.
    """
    relevant_chunks = find_relevant_chunk(requirements_summary, reference_code_collection)
    if not relevant_chunks:
        print("No relevant code information found.")
        return None

    # Define the system and user messages
    messages = [
        {
            "role": "system",
            "content": """
            You are a highly skilled AI assistant specializing in understanding Code information. You are provided with Requirements and its associated Code Information delimited by tripple backticks.
            Your task is to:
            1. Identify relevant API functions, parameters, protocols, and constraints from the Code Information.
            2. Bring out an understanding of the Code information and map them with the requirements.
            """
        },
        {
            "role": "user",
            "content": f"""
            Requirements:
            ```{requirements_summary}```
            Code Information:
            ```{relevant_chunks}```
            """
        }
    ]
    
    code_design_info = prompt_model(messages)
    return code_design_info

def create_uml_design(UML_Diagram, design_info, code_design_info, design_querry, uml_guidelines_collection):
    uml_guidelines = find_relevant_chunk(design_querry, uml_guidelines_collection)
    if not uml_guidelines:
        print("No UML design guidelines found.")
        return None
    
    # Define the system and user messages
    messages = [
        {
            "role": "system",
            "content": f"""
            You are a highly skilled AI assistant specializing in creating Software UML designs.
            Your task is to:
            1. Create the requested UML designs ({UML_Diagram}) based on the Design information, Code Information and UML design guidelines delimited by tripple backticks.
            2. Make sure all the identified API Functions, from the Design Information and the Code Information are included in the UML Design.
            3. Provide PlantUML codes for each UML diagram.
            4. Provide a detailed explanation of the requested PlantUML diagrams.
            """
        },
        {
            "role": "user",
            "content": f"""
            Design Information:
            ```{design_info}```
            Code Information:
            ```{code_design_info}```
            UML Design Guidelines:
            ```{uml_guidelines}```
            """
        }
    ]
    
    uml_design = prompt_model(messages)
    return uml_design

def generate_code_from_design(uml_design_1, uml_design_2, user_querry, code_guideline_collection):
    code_guidelines = find_relevant_chunk(user_querry, code_guideline_collection)
    if not code_guidelines:
        print(f"No Code guidelines specific to the user querry: {user_querry} found.")
        return None
    
    # Define the system and user messages
    messages = [
        {
            "role": "system",
            "content": f"""
            You are a highly skilled AI assistant specializing in creating source code by understanding the input UML Designs.
            Your task is to:
            1. Generate the complete source Code (.c and .h files), and also the main file, and complete implementation of any applicable Stub APIs based on your understanding of the Design Information 1 and Design Information 2 delimited by tripple backticks.
            2. Make sure all the identified API Functions, from both the Design Informations are included in your generated Code.
            3. Make sure the generated source code adheres to the Coding Guidelines delimited by tripple backticks.
            """
        },
        {
            "role": "user",
            "content": f"""
            Design Information 1:
            ```{uml_design_1}```
            Design Information 2:
            ```{uml_design_2}```
            Coding Guidelines:
            ```{code_guidelines}```
            """
        }
    ]
    
    source_Code = prompt_model(messages)
    return source_Code

In [4]:
req_directory = "Requirement_Docs"
desgn_directory = "Reference_Docs"
reference_code_directory = "ReferenceCode_Docs"
Design_guideline_directory = "Guideline_Docs"
coding_guideline_directory = "Coding_Guideline_Docs"

# Initialize Chroma client and collection for Requirement Documents
Reqclient, req_collection = init_chroma_client("reqs")

# Initialize Chroma client and collection for Reference Documents
Desgnclient, desgn_collection = init_chroma_client("refs")

# Initialize Chroma client and collection for guideline Documents
guidelineclient, guideline_collection = init_chroma_client("UML")

# Initialize Chroma client and collection for Reference Code
ReferenceCodeClient, reference_code_collection = init_chroma_client("code")

# Initialize Chroma client and collection for Coding Guideline Documents
CodeGuidelineClient, code_guideline_collection = init_chroma_client("codeGuideline")

In [5]:
# Process all Requirement PDFs in the specified directory
process_all_pdfs(req_directory, req_collection, REQUIREMENT_METADATA_FILE)

# Process all Requirement PDFs in the specified directory
process_all_pdfs(desgn_directory, desgn_collection, REFERENCE_DATA_METADATA_FILE)
process_all_pdfs(Design_guideline_directory, guideline_collection, GUIDELINE_DATA_METADATA_FILE)

# Process all files in the ReferenceCode folder
process_reference_code(reference_code_directory, reference_code_collection, REFERENCE_CODE_METADATA_FILE)

process_all_pdfs(coding_guideline_directory, code_guideline_collection, CODE_GUIDELINE_METADATA_FILE)

Document 'Requirement_Docs\1340874- CCRack_mpci_Safety_NXP_SwRS.pdf' unchanged. Skipping re-embedding.
Document 'Requirement_Docs\461624- CCRack_mpci_EnvelopeB_SysRS.pdf' unchanged. Skipping re-embedding.
Document 'Reference_Docs\ADC_com.pdf' unchanged. Skipping re-embedding.
Document 'Reference_Docs\ADC_seq.pdf' unchanged. Skipping re-embedding.
Document 'Reference_Docs\AUTOSAR_CP_SWS_CANDriver.pdf' unchanged. Skipping re-embedding.
Document 'Reference_Docs\AUTOSAR_CP_SWS_COM.pdf' unchanged. Skipping re-embedding.
Document 'Reference_Docs\AUTOSAR_SWS_ADCDriver.pdf' unchanged. Skipping re-embedding.
Document 'Reference_Docs\AUTOSAR_SWS_BSWModeManager.pdf' unchanged. Skipping re-embedding.
Document 'Reference_Docs\AUTOSAR_SWS_DiagnosticCommunicationManager.pdf' unchanged. Skipping re-embedding.
Document 'Reference_Docs\AUTOSAR_SWS_DiagnosticEventManager.pdf' unchanged. Skipping re-embedding.
Document 'Reference_Docs\AUTOSAR_SWS_DiagnosticOverIP.pdf' unchanged. Skipping re-embedding.
Doc

In [6]:
# Step 1: Extract and Summarize Requirements
feature_query = f"Extract all requirements related to {feature_input}."
requirements_summary = summarize_requirements(feature_query, req_collection)
print("Requirements Summary:", requirements_summary)

API Response: {'choices': [{'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}, 'finish_reason': 'stop', 'index': 0, 'logprobs': None, 'message': {'annotations': [], 'content': '**Summary of Technical Requirements**\n\n---\n\n### 1. Functional Requirements\n\n**A. Voltage Monitoring**\n- The software shall monitor the following supply voltages periodically every 10ms:\n    - VDD_1V2_ETH\n    - VDD_0V8_ETH\n    - VDD_5V\n    - VDD_3V3_ETH\n    - VDD_1V2_SSD\n    - VDD_0V9_SSD\n    - US33_LDO3_S32G\n- For each voltage, the software shall:\n    - Check if the voltage is within under-voltage and over-voltage thresholds.\n    - If a deviation is detected, raise the BOSCH_SECPOWER

In [7]:
# Step 2: Extract Relevant Design Information
design_info = extract_design_information(requirements_summary, desgn_collection)
print("Design Information:", design_info)

Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk22
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk23
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk24
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk25
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk26
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk27
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk28
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk29
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk30
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk31
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk32
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk33
Add of existing embedding ID: Reference_Docs\S32G_SAF_EMCEM_UM.pdf_chunk34
Add of existing embedding

API Response: {'choices': [{'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}, 'finish_reason': 'stop', 'index': 0, 'logprobs': None, 'message': {'annotations': [], 'content': 'Here is a structured analysis based on your requirements and the provided design information.\n\n---\n\n## 1. Extracted Relevant API Functions, Parameters, Protocols, and Constraints\n\n### a. Diagnostic Event Management (DEM) & Error Handling\n\n- **API Functions (from AUTOSAR DEM and DCM):**\n  - `Dem_ReportErrorStatus(EventId, EventStatus)`: Reports DTCs such as BOSCH_SECPOWERSUPPLY_FAILURE.\n  - `Dem_SetEventStatus(EventId, EventStatus)`: Sets the status of a diagnostic event.\n  - `Dem_ClearEven

In [8]:
# Step 3: Extract Relevant Design information from Code.
code_design_info = extract_code_information(requirements_summary, reference_code_collection)
print("Code Design Information: ", code_design_info)

API Response: {'choices': [{'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}, 'finish_reason': 'stop', 'index': 0, 'logprobs': None, 'message': {'annotations': [], 'content': "Let's break down the provided Code Information and map it to the summarized Technical Requirements:\n\n---\n\n## 1. Relevant API Functions, Parameters, Protocols, and Constraints\n\n### a) API Functions\n\n- **Voltage Monitoring Functions:**\n  - `void rbsftyBldVolMon(void);`\n    - Main function for blade voltage monitoring.\n  - `uint16 rbsftyBldVoltMon_getBladeVoltage_u16(void);`\n    - Returns the current measured blade voltage.\n  - `boolean rbsftyBldVoltMon_getStatus_b(void);`\n    - Returns th

In [9]:
# Step 4: Create UML Design
design_querry = f"Extract the guidelines related to {UML_Diagram_1}"
uml_design_1 = create_uml_design(UML_Diagram_1, design_info, code_design_info, design_querry, guideline_collection)
print("UML Design:", uml_design_1)

API Response: {'choices': [{'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}, 'finish_reason': 'stop', 'index': 0, 'logprobs': None, 'message': {'annotations': [], 'content': 'Below is a comprehensive UML Activity Diagram (in PlantUML code) for the **Voltage Monitoring and Fault Handling** process, as derived from your Design and Code Information. This diagram includes all relevant API functions, the 10ms periodic monitoring, debounce logic, DTC/error reporting, fault injection, and safety monitoring, and follows the provided UML guidelines.\n\n---\n\n## 1. PlantUML Code: Voltage Monitoring & Fault Handling Activity Diagram\n\n```plantuml\n@startuml\n|VoltageMonitoringClas

In [10]:
# Step 5: Create UML Design
design_querry = f"Extract the guidelines related to {UML_Diagram_2}"
uml_design_2 = create_uml_design(UML_Diagram_2, design_info, code_design_info, design_querry, guideline_collection)
print("UML Design:", uml_design_2)

API Response: {'choices': [{'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}, 'finish_reason': 'stop', 'index': 0, 'logprobs': None, 'message': {'annotations': [], 'content': 'Below is a comprehensive UML Class Diagram (with PlantUML code and explanation) that models the software architecture for the described voltage monitoring, diagnostic event management, fault injection, and safety monitoring system. The design is based on the provided Design Information, Code Information, and the UML guidelines.\n\n---\n\n## 1. UML Class Diagram (PlantUML)\n\n```plantuml\n@startuml\n\' Package representing the SW Component for Blade Voltage Monitoring\npackage "BladeVoltageMonitoring_

In [11]:
#Step5: Generate Code from the Design
code_guideline_input_querry = "identify all the important guidelines related to C Coding"
generated_code = generate_code_from_design(uml_design_1, uml_design_2, code_guideline_input_querry, code_guideline_collection)
print("Generated Source Code:", generated_code)

API Response: {'choices': [{'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}, 'finish_reason': 'length', 'index': 0, 'logprobs': None, 'message': {'annotations': [], 'content': 'Below is a full AUTOSAR-style C implementation (with .h/.c separation and a main.c) of the described **Blade Voltage Monitoring, Fault Handling, Fault Injection, DEM, Error Handling, and Safety Monitoring** system. All APIs and logic from the UML diagrams are included, and the code is written to comply with the provided coding guidelines (MISRA C, no forbidden headers, types, etc.).\n\n---\n\n# File: rbsftybladevoltmon_types.h\n\n```c\n/**************************************************************