# Imports and Path to Files

In [None]:
%pip install "unstructured[all-docs]" chromadb pydantic lxml tiktoken langchain langchain-community langchain-openai langchain-chroma

In [16]:
from unstructured.partition.pdf import partition_pdf
import pytesseract
from tqdm import tqdm
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda
from langchain.prompts import PromptTemplate
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import pickle

import os
load_dotenv()

True

In [18]:
pdf_path = './nRF52840_PS_v1.8.pdf'
uart_pdf_path='./uart.pdf'
image_path = "./images/"
db_path = "./chroma_langchain_db"
pickle_path = "./stored_data.pkl"

# Ingest PDF 

Split the pdf into text chunks, save any table and images as images and store them into a separate directory

In [7]:
def load_chroma_db(local_directory=db_path):
    embeddings = OpenAIEmbeddings()
    return Chroma(persist_directory=local_directory, embedding_function=embeddings)

In [8]:
def categorize_elements(raw_pdf_elements):
    text_elements = []
    table_elements = []
    for element in raw_pdf_elements:
        if 'CompositeElement' in str(type(element)):
            text_elements.append(str(element))
        elif 'Table' in str(type(element)):
            table_elements.append(str(element))
    return text_elements, table_elements

# Generate summaries for the text blocks and table/images. 

These descriptions help to match a query better, so that we don't have to deal with the spaces and formatting from the direct extraction

In [20]:
model = ChatOpenAI(model="gpt-4o", temperature=0, max_tokens=1024)

In [9]:
# Generate summaries of text elements
def generate_text_summaries(texts, tables, summarize_texts=False):
    """
    Summarize text elements
    texts: List of str
    tables: List of str
    summarize_texts: Bool to summarize texts
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well-optimized for retrieval. \
    Don't use Markdown, just plain text output. Table \
    or text: {element} """
    prompt = PromptTemplate.from_template(prompt_text)

    # Text summary chain
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    # Initialize empty summaries
    text_summaries = []
    table_summaries = []

    # Apply to text if texts are provided and summarization is requested
    if texts and summarize_texts:
        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 1})
    elif texts:
        text_summaries = texts

    # Apply to tables if tables are provided
    if tables:
        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 1})

    return text_summaries, table_summaries


In [10]:
import os
import base64
# encode image
def encode_image(image_path):
    """Getting the base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [11]:
def image_summarize(img_base64, prompt):
    """Make image summary"""
    msg = model.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content

In [12]:
def generate_img_summaries(path):
    """
    Generate summaries and base64 encoded strings for images
    path: Path to list of .jpg files extracted by Unstructured
    """
    # Store base64 encoded images
    img_base64_list = []

    # Store image summaries
    image_summaries = []

    # Prompt
    prompt = """You are an assistant tasked with summarizing images for retrieval. \
    These summaries will be embedded and used to retrieve the raw image. \
    Include all the values in each image, including extracting all the text. \
    Give a concise summary of the image that is well optimized for retrieval."""

    # Apply to images
    for img_file in sorted(os.listdir(path)):
        if img_file.endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(path, img_file)
            base64_image = encode_image(img_path)
            img_base64_list.append(base64_image)
            image_summaries.append(image_summarize(base64_image, prompt))

    return img_base64_list, image_summaries

# Create a vector database to store summaries 

In [13]:
def create_multi_vector_retriever(vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """
    # Initialize the storage layer
    store = InMemoryStore()
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )
    
    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    # Add texts, tables, and images
    # Check that text_summaries is not empty before adding
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    # Check that table_summaries is not empty before adding
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    # Check that image_summaries is not empty before adding
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever



In [22]:
if os.path.exists(db_path) and os.path.exists(pickle_path):
    print("Loading existing Chroma database...")
    vectorstore = load_chroma_db()
    
    with open(pickle_path, 'rb') as f:
        loaded_data = pickle.load(f)

    # Access the variables
    texts = loaded_data['texts']
    tables = loaded_data['tables']
    text_summaries = loaded_data['text_summaries']
    table_summaries = loaded_data['table_summaries']
    img_base64_list = loaded_data['img_base64_list']
    image_summaries = loaded_data['image_summaries']

else:
    print("Creating new Chroma database...")
    # Store embeddings in Chroma
    
    pdf_elements = partition_pdf(
        pdf_path,
        chunking_strategy="by_title",
        extract_images_in_pdf=True,
        infer_table_structure=True,
        extract_image_block_types=['Table', 'Image'],
        extract_image_block_output_dir='./images',
        max_characters=3000,
        new_after_n_chars=2800,
        combine_text_under_n_chars=2000,
        image_output_dir_path=image_path
    )
    
    # extract tables and texts
    texts, tables = categorize_elements(pdf_elements)
    
    # Get text & table summaries
    text_summaries, table_summaries = generate_text_summaries(texts[0:19], tables, summarize_texts=True)
    
    # Image summaries
    img_base64_list, image_summaries = generate_img_summaries("./images")
    
    with open(pickle_path, 'wb') as f:
        pickle.dump({
            'texts': texts,
            'tables': tables,
            'text_summaries': text_summaries,
            'table_summaries': table_summaries,
            'img_base64_list': img_base64_list,
            'image_summaries': image_summaries
        }, f)
    
    vectorstore = Chroma(
        collection_name="mm_rag",
        embedding_function = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY")),
        persist_directory="./chroma_langchain_db"
    )

Creating new Chroma database...


In [23]:
# Create retriever
retriever_multi_vector_img = create_multi_vector_retriever(
    vectorstore,
    text_summaries,
    texts,
    table_summaries,
    tables,
    image_summaries,
    img_base64_list,
)

In [24]:
import io
import re

from IPython.display import HTML, display
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from PIL import Image


def plt_img_base64(img_base64):
    """Disply base64 encoded string as image"""
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'
    # Display the image by rendering the HTML
    display(HTML(image_html))

def looks_like_base64(sb):
    """Check if the string looks like base64"""
    return re.match("^[A-Za-z0-9+/]+[=]{0,2}$", sb) is not None


def is_image_data(b64data):
    """
    Check if the base64 data is an image by looking at the start of the data
    """
    image_signatures = {
        b"\xFF\xD8\xFF": "jpg",
        b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A": "png",
        b"\x47\x49\x46\x38": "gif",
        b"\x52\x49\x46\x46": "webp",
    }
    try:
        header = base64.b64decode(b64data)[:8]  # Decode and get the first 8 bytes
        for sig, format in image_signatures.items():
            if header.startswith(sig):
                return True
        return False
    except Exception:
        return False

def resize_base64_image(base64_string, size=(128, 128)):
    """
    Resize an image encoded as a Base64 string
    """
    # Decode the Base64 string
    img_data = base64.b64decode(base64_string)
    img = Image.open(io.BytesIO(img_data))

    # Resize the image
    resized_img = img.resize(size, Image.LANCZOS)

    # Save the resized image to a bytes buffer
    buffered = io.BytesIO()
    resized_img.save(buffered, format=img.format)

    # Encode the resized image to Base64
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

def split_image_text_types(docs):
    """
    Split base64-encoded images and texts
    """
    b64_images = []
    texts = []
    for doc in docs:
        # Check if the document is of type Document and extract page_content if so
        if isinstance(doc, Document):
            doc = doc.page_content
        if looks_like_base64(doc) and is_image_data(doc):
            doc = resize_base64_image(doc, size=(1300, 600))
            b64_images.append(doc)
        else:
            texts.append(doc)
    if len(b64_images) > 0:
        return {"images": b64_images[:1], "texts": []}
    return {"images": b64_images, "texts": texts}
  


# Prepare RAG pipeline  
user can ask question and the query will search for relevant documents

In [25]:
def img_prompt_func(data_dict):
    """
    Join the context into a single string
    """
    messages = []

    # Adding the text for analysis
    text_message = {
        "type": "text",
        "text": (
            "You are an AI scientist tasking with providing factual answers from a datasheet of a System-on-Chip (SoC) \n"
            "Use this information to provide answers related to the user question. \n"
            f"User-provided question: {data_dict['question']}\n\n"
        ),
    }
    messages.append(text_message)
    # Adding image(s) to the messages if present
    if data_dict["context"]["images"]:
        for image in data_dict["context"]["images"]:
            image_message = {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}"},
            }
            messages.append(image_message)
    return [HumanMessage(content=messages)]

def multi_modal_rag_chain(retriever):
    """
    Multi-modal RAG chain
    """

    # RAG pipeline
    chain = (
        {
            "context": retriever | RunnableLambda(split_image_text_types),
            "question": RunnablePassthrough(),
        }
        | RunnableLambda(img_prompt_func)
        | model  # MM_LLM
        | StrOutputParser()
    )

    return chain

In [26]:
chain_multimodal_rag = multi_modal_rag_chain(retriever_multi_vector_img)

In [27]:
query = """What is the starting address of UART?"""
docs = retriever_multi_vector_img.get_relevant_documents(query, limit=1)
split_image_text_types(docs)

  docs = retriever_multi_vector_img.get_relevant_documents(query, limit=1)


{'images': ['/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAJYBRQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDraKKK9o+NfvrT+tAooopitogooopdQt7wUUUUxy3CiiigHqFFFFO4rBRRRUsroFFFFMQUUUUAFHY/SijsfpQxokuP+Ph/r/So6kuP+Ph/r/So6FsUnZS9AooopGaVrhRRRTfQr7QUUUU0JbhRRRTuHVhRRRQ

In [28]:
def ask_bot(query):
    # docs = retriever_multi_vector_img.get_relevant_documents(query, limit=10)
    # print(split_image_text_types(docs))
    return chain_multimodal_rag.invoke(query)

In [29]:
response = ask_bot("What is the base address of UART")
print(response)

The base address of the UART is 0x40002000.


In [65]:
print(ask_bot("How many registers does UART have? List their name, size, and address offset"))

The UART has the following registers:

1. **TASKS_STARTRX**
   - Size: Not specified
   - Address Offset: 0x000

2. **TASKS_STOPRX**
   - Size: Not specified
   - Address Offset: 0x004

3. **TASKS_STARTTX**
   - Size: Not specified
   - Address Offset: 0x008

4. **TASKS_STOPTX**
   - Size: Not specified
   - Address Offset: 0x00C

5. **TASKS_SUSPEND**
   - Size: Not specified
   - Address Offset: 0x01C

6. **EVENTS_CTS**
   - Size: Not specified
   - Address Offset: 0x100

7. **EVENTS_NCTS**
   - Size: Not specified
   - Address Offset: 0x104

8. **EVENTS_RXDRDY**
   - Size: Not specified
   - Address Offset: 0x108

9. **EVENTS_TXDRDY**
   - Size: Not specified
   - Address Offset: 0x11C

10. **EVENTS_ERROR**
    - Size: Not specified
    - Address Offset: 0x124

11. **EVENTS_RXTO**
    - Size: Not specified
    - Address Offset: 0x144

12. **SHORTS**
    - Size: Not specified
    - Address Offset: 0x200

13. **INTENSET**
    - Size: Not specified
    - Address Offset: 0x304

14. **INT

In [96]:
query = "list all peripherals and their base address, and description"
docs = retriever_multi_vector_img.get_relevant_documents(query, limit=10)

In [97]:
print(ask_bot(query))

Here is a list of peripherals, their base addresses, and descriptions:

1. **SWI5**
   - Base Address: 0x40019000
   - Description: Software interrupt 5

2. **TIMER3**
   - Base Address: 0x4001A000
   - Description: Timer 3

3. **TIMER4**
   - Base Address: 0x4001B000
   - Description: Timer 4

4. **PWM0**
   - Base Address: 0x4001C000
   - Description: Pulse width modulation unit 0

5. **PDM**
   - Base Address: 0x4001D000
   - Description: Pulse Density modulation (digital microphone) interface

6. **ACL**
   - Base Address: 0x4001E000
   - Description: Access control lists

7. **NVMC**
   - Base Address: 0x4001E000
   - Description: Non-volatile memory controller

8. **PPI**
   - Base Address: 0x4001F000
   - Description: Programmable peripheral interconnect

9. **MWU**
   - Base Address: 0x40020000
   - Description: Memory watch unit

10. **PWM1**
    - Base Address: 0x40021000
    - Description: Pulse width modulation unit 1

11. **PWM2**
    - Base Address: 0x40022000
    - Descr

In [72]:
print(ask_bot("Explain the memory map, describe the system address map and the address map."))

The memory map of the System-on-Chip (SoC) is organized into distinct regions, each serving a specific purpose. Here's a breakdown of the system address map:

1. **Code (0x00000000 - 0x1FFFFFFF):**
   - This region is designated for executable code. It typically contains the firmware or software that runs on the SoC.

2. **XIP (Execute In Place) (0x20000000 - 0x3FFFFFFF):**
   - This area is used for executing code directly from external flash memory without copying it to RAM. The XIP region in the system address map corresponds to a section in the external flash memory.

3. **SRAM (0x40000000 - 0x5FFFFFFF):**
   - This section is allocated for static RAM, which is used for data storage and manipulation during program execution.

4. **Peripheral (0x40000000 - 0x5FFFFFFF):**
   - This region is used for memory-mapped peripheral devices. It allows the CPU to interact with hardware components like timers, UARTs, and other I/O devices.

5. **RAM (0x60000000 and above):**
   - This area is 

In [98]:
print(ask_bot("List all the peripherals, their base address, instance, and description."))

Here are the peripherals listed in the datasheet:

1. **Peripheral:** PWM
   - **Base Address:** 0x4001C000
   - **Instance:** PWM0
   - **Description:** Pulse width modulation unit 0

2. **Peripheral:** PWM
   - **Base Address:** 0x40021000
   - **Instance:** PWM1
   - **Description:** Pulse width modulation unit 1

3. **Peripheral:** PWM
   - **Base Address:** 0x40022000
   - **Instance:** PWM2
   - **Description:** Pulse width modulation unit 2

4. **Peripheral:** PWM
   - **Base Address:** 0x4002D000
   - **Instance:** PWM3
   - **Description:** Pulse width modulation unit 3
