In [4]:
# !pip install openai streamlit chromadb pymupdf

In [9]:
import fitz  # PyMuPDF
import chromadb
import base64
import io
import streamlit as st
from openai import OpenAI

In [None]:
# OpenAI API key (Set this properly in your environment)
OpenAI_client = OpenAI()

In [None]:
# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")
try:
    chroma_client.delete_collection(name="pdf_content")
except:
    pass
collection = chroma_client.create_collection(name="pdf_content")

In [None]:
def encode_image(image):
    """Convert image to base64 for OpenAI Vision API."""
    return base64.b64encode(image.getvalue()).decode()

In [None]:
def describe_image(image):
    """Send image to OpenAI GPT-4o-Mini for description."""
    base64_image = encode_image(image)
    response = OpenAI_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Describe the attached image in JSON format."},
            {"role": "user", "content": [{"type": "image", "image": base64_image}]}
        ],
    )
    return response["choices"][0]["message"]["content"]

In [None]:
def parse_pdf(pdf_path):
    """Extract text and replace images with descriptions in-line."""
    doc = fitz.open(pdf_path)
    parsed_content = []

    for page_num, page in enumerate(doc):
        blocks = page.get_text("blocks")  # Extract text blocks (for positioning)
        images = page.get_images(full=True)  # Get images

        content_blocks = []  # To store ordered content (text + images)

        # Process text blocks first
        for block in blocks:
            block_text = block[4].strip()
            if block_text:
                content_blocks.append((block[1], "text", block_text))  # (y-position, type, content)

        # Process images
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = io.BytesIO(image_bytes)

            # Get image description
            description = describe_image(image)
            content_blocks.append((img[1], "image", f"[Page: {page_num+1}, Image {img_index+1}]: {description}"))

        # Sort all content (text + images) by y-coordinate to maintain order
        content_blocks.sort(key=lambda x: x[0])

        # Construct final content with images in correct places
        for block in content_blocks:
            parsed_content.append(block[2])
    return parsed_content

In [None]:
def store_in_vector_db(doc_id, content):
    """Store parsed content in ChromaDB."""
    collection.add(documents=content, ids=[f"{doc_id}_{i}" for i in range(len(content))])

In [None]:
def query_vector_db(query_text):
    """Query ChromaDB for relevant content."""
    results = collection.query(query_texts=[query_text], n_results=10)
    return results["documents"]

In [None]:
# Example Usage
pdf_path = "/content.pdf"
parsed_content = parse_pdf(pdf_path)
store_in_vector_db("content.pdf", parsed_content)

In [None]:
# Query Example
query_result = query_vector_db("What is ML?")
print(query_result)