In [None]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv() 
import os
import fitz  # PyMuPDF
import io
from PIL import Image
import base64
from typing import Dict, List, Tuple

In [None]:
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def openai(prompt, system_prompt="", model="gpt-4o", temp=0):
    ''' 
    Simplified openai call. Output is simply string.
    '''
    return client.chat.completions.create(
            model=model,
            temperature=temp,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ]
        ).choices[0].message.content

def openai_image(prompt, system_prompt="", model="gpt-4o-mini", temp=0.8,base64=""):
    ''' 
    Simplified openai call. Output is simply string.
    '''
    return client.chat.completions.create(
            model=model,
            temperature=temp,
            messages=[
                {"role": "system", "content": system_prompt},
                {
                    "role": "user",
                    "content": [
                {
                    "type": "text",
                    "text": prompt,
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{base64}"},
                },
            ],
        }
            ]
        ).choices[0].message.content

## Change image in PDF to be text explanation using OpenAI LLMVision

In [None]:
def extract_image_info(pdf_path: str, include_base64: bool = True) -> List[Dict]:
    """
    Extract information about images, graphs, and charts from a PDF file.
    Returns a list of dictionaries containing image information, location, and base64 encoding.
    """
    pdf_document = fitz.open(pdf_path)
    image_list = []

    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
       
        print(f"\nDebug - Processing page {page_num + 1}")
        
        # Get all images on the page
        images = page.get_images()
        print(f"Debug - Found {len(images)} images on page {page_num + 1}")
        
        # Process each image
        for img_index, img in enumerate(images):
            xref = img[0]  # xref is the first element in the image tuple
            print(f"Debug - Processing image {img_index + 1} with xref {xref}")
            
            # Get image bounds from the page
            image_list_info = page.get_image_info()
            image_rect = None
            
            # Try to find matching image info
            for info in image_list_info:
                if 'bbox' in info:
                    # Convert tuple to Rect
                    bbox = info['bbox']
                    image_rect = fitz.Rect(bbox[0], bbox[1], bbox[2], bbox[3])
                    break
            
            base_image = pdf_document.extract_image(xref)
            print(f"Debug - Base image extracted: {bool(base_image)}")
            
            if base_image:
                unique_id = f"p{page_num + 1}_i{img_index}"
                
                # If we found image bounds, use them
                if image_rect:
                    # Skip if it's a full-page image
                    if (abs(image_rect.width - page.rect.width) < 1 and 
                        abs(image_rect.height - page.rect.height) < 1):
                        print(f"Debug - Skipping full-page image {img_index + 1}")
                        continue
                        
                    image_data = {
                        'id': unique_id,
                        'page_number': page_num + 1,
                        'image_index': img_index,
                        'location': {
                            'x0': image_rect.x0,
                            'y0': image_rect.y0,
                            'x1': image_rect.x1,
                            'y1': image_rect.y1
                        },
                        'width': image_rect.width,
                        'height': image_rect.height,
                        'image_format': base_image['ext'],
                        'xref': xref
                    }
                else:
                    # Try to get image size from the image info
                    width = img[2]   # Width is third element
                    height = img[3]  # Height is fourth element
                    
                    # Calculate a reasonable position if bounds not found
                    image_data = {
                        'id': unique_id,
                        'page_number': page_num + 1,
                        'image_index': img_index,
                        'location': {
                            'x0': 0,
                            'y0': 0,
                            'x1': width,
                            'y1': height
                        },
                        'width': width,
                        'height': height,
                        'image_format': base_image['ext'],
                        'xref': xref
                    }
                
                if include_base64:
                    # Convert image data to base64
                    image_bytes = base_image['image']
                    base64_image = base64.b64encode(image_bytes).decode('utf-8')
                    image_data['base64_data'] = base64_image
                    image_data['mime_type'] = f"image/{base_image['ext']}"
                
                image_list.append(image_data)
                print(f"Debug - Added image {img_index + 1} to list")
                print(f"Debug - Image location: x0={image_data['location']['x0']}, y0={image_data['location']['y0']}, " 
                      f"x1={image_data['location']['x1']}, y1={image_data['location']['y1']}")
    
    pdf_document.close()
    print(f"\nDebug - Final image count: {len(image_list)}")
    return image_list

In [None]:
def get_image_text_list(image_list:List[Dict]):
    replacement_text_dict = {}
    for image_info in image_list:
        text =""
        try :
            text  = openai_image(prompt, system_prompt=system_prompt, model="gpt-4o-mini", temp=0,base64=image_info['base64_data'])
        except: 
            print("An exception occurred")
        replacement_text_dict[image_info['id']] = text
    return replacement_text_dict

In [None]:
def replace_images_with_text(pdf_path: str, image_info_list: List[Dict], replacement_texts: Dict[str, str], output_path: str):
    """
    Replace images with text using the image info from extract_image_info while preserving existing content
    """
    # Create a new PDF to store the modified content
    pdf_document = fitz.open(pdf_path)
    
    # Group images by page
    images_by_page = {}
    for img_info in image_info_list:
        page_num = img_info['page_number'] - 1  # Convert to 0-based
        if page_num not in images_by_page:
            images_by_page[page_num] = []
        images_by_page[page_num].append(img_info)
    
    # Process each page
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        
        if page_num in images_by_page:
            # Create a new page content stream
            page.clean_contents()
            page.apply_redactions()
            
            # Process each image on the page
            for img_info in images_by_page[page_num]:
                img_id = img_info['id']
                
                if img_id in replacement_texts:
                    # Get the original image location
                    rect = fitz.Rect(
                        img_info['location']['x0'],
                        img_info['location']['y0'],
                        img_info['location']['x1'],
                        img_info['location']['y1']
                    )
                    
                    # Insert replacement text
                    text = replacement_texts[img_id]
                    
                    # Calculate font size based on rectangle height
                    font_size = min(rect.height / 1.5, 11)
                    print(f"Replacing image at: {rect}")
                    try:
                        # Create white background just for the image area
                        page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
                        
                        # Add text box
                        rc = page.insert_textbox(
                            rect,
                            text,
                            fontsize=font_size,
                            fontname='helv',
                            align=1,  # Center alignment
                            color=(0, 0, 0),
                            fill_opacity=1
                        )
                        
                        # If text doesn't fit, reduce font size
                        while rc < 0 and font_size > 6:
                            font_size -= 1
                            rc = page.insert_textbox(
                                rect,
                                text,
                                fontsize=font_size,
                                fontname='helv',
                                align=1,
                                color=(0, 0, 0),
                                fill_opacity=1
                            )
                    except Exception as e:
                        print(f"Error inserting text for image {img_id}: {str(e)}")
    
    # Save the modified PDF
    pdf_document.save(output_path, garbage=4, deflate=True, clean=True)
    pdf_document.close()

In [None]:
def pdf_preparation(pdf_path:str, output_path:str):
    image_info = extract_image_info(pdf_path)
    replacement_text_dict = get_image_text_list(image_info)
    replace_images_with_text(pdf_path,image_info,replacement_text_dict,output_path)
    

## Part Input Document to Vector Database

In [None]:
import chromadb
import uuid
from langchain.document_loaders.pdf import PyPDFDirectoryLoader, PyMuPDFLoader# Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
from langchain.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma # Impo
from langchain.embeddings import (
    HuggingFaceEmbeddings,
    SentenceTransformerEmbeddings,
    HuggingFaceBgeEmbeddings,
)

In [None]:
chroma_client = chromadb.HttpClient(
    host={host},
    port={port}
)

In [None]:

def load_documents(pdf_path):
  """
  Load PDF documents from the specified directory using PyPDFDirectoryLoader.
  Returns:
  List of Document objects: Loaded PDF documents represented as Langchain
                                                          Document objects.
  """
  # Initialize PDF loader with specified directory
  document_loader = PyMuPDFLoader(pdf_path) 
  # Load PDF documents and return them as a list of Document objects
  return document_loader.load() 

documents = load_documents("generated_final_test.pdf") # Call the function
# Inspect the contents of the first document as well as metadata
print(documents)

In [None]:
def split_document (document):
    text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=300,
            chunk_overlap=100,
            length_function=len,
            add_start_index=True,
        )
    chunks = text_splitter.split_documents(document)
    print(f"Split PDF into {len(chunks)} chunks")
        
        # Print example of first chunk if available
    if chunks:
        print("\nFirst chunk example:")
        print("Content:", chunks[0].page_content[:150], "...")  # Show first 150 chars
        print("Metadata:", chunks[0].metadata)
            
    return chunks

In [None]:
collection_name="cred_final_test"
collection = chroma_client.get_or_create_collection(
            name=collection_name
        )
        
        # 5. Initialize embeddings
embeddings = SentenceTransformerEmbeddings(
                model_name="all-MiniLM-L6-v2"  # Fast and lightweight
            )

In [None]:
for i, chunk in enumerate(chunks):
    collection_name=collection_name
    doc_id = str(uuid.uuid4())
            
            # Get embeddings for the chunk
    embedding = embeddings.embed_query(chunk.page_content)
            
            # Store in ChromaDB
    collection.add(
                ids=[doc_id],
                embeddings=[embedding],
                documents=[chunk.page_content],
                metadatas=[{
                    **chunk.metadata,
                    'chunk_id': i,
                    'source': "slik2.pdf",
                    'embedding_model': embeddings.model_name
                }]
            )
            
            # Print progress every 10 chunks
    if (i + 1) % 10 == 0:
        print(f"Processed {i + 1}/{len(chunks)} chunks")
    print(f"\nSuccessfully processed and stored PDF in collection '{collection_name}'")
    print(f"Total documents in collection: {collection.count()}")
            # Generate unique ID for each chunk
            