In [50]:
from pdf2image import convert_from_path
from dotenv import load_dotenv
from pathlib import Path
import hashlib, json
from io import BytesIO
import os
import cohere
from openai import OpenAI
import base64
import numpy as np
import tqdm
from cohere import Client  
from PIL import Image


In [37]:
load_dotenv()

True

In [38]:
co = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))

In [39]:
co

<cohere.client_v2.ClientV2 at 0x727d8066fdd0>

In [40]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [41]:
client

<openai.OpenAI at 0x727d80674080>

In [42]:
def compute_sha256(data: bytes) -> str:
    h = hashlib.sha256()
    h.update(data)
    return h.hexdigest()

In [43]:
def pdf_to_images_with_hashes(input_pdf: str, images_folder: str = 'images', hashes_folder: str = 'hashes', dpi: int = 300, 
    fmt: str = 'png', hash_filename: str = 'pages_hashes.json'):

    pdf = Path(input_pdf)
    images_dir = Path(images_folder)
    hashes_dir = Path(hashes_folder)
    images_dir.mkdir(parents=True, exist_ok=True)
    hashes_dir.mkdir(parents=True, exist_ok=True)

    hash_path = hashes_dir / hash_filename

    # Load existing manifest if exists
    if hash_path.exists():
        with open(hash_path, 'r') as f:
            hash = json.load(f)
    else:
        hash = {}

    pages = convert_from_path(str(pdf), dpi=dpi)
    updated = False

    for page_num, page in enumerate(pages, start=1):
        key = f"{pdf.stem}_page_{page_num}"
        if key in hash:
            print(f"✔ Skipping page {page_num}, already processed.")
            continue

        # Convert image to bytes for hashing
        buffer = BytesIO()
        page.save(buffer, format=fmt.upper())
        img_bytes = buffer.getvalue()

        # Compute hash
        short_hash = compute_sha256(img_bytes)

        # Save image without hash in name
        filename = f"{pdf.stem}_page_{page_num}.{fmt}"
        image_path = images_dir / filename
        image_path.write_bytes(img_bytes)
        print(f"🖼️ Saved: {filename}")

        # Update manifest
        hash[key] = {"filename": filename, "hash": short_hash}
        updated = True

    # Write updated manifest
    if updated:
        with open(hash_path, 'w') as f:
            json.dump(hash, f, indent=2)
        print(f"📘 Updated hashes at {hash_path}")
    else:
        print("✅ No new pages — hashes unchanged.")


In [44]:
pdf_to_images_with_hashes("source_docs/2024TrustFundAnnualReports.pdf")

✔ Skipping page 1, already processed.
✔ Skipping page 2, already processed.
✔ Skipping page 3, already processed.
✔ Skipping page 4, already processed.
✔ Skipping page 5, already processed.
✔ Skipping page 6, already processed.
✔ Skipping page 7, already processed.
✔ Skipping page 8, already processed.
✔ Skipping page 9, already processed.
✔ Skipping page 10, already processed.
✔ Skipping page 11, already processed.
✔ Skipping page 12, already processed.
✔ Skipping page 13, already processed.
✔ Skipping page 14, already processed.
✔ Skipping page 15, already processed.
✔ Skipping page 16, already processed.
✅ No new pages — hashes unchanged.


In [None]:
# # Some helper functions to resize images and to convert them to base64 format
# max_pixels = 1568*1568  #Max resolution for images

# # Resize too large images
# def resize_image(pil_image):
#     org_width, org_height = pil_image.size

#     # Resize image if too large
#     if org_width * org_height > max_pixels:
#         scale_factor = (max_pixels / (org_width * org_height)) ** 0.5
#         new_width = int(org_width * scale_factor)
#         new_height = int(org_height * scale_factor)
#         pil_image.thumbnail((new_width, new_height))


In [51]:
# Convert images to a base64 string before sending it to the API
def base64_from_image(img_path):
    pil_image = PIL.Image.open(img_path)
    img_format = pil_image.format if pil_image.format else "PNG"

    # resize_image(pil_image)

    with io.BytesIO() as img_buffer:
        pil_image.save(img_buffer, format=img_format)
        img_buffer.seek(0)
        img_data = f"data:image/{img_format.lower()};base64,"+base64.b64encode(img_buffer.read()).decode("utf-8")

    return img_data

In [52]:

# Initialize Cohere
co = Client(api_key=os.getenv("COHERE_API_KEY"))  # Replace with your key

# Local images folder
img_folder = "images"  # or wherever your images are saved
img_paths = sorted([
    os.path.join(img_folder, f)
    for f in os.listdir(img_folder)
    if f.lower().endswith(('.png', '.jpg', '.jpeg'))
])

doc_embeddings = []

for img_path in tqdm.tqdm(img_paths, desc="Embedding images"):
    api_input_document = {
        "content": [
            {"type": "image", "image": base64_from_image(img_path)},
        ]
    }

    # Call Cohere Embed v4.0
    api_response = co.embed(
        model="embed-v4.0",
        input_type="search_document",
        embedding_types=["float"],
        inputs=[api_input_document],
    )

    # Extract embedding
    emb = np.asarray(api_response.embeddings.float[0])
    doc_embeddings.append(emb)

doc_embeddings = np.vstack(doc_embeddings)
print("\n✅ Embeddings shape:", doc_embeddings.shape)

Embedding images:   0%|          | 0/16 [00:00<?, ?it/s]


NameError: name 'PIL' is not defined