In [37]:
import os

import cohere
import requests
from dotenv import load_dotenv



load_dotenv()


ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_APPLICATION_ENDPOINT = os.getenv("ASTRA_DB_APPLICATION_ENDPOINT")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


co = cohere.ClientV2()



In [8]:
import base64
from io import BytesIO

from pdf2image import convert_from_bytes

pdf_url = "https://arxiv.org/pdf/2407.01449"
response = requests.get(pdf_url)
pdf_data = response.content

images = convert_from_bytes(pdf_data)

base64_images = []

for img in images:
    buffer = BytesIO()
    img.save(buffer, format='PNG')
    img_data = buffer.getvalue()
    
    base64_str = base64.b64encode(img_data).decode('utf-8')

    image_base64 = f"data:image/png;base64,{base64_str}"
    base64_images.append(image_base64)

print(f"Converted {len(base64_images)} pages to base64")
print(f"First image base64 length: {len(base64_images[0])}")

Converted 26 pages to base64
First image base64 length: 792382


In [16]:
response = co.embed(
    model="embed-v4.0",
    input_type="image",
    embedding_types=["float"],
    images=base64_images,
)

In [20]:
from astrapy import DataAPIClient
from astrapy.constants import VectorMetric
from astrapy.info import CollectionDefinition

db = DataAPIClient(
    token=ASTRA_DB_APPLICATION_TOKEN,
).get_database_by_api_endpoint(api_endpoint=ASTRA_DB_APPLICATION_ENDPOINT)

collection_definition = (
    CollectionDefinition.builder()
    .set_vector_dimension(1536)
    .set_vector_metric(VectorMetric.DOT_PRODUCT)
    .set_indexing("deny", ["annotations", "logs"])
    .build()
)

db.create_collection(
    name="cohere_embed_4",
    definition=collection_definition,
)



Collection(name="cohere_embed_4", keyspace="default_keyspace", database.api_endpoint="https://d085f1e6-07c1-4039-9c2d-e9a7902fccc1-us-east-2.apps.astra.datastax.com", api_options=FullAPIOptions(token=StaticTokenProvider(AstraCS:axmt...), ...))

In [22]:
collection = db.get_collection(
    name="cohere_embed_4",
)

for i, embedding in enumerate(response.embeddings.float_):
    collection.insert_one(
        {
            "page_number": i,
            "url": pdf_url,
            "$vector": embedding,
        }
    )


In [45]:
query_vector = co.embed(
    texts=["What is ColPali?"],
    model="embed-v4.0",
    input_type="search_query",
    embedding_types=["float"],
)


In [43]:
cursor = collection.find(
    {},
    limit=5,
    include_similarity=True,
    sort={"$vector": query_vector.embeddings.float_[0]},
)

In [44]:
from openai import OpenAI


pages = [base64_images[page["page_number"]] for page in cursor]


client = OpenAI(api_key=OPENAI_API_KEY)



def summarize_pdf_pages(base64_images, query: str):
    messages = [
        {
            "role": "system",
            "content": "Answer the user's question based on only the context provided."
        }, 
        {
            "role": "user",
            "content": [
                {"type": "text", "text": query}
            ]
        }
    ]
    
    for _, img_b64 in enumerate(base64_images):
        messages[1]["content"].append({
            "type": "image_url",
            "image_url": {
                "url": img_b64
            }
        })
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=4000
    )
    
    return response.choices[0].message.content


summarize_pdf_pages(pages, "What is ColPali?")

'ColPali is a concept and model architecture based on Vision Language Models (VLMs) designed to simplify document retrieval. It efficiently indexes documents using visual features, aiming to improve query matching with late interaction mechanisms. ColPali offers quick and easy-to-train indexing that outperforms other retrieval systems on benchmarks like ViDoRe. It focuses on combining visual and textual understanding to provide better retrieval performance, fast queries, and high throughput corpus indexation.'