In [5]:
import os

import cohere
import requests
from dotenv import load_dotenv



load_dotenv()


ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_APPLICATION_ENDPOINT = os.getenv("ASTRA_DB_APPLICATION_ENDPOINT")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")


co = cohere.ClientV2()



In [8]:
import base64
from io import BytesIO

from pdf2image import convert_from_bytes

pdf_url = "https://arxiv.org/pdf/2407.01449"
response = requests.get(pdf_url)
pdf_data = response.content

images = convert_from_bytes(pdf_data)

base64_images = []

for img in images:
    buffer = BytesIO()
    img.save(buffer, format='PNG')
    img_data = buffer.getvalue()
    
    base64_str = base64.b64encode(img_data).decode('utf-8')

    image_base64 = f"data:image/png;base64,{base64_str}"
    base64_images.append(image_base64)

print(f"Converted {len(base64_images)} pages to base64")
print(f"First image base64 length: {len(base64_images[0])}")

Converted 26 pages to base64
First image base64 length: 792382


In [16]:
response = co.embed(
    model="embed-v4.0",
    input_type="image",
    embedding_types=["float"],
    images=base64_images,
)

In [20]:
from astrapy import DataAPIClient
from astrapy.constants import VectorMetric
from astrapy.info import CollectionDefinition

db = DataAPIClient(
    token=ASTRA_DB_APPLICATION_TOKEN,
).get_database_by_api_endpoint(api_endpoint=ASTRA_DB_APPLICATION_ENDPOINT)

collection_definition = (
    CollectionDefinition.builder()
    .set_vector_dimension(1536)
    .set_vector_metric(VectorMetric.DOT_PRODUCT)
    .set_indexing("deny", ["annotations", "logs"])
    .build()
)

db.create_collection(
    name="cohere_embed_4",
    definition=collection_definition,
)



Collection(name="cohere_embed_4", keyspace="default_keyspace", database.api_endpoint="https://d085f1e6-07c1-4039-9c2d-e9a7902fccc1-us-east-2.apps.astra.datastax.com", api_options=FullAPIOptions(token=StaticTokenProvider(AstraCS:axmt...), ...))

In [22]:
collection = db.get_collection(
    name="cohere_embed_4",
)

for i, embedding in enumerate(response.embeddings.float_):
    collection.insert_one(
        {
            "page_number": i,
            "url": pdf_url,
            "$vector": embedding,
        }
    )


In [23]:
query_vector = co.embed(
    texts=["What is ColPali?"],
    model="embed-v4.0",
    input_type="search_query",
    embedding_types=["float"],
)


[[-0.0001077652,
  0.027832031,
  -0.00045776367,
  0.004547119,
  -0.018554688,
  0.008911133,
  -0.016601562,
  0.021606445,
  0.04272461,
  -0.0065307617,
  -0.022827148,
  -0.023071289,
  -0.029174805,
  0.020019531,
  -0.044677734,
  0.04272461,
  0.04321289,
  0.03491211,
  -0.024536133,
  0.01940918,
  0.059814453,
  0.009155273,
  -0.015380859,
  -0.0066223145,
  0.029052734,
  -0.0134887695,
  -0.0025482178,
  0.025512695,
  0.0087890625,
  0.017944336,
  -0.05859375,
  0.049804688,
  -0.012451172,
  0.024536133,
  -0.041992188,
  -0.029418945,
  -0.06225586,
  -0.019897461,
  -0.018554688,
  0.0030975342,
  0.024169922,
  -0.012573242,
  -0.026245117,
  0.026855469,
  0.013183594,
  0.004272461,
  -0.013061523,
  -0.015991211,
  0.022705078,
  0.0079956055,
  -0.029052734,
  -0.010864258,
  0.0035705566,
  0.022338867,
  -0.029296875,
  0.02734375,
  -0.0005493164,
  -0.0095825195,
  0.00579834,
  0.010192871,
  -0.0047302246,
  0.021118164,
  0.04321289,
  0.03149414,
  0.01

In [25]:
cursor = collection.find(
    {},
    limit=5,
    include_similarity=True,
    sort={"$vector": query_vector.embeddings.float_[0]},
)

list(cursor)

[{'_id': 'd3a63d11-bba5-4e4a-a63d-11bba54e4a00',
  'page_number': 1,
  'url': 'https://arxiv.org/pdf/2407.01449',
  '$similarity': 0.6844393},
 {'_id': '6bf254f8-cbd6-4c44-b254-f8cbd6cc44e2',
  'page_number': 7,
  'url': 'https://arxiv.org/pdf/2407.01449',
  '$similarity': 0.6813785},
 {'_id': '2133898c-c41c-458f-b389-8cc41c758f42',
  'page_number': 17,
  'url': 'https://arxiv.org/pdf/2407.01449',
  '$similarity': 0.6709163},
 {'_id': '7703197e-a79f-4bbe-8319-7ea79f6bbecd',
  'page_number': 22,
  'url': 'https://arxiv.org/pdf/2407.01449',
  '$similarity': 0.6707145},
 {'_id': 'f2cb38dc-9b76-4c17-8b38-dc9b76dc17e4',
  'page_number': 5,
  'url': 'https://arxiv.org/pdf/2407.01449',
  '$similarity': 0.66878486}]