<a href="https://colab.research.google.com/github/ashivashankars/CMPE256_Assignments/blob/main/AI_Image_Search_with_Cohere_Embeddings_embed_v4_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##AI Image Search with Cohere Embeddings - embed-v4.0

Learning objective: Embeddings are a way to represent the meaning of texts, images, or information as a list of numbers. Using a simple comparison function, we can then calculate a similarity score for two embeddings to figure out whether two pieces of information are about similar things. Common use-cases for embeddings include semantic search, clustering, and classification.

In the following code, use the embed-v4.0 model to generate embeddings for attached images  and compare them using a similarity function.

In [4]:
!pip install cohere



In [3]:
import cohere
from PIL import Image
from io import BytesIO
import base64
import numpy as np
from typing import List, Tuple

In [17]:
# ---------- Setup ----------
co = cohere.ClientV2(api_key="urtWH3GhzA2nRsS6UGquUxZlu43kwaXIcxJP8PSS")
def image_to_base64_data_url(image_path):
#"""Convert image to base64 data URL"""
  with Image.open(image_path) as img:
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
  return f"data:image/png;base64,{img_base64}"
def image_to_base64_data_url_old(image_path: str) -> str:
  with Image.open(image_path) as img:
    buf = BytesIO()
    img.save(buf, format="PNG")
    b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
  return f"data:image/png;base64,{b64}"
def l2_normalize(v: np.ndarray) -> np.ndarray:
  n = np.linalg.norm(v, axis=-1, keepdims=True) + 1e-12
  return v / n
def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
# assumes both are already L2-normalized
  return float(np.dot(a, b))


In [18]:
# ---------- Build image embedding store ----------
def embed_images(image_paths: List[str]) -> List[Tuple[str, np.ndarray]]:
  data_urls = [image_to_base64_data_url(p) for p in image_paths]
  res = co.embed(
  images=data_urls,
  model="embed-v4.0",
  embedding_types=["float"],
  input_type="image",)
  vecs = [np.array(v, dtype=np.float32) for v in res.embeddings.float]
  vecs = l2_normalize(np.stack(vecs, axis=0))
  return list(zip(image_paths, vecs))

In [21]:
# ---------- Natural-language search over images ----------
def search_images(
query: str,
image_index: List[Tuple[str, np.ndarray]],
top_k: int = 5
) -> List[Tuple[str, float]]:
  qres = co.embed(
  texts=[query],
  model="embed-v4.0",
  embedding_types=["float"],
  input_type="search_query", # key for cross-modal retrieval
)
  print("qres")
  embedding_vector = qres.embeddings.float[0]
  print("qres Embedding vector length:", len(embedding_vector))
  qvec = np.array(qres.embeddings.float[0], dtype=np.float32)
  qvec = l2_normalize(qvec)

  # --- Retrieve embedding and token count ---
  #embedding_vector = qres.embeddings.float[0]
  scored = []
  for path, ivec in image_index:
    score = cosine_sim(qvec, ivec)
    scored.append((path, score))
  scored.sort(key=lambda x: x[1], reverse=True)
  return scored[:top_k]

In [29]:
# 1) Embed a small corpus of images
image_paths = [
"person_with_cap.jpg" , "cart_with_single_tire.jpg",
]
image_index = embed_images(image_paths)
print(image_index)

[('person_with_cap.jpg', array([-0.02044545,  0.03461122,  0.04877699, ...,  0.03096024,
       -0.03037609, -0.02526473], dtype=float32)), ('cart_with_single_tire.jpg', array([-0.00240004,  0.02031567,  0.03473581, ...,  0.01975798,
       -0.01027734, -0.00213115], dtype=float32))]


In [30]:
# 2) Run a natural-language query
#query = "person handling a package on a residential porch; delivery truck on the street"
#query = "Wells Fargo check"
#query = "glasses, necklace, hill with sun and fence"
#query = "Can we copy Strike ?"
query = "person with tape and cap"
#query = "cart with single tire"
#query = "glasses, sun, hills, red shirt"
results = search_images(query, image_index, top_k=3)
print(results)

qres
qres Embedding vector length: 1536
[('person_with_cap.jpg', 0.1719561219215393), ('cart_with_single_tire.jpg', 0.043788447976112366)]


In [31]:
# 3) Show results
print("\nTop matches:")
for path, score in results:
  print(f"{path} | cosine={score:.4f}")


Top matches:
person_with_cap.jpg | cosine=0.1720
cart_with_single_tire.jpg | cosine=0.0438
