### Install libraries

In [None]:
!pip install pymupdf sentence-transformers numpy chromadb faiss-cpu qdrant-client weaviate-client pinecone-client

### Loading + Chunking Docs

In [None]:
import fitz

In [None]:
def load_pdf(path):
  doc = fitz.open(path)
  text = ""
  for page in doc:
    text += page.get_text()
  return text

In [None]:
def chunk_text(text, chunk_size = 500, overlap = 100):
  chunks = []
  start = 0
  while start < len(text):
    chunks.append(text[start:start + chunk_size])
    start += chunk_size - overlap
  return chunks

In [None]:
pdf_text = load_pdf("/content/Attention_Is_All_You_Need.pdf")

In [None]:
chunks = chunk_text(pdf_text)

In [None]:
print(f"Total chunks: {len(chunks)}")

### Create Embeddings

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
embeddings = model.encode(chunks).astype("float32")

In [None]:
query = "What is multi head attention"
query_embedding = model.encode([query]).astype("float32")

## ChromaDB

In [None]:
import chromadb
from chromadb.config import Settings

In [None]:
client = chromadb.Client(Settings(anonymized_telemetry=False))
collection = client.get_or_create_collection(name="pdf")

In [None]:
collection.add(
    documents=chunks,
    embeddings=embeddings.tolist(),
    ids=[str(i) for i in range(len(chunks))]
)

In [None]:
res = collection.query(query_texts=[query], n_results=3)
print("ChromaDB:", res["documents"][0])

## FAISS

In [None]:
import faiss

In [None]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

In [None]:
D, I = index.search(query_embedding, k = 3)
print("FAISS: ", [chunks[i] for i in I[0]])

## Qdrant

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

In [None]:
client = QdrantClient(":memory:")

In [None]:
collection_name = "pdf"
vector_size = embeddings.shape[1]

In [None]:
if client.collection_exists(collection_name):
  client.delete_collection(collection_name)

In [None]:
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=vector_size,
        distance=Distance.COSINE
      )
)

In [None]:
client.upsert(
    collection_name = "pdf",
    points = [
        PointStruct(id = i, vector = embeddings[i], payload = {"text": chunks[i]}) for i in range(len(chunks))
    ]
)

In [None]:
res = client.query_points(
    collection_name = "pdf",
    query = query_embedding[0].tolist(),
    limit = 3,
    with_payload = True
)

In [None]:
points = res.points

In [None]:
print("Qdrant:", [p.payload["text"] for p in points])

## Weaviate

In [None]:
import weaviate
from weaviate.connect import ConnectionParams
from weaviate.classes.config import Configure, Property, DataType
from weaviate.classes.init import Auth

In [None]:
WEAVIATE_URL="your-weaviate-url"
WEAVIATE_API_KEY="your-weaviate-url-key"

In [None]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)

In [None]:
print(client.is_ready())

In [None]:
client.connect()

In [None]:
COLLECTION = "PdfChunk"

In [None]:
if client.collections.exists(COLLECTION):
  client.collections.delete(COLLECTION)

In [None]:
client.collections.create(
    name=COLLECTION,
    properties=[
        Property(name="text", data_type=DataType.TEXT),
    ],
    vectorizer_config = Configure.Vectorizer.none()
)

In [None]:
collection = client.collections.get(COLLECTION)

for i, emb in enumerate(embeddings):
  collection.data.insert(
      properties={"text":chunks[i]},
      vector=emb.tolist(),
  )

In [None]:
res = collection.query.near_vector(
    near_vector=query_embedding[0].tolist(),
    limit=3
)

In [None]:
print("Weaviate:", [obj.properties["text"] for obj in res.objects])

## Pinecone

In [None]:
from pinecone import Pinecone

In [None]:
PINECONE_API_KEY = "your-pinecone-api-key"

In [None]:
pc = Pinecone(api_key = PINECONE_API_KEY)
index = pc.Index("pdf-index")

In [None]:
index.upsert(
    vectors = [(str(i), embeddings[i].tolist()) for i in range(len(chunks))]
)

In [None]:
res = index.query(
    vector = query_embedding[0].tolist(),
    top_k = 3,
    include_metadata =False
)

In [None]:
print("Pinecone:", [m["id"] for m in res["matches"]])