### Install libraries

In [1]:
!pip install pymupdf sentence-transformers numpy chromadb faiss-cpu qdrant-client weaviate-client pinecone-client

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting qdrant-client
  Downloading qdrant_client-1.16.2-py3-none-any.whl.metadata (11 kB)
Collecting weaviate-client
  Downloading weaviate_client-4.19.2-py3-none-any.whl.metadata (3.7 kB)
Collecting pinecone-client
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading p

### Loading + Chunking Docs

In [2]:
import fitz

In [3]:
def load_pdf(path):
  doc = fitz.open(path)
  text = ""
  for page in doc:
    text += page.get_text()
  return text

In [4]:
def chunk_text(text, chunk_size = 500, overlap = 100):
  chunks = []
  start = 0
  while start < len(text):
    chunks.append(text[start:start + chunk_size])
    start += chunk_size - overlap
  return chunks

In [5]:
pdf_text = load_pdf("/content/drive/MyDrive/Attention_Is_All_You_Need.pdf")

In [6]:
chunks = chunk_text(pdf_text)

In [7]:
print(f"Total chunks: {len(chunks)}")

Total chunks: 99


### Create Embeddings

In [8]:
import numpy as np
from sentence_transformers import SentenceTransformer

In [9]:
model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
embeddings = model.encode(chunks).astype("float32")

In [11]:
query = "What is multi head attention"
query_embedding = model.encode([query]).astype("float32")

## ChromaDB

In [12]:
import chromadb
from chromadb.config import Settings

In [15]:
client = chromadb.Client(Settings(anonymized_telemetry=False))
collection = client.get_or_create_collection(name="pdf")

In [16]:
collection.add(
    documents=chunks,
    embeddings=embeddings.tolist(),
    ids=[str(i) for i in range(len(chunks))]
)

In [17]:
res = collection.query(query_texts=[query], n_results=3)
print("ChromaDB:", res["documents"][0])

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 35.4MiB/s]


ChromaDB: [' model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q, K, V ) = Concat(head1, ..., headh)W O\nwhere headi = Attention(QW Q\ni , KW K\ni , V W V\ni )\nWhere the projections are parameter matrices W Q\ni\n∈Rdmodel×dk, W K\ni\n∈Rdmodel×dk, W V\ni\n∈Rdmodel×dv\nand W O ∈Rhdv×dmodel.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\ndk = dv = dmodel/h = 64.', 'Multi-Head Attention\nInstead of performing a single attention function with dmodel-dimensional keys, values and queries,\nwe found it beneficial to linearly project the queries, keys and values h times with different, learned\nlinear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of\nqueries, keys and values we then perform the attention function in parallel, yielding dv-dimensional\n4To illustrate why the dot products 

## FAISS

In [18]:
import faiss

In [19]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

In [20]:
D, I = index.search(query_embedding, k = 3)
print("FAISS: ", [chunks[i] for i in I[0]])

FAISS:  [' model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q, K, V ) = Concat(head1, ..., headh)W O\nwhere headi = Attention(QW Q\ni , KW K\ni , V W V\ni )\nWhere the projections are parameter matrices W Q\ni\n∈Rdmodel×dk, W K\ni\n∈Rdmodel×dk, W V\ni\n∈Rdmodel×dv\nand W O ∈Rhdv×dmodel.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\ndk = dv = dmodel/h = 64.', 'Multi-Head Attention\nInstead of performing a single attention function with dmodel-dimensional keys, values and queries,\nwe found it beneficial to linearly project the queries, keys and values h times with different, learned\nlinear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of\nqueries, keys and values we then perform the attention function in parallel, yielding dv-dimensional\n4To illustrate why the dot products ge

## Qdrant

In [21]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

In [22]:
client = QdrantClient(":memory:")

In [24]:
collection_name = "pdf"
vector_size = embeddings.shape[1]

In [25]:
if client.collection_exists(collection_name):
  client.delete_collection(collection_name)

In [27]:
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=vector_size,
        distance=Distance.COSINE
      )
)

True

In [28]:
client.upsert(
    collection_name = "pdf",
    points = [
        PointStruct(id = i, vector = embeddings[i], payload = {"text": chunks[i]}) for i in range(len(chunks))
    ]
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [42]:
res = client.query_points(
    collection_name = "pdf",
    query = query_embedding[0].tolist(),
    limit = 3,
    with_payload = True
)

In [47]:
points = res.points

In [48]:
print("Qdrant:", [p.payload["text"] for p in points])

Qdrant: [' model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q, K, V ) = Concat(head1, ..., headh)W O\nwhere headi = Attention(QW Q\ni , KW K\ni , V W V\ni )\nWhere the projections are parameter matrices W Q\ni\n∈Rdmodel×dk, W K\ni\n∈Rdmodel×dk, W V\ni\n∈Rdmodel×dv\nand W O ∈Rhdv×dmodel.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\ndk = dv = dmodel/h = 64.', 'Multi-Head Attention\nInstead of performing a single attention function with dmodel-dimensional keys, values and queries,\nwe found it beneficial to linearly project the queries, keys and values h times with different, learned\nlinear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of\nqueries, keys and values we then perform the attention function in parallel, yielding dv-dimensional\n4To illustrate why the dot products ge

## Weaviate

In [74]:
import weaviate
from weaviate.connect import ConnectionParams
from weaviate.classes.config import Configure, Property, DataType
from weaviate.classes.init import Auth

In [64]:
WEAVIATE_URL="h7ypzvzoqw8plo7n5iawq.c0.asia-southeast1.gcp.weaviate.cloud"
WEAVIATE_API_KEY="bUYyR3hQZk85QnNlazZnM180dzEzN0FxeHpBV1paRUFXeU1ZaVo5NFdQYkV4NkVWSHNnZ2dwdFRNUS9vPV92MjAw"

In [67]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)

In [68]:
print(client.is_ready())

True


In [69]:
client.connect()

In [70]:
COLLECTION = "PdfChunk"

In [72]:
if client.collections.exists(COLLECTION):
  client.collections.delete(COLLECTION)

In [75]:
client.collections.create(
    name=COLLECTION,
    properties=[
        Property(name="text", data_type=DataType.TEXT),
    ],
    vectorizer_config = Configure.Vectorizer.none()
)

<weaviate.collections.collection.sync.Collection at 0x7985b8e84560>

In [76]:
collection = client.collections.get(COLLECTION)

for i, emb in enumerate(embeddings):
  collection.data.insert(
      properties={"text":chunks[i]},
      vector=emb.tolist(),
  )

In [77]:
res = collection.query.near_vector(
    near_vector=query_embedding[0].tolist(),
    limit=3
)

In [78]:
print("Weaviate:", [obj.properties["text"] for obj in res.objects])

Weaviate: [' model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q, K, V ) = Concat(head1, ..., headh)W O\nwhere headi = Attention(QW Q\ni , KW K\ni , V W V\ni )\nWhere the projections are parameter matrices W Q\ni\n∈Rdmodel×dk, W K\ni\n∈Rdmodel×dk, W V\ni\n∈Rdmodel×dv\nand W O ∈Rhdv×dmodel.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\ndk = dv = dmodel/h = 64.', 'Multi-Head Attention\nInstead of performing a single attention function with dmodel-dimensional keys, values and queries,\nwe found it beneficial to linearly project the queries, keys and values h times with different, learned\nlinear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of\nqueries, keys and values we then perform the attention function in parallel, yielding dv-dimensional\n4To illustrate why the dot products 

## Pinecone

In [82]:
from pinecone import Pinecone

In [83]:
PINECONE_API_KEY = "pcsk_LKvLu_Uu4q74dnJur4nkmC5zCwxJwrdAG7ThZ8YwgmxSAWmGBCGEDvSd6SgCZs58ndhbX"

In [87]:
pc = Pinecone(api_key = PINECONE_API_KEY)
index = pc.Index("pdf-index")

In [88]:
index.upsert(
    vectors = [(str(i), embeddings[i].tolist()) for i in range(len(chunks))]
)

UpsertResponse(upserted_count=99, _response_info={'raw_headers': {'date': 'Sun, 08 Feb 2026 14:08:03 GMT', 'content-type': 'application/json', 'content-length': '20', 'connection': 'keep-alive', 'x-pinecone-request-lsn': '1', 'x-pinecone-request-logical-size': '152252', 'x-pinecone-request-latency-ms': '483', 'x-envoy-upstream-service-time': '220', 'x-pinecone-response-duration-ms': '485', 'grpc-status': '0', 'server': 'envoy'}})

In [89]:
res = index.query(
    vector = query_embedding[0].tolist(),
    top_k = 3,
    include_metadata =False
)

In [90]:
print("Pinecone:", [m["id"] for m in res["matches"]])

Pinecone: ['29', '27', '98']
