In [None]:
# ✅ 0. Install FAISS if not available
!pip install faiss-cpu
from google.colab import drive
drive.mount('/content/drive')

# ✅ 1. Import modules
import numpy as np
import faiss
import os

# ✅ 2. Load embeddings from Google Drive
embedding_dir = "/content/drive/My Drive/RAG Research/embeddings"

context_embeddings = np.load(os.path.join(embedding_dir, "context_embeddings.npy"))
question_embeddings = np.load(os.path.join(embedding_dir, "question_embeddings.npy"))

print("Context shape:", context_embeddings.shape)
print("Question shape:", question_embeddings.shape)

# ✅ 3. Build FAISS Flat index
embedding_dim = context_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)

index.add(context_embeddings)
print(f" FAISS Index built with {index.ntotal} context vectors")

# ✅ 4. Example Retrieval: Top-5 documents for first 3 queries
k = 5
D, I = index.search(question_embeddings[:3], k)

for i, (distances, indices) in enumerate(zip(D, I)):
    print(f"\n🔍 Query {i+1}:")
    for rank, (idx, score) in enumerate(zip(indices, distances)):
        print(f"  Rank {rank+1}: Context ID {idx}, Distance {score:.4f}")


## Flat: Evaluating Performance (Recall@k, MRR) — k set to 20 for evaluation

In [None]:

from datasets import load_dataset

# Load original SQuAD dataset (train split, same used for embeddings)
dataset = load_dataset("squad_v2", split="train")

# Ground truth: for each question i, the correct context is dataset[i]["context"]
# Since embeddings were built in the same order, ground truth = i
ground_truth = np.arange(len(dataset))

# Search top-k for all questions
k = 20
D, I = index.search(question_embeddings, k)

# Compute Recall@k
recall_at_k = np.mean([
    1 if ground_truth[i] in I[i] else 0
    for i in range(len(question_embeddings))
])

# Compute MRR
mrr = np.mean([
    1 / (list(I[i]).index(ground_truth[i]) + 1)
    if ground_truth[i] in I[i] else 0
    for i in range(len(question_embeddings))
])

print(f"Recall@{k}: {recall_at_k:.4f}")
print(f"MRR: {mrr:.4f}")


## IVFFlat(Inverted File Flat Index): Evaluating Performance (Recall@k, MRR)— k set to 20 for evaluation

In [None]:


import faiss
import numpy as np
from datasets import load_dataset

# Load embeddings
embedding_dir = "/content/drive/MyDrive/RAG Research/embeddings"
context_embeddings = np.load(f"{embedding_dir}/context_embeddings.npy")
question_embeddings = np.load(f"{embedding_dir}/question_embeddings.npy")

# IVFFlat index parameters
d = context_embeddings.shape[1]
nlist = 100   # number of clusters (tuneable)

# Build IVFFlat
quantizer = faiss.IndexFlatL2(d)
index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

# Train and add
index_ivf.train(context_embeddings)
index_ivf.add(context_embeddings)

print("✅ IVFFlat index built and trained")

# Ground truth
dataset = load_dataset("squad_v2", split="train")
ground_truth = np.arange(len(dataset))

# Search with full question set
k = 20
D, I = index_ivf.search(question_embeddings, k)

# Recall@k
recall_at_k = np.mean([
    1 if ground_truth[i] in I[i] else 0
    for i in range(len(question_embeddings))
])

# MRR
mrr = np.mean([
    1 / (list(I[i]).index(ground_truth[i]) + 1)
    if ground_truth[i] in I[i] else 0
    for i in range(len(question_embeddings))
])

print(f"[IVFFlat] Recall@{k}: {recall_at_k:.4f}")
print(f"[IVFFlat] MRR: {mrr:.4f}")

## HNSW(Hierarchical Navigable Small World graph): Evaluating Performance (Recall@k, MRR) - k set to 20 for evaluation

In [None]:
import faiss

# Dimension of embeddings
d = context_embeddings.shape[1]

# Build HNSW index (Hierarchical Navigable Small World graph)
hnsw_index = faiss.IndexHNSWFlat(d, 32)  # 32 = number of neighbors (M)
hnsw_index.hnsw.efConstruction = 200     # construction parameter (higher = more accurate, slower build)

# Add context embeddings
hnsw_index.add(context_embeddings)
print(f"✅ HNSW index built with {hnsw_index.ntotal} context vectors")

# Set search parameter (efSearch = trade-off between accuracy/speed)
hnsw_index.hnsw.efSearch = 64

# Search top-k for all questions
k = 20
D, I = hnsw_index.search(question_embeddings, k)

# Compute Recall@k
recall_at_k = np.mean([
    1 if ground_truth[i] in I[i] else 0
    for i in range(len(question_embeddings))
])

# Compute MRR
mrr = np.mean([
    1 / (list(I[i]).index(ground_truth[i]) + 1)
    if ground_truth[i] in I[i] else 0
    for i in range(len(question_embeddings))
])

print(f"[HNSW] Recall@{k}: {recall_at_k:.4f}")
print(f"[HNSW] MRR: {mrr:.4f}")