In [None]:
!pip install -q datasets sentence-transformers

from datasets import load_dataset
from src.embeddings.embedder import Embedder
import numpy as np

# 1. Load SQuAD v2 train split
squad = load_dataset("squad_v2", split="train")

# 2. Take a small sample (200 examples)
sample = squad.shuffle(seed=42).select(range(200))
queries = [q["question"] for q in sample]
contexts = [q["context"] for q in sample]

# 3. Initialize Embedder (using e5-large as best model)
embedder = Embedder("intfloat/e5-large")

# 4. Encode queries and contexts
query_embeddings = embedder.encode(queries)
context_embeddings = embedder.encode(contexts)

print("Query embeddings shape:", query_embeddings.shape)
print("Context embeddings shape:", context_embeddings.shape)

# 5. Save vectors to temporary files (for FAISS prep in week 2)
np.save("query_embeddings_sample.npy", query_embeddings)
np.save("context_embeddings_sample.npy", context_embeddings)

print("✅ Sample embeddings saved successfully.")
