In [0]:
%sql
--SELECT * FROM `docai-dbx`.gold.doc_embeddings

In [0]:
#%pip install faiss-cpu

In [0]:
#%pip install sentence-transformers

In [0]:
#dbutils.library.restartPython()

In [0]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from pyspark.sql import SparkSession
import pandas as pd

In [0]:
dbutils.widgets.text("query", "What is the refund policy?", "Ask a question")
query = dbutils.widgets.get("query")

In [0]:
df_embed = spark.read.format("delta").table("`docai-dbx`.gold.doc_embeddings").toPandas()

# Ensure correct format for FAISS
embedding_matrix = np.vstack(df_embed["embedding"].values).astype("float32")


In [0]:
# Step 4: Load model & embed the query
model = SentenceTransformer("all-MiniLM-L6-v2")
query_embedding = model.encode([query]).astype("float32")

In [0]:
# Step 5: FAISS Index Creation & Search
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

top_k = 5
D, I = index.search(query_embedding, top_k)

In [0]:
# Step 6: Show Retrieved Chunks
print("🔍 Top Retrieved Chunks:\n")
for idx in I[0]:
    row = df_embed.iloc[idx]
    print(f"📄 File: {row['file_name']}")
    print(f"🧠 Chunk ID: {row['chunk_id']}")
    print(f"📝 Text: {row['text_chunk'][:300]}...\n")
    print("—" * 60)