In [1]:
# --- Configuration ---
import os
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder

# Paths (adjust if needed)
BASE_DIR = os.path.abspath("../../backend2")
DATASET_DIR = os.path.abspath(os.path.join(BASE_DIR, "./Dataset/Integrated/Latest"))
NOMIC_MODEL_PATH = "../../Models/nomic-finetuned/nomic-finetuned-final"

# Thresholds
SIMILARITY_THRESHOLD = 1.4  # FAISS L2 distance. Lower is better.

print(f"Dataset directory: {DATASET_DIR}")
print(f"Nomic model path: {NOMIC_MODEL_PATH}")

Dataset directory: c:\Users\tebats\Baste\Projects\AmangBot\backend2\Dataset\Integrated\Latest
Nomic model path: ../../Models/nomic-finetuned/nomic-finetuned-final


In [2]:
# --- Load Chunks from Dataset ---
chunks = []
chunks_for_embedding = []

if not os.path.exists(DATASET_DIR):
    raise FileNotFoundError(f"Dataset directory not found at {DATASET_DIR}")

for filename in os.listdir(DATASET_DIR):
    if filename.endswith(".json"):
        file_path = os.path.join(DATASET_DIR, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                file_data = json.load(f)
                if isinstance(file_data, list):
                    for item in file_data:
                        if "content" in item:
                            chunks.append(item["content"])
                            
                            # Prepare embedding with metadata (excluding id and keywords)
                            metadata_parts = []
                            for k, v in item.items():
                                if k not in ["id", "keywords", "content"]:
                                    metadata_parts.append(f"{k}: {v}")
                            metadata_str = ", ".join(metadata_parts)
                            
                            chunks_for_embedding.append(f"search_document: {metadata_str}. {item['content']}")
        except Exception as e:
            print(f"Error loading {filename}: {e}")

print(f"Loaded {len(chunks)} chunks from {DATASET_DIR}.")

Loaded 402 chunks from c:\Users\tebats\Baste\Projects\AmangBot\backend2\Dataset\Integrated\Latest.


In [3]:
# --- Initialize Embedder ---
print("Initializing Nomic Embedder...")
embedder = SentenceTransformer(NOMIC_MODEL_PATH, trust_remote_code=True)
print("Embedder initialized.")

You are trying to use a model that was created with Sentence Transformers version 5.1.2, but you're currently using version 5.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.


Initializing Nomic Embedder...


<All keys matched successfully>


Embedder initialized.


In [4]:
# --- Build FAISS Index ---
print("Building FAISS index...")
embeddings = embedder.encode(chunks_for_embedding, convert_to_numpy=True, normalize_embeddings=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors.")
print(f"Embedding dimension: {dimension}")

Building FAISS index...
FAISS index built with 402 vectors.
Embedding dimension: 768


In [5]:
# --- Initialize Reranker ---
try:
    reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    print("Reranker initialized.")
except Exception as e:
    reranker = None
    print(f"Reranker failed to initialize: {e}")

Reranker initialized.


In [6]:
# --- Retrieval Function ---
def retrieve(query: str, k: int = 4):
    """
    Retrieve relevant chunks for a given query.
    
    Args:
        query: The search query
        k: Number of results to return
    
    Returns:
        Tuple of (final_chunks, distances, is_relevant)
    """
    # Encode query
    query_text = f"search_query: {query}"
    query_embedding = embedder.encode([query_text], convert_to_numpy=True, normalize_embeddings=True)
    
    # Fetch candidates (3x for reranking)
    fetch_k = k * 3 if reranker else k
    D, I = index.search(query_embedding, fetch_k)
    
    # Check if results are relevant based on distance threshold
    is_relevant = D[0][0] <= SIMILARITY_THRESHOLD
    
    retrieved_chunks = [chunks[i] for i in I[0]]
    distances = D[0].tolist()
    
    # Rerank if available
    if reranker:
        pairs = [[query, doc] for doc in retrieved_chunks]
        scores = reranker.predict(pairs)
        scored_chunks = sorted(zip(retrieved_chunks, scores, distances), key=lambda x: x[1], reverse=True)
        final_chunks = [chunk for chunk, score, dist in scored_chunks[:k]]
        final_scores = [score for chunk, score, dist in scored_chunks[:k]]
        return final_chunks, final_scores, is_relevant
    else:
        return retrieved_chunks[:k], distances[:k], is_relevant

print("Retrieval function defined.")

Retrieval function defined.


In [None]:
# --- Test Retrieval ---
# Example queries to test
test_queries = [
    "What are the admission requirements?",
    "Who is the president of EARIST?",
    "What are the school fees?",
]

for query in test_queries:
    print(f"\n{'='*60}")
    print(f"Query: {query}")
    print('='*60)
    
    results, scores, is_relevant = retrieve(query, k=3)
    
    print(f"Is relevant (within threshold): {is_relevant}")
    print(f"\nTop {len(results)} results:")
    
    for i, (chunk, score) in enumerate(zip(results, scores), 1):
        print(f"\n--- Result {i} (Score: {score:.4f}) ---")
        print(chunk[:500] + "..." if len(chunk) > 500 else chunk)

In [None]:
# --- Interactive Testing ---
# Run your own queries here
custom_query = "What are the requirements for enrollment?"

results, scores, is_relevant = retrieve(custom_query, k=4)

print(f"Query: {custom_query}")
print(f"Is relevant: {is_relevant}")
print(f"\nRetrieved {len(results)} chunks:\n")

for i, (chunk, score) in enumerate(zip(results, scores), 1):
    print(f"--- Chunk {i} (Score: {score:.4f}) ---")
    print(chunk)
    print()