In [1]:
import json

with open('faq_data_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [2]:
documents

{'company_name': 'NomadFoods Inc.',
 'faq_data': [{'category': 'Products',
   'questions': [{'question': 'What types of food products do you sell?',
     'answer': "We offer an extensive selection of food products across multiple categories to meet diverse customer needs. Our offerings include fresh fruits and vegetables sourced from local and international farms, dairy products like milk, cheese, and yogurt from certified suppliers, a variety of meats and seafood from both wild-caught and farm-raised sources, freshly baked goods such as breads, cakes, and pastries, as well as an array of packaged and frozen foods including snacks, cereals, pastas, and beverages. Whether you're a retailer looking to stock up on everyday essentials or specialty products, we have a wide inventory to support your needs.",
     'id': 'e063fa64'},
    {'question': 'Are your products organic?',
     'answer': 'Yes, we proudly offer both organic and non-organic products to cater to a wide range of consumer pr

In [6]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pandas as pd
from rank_bm25 import BM25Okapi

# Load documents
with open('faq_data_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

# Initialize sentence transformer model
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)

# Prepare questions, vectors, and texts for BM25
questions = []
vectors = []
doc_ids = []
texts = []

for faq in tqdm(documents['faq_data']):
    for question_data in faq['questions']:
        questions.append(question_data['question'])
        vectors.append(model.encode(question_data['question']))
        doc_ids.append(question_data['id'])
        combined_text = f"{question_data['question']} {question_data['answer']}"
        texts.append(combined_text)

# Convert vectors list into a NumPy array
vectors_np = np.array(vectors)

# Initialize BM25
tokenized_corpus = [text.lower().split() for text in texts]
bm25 = BM25Okapi(tokenized_corpus)

def faiss_knn_search(query):
    query_vector = model.encode(query)
    distances, indices = index.search(np.array([query_vector]), 5)
    return [doc_ids[idx] for idx in indices[0]]

def bm25_search(query):
    query_tokens = query.lower().split()
    scores = bm25.get_scores(query_tokens)
    top_indices = np.argsort(scores)[::-1][:5]
    return [doc_ids[idx] for idx in top_indices]

def evaluate_search(ground_truth, search_function):
    relevance_total = []
    reciprocal_ranks = []
    
    for q in tqdm(ground_truth, desc="Evaluating"):
        doc_id = q['document']
        results = search_function(q['question'])
        
        relevance = [d == doc_id for d in results]
        relevance_total.append(relevance)
        
        try:
            first_relevant = relevance.index(True)
            reciprocal_ranks.append(1.0 / (first_relevant + 1))
        except ValueError:
            reciprocal_ranks.append(0.0)
    
    if not relevance_total:
        return {'hit_rate': 0.0, 'mrr': 0.0}
    
    hit_rate = sum(1 for rel in relevance_total if True in rel) / len(relevance_total)
    mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
    
    return {'hit_rate': hit_rate, 'mrr': mrr}

# Example usage and evaluation
if __name__ == "__main__":
    # Load ground truth data
    try:
        df_ground_truth = pd.read_csv('ground-truth-data.csv')
        ground_truth = df_ground_truth.to_dict(orient='records')

        # Evaluate FAISS
        faiss_metrics = evaluate_search(ground_truth, faiss_knn_search)
        print("FAISS Results:")
        print(f"Hit Rate: {faiss_metrics['hit_rate']:.4f}")
        print(f"MRR: {faiss_metrics['mrr']:.4f}")

        # Evaluate BM25
        bm25_metrics = evaluate_search(ground_truth, bm25_search)
        print("\nBM25 Results:")
        print(f"Hit Rate: {bm25_metrics['hit_rate']:.4f}")
        print(f"MRR: {bm25_metrics['mrr']:.4f}")

    except FileNotFoundError:
        print("\nNote: Ground truth data file not found. Skipping evaluation metrics.")


  0%|          | 0/4 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

FAISS Results:
Hit Rate: 0.9700
MRR: 0.8357


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]


BM25 Results:
Hit Rate: 0.9700
MRR: 0.8505
