In [1]:
import json

with open('faq_data_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [2]:
documents

{'company_name': 'NomadFoods Inc.',
 'faq_data': [{'category': 'Products',
   'questions': [{'question': 'What types of food products do you sell?',
     'answer': "We offer an extensive selection of food products across multiple categories to meet diverse customer needs. Our offerings include fresh fruits and vegetables sourced from local and international farms, dairy products like milk, cheese, and yogurt from certified suppliers, a variety of meats and seafood from both wild-caught and farm-raised sources, freshly baked goods such as breads, cakes, and pastries, as well as an array of packaged and frozen foods including snacks, cereals, pastas, and beverages. Whether you're a retailer looking to stock up on everyday essentials or specialty products, we have a wide inventory to support your needs.",
     'id': 'e063fa64'},
    {'question': 'Are your products organic?',
     'answer': 'Yes, we proudly offer both organic and non-organic products to cater to a wide range of consumer pr

In [11]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pandas as pd

# Load documents
with open('faq_data_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

# Initialize sentence transformer model
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)

# Extract and encode the question and text data
questions = []
vectors = []
doc_ids = []

# Iterate through the FAQ data
for faq in tqdm(documents['faq_data']):  # Access 'faq_data' key
    for question_data in faq['questions']:
        questions.append(question_data['question'])
        vectors.append(model.encode(question_data['question']))
        doc_ids.append(question_data.get('id', 'unknown'))  # Use a fallback if 'id' is missing

# Convert vectors list into a NumPy array
vectors_np = np.array(vectors)

# Build FAISS index
vector_dimension = vectors_np.shape[1]  # Dimensionality of vectors (384 in this case)
index = faiss.IndexFlatL2(vector_dimension)  # L2 distance metric for FAISS
index.add(vectors_np)  # Add vectors to the index

# Function to perform FAISS search
def faiss_search(query_vector, top_k=5):
    query_vector_np = np.array([query_vector])  # Convert query to NumPy array
    distances, indices = index.search(query_vector_np, top_k)  # Perform search
    return indices, distances

# Function to search for the most similar documents
def faiss_knn_search(query, top_k=5):
    query_vector = model.encode(query)
    indices, distances = faiss_search(query_vector, top_k)
    
    result_docs = []
    for idx in indices[0]:  # indices[0] since FAISS returns a 2D array
        result_docs.append({
            'id': doc_ids[idx],
            'question': questions[idx]
        })
    return result_docs

# Load ground truth data for evaluation
df_ground_truth = pd.read_csv('ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

# Evaluation metrics (hit rate and MRR)
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

# Evaluation function
def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q['question'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Evaluate using FAISS
evaluation_results = evaluate(ground_truth, faiss_knn_search)
print(evaluation_results)


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

{'hit_rate': 0.97, 'mrr': 0.8356666666666664}
