In [None]:
import pandas as pd
import numpy as np
import faiss
import os
from sentence_transformers import SentenceTransformer, util
import evaluate
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load dataset
df = pd.read_csv("/content/medquad.csv")  # Ensure the dataset has 'question' and 'answer' columns

# Load embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# File paths for saved embeddings and FAISS index
EMBEDDINGS_PATH = "question_embeddings.npy"
FAISS_INDEX_PATH = "faiss_index.bin"

# Load or compute embeddings
if os.path.exists(EMBEDDINGS_PATH) and os.path.exists(FAISS_INDEX_PATH):
    print("Loading saved FAISS index and embeddings...")
    question_embeddings = np.load(EMBEDDINGS_PATH)
    index = faiss.read_index(FAISS_INDEX_PATH)
else:
    print("Computing embeddings and creating FAISS index...")
    question_embeddings = np.array([embed_model.encode(q) for q in df["question"]])
    np.save(EMBEDDINGS_PATH, question_embeddings)

    index = faiss.IndexFlatL2(question_embeddings.shape[1])
    index.add(question_embeddings)
    faiss.write_index(index, FAISS_INDEX_PATH)

print("FAISS index ready!")

In [None]:
# Prepare ground truth data
ground_truth_questions = df["question"].tolist()
ground_truth_answers = df["answer"].tolist()

# Compute embeddings for ground truth answers
ground_truth_embeddings = np.array([embed_model.encode(q) for q in ground_truth_questions])

# Function to retrieve the closest question from FAISS
def retrieve_faiss_answer(user_question):
    user_embedding = np.array([embed_model.encode(user_question)])
    _, closest_idx = index.search(user_embedding, 1)
    return ground_truth_answers[closest_idx[0][0]]

# Function to evaluate retrieval accuracy
def evaluate_retrieval():
    correct_retrievals = 0
    total = len(ground_truth_questions)

    for i, question in enumerate(ground_truth_questions):
        retrieved_answer = retrieve_faiss_answer(question)
        if retrieved_answer == ground_truth_answers[i]:  # Exact match
            correct_retrievals += 1

    accuracy = correct_retrievals / total
    print(f"Retrieval Accuracy: {accuracy:.4f}")

# Function to evaluate semantic similarity
def evaluate_similarity():
    similarities = []

    for i, question in enumerate(ground_truth_questions):
        retrieved_answer = retrieve_faiss_answer(question)

        # Handle missing or invalid answers
        if not isinstance(retrieved_answer, str) or pd.isna(retrieved_answer):
            print(f"Skipping invalid answer at index {i}")
            continue

        # Compute similarity
        sim_score = util.pytorch_cos_sim(
            embed_model.encode(retrieved_answer, convert_to_tensor=True),
            embed_model.encode(ground_truth_answers[i], convert_to_tensor=True)
        )

        similarities.append(sim_score.item())

    # Compute and display average similarity
    if similarities:
        avg_similarity = np.mean(similarities)
        print(f"Average Semantic Similarity: {avg_similarity:.4f}")
    else:
        print("No valid similarities computed.")

# Function to evaluate BLEU & ROUGE
def evaluate_nlg():
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")

    generated_responses = [retrieve_faiss_answer(q) for q in ground_truth_questions]

    bleu_score = bleu.compute(predictions=generated_responses, references=ground_truth_answers)
    rouge_score = rouge.compute(predictions=generated_responses, references=ground_truth_answers)

    print(f"BLEU Score: {bleu_score['bleu']:.4f}")
    print(f"ROUGE Score: {rouge_score}")

In [None]:
# Run evaluations
evaluate_retrieval()
evaluate_similarity()
evaluate_nlg()