In [8]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import mistralai
import os
import pandas as pd
from tqdm import tqdm
import random

def load_ground_truth(file_path: str):
    """Load ground truth data from Excel file"""
    # Read Excel file
    df = pd.read_excel(file_path)
    
    # Convert to dictionary
    questions_dict = dict(zip(df['question'], df['document']))
    return questions_dict

def load_faq_data(file_path: str):
    """Load FAQ data from JSON file"""
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Create mapping of document IDs to answers
    id_to_answer = {}
    for category in data['faq_data']:
        for qa in category['questions']:
            id_to_answer[qa['id']] = qa['answer']
    
    return id_to_answer

def generate_rag_answer(question: str, context: str, client) -> str:
    """Generate answer using RAG system with Mistral"""
    system_prompt = """You are a helpful customer service representative at NomadFoods company. 
    Use the following context to answer the question. Be natural and conversational while being informative.
    
    Context: {context}
    """
    
    user_prompt = f"Answer this question in a friendly, conversational way: {question}"
    
    response = client.chat.complete(
        model="mistral-large-latest",
        messages=[
            {"role": "system", "content": system_prompt.format(context=context)},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=500,
        temperature=0.7
    )
    
    return response.choices[0].message.content.strip()

def compute_cosine_similarity(answer1: str, answer2: str, model) -> float:
    """Compute cosine similarity between two answers"""
    # Generate embeddings
    emb1 = model.encode(answer1)
    emb2 = model.encode(answer2)
    
    # Reshape for sklearn cosine_similarity
    emb1 = emb1.reshape(1, -1)
    emb2 = emb2.reshape(1, -1)
    
    return cosine_similarity(emb1, emb2)[0][0]

def llm_as_judge(question: str, 
                generated_answer: str, 
                ground_truth_answer: str, 
                client) -> dict:
    """Evaluate generated answer quality using LLM-as-judge technique"""
    prompt = f"""Rate the following generated answer compared to the ground truth answer for the given question.
    
    Question: {question}
    
    Generated Answer: {generated_answer}
    
    Ground Truth Answer: {ground_truth_answer}
    
    Please evaluate on the following criteria and provide numerical scores between 0 and 1:
    1. Relevance: How relevant is the generated answer to the question?
    2. Completeness: How complete is the generated answer compared to ground truth?
    3. Accuracy: How accurate is the information in the generated answer?
    4. Consistency: How consistent is the generated answer with the ground truth?
    
    Provide your evaluation in the following format exactly:
    Relevance: [score]
    Completeness: [score]
    Accuracy: [score]
    Consistency: [score]
    """
    
    response = client.chat.complete(
        model="ministral-3b-latest",
        messages=[
            {"role": "system", "content": "You are an expert evaluator of question-answering systems."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,
        temperature=0.7
    )
    
    # Parse scores from response
    scores = {}
    for line in response.choices[0].message.content.strip().split('\n'):
        category, score = line.split(': ')
        scores[category.lower()] = float(score)
    
    return scores

def evaluate_rag_generation(ground_truth_path: str, 
                          faq_data_path: str, 
                          client,
                          num_samples: int = None) -> dict:
    """Main function to evaluate RAG generation system"""
    
    # Initialize models
    print("Loading embedding model...")
    embedding_model = SentenceTransformer('all-mpnet-base-v2')
    
    # Load data
    print("Loading ground truth and FAQ data...")
    ground_truth_data = load_ground_truth(ground_truth_path)
    faq_data = load_faq_data(faq_data_path)
    
    # Get questions and sample if specified
    questions = list(ground_truth_data.keys())
    if num_samples and num_samples < len(questions):
        questions = random.sample(questions, num_samples)
    
    results = {
        'cosine_similarities': [],
        'llm_scores': {
            'relevance': [],
            'completeness': [],
            'accuracy': [],
            'consistency': []
        },
        'question_level_results': []
    }
    
    # Main evaluation loop with tqdm progress bar
    with tqdm(total=len(questions), desc="Evaluating questions") as pbar:
        for question in questions:
            # Get ground truth document ID and answer
            doc_id = ground_truth_data[question]
            ground_truth_answer = faq_data[doc_id]
            
            # Generate answer using RAG
            generated_answer = generate_rag_answer(question, ground_truth_answer, client)
            
            # Compute cosine similarity
            similarity = compute_cosine_similarity(
                generated_answer,
                ground_truth_answer,
                embedding_model
            )
            results['cosine_similarities'].append(similarity)
            
            # Get LLM evaluation using LLM-as-judge technique
            llm_scores = llm_as_judge(
                question,
                generated_answer,
                ground_truth_answer,
                client
            )
            
            # Store individual question results
            question_result = {
                'question': question,
                'document_id': doc_id,
                'generated_answer': generated_answer,
                'ground_truth_answer': ground_truth_answer,
                'cosine_similarity': similarity,
                **llm_scores
            }
            
            results['question_level_results'].append(question_result)
            
            # Aggregate LLM scores
            for metric, score in llm_scores.items():
                results['llm_scores'][metric].append(score)
            
            # Update progress bar
            pbar.update(1)
    
    # Calculate final averages
    final_results = {
        'average_cosine_similarity': np.mean(results['cosine_similarities']),
        'llm_average_scores': {
            metric: np.mean(scores) 
            for metric, scores in results['llm_scores'].items()
        },
        'detailed_results': pd.DataFrame(results['question_level_results'])
    }
    
    return final_results

def main():
    # Initialize Mistral client
    client = mistralai.Mistral(api_key=os.getenv('MISTRAL_API_KEY'))
    
    # Evaluate the system
    results = evaluate_rag_generation(
        ground_truth_path='ground-truth-data.xlsx',
        faq_data_path='faq_data_with_ids.json',
        client=client,
        num_samples=1  # Set to evaluate 12 random question because of the SDKError: API error of Requests rate limit exceeded , please increase it in production environments
    )
    
    # Print overall results
    print("\nEvaluation Results:")
    print(f"Average Cosine Similarity: {results['average_cosine_similarity']:.3f}")
    print("\nLLM-As-A-Judge Evaluation Scores:")
    for metric, score in results['llm_average_scores'].items():
        print(f"{metric.capitalize()}: {score:.3f}")
    
    # Save detailed results to Excel
    excel_path = 'rag_generation_evaluation_results.xlsx'
    results['detailed_results'].to_excel(excel_path, index=False)
    print(f"\nDetailed results saved to '{excel_path}'")

if __name__ == "__main__":
    main()

Loading embedding model...
Loading ground truth and FAQ data...


Evaluating questions: 100%|██████████| 1/1 [00:12<00:00, 12.12s/it]


Evaluation Results:
Average Cosine Similarity: 0.845

LLM-As-A-Judge Evaluation Scores:
Relevance: 0.900
Completeness: 0.800
Accuracy: 0.800
Consistency: 0.800

Detailed results saved to 'rag_generation_evaluation_results.xlsx'



