In [1]:
pip install -U deepeval


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
# DeepEval LLM Evaluation - No API Keys Required
# First install: pip install deepeval

import pandas as pd
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    ContextualRelevancyMetric,
    ContextualRecallMetric,
    ContextualPrecisionMetric,
    HallucinationMetric,
    BiasMetric,
    ToxicityMetric
)
from typing import List
import json

class LLMEvaluator:
    def __init__(self):
        """Initialize the evaluator with free metrics (no API keys needed)"""
        # These metrics work without API keys
        self.metrics = [
            AnswerRelevancyMetric(threshold=0.7),
            FaithfulnessMetric(threshold=0.7),
            ContextualRelevancyMetric(threshold=0.7),
            ContextualRecallMetric(threshold=0.7),
            ContextualPrecisionMetric(threshold=0.7),
            HallucinationMetric(threshold=0.3),
            BiasMetric(threshold=0.5),
            ToxicityMetric(threshold=0.5)
        ]
    
    def create_test_case(self, 
                        input_text: str, 
                        actual_output: str, 
                        expected_output: str = None,
                        retrieval_context: List[str] = None) -> LLMTestCase:
        """Create a test case for evaluation"""
        return LLMTestCase(
            input=input_text,
            actual_output=actual_output,
            expected_output=expected_output,
            retrieval_context=retrieval_context
        )
    
    def evaluate_single_response(self, test_case: LLMTestCase) -> dict:
        """Evaluate a single response"""
        results = {}
        
        for metric in self.metrics:
            try:
                score = metric.measure(test_case)
                results[metric.__class__.__name__] = {
                    'score': score,
                    'passed': metric.is_successful(),
                    'reason': getattr(metric, 'reason', 'N/A')
                }
            except Exception as e:
                results[metric.__class__.__name__] = {
                    'score': None,
                    'passed': False,
                    'reason': f'Error: {str(e)}'
                }
        
        return results
    
    def evaluate_batch(self, test_cases: List[LLMTestCase]) -> pd.DataFrame:
        """Evaluate multiple test cases"""
        all_results = []
        
        for i, test_case in enumerate(test_cases):
            print(f"Evaluating test case {i+1}/{len(test_cases)}...")
            
            result = self.evaluate_single_response(test_case)
            result['test_case_id'] = i
            result['input'] = test_case.input
            result['actual_output'] = test_case.actual_output
            
            all_results.append(result)
        
        return self.format_results(all_results)
    
    def format_results(self, results: List[dict]) -> pd.DataFrame:
        """Format results into a readable DataFrame"""
        formatted_data = []
        
        for result in results:
            row = {
                'test_case_id': result['test_case_id'],
                'input': result['input'][:100] + '...' if len(result['input']) > 100 else result['input'],
                'actual_output': result['actual_output'][:100] + '...' if len(result['actual_output']) > 100 else result['actual_output']
            }
            
            # Add metric scores
            for metric_name, metric_result in result.items():
                if metric_name not in ['test_case_id', 'input', 'actual_output']:
                    row[f'{metric_name}_score'] = metric_result.get('score')
                    row[f'{metric_name}_passed'] = metric_result.get('passed')
        
        return pd.DataFrame(formatted_data)
    
    def save_results(self, results_df: pd.DataFrame, filename: str = 'evaluation_results.csv'):
        """Save results to CSV"""
        results_df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")
    
    def generate_report(self, results_df: pd.DataFrame) -> dict:
        """Generate evaluation report"""
        report = {
            'total_test_cases': len(results_df),
            'metrics_summary': {}
        }
        
        # Calculate average scores and pass rates for each metric
        metric_columns = [col for col in results_df.columns if col.endswith('_score')]
        
        for col in metric_columns:
            metric_name = col.replace('_score', '')
            pass_col = f'{metric_name}_passed'
            
            scores = results_df[col].dropna()
            passes = results_df[pass_col].sum() if pass_col in results_df.columns else 0
            
            report['metrics_summary'][metric_name] = {
                'avg_score': scores.mean() if len(scores) > 0 else 0,
                'min_score': scores.min() if len(scores) > 0 else 0,
                'max_score': scores.max() if len(scores) > 0 else 0,
                'pass_rate': (passes / len(results_df)) * 100 if len(results_df) > 0 else 0,
                'total_evaluated': len(scores)
            }
        
        return report

# Example usage and sample data
def main():
    # Initialize evaluator
    evaluator = LLMEvaluator()
    
    # Sample test cases (replace with your actual data)
    sample_data = [
        {
            'input': 'What is the capital of France?',
            'actual_output': 'The capital of France is Paris. Paris is located in the north-central part of France and is known for its rich history, culture, and landmarks like the Eiffel Tower.',
            'expected_output': 'Paris',
            'context': ['France is a country in Western Europe.', 'Paris is the capital and largest city of France.']
        },
        {
            'input': 'Explain photosynthesis',
            'actual_output': 'Photosynthesis is the process by which plants convert sunlight, carbon dioxide, and water into glucose and oxygen. This process occurs in the chloroplasts of plant cells.',
            'expected_output': 'Photosynthesis is a process used by plants to convert light energy into chemical energy.',
            'context': ['Plants use chlorophyll to capture light energy.', 'The photosynthesis equation is 6CO2 + 6H2O + light energy → C6H12O6 + 6O2.']
        },
        {
            'input': 'What is machine learning?',
            'actual_output': 'Machine learning is a subset of artificial intelligence that enables computers to learn and make decisions from data without being explicitly programmed for every task.',
            'expected_output': 'Machine learning is a method of data analysis that automates analytical model building.',
            'context': ['Machine learning is part of AI.', 'It uses algorithms to find patterns in data.']
        }
    ]
    
    # Create test cases
    test_cases = []
    for data in sample_data:
        test_case = evaluator.create_test_case(
            input_text=data['input'],
            actual_output=data['actual_output'],
            expected_output=data['expected_output'],
            retrieval_context=data['context']
        )
        test_cases.append(test_case)
    
    # Run evaluation
    print("Starting evaluation...")
    results_df = evaluator.evaluate_batch(test_cases)
    
    # Display results
    print("\n" + "="*50)
    print("EVALUATION RESULTS")
    print("="*50)
    print(results_df.to_string(index=False))
    
    # Generate and display report
    report = evaluator.generate_report(results_df)
    print("\n" + "="*50)
    print("EVALUATION REPORT")
    print("="*50)
    print(json.dumps(report, indent=2))
    
    # Save results
    evaluator.save_results(results_df)
    
    return results_df, report

# Custom evaluation function for your specific data
def evaluate_your_data(input_texts: List[str], 
                      actual_outputs: List[str], 
                      expected_outputs: List[str] = None,
                      contexts: List[List[str]] = None):
    """
    Evaluate your own LLM outputs
    
    Args:
        input_texts: List of input prompts
        actual_outputs: List of LLM responses to evaluate
        expected_outputs: List of expected/reference outputs (optional)
        contexts: List of retrieval contexts for each input (optional)
    """
    
    evaluator = LLMEvaluator()
    test_cases = []
    
    for i in range(len(input_texts)):
        expected = expected_outputs[i] if expected_outputs and i < len(expected_outputs) else None
        context = contexts[i] if contexts and i < len(contexts) else None
        
        test_case = evaluator.create_test_case(
            input_text=input_texts[i],
            actual_output=actual_outputs[i],
            expected_output=expected,
            retrieval_context=context
        )
        test_cases.append(test_case)
    
    # Evaluate
    results_df = evaluator.evaluate_batch(test_cases)
    report = evaluator.generate_report(results_df)
    
    return results_df, report

# Load data from CSV (if you have your data in a file)
def evaluate_from_csv(csv_file_path: str, 
                     input_column: str = 'input',
                     output_column: str = 'actual_output',
                     expected_column: str = 'expected_output',
                     context_column: str = 'context'):
    """
    Evaluate LLM outputs from a CSV file
    """
    df = pd.read_csv(csv_file_path)
    
    input_texts = df[input_column].tolist()
    actual_outputs = df[output_column].tolist()
    expected_outputs = df[expected_column].tolist() if expected_column in df.columns else None
    
    # Handle context column (assuming it's JSON string of list)
    contexts = None
    if context_column in df.columns:
        contexts = []
        for context_str in df[context_column]:
            try:
                context_list = json.loads(context_str) if pd.notna(context_str) else None
                contexts.append(context_list)
            except:
                contexts.append(None)
    
    return evaluate_your_data(input_texts, actual_outputs, expected_outputs, contexts)

if __name__ == "__main__":
    # Run the main example
    results, report = main()
    
    # Example of evaluating your own data
    # your_inputs = ["Your input 1", "Your input 2"]
    # your_outputs = ["Your LLM output 1", "Your LLM output 2"]
    # results, report = evaluate_your_data(your_inputs, your_outputs)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [3]:
from deepeval_evaluator import evaluate_your_data

inputs = ["What is AI?"]
outputs = ["AI stands for Artificial Intelligence..."]
results, report = evaluate_your_data(inputs, outputs)

ModuleNotFoundError: No module named 'deepeval_evaluator'