In [18]:
import faiss
import pickle
import torch
import json
import pandas as pd
import numpy as np
from collections import Counter
from datasets import load_dataset
import re
import string


from config import *
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

print(f"Evaluating {RUN}")
print(f"Sampling subset (n={SUBSET_SIZE})" if USE_SUBSET else "Using full dataset")

Evaluating run_2
Using full dataset


In [19]:
# Load FAISS index and metadata for retrieval
print("> Loading FAISS index and metadata")
index = faiss.read_index(FAISS_INDEX_PATH)
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
with open(METADATA_PATH, "rb") as f:
    metadata = pickle.load(f)
print(f"FAISS index loaded with {index.ntotal} vectors")
print()

# Load QA model and tokenizer
print("> Loading QA model")
device = 0 if torch.cuda.is_available() else -1
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=device
)
print(f"QA model loaded on device: {'cuda' if device == 0 else 'cpu'}\n")

# Load datasets for evaluation
print("> Loading datasets")
train_dataset = load_dataset("json", data_files=TRAIN_DATA_PATH)["train"]
val_dataset = load_dataset("json", data_files=VAL_DATA_PATH)["train"]  
test_dataset = load_dataset("json", data_files=TEST_DATA_PATH)["train"]

print(f"Dataset sizes:")
print(f"  Train: {len(train_dataset)} examples")
print(f"  Validation: {len(val_dataset)} examples") 
print(f"  Test: {len(test_dataset)} examples")

> Loading FAISS index and metadata


Device set to use cuda:0


FAISS index loaded with 20260 vectors

> Loading QA model
QA model loaded on device: cuda

> Loading datasets
Dataset sizes:
  Train: 10207 examples
  Validation: 2187 examples
  Test: 2188 examples


In [20]:
# Evaluation utility functions
def normalize_answer(s):
    """Lowercase, remove punctuation, articles, and extra whitespace."""
    s = s.lower()
    s = ''.join(ch for ch in s if ch not in string.punctuation)
    s = re.sub(r'\b(a|an|the)\b', ' ', s)
    s = ' '.join(s.split())
    return s

def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def evaluate_dataset(dataset, use_retrieval=False, max_examples=None, dataset_name="Dataset"):
    """
    Evaluate the model on a dataset.
    
    Args:
        dataset: HuggingFace dataset with questions, contexts, and answers
        use_retrieval: If True, use FAISS retrieval to get context. If False, use original context
        max_examples: Limit evaluation to this many examples (for faster testing)
        dataset_name: Name for logging purposes
    """
    if max_examples:
        dataset = dataset.select(range(min(max_examples, len(dataset))))
    
    print(f"\n=== Evaluating {dataset_name} ({len(dataset)} examples) ===")
    print(f"Using {'retrieval + QA' if use_retrieval else 'direct QA'} pipeline")
    
    exact_matches = []
    f1_scores = []
    predictions = []
    
    for i, example in enumerate(dataset):
        if i % 50 == 0:
            print(f"Processing example {i+1}/{len(dataset)}")
        
        question = example['question']
        gold_answers = example['answers']['text']
        
        if use_retrieval:
            # Use retrieval like in the API
            question_embedding = embedding_model.encode([question], convert_to_numpy=True)
            distances, indices = index.search(question_embedding, k=3)
            context = " ".join([metadata[j]['answer_chunk'] for j in indices[0]])
        else:
            # Use original context from dataset
            context = example['context']
        
        try:
            # Get prediction from QA pipeline
            result = qa_pipeline(question=question, context=context)
            predicted_answer = result['answer']
            confidence = result['score']
        except Exception as e:
            print(f"Error processing example {i}: {e}")
            predicted_answer = ""
            confidence = 0.0
        
        # Compute metrics against all gold answers (take max)
        max_exact = 0
        max_f1 = 0
        for gold_answer in gold_answers:
            exact = compute_exact(gold_answer, predicted_answer)
            f1 = compute_f1(gold_answer, predicted_answer)
            max_exact = max(max_exact, exact)
            max_f1 = max(max_f1, f1)
        
        exact_matches.append(max_exact)
        f1_scores.append(max_f1)
        predictions.append({
            'question': question,
            'predicted_answer': predicted_answer,
            'gold_answers': gold_answers,
            'confidence': confidence,
            'exact_match': max_exact,
            'f1_score': max_f1,
            'context_used': context[:200] + "..." if len(context) > 200 else context
        })
    
    # Compute final metrics
    avg_exact = np.mean(exact_matches) * 100
    avg_f1 = np.mean(f1_scores) * 100
    
    print(f"\n> Results for {dataset_name}:")
    print(f"  Exact Match: {avg_exact:.2f}%")
    print(f"  F1 Score: {avg_f1:.2f}%")
    print(f"  Total examples: {len(exact_matches)}\n")
    
    return {
        'exact_match': avg_exact,
        'f1_score': avg_f1,
        'predictions': predictions,
        'num_examples': len(exact_matches)
    }


In [21]:
# Run comprehensive evaluation on all datasets
print("> Starting comprehensive model evaluation")

# For faster testing, limit to smaller subsets initially
# Set max_examples=None to evaluate on full datasets
MAX_EXAMPLES = 100  # Adjust as needed

results = {}

# Evaluate on test set (most important)
print("\n" + "="*60)
print("EVALUATING ON TEST SET")
print("="*60)

# Direct QA evaluation (using original contexts)
results['test_direct'] = evaluate_dataset(
    test_dataset, 
    use_retrieval=False, 
    max_examples=MAX_EXAMPLES,
    dataset_name="Test Set (Direct QA)"
)

# Retrieval + QA evaluation (like the API)
results['test_retrieval'] = evaluate_dataset(
    test_dataset, 
    use_retrieval=True, 
    max_examples=MAX_EXAMPLES,
    dataset_name="Test Set (Retrieval + QA)"
)

> Starting comprehensive model evaluation

EVALUATING ON TEST SET

=== Evaluating Test Set (Direct QA) (100 examples) ===
Using direct QA pipeline
Processing example 1/100
Processing example 51/100

> Results for Test Set (Direct QA):
  Exact Match: 3.00%
  F1 Score: 8.90%
  Total examples: 100


=== Evaluating Test Set (Retrieval + QA) (100 examples) ===
Using retrieval + QA pipeline
Processing example 1/100
Processing example 51/100

> Results for Test Set (Retrieval + QA):
  Exact Match: 0.00%
  F1 Score: 5.78%
  Total examples: 100



In [22]:
# Evaluate on validation set
print("\n" + "="*60)
print("   EVALUATING ON VALIDATION SET")
print("="*60)

results['val_direct'] = evaluate_dataset(
    val_dataset, 
    use_retrieval=False, 
    max_examples=MAX_EXAMPLES,
    dataset_name="Validation Set (Direct QA)"
)

results['val_retrieval'] = evaluate_dataset(
    val_dataset, 
    use_retrieval=True, 
    max_examples=MAX_EXAMPLES,
    dataset_name="Validation Set (Retrieval + QA)"
)


   EVALUATING ON VALIDATION SET

=== Evaluating Validation Set (Direct QA) (100 examples) ===
Using direct QA pipeline
Processing example 1/100
Processing example 51/100

> Results for Validation Set (Direct QA):
  Exact Match: 0.00%
  F1 Score: 5.57%
  Total examples: 100


=== Evaluating Validation Set (Retrieval + QA) (100 examples) ===
Using retrieval + QA pipeline
Processing example 1/100
Processing example 51/100

> Results for Validation Set (Retrieval + QA):
  Exact Match: 0.00%
  F1 Score: 4.58%
  Total examples: 100



In [23]:
# Evaluate on training set (to check for overfitting)
print("\n" + "="*60)
print("   EVALUATING ON TRAINING SET")
print("="*60)

results['train_direct'] = evaluate_dataset(
    train_dataset, 
    use_retrieval=False, 
    max_examples=MAX_EXAMPLES,
    dataset_name="Training Set (Direct QA)"
)

results['train_retrieval'] = evaluate_dataset(
    train_dataset, 
    use_retrieval=True, 
    max_examples=MAX_EXAMPLES,
    dataset_name="Training Set (Retrieval + QA)"
)


   EVALUATING ON TRAINING SET

=== Evaluating Training Set (Direct QA) (100 examples) ===
Using direct QA pipeline
Processing example 1/100
Processing example 51/100

> Results for Training Set (Direct QA):
  Exact Match: 1.00%
  F1 Score: 9.72%
  Total examples: 100


=== Evaluating Training Set (Retrieval + QA) (100 examples) ===
Using retrieval + QA pipeline
Processing example 1/100
Processing example 51/100

> Results for Training Set (Retrieval + QA):
  Exact Match: 0.00%
  F1 Score: 5.31%
  Total examples: 100



In [24]:
# Summary of all results
print("\n" + "="*80)
print("   COMPREHENSIVE EVALUATION SUMMARY")
print("="*80)

# Create a results table
print(f"{'Dataset':<15} {'Method':<15} {'Exact Match':<12} {'F1 Score':<10} {'Examples':<10}")
print("-" * 70)

for key, result in results.items():
    dataset_name, method = key.split('_')
    dataset_name = dataset_name.capitalize()
    method = "Direct QA" if method == "direct" else "Retrieval+QA"
    
    print(f"{dataset_name:<15} {method:<15} {result['exact_match']:<12.2f}% {result['f1_score']:<10.2f}% {result['num_examples']:<10}")
# Check for overfitting
if 'train_direct' in results and 'test_direct' in results:
    train_f1 = results['train_direct']['f1_score']
    test_f1 = results['test_direct']['f1_score']
    overfitting_gap = train_f1 - test_f1
    print(f"\n> Overfitting Analysis (Direct QA):")
    print(f"   - Training F1: {train_f1:.2f}%")
    print(f"   - Test F1: {test_f1:.2f}%")
    print(f"   - Gap: {overfitting_gap:.2f}%")
    
    if overfitting_gap > 10:
        print("   - Significant overfitting detected!")
    elif overfitting_gap > 5:
        print("   - Moderate overfitting detected")
    else:
        print("   - Good generalization!")



   COMPREHENSIVE EVALUATION SUMMARY
Dataset         Method          Exact Match  F1 Score   Examples  
----------------------------------------------------------------------
Test            Direct QA       3.00        % 8.90      % 100       
Test            Retrieval+QA    0.00        % 5.78      % 100       
Val             Direct QA       0.00        % 5.57      % 100       
Val             Retrieval+QA    0.00        % 4.58      % 100       
Train           Direct QA       1.00        % 9.72      % 100       
Train           Retrieval+QA    0.00        % 5.31      % 100       

> Overfitting Analysis (Direct QA):
   - Training F1: 9.72%
   - Test F1: 8.90%
   - Gap: 0.83%
   - Good generalization!


In [25]:
# Show example predictions
print("\n" + "="*80)
print("   EXAMPLE PREDICTIONS")
print("="*80)

def show_examples(results_key, num_examples=5):
    if results_key not in results:
        print(f"No results found for {results_key}")
        return
    
    predictions = results[results_key]['predictions']
    dataset_name = results_key.replace('_', ' ').title()
    
    print(f"\n--- {dataset_name} Examples ---")
    
    # Show a mix of correct and incorrect predictions
    correct_preds = [p for p in predictions if p['exact_match'] == 1]
    incorrect_preds = [p for p in predictions if p['exact_match'] == 0]
    
    examples_to_show = []
    examples_to_show.extend(correct_preds[:num_examples//2])
    examples_to_show.extend(incorrect_preds[:num_examples//2])
    
    for i, pred in enumerate(examples_to_show[:num_examples]):
        status = "O - CORRECT" if pred['exact_match'] == 1 else "X - INCORRECT"
        print(f"\nExample {i+1} - {status}")
        print(f"Question: {pred['question']}")
        print(f"Gold Answer(s): {pred['gold_answers']}")
        print(f"Predicted: '{pred['predicted_answer']}'")
        print(f"Confidence: {pred['confidence']:.4f}")
        print(f"F1 Score: {pred['f1_score']:.2f}")
        print(f"Context: {pred['context_used']}")
        print("-" * 60)

# Show examples from test set
show_examples('test_direct', num_examples=3)
show_examples('test_retrieval', num_examples=3)



   EXAMPLE PREDICTIONS

--- Test Direct Examples ---

Example 1 - O - CORRECT
Question: How many people are affected by cystinuria ?
Gold Answer(s): ['Cystinuria affects approximately 1 in 10,000 people.']
Predicted: 'Cystinuria affects approximately 1 in 10,000 people.'
Confidence: 0.9999
F1 Score: 1.00
Context: Cystinuria affects approximately 1 in 10,000 people.
------------------------------------------------------------

Example 2 - X - INCORRECT
Question: Is Galactosialidosis inherited ?
Gold Answer(s): ['How is galactosialidosis inherited? Galactosialidosis is inherited in an autosomal recessive pattern, which means both copies of the gene in each cell have mutations. The parents of an individual with an autosomal recessive condition each carry one copy of the mutated gene, but they typically do not show signs and symptoms of the condition.']
Predicted: 'How is galactosialidosis inherited?'
Confidence: 0.0011
F1 Score: 0.16
Context: How is galactosialidosis inherited? Galactosi

In [26]:
# Save evaluation results
import json
from datetime import datetime

# Prepare results for saving (remove predictions to reduce file size)
results_summary = {}
for key, result in results.items():
    results_summary[key] = {
        'exact_match': result['exact_match'],
        'f1_score': result['f1_score'],
        'num_examples': result['num_examples']
    }

# Add metadata
evaluation_metadata = {
    'timestamp': datetime.now().isoformat(),
    'model_path': MODEL_PATH,
    'embedding_model': EMBEDDING_MODEL_NAME,
    'max_examples_per_dataset': MAX_EXAMPLES,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'results': results_summary
}

# Save to file
results_path = '../models/evaluation_results.json'
with open(results_path, 'w') as f:
    json.dump(evaluation_metadata, f, indent=2)

print(f"> Evaluation results saved to {results_path}")

# Final recommendations
print("\n" + "="*80)
print("   RECOMMENDATIONS")
print("="*80)

test_retrieval_f1 = results.get('test_retrieval', {}).get('f1_score', 0)
test_direct_f1 = results.get('test_direct', {}).get('f1_score', 0)

print(f"> Test Set Performance:")
print(f"   Direct QA F1: {test_direct_f1:.2f}%")
print(f"   Retrieval+QA F1: {test_retrieval_f1:.2f}%")

if test_direct_f1 > test_retrieval_f1:
    gap = test_direct_f1 - test_retrieval_f1
    print(f"\n> Retrieval Performance Gap: {gap:.2f}%")
    if gap > 10:
        print("   - Large gap suggests retrieval system needs improvement")
        print("   - Consider: Better embedding model, larger k, improved chunking")
    else:
        print("   - Retrieval system performing reasonably well")

print(f"\n> Overall Assessment:")
if test_retrieval_f1 > 70:
    print("   - Excellent performance - ready for production")
elif test_retrieval_f1 > 50:
    print("   - Good performance - consider fine-tuning")
elif test_retrieval_f1 > 30:
    print("   - Moderate performance - needs improvement")
else:
    print("   - Poor performance - requires significant work")


> Evaluation results saved to ../models/evaluation_results.json

   RECOMMENDATIONS
> Test Set Performance:
   Direct QA F1: 8.90%
   Retrieval+QA F1: 5.78%

> Retrieval Performance Gap: 3.12%
   - Retrieval system performing reasonably well

> Overall Assessment:
   - Poor performance - requires significant work
