# Evaluating retrieval Quality

## Creating syntethic ground truth

In [1]:
import sys
sys.path.append('..')

from utils import load_from_json, save_to_json, llm

In [None]:
# reading in all the parsed data from the civics guide
input_file = '../documents/parsed_civics_guide.json'

dataset = load_from_json(input_file)

In [None]:
user_prompt = """
Based on the *text* provided below, write 5 realistic queries you might ask to find information that is clearly answerable by the text. 

Guidelines:
- Each query should be natural, specific, and complete.
- Avoid copying exact phrases or sentences from the text whenever possible.
- Do not make up information that is not supported by the text.
- Make the queries diverse (e.g., mix "what", "who", "when", "how", etc.).
- Keep queries concise but clear enough to retrieve the right passage.

text: {text}

Output valid JSON only (no code blocks, no extra text):

["query1", "query2", "query3", "query4", "query5"]
""".strip()

In [None]:
def generate_questions(data,model):
    "This function generates synthetic questions to evaluate later."
    #
    text = data['text']
    d_user_prompt = user_prompt.format(text=text)
    
    # call llm
    return llm(system_prompt="You are a U.S. immigrant preparing for the civics test.",user_prompt=d_user_prompt,model=model)


In [None]:
# Set model
model = 'gpt-4o-mini'

# Initialize results as a list
golden_questions = []

for data in dataset:
    # Get id and page no (unique identifiers)
    id = data['uuid']
    page = data['page_no']
    # Generate set of questions
    questions = generate_questions(data,model)
    
    # Add each question with its UUID
    for question in questions['queries']:
        golden_questions.append({
            'uuid': id,
            'page': page,
            'question': question
        })

In [None]:
# save the golden data as a json
output_file = '../documents/golden_data_questions.json'
_ = save_to_json(output_file,golden_questions)

## Evaluating retrieval

In [10]:
## load from json if you dont want to rerun everything above
output_file = '../documents/golden_data_questions.json'
golden_questions = load_from_json(output_file)

Data loaded from ../documents/golden_data_questions.json


### Main Relevance matrix

In [11]:
import re
import json
from tqdm import tqdm
from utils import get_context

In [12]:
def get_relevance_matrix(
    golden_questions,
    limit=2,
    threshold=0.5,
    expansion=False,
    expansion_file="../documents/expansion_terms.json"
):
    """
    Calculate the relevance matrix used later to calculate hit rate and mrr.
    
    Args:
        golden_questions: List of golden question dictionaries
        limit: Number of documents to retrieve
        threshold: Score threshold for retrieval
        expansion: Whether to use query expansion
        expansion_file: Path to expansion terms JSON file
    
    Returns:
        List of relevance vectors
    """
    # Load expansion terms ONCE if needed
    expansion_terms = None
    if expansion:
        try:
            expansion_terms = load_from_json(expansion_file)
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"Warning: Could not load expansion terms: {e}")
            expansion_terms = None
    
    relevance_total = []
    for golden_question in golden_questions:
        # get question and page number 
        page = golden_question['page']
        question = golden_question['question']
    
        # get results from qdrant
        context = get_context(
            question=question,
            limit=limit,
            score_threshold=threshold,
            query_expansion=expansion,
            expansion_terms=expansion_terms  # Pass the pre-loaded dict
        )

        # get the page numbers that matched
        retrieved_pages = [int(num) for num in re.findall(r'Page (\d+):', context)]

        # get the relevance vector
        relevance = [p == page for p in retrieved_pages]
        relevance_total.append(relevance)

    return relevance_total

### Hit rate and MRR functions

In [13]:
def hit_rate(relevance_total):
    """
    Calculate hit rate: percentage of queries where the correct document appears.
    
    Args:
        relevance_total: List of relevance lists, where each inner list contains
                        True/False for whether each retrieved document is relevant
    
    Returns:
        Hit rate as a float between 0 and 1
    
    Example:
        >>> relevance = [[True, False], [False, False], [False, True]]
        >>> hit_rate(relevance)
        0.6667  # 2 out of 3 queries had a hit
    """
    cnt = 0
    
    for line in relevance_total:
        if True in line:
            cnt += 1
    
    return cnt / len(relevance_total) if len(relevance_total) > 0 else 0.0


def mrr(relevance_total):
    """
    Calculate Mean Reciprocal Rank (MRR).
    
    Args:
        relevance_total: List of relevance lists, where each inner list contains
                        True/False for whether each retrieved document is relevant
    
    Returns:
        MRR as a float between 0 and 1
    
    Example:
        >>> relevance = [[True, False], [False, True], [False, False]]
        >>> mrr(relevance)
        0.5  # (1/1 + 1/2 + 0) / 3 = 0.5
    """
    total_score = 0.0
    
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1 / (rank + 1)
                break  # Only count the FIRST True (this is optional but typical)
    
    return total_score / len(relevance_total) if len(relevance_total) > 0 else 0.0

### Evaluation strategy

In [14]:
def evaluate_strategy(golden_questions, strategy):
    """
    Evaluate a retrieval strategy using hit rate and MRR metrics.
    """
    # Validate inputs
    if not golden_questions:
        raise ValueError("golden_questions cannot be empty")
    
    # get the parameters from the strategy with defaults
    limit = strategy.get("limit", 2)
    threshold = strategy.get("threshold", 0.5)
    expansion = strategy.get("expansion", False)
    
    # Optional: validate parameter ranges
    if limit <= 0:
        raise ValueError(f"limit must be positive, got {limit}")
    if not 0 <= threshold <= 1:
        raise ValueError(f"threshold must be between 0 and 1, got {threshold}")

    # get main relevance matrix
    relevance_matrix = get_relevance_matrix(golden_questions, limit, threshold, expansion)

    # calculate hit rate
    metric_hit_rate = hit_rate(relevance_matrix)

    # calculate mrr
    metric_mrr = mrr(relevance_matrix)

    return {"hit_rate": metric_hit_rate, "mrr": metric_mrr}

In [15]:
# create a list of different strategies
import itertools

limits = [2, 3, 4]
thresholds = [0.3, 0.5, 0.7]
expansions = [False, True]

strategies = {}

for limit, threshold, expansion in itertools.product(limits, thresholds, expansions):
    # Create a readable strategy name
    exp_suffix = "_exp" if expansion else ""
    strategy_name = f"L{limit}_T{threshold}{exp_suffix}"
    
    strategies[strategy_name] = {
        "limit": limit,
        "threshold": threshold,
        "expansion": expansion
        }

In [16]:
strategies

{'L2_T0.3': {'limit': 2, 'threshold': 0.3, 'expansion': False},
 'L2_T0.3_exp': {'limit': 2, 'threshold': 0.3, 'expansion': True},
 'L2_T0.5': {'limit': 2, 'threshold': 0.5, 'expansion': False},
 'L2_T0.5_exp': {'limit': 2, 'threshold': 0.5, 'expansion': True},
 'L2_T0.7': {'limit': 2, 'threshold': 0.7, 'expansion': False},
 'L2_T0.7_exp': {'limit': 2, 'threshold': 0.7, 'expansion': True},
 'L3_T0.3': {'limit': 3, 'threshold': 0.3, 'expansion': False},
 'L3_T0.3_exp': {'limit': 3, 'threshold': 0.3, 'expansion': True},
 'L3_T0.5': {'limit': 3, 'threshold': 0.5, 'expansion': False},
 'L3_T0.5_exp': {'limit': 3, 'threshold': 0.5, 'expansion': True},
 'L3_T0.7': {'limit': 3, 'threshold': 0.7, 'expansion': False},
 'L3_T0.7_exp': {'limit': 3, 'threshold': 0.7, 'expansion': True},
 'L4_T0.3': {'limit': 4, 'threshold': 0.3, 'expansion': False},
 'L4_T0.3_exp': {'limit': 4, 'threshold': 0.3, 'expansion': True},
 'L4_T0.5': {'limit': 4, 'threshold': 0.5, 'expansion': False},
 'L4_T0.5_exp': {'l

In [17]:
# literally evaluate all strategies and calculate mrr/hit rate for each
results = []
    
for strategy_name, strategy_config in tqdm(strategies.items(), desc="Testing strategies"):
    print(f"Evaluating: {strategy_name}")
    
    # Evaluate the strategy
    metrics = evaluate_strategy(golden_questions, strategy_config)
    
    # Combine strategy config with metrics
    result = {
        "strategy_name": strategy_name,
        "limit": strategy_config["limit"],
        "threshold": strategy_config["threshold"],
        "expansion": strategy_config["expansion"],
        "hit_rate": metrics["hit_rate"],
        "mrr": metrics["mrr"]
    }
    
    results.append(result)

Testing strategies:   0%|          | 0/18 [00:00<?, ?it/s]

Evaluating: L2_T0.3


Testing strategies:   6%|▌         | 1/18 [00:44<12:34, 44.38s/it]

Evaluating: L2_T0.3_exp
Data loaded from ../documents/expansion_terms.json


Testing strategies:  11%|█         | 2/18 [01:28<11:49, 44.32s/it]

Evaluating: L2_T0.5


Testing strategies:  17%|█▋        | 3/18 [02:13<11:08, 44.59s/it]

Evaluating: L2_T0.5_exp
Data loaded from ../documents/expansion_terms.json


Testing strategies:  22%|██▏       | 4/18 [02:58<10:25, 44.70s/it]

Evaluating: L2_T0.7


Testing strategies:  28%|██▊       | 5/18 [03:43<09:43, 44.86s/it]

Evaluating: L2_T0.7_exp
Data loaded from ../documents/expansion_terms.json


Testing strategies:  33%|███▎      | 6/18 [04:29<09:00, 45.06s/it]

Evaluating: L3_T0.3


Testing strategies:  39%|███▉      | 7/18 [05:13<08:13, 44.90s/it]

Evaluating: L3_T0.3_exp
Data loaded from ../documents/expansion_terms.json


Testing strategies:  44%|████▍     | 8/18 [05:56<07:23, 44.34s/it]

Evaluating: L3_T0.5


Testing strategies:  50%|█████     | 9/18 [06:42<06:41, 44.64s/it]

Evaluating: L3_T0.5_exp
Data loaded from ../documents/expansion_terms.json


Testing strategies:  56%|█████▌    | 10/18 [07:24<05:52, 44.02s/it]

Evaluating: L3_T0.7


Testing strategies:  61%|██████    | 11/18 [08:19<05:31, 47.40s/it]

Evaluating: L3_T0.7_exp
Data loaded from ../documents/expansion_terms.json


Testing strategies:  67%|██████▋   | 12/18 [09:03<04:38, 46.45s/it]

Evaluating: L4_T0.3


Testing strategies:  72%|███████▏  | 13/18 [09:52<03:55, 47.04s/it]

Evaluating: L4_T0.3_exp
Data loaded from ../documents/expansion_terms.json


Testing strategies:  78%|███████▊  | 14/18 [10:48<03:18, 49.67s/it]

Evaluating: L4_T0.5


Testing strategies:  83%|████████▎ | 15/18 [11:32<02:24, 48.21s/it]

Evaluating: L4_T0.5_exp
Data loaded from ../documents/expansion_terms.json


Testing strategies:  89%|████████▉ | 16/18 [12:17<01:34, 47.18s/it]

Evaluating: L4_T0.7


Testing strategies:  94%|█████████▍| 17/18 [13:02<00:46, 46.49s/it]

Evaluating: L4_T0.7_exp
Data loaded from ../documents/expansion_terms.json


Testing strategies: 100%|██████████| 18/18 [13:47<00:00, 45.98s/it]


In [18]:
# Find best hit rate
best_hit_rate = max(results, key=lambda x: x['hit_rate'])

# Find best MRR
best_mrr = max(results, key=lambda x: x['mrr'])

# Find best combined (average of hit_rate and mrr)
best_combined = max(results, key=lambda x: (x['hit_rate'] + x['mrr']) / 2)
combined_score = (best_combined['hit_rate'] + best_combined['mrr']) / 2

# print out result
print("\n" + "="*80)
print("BEST STRATEGY BY HIT RATE")
print("="*80)
print(f"Strategy: {best_hit_rate['strategy_name']}")
print(f"  Limit: {best_hit_rate['limit']}")
print(f"  Threshold: {best_hit_rate['threshold']}")
print(f"  Expansion: {best_hit_rate['expansion']}")
print(f"  Hit Rate: {best_hit_rate['hit_rate']:.4f}")
print(f"  MRR: {best_hit_rate['mrr']:.4f}")

print("\n" + "="*80)
print("BEST STRATEGY BY MRR")
print("="*80)
print(f"Strategy: {best_mrr['strategy_name']}")
print(f"  Limit: {best_mrr['limit']}")
print(f"  Threshold: {best_mrr['threshold']}")
print(f"  Expansion: {best_mrr['expansion']}")
print(f"  Hit Rate: {best_mrr['hit_rate']:.4f}")
print(f"  MRR: {best_mrr['mrr']:.4f}")

print("\n" + "="*80)
print("BEST STRATEGY BY COMBINED SCORE")
print("="*80)
print(f"Strategy: {best_combined['strategy_name']}")
print(f"  Limit: {best_combined['limit']}")
print(f"  Threshold: {best_combined['threshold']}")
print(f"  Expansion: {best_combined['expansion']}")
print(f"  Hit Rate: {best_combined['hit_rate']:.4f}")
print(f"  MRR: {best_combined['mrr']:.4f}")
print(f"  Combined Score: {combined_score:.4f}")


BEST STRATEGY BY HIT RATE
Strategy: L4_T0.3
  Limit: 4
  Threshold: 0.3
  Expansion: False
  Hit Rate: 0.9647
  MRR: 0.8172

BEST STRATEGY BY MRR
Strategy: L4_T0.3
  Limit: 4
  Threshold: 0.3
  Expansion: False
  Hit Rate: 0.9647
  MRR: 0.8172

BEST STRATEGY BY COMBINED SCORE
Strategy: L4_T0.3
  Limit: 4
  Threshold: 0.3
  Expansion: False
  Hit Rate: 0.9647
  MRR: 0.8172
  Combined Score: 0.8909
