# Official SQuAD Evaluation

In [None]:
"""Official evaluation script for SQuAD version 2.0.

In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
"""
import argparse
import collections
import json
import numpy as np
import os
import re
import string
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import sys

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
    """Compute Exact Match (EM) score."""
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    """Compute F1 score."""
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def get_raw_scores(predictions, references):
    """
    Compute Exact Match (EM) and F1 scores for predictions and references.

    :param predictions: List of prediction dictionaries.
                        Each should have 'id' and 'prediction_text'.
    :param references: List of reference dictionaries.
                       Each should have 'id' and 'answers' (a dictionary with 'text' key containing a list of possible answers).
    :return: Two dictionaries: exact_scores and f1_scores.
    """
    exact_scores = {}
    f1_scores = {}

    # Create a reference lookup by ID
    reference_lookup = {ref['id']: ref['answers']['text'] for ref in references}

    for pred in predictions:
        qid = pred['id']
        a_pred = pred['prediction_text']

        # Get the gold answers for the current prediction
        gold_answers = reference_lookup.get(qid, [])
        if not gold_answers:
            # If no gold answers exist, the only correct answer is an empty string
            gold_answers = ['']

        # Compute the maximum scores for this prediction
        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
        f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)

    return exact_scores, f1_scores

def make_eval_dict(exact_scores, f1_scores):
    """
    Compute overall evaluation metrics (EM and F1).

    :param exact_scores: Dictionary of exact scores by question ID.
    :param f1_scores: Dictionary of F1 scores by question ID.
    :return: Dictionary containing exact, F1, and total metrics.
    """
    total = len(exact_scores)
    return collections.OrderedDict([
        ('exact', 100.0 * sum(exact_scores.values()) / total),
        ('f1', 100.0 * sum(f1_scores.values()) / total),
        ('total', total),
    ])

# Loading SQUAD

In [1]:
pip install farm-haystack transformers datasets

Collecting farm-haystack
  Downloading farm_haystack-1.26.4-py3-none-any.whl.metadata (31 kB)
Collecting boilerpy3 (from farm-haystack)
  Downloading boilerpy3-1.0.7-py3-none-any.whl.metadata (5.8 kB)
Collecting events (from farm-haystack)
  Downloading Events-0.5-py3-none-any.whl.metadata (3.9 kB)
Collecting lazy-imports==0.3.1 (from farm-haystack)
  Downloading lazy_imports-0.3.1-py3-none-any.whl.metadata (10 kB)
Collecting posthog (from farm-haystack)
  Downloading posthog-3.9.3-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting prompthub-py==4.0.0 (from farm-haystack)
  Downloading prompthub_py-4.0.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pydantic<2 (from farm-haystack)
  Downloading pydantic-1.10.21-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (153 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting quantulum3 (from farm-haystack)
  Downloading quantulum3-0.9.2-p

In [2]:
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import BM25Retriever
from datasets import load_dataset

In [3]:
document_store = InMemoryDocumentStore(use_bm25=True)

In [4]:
def prepare_squad_documents(squad_data):
    documents = []
    for example in squad_data:
        documents.append({
            'content': example['context'],
            'meta': {'question': example['question'], 'answers': example['answers']['text']}
        })
    return documents

In [5]:
squad_train = load_dataset('squad', split='train')
squad_documents = prepare_squad_documents(squad_train)
document_store.write_documents(squad_documents)

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Updating BM25 representation...: 100%|██████████| 18891/18891 [00:00<00:00, 22157.90 docs/s]


BM25 refers to Okapi BM25, which is a ranking function used by search engines to score documents based on their relevance to a query.

In [6]:
retriever = BM25Retriever(document_store=document_store)

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [8]:
# Load a T5 model and tokenizer
model_name = "t5-large"  # Use "t5-base" or "t5-large" for better quality
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move the model to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
def t5_reader_predict(query, documents):
    """
    Use T5 to generate an answer from retrieved documents.
    :param query: The question string.
    :param documents: A list of documents (Haystack format).
    :return: A single answer generated by T5.
    """
    # Combine retrieved documents into a single context
    context = " ".join([doc.content for doc in documents])

    # Construct the T5 input
    t5_input = f"question: {query} context: {context}"

    # Tokenize the input
    inputs = tokenizer(t5_input, return_tensors="pt", truncation=True, max_length=512).to(device)

    # Generate the answer
    outputs = model.generate(inputs.input_ids, max_length=150, num_beams=3, temperature=1.0).to(device)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

In [10]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m128.6 kB/s[0m eta [36m0:00:00[0m:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


# Baseline RAG

In [11]:
squad_train.select(range(100))

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 100
})

In [12]:
# Run the RAG pipeline
predictions = []
references = []

for example in squad_train.select(range(100)):
    question = example['question']
    answers = example['answers']['text']

    # Retrieve relevant documents for the query
    retrieved_docs = retriever.retrieve(query=question, top_k=5)

    # Run the reader to generate an answer
    predicted_answer = t5_reader_predict(query=question, documents=retrieved_docs)

    # Store the prediction and reference for evaluation
    predictions.append({
        'prediction_text': predicted_answer,
        'id': example['id'],
        'no_answer_probability': 0.0  # Assuming no "no-answer" case for baseline
    })
    references.append({
        'answers': {'text': answers, 'answer_start': [0]},
        'id': example['id']
    })


In [None]:
# Compute raw scores
exact_scores, f1_scores = get_raw_scores(predictions, references)

# Compute overall metrics
evaluation_results = make_eval_dict(exact_scores, f1_scores)

# Print the results
print("Evaluation Results:")
print(f"Exact Match (EM): {evaluation_results['exact']:.2f}")
print(f"F1 Score: {evaluation_results['f1']:.2f}")
print(f"Total Questions Evaluated: {evaluation_results['total']}")

In [13]:
from evaluate import load
squad_v2_metric = load("squad_v2")
results = squad_v2_metric.compute(predictions=predictions, references=references)

print("Evaluation Results:", results)

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Evaluation Results: {'exact': 55.0, 'f1': 63.54920634920634, 'total': 100, 'HasAns_exact': 55.0, 'HasAns_f1': 63.54920634920634, 'HasAns_total': 100, 'best_exact': 55.0, 'best_exact_thresh': 0.0, 'best_f1': 63.54920634920634, 'best_f1_thresh': 0.0}


# RAG Fusion

In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [16]:
def generate_query_variations_t5(original_query, num_variations=5):
    """
    Generates query variations using a pretrained T5 model.
    :param original_query: The original query string.
    :param num_variations: Number of query variations to generate.
    :return: List of query variations.
    """
    input_text = f"paraphrase: {original_query} </s>"
    inputs = tokenizer([input_text] * num_variations, return_tensors="pt", padding=True, truncation=True).to(device)

    outputs = model.generate(
        inputs.input_ids,
        max_length=50,
        num_beams=10,  # Use beam search for more diverse outputs
        num_return_sequences=num_variations,
        temperature=0.5
    ).to(device)

    variations = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return variations

In [17]:
# Define reciprocal rank fusion (RRF) function
def reciprocal_rank_fusion(results, k=60):
    """
    Combines multiple ranked lists using Reciprocal Rank Fusion (RRF).
    :param results: List of ranked lists from different queries.
    :param k: Fusion parameter.
    :return: A single fused ranked list.
    """
    fused_scores = {}
    for ranked_list in results:
        for rank, (doc_id, score) in enumerate(ranked_list):
            fused_scores[doc_id] = fused_scores.get(doc_id, 0) + 1 / (k + rank + 1)
    # Sort by fused scores
    return sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)

In [18]:
predictions = []
references = []

for example in squad_train.select(range(100)):
    question = example['question']
    answers = example['answers']['text']

    # Generate query variations using T5
    query_variations = generate_query_variations_t5(question)

    # Retrieve documents for each variation
    all_retrieval_results = []
    for variation in query_variations:
        retrieved_docs = retriever.retrieve(query=variation, top_k=5)
        all_retrieval_results.append([(doc.id, doc.score) for doc in retrieved_docs])

    # Perform RRF
    fused_results = reciprocal_rank_fusion(all_retrieval_results)
    fused_doc_ids = [doc_id for doc_id, _ in fused_results[:5]]  # Top 5 fused documents

    # Fetch the fused documents
    fused_documents = [document_store.get_document_by_id(doc_id) for doc_id in fused_doc_ids]

    # Run T5 reader on the fused documents
    predicted_answer = t5_reader_predict(query=question, documents=fused_documents)

    # Store the prediction and reference for evaluation
    predictions.append({
        'prediction_text': predicted_answer,
        'id': example['id'],
        'no_answer_probability': 0.  # Assuming no "no answer" case for now
    })

    references.append({
        'answers': {'text': answers, 'answer_start': [0]},
        'id': example['id']
    })


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Compute raw scores
exact_scores, f1_scores = get_raw_scores(predictions, references)

# Compute overall metrics
evaluation_results = make_eval_dict(exact_scores, f1_scores)

# Print the results
print("Evaluation Results:")
print(f"Exact Match (EM): {evaluation_results['exact']:.2f}")
print(f"F1 Score: {evaluation_results['f1']:.2f}")
print(f"Total Questions Evaluated: {evaluation_results['total']}")

In [19]:
# Compute evaluation metrics
results = squad_v2_metric.compute(predictions=predictions, references=references)

print("Evaluation Results:", results)

Evaluation Results: {'exact': 52.0, 'f1': 59.232539682539674, 'total': 100, 'HasAns_exact': 52.0, 'HasAns_f1': 59.232539682539674, 'HasAns_total': 100, 'best_exact': 52.0, 'best_exact_thresh': 0.0, 'best_f1': 59.232539682539674, 'best_f1_thresh': 0.0}
