<a href="https://colab.research.google.com/github/aaronbergfeld/w266-final-project/blob/main/Evaluate_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from accelerate import Accelerator
from datasets import load_dataset, Dataset

# Load Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_type = "finetuned"
document_type = "none"
sanitized = False
use_peft = True if model_type == "finetuned" else False
input_path = "/content/drive/MyDrive/w266 Final Project/data/" + model_name + "/" + model_type  + "/" + document_type + ("" if document_type == "none" else ("/sanitized" if sanitized else "/not_sanitized")) + "/nq_predictions.jsonl"
output_path = "/content/drive/MyDrive/w266 Final Project/data/" + model_name + "/" + model_type  + "/" + document_type + ("" if document_type == "none" else ("/sanitized" if sanitized else "/not_sanitized")) + "/nq_score.jsonl"

NQ_PREDICTIONS_PATH = input_path
NQ_SCORE_PATH = output_path

In [None]:
import torch
from transformers import AutoTokenizer, pipeline

# Configuration
model_id = "meta-llama/Llama-3.1-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=16,
    return_full_text=False,
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


# Prompt Format

In [None]:
SYSTEM_PROMPT = """You are a strict, binary QA judge. Each turn you will receive three pieces of information from the user:
  1. Question: a natural-language query.
  2. Gold answers: a JSON array of one or more strings (e.g. ["Paris","City of Light"]).
  3. Predicted answer: a free-form string.

Your **only** output must be exactly one word, with no punctuation or extra text:
  • “Yes” – if the Predicted answer is an exact match or semantically equivalent to any entry in Gold answers.
  • “No”  – otherwise.

Do not emit any rationale, examples, or additional commentary—only “Yes” or “No.”"""

USER_PROMPT = """Question: {}
Gold answers: {}
Predicted answer: {}"""

BATCH_SIZE = 32

dataset = load_dataset("json", data_files=NQ_PREDICTIONS_PATH, split="train")

def attach_messages(batch):
    batch["messages"] = [
        [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": USER_PROMPT.format(q, a, r)}
        ]
        for q, a, r in zip(batch["question"], batch["answer"], batch["prediction"])
    ]
    return batch

# 4. Batched generation function
def generate_batch(batch):
    # Extract the user prompt string from the messages list
    prompts = batch['messages']
    outputs = pipeline(
        prompts,
        max_new_tokens=32,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    # extract just the first line of the model’s output as the answer
    batch["correct"] = [
        True if out[0]["generated_text"].strip().split("\n")[0] == "Yes" else False
        for out in outputs
    ]
    return batch

dataset = dataset.map(
    attach_messages,
    batched=True,
    batch_size=BATCH_SIZE,
)

# 5. Map over the dataset in large batches
#    - batched=True hands you lists of rows
#    - batch_size controls how many you send at once
dataset = dataset.map(
    generate_batch,
    batched=True,
    batch_size=BATCH_SIZE,    # tune to fit your GPU memory
    remove_columns=[c for c in dataset.column_names if c not in ("question", "answer", "prediction")],
)

# 6. Save to JSONL
dataset.to_json(NQ_SCORE_PATH, orient="records", lines=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]



Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

579725

In [None]:
import pandas as pd

nq_score = pd.read_json(NQ_SCORE_PATH, lines=True)

print(len(nq_score[nq_score["correct"] == True]) / len(nq_score))

0.35225


In [None]:
!git clone https://github.com/google-research/language.git
!pip install -e language --no-deps

Cloning into 'language'...
remote: Enumerating objects: 4043, done.[K
remote: Counting objects: 100% (444/444), done.[K
remote: Compressing objects: 100% (207/207), done.[K
remote: Total 4043 (delta 331), reused 237 (delta 237), pack-reused 3599 (from 3)[K
Receiving objects: 100% (4043/4043), 6.38 MiB | 18.62 MiB/s, done.
Resolving deltas: 100% (2285/2285), done.
Obtaining file:///content/language
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: language
  Running setup.py develop for language
Successfully installed language-0.0.1.dev0


In [None]:
nq_ds = pd.read_json(NQ_PREDICTIONS_PATH, lines=True)
nq_predictions = nq_ds[["question", "prediction"]]
nq_references = nq_ds[["question", "answer"]]

nq_predictions.to_json("/content/nq_predictions.jsonl", orient='records', lines=True)
nq_references.to_json("/content/nq_references.jsonl", orient='records', lines=True)

In [None]:
!python -m language.orqa.evaluation.evaluate_predictions \
  --references_path=nq_references.jsonl \
  --predictions_path=nq_predictions.jsonl \
  --is_regex=False \
  --answer_field=answer


2025-08-03 16:03:29.952414: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754237009.972374    3865 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754237009.978396    3865 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Found 4000 references in nq_references.jsonl
Found 4000 predictions in nq_predictions.jsonl
Found 0 missing predictions.
Accuracy: 0.1845 (738/4000)


In [None]:
correct_predictions = nq_score[nq_score["correct"] == True]
incorrect_predictions = nq_score[nq_score["correct"] == False]

correct_predictions = incorrect_predictions.sample(10)

for q, a, r, c in zip(correct_predictions["question"], correct_predictions["answer"], correct_predictions["prediction"], correct_predictions["correct"]):
    print(f"Question: {q}\n")
    print(f"Gold answers: {a}\n")
    print(f"Predicted answer: {r}\n")
    print(f"Correct: {c}\n")
    print("-" * 80)


Question: when did the song you can ring my bell come out

Gold answers: ['1979']

Predicted answer: 1963

Correct: False

--------------------------------------------------------------------------------
Question: who plays earl on my name is earl

Gold answers: ['Jason Michael Lee']

Predicted answer: Jason Bateman

Correct: False

--------------------------------------------------------------------------------
Question: when do we vote for governor in wisconsin

Gold answers: ['November 6, 2018']

Predicted answer: April

Correct: False

--------------------------------------------------------------------------------
Question: jaya real name of sajan re phir jhoot mat bolo

Gold answers: ['Parvati Vaze']

Predicted answer: Shahid Kapoor

Correct: False

--------------------------------------------------------------------------------
Question: when was the first fast and furious movie released

Gold answers: ['2001']

Predicted answer: 1994

Correct: False

---------------------------

In [None]:
!pip install evaluate datasets nltk rouge_score

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e15ad1f4adf78f968cd14b580e929bcbba6c1077116f423d727f2f8f341c9e2f
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.5 rouge_score-0.1.2


In [None]:
from datasets import load_dataset, Dataset
import evaluate
import nltk
from tqdm.auto import tqdm
import numpy as np

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

# Load metrics once
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def calculate_metrics(batch, bleu_metric, rouge_metric):
    """
    Compute metrics for a batch of predictions and gold answers.

    Args:
        batch: A dictionary with 'answer' (list of lists) and 'prediction' (list of strings).
        bleu_metric: Preloaded BLEU metric object.
        rouge_metric: Preloaded ROUGE metric object.

    Returns:
        Dictionary with lists of metric values.
    """
    predictions = batch['prediction']
    references = batch['answer']
    num_items = len(predictions)

    # Initialize output lists
    exact_matches = [False] * num_items
    bleu_scores = [0.0] * num_items
    rouge1_scores = [0.0] * num_items
    rouge2_scores = [0.0] * num_items
    rougeL_scores = [0.0] * num_items
    rougeLsum_scores = [0.0] * num_items
    f1_scores = [0.0] * num_items

    # Compute BLEU and ROUGE in batch
    valid_preds = [p if p else "" for p in predictions]  # Handle None/empty predictions
    valid_refs = [r if r else [""] for r in references]  # Handle empty references
    mask = [bool(p and r[0]) for p, r in zip(valid_preds, valid_refs)]  # Valid pairs

    if any(mask):
        valid_bleu = bleu_metric.compute(
            predictions=[p for p, m in zip(valid_preds, mask) if m],
            references=[r for r, m in zip(valid_refs, mask) if m]
        )['bleu']
        valid_rouge = rouge_metric.compute(
            predictions=[p for p, m in zip(valid_preds, mask) if m],
            references=[r for r, m in zip(valid_refs, mask) if m]
        )

        # Assign scores to valid indices
        valid_idx = 0
        for i, m in enumerate(mask):
            if m:
                bleu_scores[i] = valid_bleu
                rouge1_scores[i] = valid_rouge['rouge1']
                rouge2_scores[i] = valid_rouge['rouge2']
                rougeL_scores[i] = valid_rouge['rougeL']
                rougeLsum_scores[i] = valid_rouge['rougeLsum']
                valid_idx += 1

    # Exact Match and F1 (token-based)
    def compute_f1(gold_tokens, pred_tokens):
        common_tokens = set(gold_tokens) & set(pred_tokens)
        num_common = len(common_tokens)
        if num_common == 0:
            return 0.0
        precision = num_common / len(pred_tokens) if pred_tokens else 0.0
        recall = num_common / len(gold_tokens) if gold_tokens else 0.0
        return (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    for i in range(num_items):
        pred = valid_preds[i]
        golds = valid_refs[i]

        # Exact Match
        exact_matches[i] = any(pred.strip().lower() == gold.strip().lower() for gold in golds)

        # F1 Score
        pred_tokens = nltk.word_tokenize(pred.lower()) if pred else []
        if pred_tokens:
            max_f1 = 0.0
            for gold in golds:
                gold_tokens = nltk.word_tokenize(gold.lower())
                max_f1 = max(max_f1, compute_f1(gold_tokens, pred_tokens))
            f1_scores[i] = max_f1

    return {
        'exact_match': exact_matches,
        'bleu': bleu_scores,
        'rouge1': rouge1_scores,
        'rouge2': rouge2_scores,
        'rougeL': rougeL_scores,
        'rougeLsum': rougeLsum_scores,
        'f1': f1_scores
    }

# Load dataset
dataset = load_dataset('json', data_files=NQ_SCORE_PATH, split='train')

# Apply metrics calculation with batch processing
dataset = dataset.map(
    function=calculate_metrics,
    batched=True,
    batch_size=100,  # Adjust based on memory and performance
    fn_kwargs={'bleu_metric': bleu_metric, 'rouge_metric': rouge_metric},
    num_proc=4,  # Adjust based on CPU cores
    desc="Calculating Metrics"
)

# Convert to Pandas for final output (optional)
nq_score = dataset.to_pandas()

# Print average scores
print("\nAverage Metrics:")
print(f"Average Exact Match: {nq_score['exact_match'].mean():.4f}")
print(f"Average BLEU: {nq_score['bleu'].mean():.4f}")
print(f"Average ROUGE-1: {nq_score['rouge1'].mean():.4f}")
print(f"Average ROUGE-2: {nq_score['rouge2'].mean():.4f}")
print(f"Average ROUGE-L: {nq_score['rougeL'].mean():.4f}")
print(f"Average ROUGE-Lsum: {nq_score['rougeLsum'].mean():.4f}")
print(f"Average F1: {nq_score['f1'].mean():.4f}")

# Save the updated dataset
nq_score.to_json(NQ_SCORE_PATH, orient='records', lines=True)
print(f"\nUpdated scores saved to {NQ_SCORE_PATH}")
print("\nSample data with metrics:")
print(nq_score.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Calculating Metrics (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]


Average Metrics:
Average Exact Match: 0.1737
Average BLEU: 0.0400
Average ROUGE-1: 0.2877
Average ROUGE-2: 0.1291
Average ROUGE-L: 0.2860
Average ROUGE-Lsum: 0.2853
Average F1: 0.2809

Updated scores saved to /content/drive/MyDrive/w266 Final Project/data/meta-llama/Meta-Llama-3-8B-Instruct/finetuned/none/nq_score.jsonl

Sample data with metrics:
                                            question              answer  \
0        when did beavis and butthead first come out     [March 8, 1993]   
1  which is the asian tiger mosquito's species wi...  [Aedes albopictus]   
2  who won the first battle of bull run union or ...       [Confederate]   
3            who played the moon in the mighty boosh     [Noel Fielding]   
4      when does next game of thrones season 6 start    [April 24, 2016]   

                                          prediction  correct  exact_match  \
0                                      March 8, 1993     True         True   
1                                    

In [None]:
from google.colab import runtime
runtime.unassign()