# Model Evaluation Notebook
This notebook performs model evaluation and extracts incorrectly labeled samples.

The notebook uses publicly available SQuAD evaluation script from https://github.com/huggingface/evaluate/blob/main/metrics/squad/compute_score.py.

Before running, please change file paths in order to correspond to your file structure.

In [None]:
# Import libraries
import json
from transformers import pipeline
from datasets import load_dataset, Dataset
from evaluate import evaluator, load
from tqdm import tqdm

In [None]:
# Specify model and dataset
model_ckpt = '../../models/xlmr-cyrl/'
loading_script = "../finetuning/loading_script.py"
dev_data = "../../data/squad-sr/squad-sr-v1.1-dev-cyrillic.json"

In [None]:
max_length = 512

In [None]:
# Instantiate pipeline
pipe = pipeline('question-answering', model=model_ckpt, tokenizer=model_ckpt)

In [None]:
# Load dataset
data_files = {
    "dev": dev_data
}
dataset = load_dataset(loading_script, data_files=data_files)
validation_dataset = dataset['validation'] # Ignore `train` split

In [None]:
def filter_dataset(example, max_length, tokenizer):
    """Exclude examples that have more than `max_length` tokens

    This function is forwarded to `Dateset.filter` function
    """
    inputs = tokenizer(
        example["question"],
        example["context"],
        max_length=max_length,
        truncation="do_not_truncate",
        padding="max_length",
        return_offsets_mapping=True
    )
    return len(inputs["input_ids"])==tokenizer.model_max_length

In [None]:
# Filter training dataset
validation_dataset = validation_dataset.filter(filter_dataset, fn_kwargs={"max_length": max_length, "tokenizer": pipe.tokenizer})

In [None]:
# Initialize evaluator
squad_evaluator = evaluator('question-answering')

In [None]:
# Compute evaluation metrics
def evaluate_squad():
    evaluation_results = squad_evaluator.compute(pipe, data=validation_dataset, squad_v2_format=False)
    return evaluation_results

In [None]:
evaluate_squad()

| Model     | EM    | F1    |
|-----------|-------|-------|
| bert-cyrl | 51.46 | 67.28 |
| bert-lat  | 69.32 | 80.11 |
| xlmr-cyrl | 53.73 | 69.45 |
| xlmr-lat  | 71.04 | 81.62 |

In [None]:
# Initialize SQuAD metric
squad_metric = load('squad')
squad_metric

In [None]:
# Extract wrong answers
em_errors = []
f1_errors = []

for o, m in zip(pipe(question=validation_dataset['question'], context=validation_dataset['context']), validation_dataset):
    pred = [{'prediction_text': o['answer'], 'id': m['id']}]
    ref = [{"answers": m['answers'], "id": m['id']}]
    res = squad_metric.compute(predictions=pred, references=ref)
    if res['exact_match'] == 0.:
        em_errors.append({
            "id": m['id'],
            "question": m['question'],
            "answers": m['answers'],
            "prediction": pred,
            "context": m['context'],
            "score": res
        })
    if res['f1'] <= 50.:
        f1_errors.append({
            "id": m['id'],
            "question": m['question'],
            "answers": m['answers'],
            "prediction": pred,
            "context": m['context'],
            "score": res
        })

In [None]:
# Save errors to files
with open('../data/error_analysis/em_errors.json', 'w') as f:
    json.dump(em_errors, f, ensure_ascii=False)

with open('../data/error_analysis/f1_errors.json', 'w') as f:
    json.dump(f1_errors, f, ensure_ascii=False)