In [1]:
import pyterrier as pt
import pandas as pd
import numpy as np
from pyterrier.measures import RR, nDCG, MAP

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action

from common_helpers import *
from plotting_helpers import *

In [2]:
dataset_path = "irds:beir/fiqa"
dataset = pt.get_dataset(dataset_path)
testset = pt.get_dataset(dataset_path + "/test")
test_queries = testset.get_topics()

Java started (triggered by _pt_tokeniser) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [3]:
# Indexing documents
from pathlib import Path

indexer = pt.IterDictIndexer(
    str(Path.cwd()),  # this will be ignored
    type=pt.index.IndexingType.MEMORY,
)
index_ref = indexer.index(dataset.get_corpus_iter())

beir/fiqa documents: 100%|██████████| 57638/57638 [00:11<00:00, 5120.56it/s]


KeyboardInterrupt: 

In [None]:
bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")
dir = pt.terrier.Retriever(index_ref, wmodel="DirichletLM")
pl2 = pt.terrier.Retriever(index_ref, wmodel="PL2")
dfree = pt.terrier.Retriever(index_ref, wmodel="DFReeKLIM")

models = [bm25, dfree, pl2, dir]
metrics = [RR @ 10, nDCG @ 10, MAP @ 100]
noise_levels = np.arange(0.0, 1.1, 0.1)

In [None]:
random_val_per_noise = pd.concat([
        run_noise_experiment(test_queries, testset,
                             nac.OcrAug(aug_char_p=0.25, aug_word_p=noise_level, aug_word_min=0, aug_word_max=500000),
                             models, metrics).assign(noise_level=noise_level)
        for noise_level in noise_levels
    ], ignore_index=True
)
random_val_per_noise["name"] = random_val_per_noise["name"].str.extract(r"TerrierRetr\((.*?)\)")
random_val_per_noise.head(len(models))

In [None]:
plot_metric(random_val_per_noise, "RR@10", "name", "Reciprocal Rank at 10 (RR@10)", "Retrieval Model")
plot_metric(random_val_per_noise, "nDCG@10", "name", "Normalized Discounted Cumulative Gain at 10 (nDCG@10)", "Retrieval Model")
plot_metric(random_val_per_noise, "AP@100", "name", "Average Precision at 100 (AP@100)", "Retrieval Model")

In [None]:
keyboard_per_noise = pd.concat([
        run_noise_experiment(test_queries, testset, 
                             nac.OcrAug(aug_char_p=0.25, aug_word_p=noise_level, aug_word_min=0, aug_word_max=500000),
                             models, metrics).assign(noise_level=noise_level) 
    for noise_level in noise_levels], ignore_index=True
)
keyboard_per_noise["name"] = keyboard_per_noise["name"].str.extract(r"TerrierRetr\((.*?)\)")
keyboard_per_noise.head(len(models))

In [None]:
plot_metric(keyboard_per_noise, "RR@10", "name", "Reciprocal Rank at 10 (RR@10)")
plot_metric(keyboard_per_noise, "nDCG@10", "name", "Normalized Discounted Cumulative Gain at 10 (nDCG@10)")
plot_metric(keyboard_per_noise, "AP@100", "name", "Average Precision at 100 (AP@100)")