# DO NOT CHANGE - THIS IS JUST FOR REFERENCE

In [None]:
import pyterrier as pt
import pandas as pd
from pyterrier.measures import RR, nDCG, MAP

In [20]:
dataset = pt.get_dataset("irds:beir/fiqa")

In [21]:
# Install the nlpaug package
# %pip install nlpaug

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action

In [22]:
text = 'The quick brown fox jumps over the lazy dog .'
aug = nac.RandomCharAug(action="substitute", aug_char_p=0.8)

augmented_texts = aug.augment(text)

print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Texts:
['The quick 9ry)B fox &Pv1s QEQ& the lazy dog.']


In [23]:
# Add noise to the documents
noisy_dataset = []
bool = False

aug = nac.RandomCharAug(aug_char_p=0.3, aug_word_p=0.3, action="substitute", aug_word_min=1, aug_word_max=500000)

for doc in dataset.get_corpus_iter():
    noisy_text = aug.augment(doc['text'])
    noisy_doc = {'docno': doc['docno'], 'text': noisy_text}
    noisy_dataset.append(noisy_doc)

    if not bool:
        print(doc['text'])
        print("Noisy:")
        print(noisy_text)
        bool = True



I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education, rather than getting out with thousands in student debt and then complaining that they aren't qualified to do anything.
Noisy:
["I ' m not saying I don ' t Jikm the iHCa of on - the - job training too, but you can ' t expect the cUmpaIs to do XhDt. 9Iainin8 wsr1e8s is not th+sr job - 4iey ' re bui_DNng software. Per$Cps _EucaAionkl NyDte9s in the U. S. (or theWL students) vhoulu wSrDy a little abdct 9Qrting marketable suiTls in exchange for vheur maybiWe invest7kst in education, rather t3a& getting out w8Rh PhoBsaqds in studeR& djby and then cTmOpai0ing that t!Ky ar&r ' t qualified to do Dnythz%g."]


beir/fiqa documents: 100%|██████████| 57638/57638 [01:40<00:00, 573.09it/s]


In [24]:
# Indexing noisy documents - takes too long
# from pathlib import Path

# indexer = pt.IterDictIndexer(
#     str(Path.cwd()),  # this will be ignored
#     type=pt.index.IndexingType.MEMORY,
# )
# index_ref = indexer.index(noisy_dataset, fields=["text"])

In [25]:
# To test the noisy documents
# bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")
# testset = pt.get_dataset("irds:beir/fiqa/test")
# pt.Experiment(
#     [bm25],
#     testset.get_topics(),
#     testset.get_qrels(),
#     eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
# )

In [26]:
# Indexing documents
from pathlib import Path

indexer = pt.IterDictIndexer(
    str(Path.cwd()),  # this will be ignored
    type=pt.index.IndexingType.MEMORY,
)
index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])

beir/fiqa documents: 100%|██████████| 57638/57638 [00:25<00:00, 2280.78it/s]


In [27]:
# Add noise to the queries
# Load the test dataset
testset = pt.get_dataset("irds:beir/fiqa/test")

# Retrieve test queries
test_queries = testset.get_topics()

# Add noise by deleting characters 10% of the time
aug = nac.RandomCharAug(action="delete", aug_char_p=0.1, spec_char="")

# Apply noise to queries
noisy_queries_list = []
for _, row in test_queries.iterrows():
    noisy_query = aug.augment(row['query'])

    if isinstance(noisy_query, list):
        noisy_query = " ".join(noisy_query)

    noisy_queries_list.append({'qid': row['qid'], 'query': noisy_query})

noisy_queries_df = pd.DataFrame(noisy_queries_list)
noisy_queries_df["qid"] = noisy_queries_df["qid"].astype(str)

print("Original Query:", test_queries.iloc[0]['query'])
print("Noisy Query:", noisy_queries_df.iloc[0]['query'])

bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")

# Run the experiment
pt.Experiment(
    [bm25],
    noisy_queries_df,
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
)

Original Query: where should i park my rainy day emergency fund
Noisy Query: where should i park my ainy day mergency fud


Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,TerrierRetr(BM25),0.145615,0.120467,0.099014


In [28]:
# Define a Sequential Augmentation Pipeline (Substitution + Insert)
# Load the test dataset
testset = pt.get_dataset("irds:beir/fiqa/test")

# Retrieve test queries
test_queries = testset.get_topics()

aug = naf.Sequential([
    nac.RandomCharAug(action="substitute", aug_char_p=0.2, spec_char=""),  # Substitute 20% of characters
    nac.RandomCharAug(action="insert", aug_char_p=0.2, spec_char="")       # Insert characters in 20% of positions
])

# Apply noise to queries
noisy_queries_list = []
for _, row in test_queries.iterrows():
    noisy_query = aug.augment(row['query'])

    if isinstance(noisy_query, list):
        noisy_query = " ".join(noisy_query)

    noisy_queries_list.append({'qid': row['qid'], 'query': noisy_query})

noisy_queries_df = pd.DataFrame(noisy_queries_list)

noisy_queries_df["qid"] = noisy_queries_df["qid"].astype(str)

print("Original Query:", test_queries.iloc[0]['query'])
print("Noisy Query:", noisy_queries_df.iloc[0]['query'])

bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")

# Run the experiment
pt.Experiment(
    [bm25],
    noisy_queries_df,
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
)


Original Query: where should i park my rainy day emergency fund
Noisy Query: yhere should i Opark my raSny day eFer8gxnc9y pfund


Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,TerrierRetr(BM25),0.076055,0.066641,0.054936
