In [1]:
from baseline_experiments import BaselineExperiments

# Initialize the BaselineExperiments with paths to the test queries and qrels
be = BaselineExperiments("test_queries.json", "test_qrels.json")

  from .autonotebook import tqdm as notebook_tqdm
Loading indexes, queries and qrels...:   0%|          | 0/4 [00:00<?, ?step/s]Java started (triggered by IndexFactory.of) and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
Loading indexes, queries and qrels...: 100%|██████████| 4/4 [00:01<00:00,  3.41step/s]


# Baseline Experiments

In [2]:
# Experiment 1: Baseline Retrieval Models
be.run_experiment_1()

Experiment 1:


pt.Experiment: 100%|██████████| 4/4 [3:26:46<00:00, 3101.69s/system]  

                                                name        AP     P@1  \
0                                  TerrierRetr(BM25)  0.708274  0.6344   
1  (TerrierRetr(BM25) >> QueryExpansion(c:\Uni\In...  0.686370  0.6267   
2                                TerrierRetr(TF_IDF)  0.708384  0.6353   
3  (TerrierRetr(TF_IDF) >> QueryExpansion(c:\Uni\...  0.683053  0.6183   

       P@5     P@10     R@5    R@10    nDCG@5   nDCG@10  
0  0.15916  0.08404  0.7958  0.8404  0.722170  0.736715  
1  0.15028  0.07992  0.7514  0.7992  0.693830  0.709239  
2  0.15916  0.08404  0.7958  0.8404  0.722311  0.736826  
3  0.15108  0.08009  0.7554  0.8009  0.692545  0.707289  





In [3]:
# Experiment 2: Thesaurus-Based Query Expansion
be.run_experiment_2()

Expanding queries with thesaurus: 100%|██████████| 10000/10000 [00:08<00:00, 1186.06it/s]


Experiment 2:


pt.Experiment: 100%|██████████| 4/4 [3:25:07<00:00, 3076.83s/system]  

                                                name        AP     P@1  \
0                                  TerrierRetr(BM25)  0.656527  0.5730   
1  (TerrierRetr(BM25) >> QueryExpansion(c:\Uni\In...  0.626290  0.5650   
2                                TerrierRetr(TF_IDF)  0.657576  0.5746   
3  (TerrierRetr(TF_IDF) >> QueryExpansion(c:\Uni\...  0.622850  0.5569   

       P@5     P@10     R@5    R@10    nDCG@5   nDCG@10  
0  0.15118  0.08080  0.7559  0.8080  0.672255  0.689130  
1  0.13822  0.07388  0.6911  0.7388  0.633217  0.648591  
2  0.15156  0.08075  0.7578  0.8075  0.673785  0.689867  
3  0.13928  0.07431  0.6964  0.7431  0.631956  0.647136  





In [4]:
# Experiment 3: Keyword-Based Indexing with Query Expansion
be.run_experiment_3()

Experiment 3 with original queries:


pt.Experiment: 100%|██████████| 2/2 [1:36:24<00:00, 2892.21s/system]


                                                name        AP     P@1  \
0                                  TerrierRetr(BM25)  0.099426  0.0785   
1  (TerrierRetr(BM25) >> QueryExpansion(c:\Uni\In...  0.094325  0.0782   

       P@5     P@10     R@5    R@10    nDCG@5   nDCG@10  
0  0.02394  0.01384  0.1197  0.1384  0.100590  0.106646  
1  0.02200  0.01217  0.1100  0.1217  0.095184  0.098925  
Experiment 3 with expanded queries:


pt.Experiment: 100%|██████████| 2/2 [1:38:33<00:00, 2956.59s/system]

                                                name        AP     P@1  \
0                                  TerrierRetr(BM25)  0.088137  0.0691   
1  (TerrierRetr(BM25) >> QueryExpansion(c:\Uni\In...  0.081828  0.0685   

       P@5     P@10     R@5    R@10    nDCG@5   nDCG@10  
0  0.02142  0.01234  0.1071  0.1234  0.089346  0.094649  
1  0.01864  0.01058  0.0932  0.1058  0.081708  0.085755  





## Result Analysis

We will pick a random subset of 200 queries to analyze why the RM3 and thesaurus query expansion method seem to worsen the result of the tests.

In [9]:
import json
import pandas as pd
from tqdm import tqdm
from functions import thesaurus_based_expansion, keywords_extractor

In [2]:
def load_200_queries():
    with open('test_queries.json', 'r', encoding='utf-8') as file:
        data = json.load(file)
        queries = pd.DataFrame.from_dict(data)
        queries_200 = queries.sample(n=200, random_state=42)
        queries_200.rename(columns={"query_id": "qid", "question": "query"}, inplace=True)
    return queries_200

In [3]:
queries_200 = load_200_queries()

In [16]:
for q in queries_200["query"].head().tolist():
    keywords = keywords_extractor(q, max_keywords=3)
    print(q, keywords)

How much did Sarah send from America to save her house from being sold ['sold', 'send', 'sarah']
In what city is Talmudic s office located ['office', 'talmudic', 'located']
When was the Knights of the Golden Circle first discovered ['golden', 'first', 'circle']
Who drew a revolver on a fellow resident of the Sound ['fellow', 'sound', 'resident']
Who presided over the regular Democratic convention ['democratic', 'regular', 'convention']


In [18]:
for q in queries_200["query"].head().tolist():
    keywords = keywords_extractor(q, max_keywords=3)
    expanded_keywords = thesaurus_based_expansion(q, keywords=keywords, max_synonyms_per_keyword=2)
    print(q, expanded_keywords)

How much did Sarah send from America to save her house from being sold ['sold', 'sell', 'send', 'direct', 'sarah']
In what city is Talmudic s office located ['office', 'power', 'talmudic', 'located', 'locate', 'turn up']
When was the Knights of the Golden Circle first discovered ['golden', 'aureate', 'gilded', 'first', 'first base', 'circle', 'dress circle']
Who drew a revolver on a fellow resident of the Sound ['fellow', 'comrade', 'companion', 'sound', 'audio', 'resident', 'resident physician', 'house physician']
Who presided over the regular Democratic convention ['democratic', 'regular', 'convention', 'convening']


In [20]:
for q in queries_200["query"].head().tolist():
    keywords = keywords_extractor(q, max_keywords=3, method='bert')
    print(q, keywords)

How much did Sarah send from America to save her house from being sold ['sarah', 'house', 'save']
In what city is Talmudic s office located ['talmudic', 'office', 'city']
When was the Knights of the Golden Circle first discovered ['knights', 'golden', 'circle']
Who drew a revolver on a fellow resident of the Sound ['revolver', 'sound', 'resident']
Who presided over the regular Democratic convention ['convention', 'democratic', 'presided']


In [19]:
for q in queries_200["query"].head().tolist():
    keywords = keywords_extractor(q, max_keywords=3, method='bert')
    expanded_keywords = thesaurus_based_expansion(q, keywords=keywords, max_synonyms_per_keyword=2)
    print(q, expanded_keywords)

How much did Sarah send from America to save her house from being sold ['sarah', 'house', 'save', 'salvage', 'salve']
In what city is Talmudic s office located ['talmudic', 'office', 'power', 'city', 'urban center', 'metropolis']
When was the Knights of the Golden Circle first discovered ['knights', 'knight', 'golden', 'aureate', 'gilded', 'circle', 'dress circle']
Who drew a revolver on a fellow resident of the Sound ['revolver', 'revolving door', 'sound', 'audio', 'resident', 'resident physician', 'house physician']
Who presided over the regular Democratic convention ['convention', 'convening', 'democratic', 'presided', 'preside']


----