### Group 62 Search Engine<br>
Library imports

In [None]:
pip install python-terrier

In [3]:
import pyterrier as pt
if not pt.started():
  pt.init(version = 'snapshot')
  import pandas as pd
import numpy as np
import pickle
!rm -rf ./pd_index

Downloading terrier-assemblies 5.x-SNAPSHOT jar-with-dependencies to /root/.pyterrier...
Done
terrier-python-helper 0.0.6 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.8.1 has loaded Terrier 5.6 (built by jitpack on 2022-04-12 09:28)



## Dataloaders

In [10]:
# Load document data as a dict using pickle
with open('data_dict.pkl', 'rb') as file:
    data_dict = pickle.load(file)

In [11]:
# Convert document data dict of dicts into list of dicts
list_data_dicts = [value for value in data_dict.values()]
list_data_dicts[0]

{'Definitions': 'No definitions extracted',
 'Document Issuer': 'EUR-LEX',
 'Jurisdiction': 'European Union',
 'Preamble': '           26.8.2020\xa0\xa0\xa0   EN   Official Journal of the European Union   C 282/68       COUNCIL RECOMMENDATION of 20 July 2020 on the 2020 National Reform Programme of Croatia and delivering a Council opinion on the 2020 Convergence Programme of Croatia (2020/C 282/11) THE COUNCIL OF THE EUROPEAN UNION, Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 121(2) and 148(4) thereof, Having regard to Council Regulation (EC) No 1466/97 of 7 July 1997 on the strengthening of the surveillance of budgetary positions and the surveillance and coordination of economic policies\xa0(1), and in particular Article 9(2) thereof, Having regard to Regulation (EU) No 1176/2011 of the European Parliament and of the Council of 16 November 2011 on the prevention and correction of macroeconomic imbalances\xa0(2), and in particular Ar

In [12]:
# Load topics .csv file (contains set of queries to be run) converted to DataFrame
topics_df = pd.read_csv('Topics_final.csv') # import from .csv
topics_df['qid'] = topics_df['qid'].apply(str) # IDs to strings
topics_df['query'] = topics_df['query'].apply(str) # quieries to strings
topics_df

Unnamed: 0,qid,query
0,1,Commodity derivative
1,2,Commodity pool operator
2,3,Derivatives clearing organizations
3,4,Escheatment
4,5,Liquidity risk
5,6,Major swap participant
6,7,National bank
7,8,Physical commodity swaps
8,9,Proprietary trading
9,10,Swap data repositories


In [13]:
# Load querey relevant document pairs from .csv file converted to DataFrame
qrel_df = pd.read_csv('qrel_final.csv') # import .csv
qrel_df['qid'] = qrel_df['qid'].apply(str) # IDs to strings
qrel_df['docno'] = qrel_df['docno'].str.upper() # docno to sting in all caps
qrel_df

Unnamed: 0,qid,docno,label
0,1,046B5777-9321-4301-BB9A-E74DA41057B7,1
1,1,04D3FB95-13F2-49C1-96E7-E0928D39436C,1
2,1,0A8B4136-6631-47E0-B5A8-4AD1C30340A2,1
3,1,13A11F3F-BF68-4054-B61F-0BFC609F262B,1
4,1,17A674BD-41F1-41C1-8261-AEE3F30C1576,1
...,...,...,...
570,12,152788B0-DAEF-4CF9-9AEE-BB47A43F865D,1
571,12,1188E1A6-9755-4B89-A6AC-344C5589A441,1
572,9,2F972831-F375-4B8F-B971-9FE8D011C088,1
573,9,59827B5A-A82D-4462-9C5F-FE59FDABDD92,1


### Indexing

In [15]:
# Indexing
iter_indexer = pt.IterDictIndexer("./index", meta=['docno']) # define indexer and metadata
indexref1 = iter_indexer.index(list_data_dicts, fields=('text', 'Preamble')) # index document data and define fields as 'text' and 'Preamble'

In [16]:
# Print details about the index
index = pt.IndexFactory.of(indexref1)
print(index.getCollectionStatistics().toString())

Number of documents: 2387
Number of terms: 44763
Number of postings: 1253827
Number of fields: 2
Number of tokens: 11780823
Field names: [text, Preamble]
Positions:   false



## Retrieval models
### Basic retrieval examples

In [17]:
# Sample BM25 search
result_df1 = pt.BatchRetrieve(indexref1, wmodel="BM25").search("escheatment")
result_df1

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,2206,F5834E40-E60F-45EA-AA60-ED4CE8CF82D4,0,12.780534,escheatment
1,1,1525,777E3B76-0418-44F1-8E49-9F266837C82E,1,5.339306,escheatment
2,1,1177,CF2FDD66-C395-4E95-92E9-7C667985C382,2,5.203308,escheatment
3,1,2082,BF260A67-A61F-450E-BEC5-DF6CD96B78C1,3,5.126723,escheatment
4,1,64,C16D9BAF-E273-442E-A68F-0693268BAE31,4,3.161393,escheatment
5,1,484,1188E1A6-9755-4B89-A6AC-344C5589A441,5,0.342523,escheatment


In [18]:
# Sample BM25F search with weights set  
df = pt.BatchRetrieve(indexref1, wmodel="BM25F", control = {'w.0': 0.5, 'w.1': 1}).search("Escheatment")
df

BR(BM25F):   0%|          | 0/1 [00:00<?, ?q/s]

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,2206,F5834E40-E60F-45EA-AA60-ED4CE8CF82D4,0,15.340035,Escheatment
1,1,1525,777E3B76-0418-44F1-8E49-9F266837C82E,1,4.7487,Escheatment
2,1,1177,CF2FDD66-C395-4E95-92E9-7C667985C382,2,4.605943,Escheatment
3,1,2082,BF260A67-A61F-450E-BEC5-DF6CD96B78C1,3,4.526142,Escheatment
4,1,64,C16D9BAF-E273-442E-A68F-0693268BAE31,4,2.613569,Escheatment
5,1,484,1188E1A6-9755-4B89-A6AC-344C5589A441,5,0.259501,Escheatment


## Comparison of BM25 and BM25F retrieval results

In [19]:
bm25f = pt.BatchRetrieve(indexref1, wmodel="BM25F", control = {'w.0': 0.5, 'w.1': 1})
bm25 = pt.BatchRetrieve(indexref1, wmodel="BM25")

In [20]:
pt.Experiment(
    [bm25f, bm25],
    topics_df,
    qrel_df,
    eval_metrics=["map", "recall"]
)

BR(BM25F):   0%|          | 0/12 [00:00<?, ?q/s]

Unnamed: 0,name,map,R@5,R@10,R@15,R@20,R@30,R@100,R@200,R@500,R@1000
0,BR(BM25F),0.561993,0.219147,0.350586,0.40912,0.468172,0.581915,0.751474,0.841024,0.89629,0.935142
1,BR(BM25),0.5769,0.215535,0.34967,0.426274,0.499629,0.595502,0.769733,0.843272,0.897862,0.931211


## Grid search to optimize model parameters

In [None]:
# BM25F weight opimitzation using Grid Scanning 
bm25f = pt.BatchRetrieve(index, wmodel='BM25F', controls={'w.0' : 1, 'w.1' : 1}, verbose=True) # set BM25f variable 'text' = w.0, 'Preamble' = w.1 
pt.GridSearch(
    bm25f, 
    # define values of weights to be tested
    {bm25f: {'w.0' : [0.5, 1],
     'w.1': [1, 1.5, 2, 2.5, 3, 4, 5, 6, 100]}},
    topics_df, # predetermined det of queries in a df
    qrel_df, # predetermined set of query-relevant document paird
      'map' # evaluation metrics to output
)