In [89]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
from sentence_transformers import SentenceTransformer, util
import ir_measures
from ir_measures import *
import pandas as pd
import json
from tqdm import tqdm
import numpy as np

## Elastic Search Connection

In [45]:
elastic_search = Elasticsearch('http://localhost:9200')


## Index Configuration for Elastic Search

In [46]:
mapp = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
    }
}

sett = {
    "number_of_shards" : 5,
    'index' : {
        'similarity' : {
          'default' : {
            'type' : 'BM25'   # default config
          }
        }
    },
    'analysis' : {
        'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

if elastic_search.indices.exists(index='wikiir'):
    elastic_search.indices.delete(index='wikiir')
elastic_search.indices.create(index='wikiir', settings=sett, mappings=mapp)


  if elastic_search.indices.exists(index='wikiir'):
  elastic_search.indices.delete(index='wikiir')
  elastic_search.indices.create(index='wikiir', settings=sett, mappings=mapp)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'wikiir'})

## WikiIR Collection Document Reading

In [47]:
df = pd.read_csv('documents.csv')

print(df.shape)
df.head()

(369721, 2)


Unnamed: 0,id_right,text_right
0,1781133,it was used in landing craft during world war ...
1,2426736,after rejecting an offer from cambridge univer...
2,2224122,mat zan coached kuala lumpur fa in 1999 and wo...
3,219642,a barcode is a machine readable optical label ...
4,1728654,since the subordination of the monarchy under ...


## Indexing the documents using by Using Elastic Search

In [48]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def es_action_generator(df):
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0], bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        doc = {
            'text': row['text_right'],
        }
        yield create_es_action('wikiir', row['id_right'], doc)
        

for ok, result in parallel_bulk(elastic_search, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)

elastic_search.indices.refresh(index='wikiir')
elastic_search.cat.count(index='wikiir', format='json')


  result = (True, func(*args, **kwds))
100%|██████████████████████████████| 369721/369721 [00:40<00:00, 9106.94it/s]                                          
  elastic_search.indices.refresh(index='wikiir')
  elastic_search.cat.count(index='wikiir', format='json')


ListApiResponse([{'epoch': '1680446251', 'timestamp': '14:37:31', 'count': '369721'}])

## Testing query

In [49]:
testing = pd.read_csv('wikIR1k/test/queries.csv')
testing

Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united states
3,32674,patsy cline
4,406391,dierks bentley
...,...,...
95,679227,hiv aids
96,2136797,maren morris
97,5622,homer
98,1313598,south pole


## BM25 scoring document creation

In [50]:
global res

def pretty_print_result(search_result, scoring=[]): # make a pretty printing
    global res
    res = search_result['hits']
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for score in scoring:
            print(f'{score}: {hit["_source"][score]}')
    
    
def search(query, *args):             #searches document in the whole buffer of documents(train) and retrieves top 20 documents from the train set according to the rank of query in that document 
    return pretty_print_result(elastic_search.search(index='wikiir', query=query, size=20), args)


def get_doc_by_id(doc_id):             #retrieve the document by its id 
    return elastic_search.get(index='wikiir', id=doc_id)['_source']


def make_query(text):  # make a query from the document
    return {
        'bool': {
            'must': {
                'match': {
                    'text': text
                }
            }
        }
    }

print("Query ID:", testing['id_left'][0])
print("Text of Query:", testing['text_left'][0])
search(make_query(testing['text_left'][0]))

Query ID: 158491
Text of Query: southern methodist university
Doc 1880296, score is 17.230719
Doc 607552, score is 17.198406
Doc 2261272, score is 17.183655
Doc 1957435, score is 16.908918
Doc 625257, score is 16.856976
Doc 635537, score is 16.771313
Doc 1774491, score is 16.640131
Doc 663828, score is 16.487574
Doc 158491, score is 15.997955
Doc 1956922, score is 15.973572
Doc 1180246, score is 15.590252
Doc 1170039, score is 15.534702
Doc 945068, score is 15.526761
Doc 360918, score is 15.501228
Doc 589549, score is 15.501228
Doc 685181, score is 15.335788
Doc 2411344, score is 15.325968
Doc 1158969, score is 15.273922
Doc 1093529, score is 15.163386
Doc 742912, score is 15.109789


  return pretty_print_result(es.search(index='wikiir', query=query, size=20), args)


In [51]:
print(f'Total documents: {res["total"]["value"]}')

Total documents: 10000


In [52]:
res

{'total': {'value': 10000, 'relation': 'gte'},
 'max_score': 17.230719,
 'hits': [{'_index': 'wikiir',
   '_type': '_doc',
   '_id': '1880296',
   '_score': 17.230719,
   '_source': {'text': 'lindsay embrey september 23 1925 november 11 2005 was an american real estate developer and philanthropist he was a primary benefactor of southern methodist university in dallas texas he served as a member of the board of trustees of southern methodist university from 1970 to 1987 in 1978 he established an endowment for students majoring in engineering at the university this endowment has provided scholarships for over 2 000 engineering students in 1991 he was named emeritus of the board of trustees james lindsay embrey jr was born in 1925 in gainesville texas he was the son of james lindsay and margaret n e marsh embrey his great grandfather was james menees lindsay 1835 1919 who migrated to cooke county texas from tennessee in 1857 and became a real estate developer judge and philanthropist he g

## Evaluation of BM25 scoring document

In [53]:
query_doc = {}

for i, row in test_queries.iterrows():
    search_res = elastic_search.search(index='wikiir', query=make_query(row['text_left']), size=20)['hits']
    query_doc[str(row['id_left'])] = {}
                 
    for hit in search_res['hits']:
        query_doc[str(row['id_left'])][hit['_id']] = hit['_score']
                 

  search_res = es.search(index='wikiir', query=make_query(row['text_left']), size=20)['hits']


In [54]:
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')

ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, query_doc)

{P@20: 0.14700000000000005,
 P@10: 0.20599999999999988,
 AP@20: 0.14572550811737783}

In [55]:
len(query_doc)

100

## Initiating MSMARCO Model and creating Cosine similarity scoring document

In [105]:
def get_cosine_similarity_testing_document_from_model(model, query_doc):
    '''
    Args:
        model: SentenceTransformer model('msmarco-distilbert-cos-v5')
        query_doc: queryid-document-score nested dict
    Returns:
        query_doc_cosine: queryid-document-cosine_similarity_score nested dict
    '''

    q_ids = list(query_doc.keys())  # q_ids -- list of strings
    query_doc_cosine = {}

    for q_id in tqdm(q_ids, total=len(q_ids), bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        
        # encoding the query
        query_text = testing[testing['id_left']==int(q_id)].iloc[0]['text_left']
        query_embedding = model.encode(query_text)
        query_doc_cosine[q_id] = {}

        # documents encoding
        docs_texts = []
        for doc_id in query_doc[str(q_id)]:
            docs_texts.append(df[df['id_right']==int(doc_id)].iloc[0]['text_right'])
        docs_embedding = model.encode(docs_texts)

        # computing cosine similarity
        if len(docs_embedding) == 0:
            continue
        cos_sim_score = util.cos_sim(query_embedding, docs_embedding)[0]
        for i, doc_id in enumerate(query_doc[str(q_id)]):
            query_doc_cosine[q_id][doc_id] = cos_sim_score[i].item()
            
    return query_doc_cosine

## Evaluation of Cosine Similarity scoring document

In [57]:
model = SentenceTransformer('msmarco-distilbert-cos-v5')

query_doc_cosine = get_cosine_similarity_testing_document_from_model(model, query_doc)

100%|██████████████████████████████| 100/100 [04:46<00:00,  2.87s/it]                                                  


In [58]:
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')

ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, query_doc_cosine)

{P@20: 0.14800000000000005,
 P@10: 0.2339999999999999,
 AP@20: 0.17022283800557722}

## Printing the scores of the first testing query id

In [59]:
print("Query ID: ", list(query_doc_cosine.keys())[0])
for i in range (20):
    print("Doc",list(query_doc_cosine['158491'].keys())[i],", score",round(query_doc_cosine['158491'][list(query_doc_cosine['158491'].keys())[i]],3))

Query ID:  158491
Doc 1880296 , score 0.267
Doc 607552 , score 0.382
Doc 2261272 , score 0.188
Doc 1957435 , score 0.368
Doc 625257 , score 0.247
Doc 635537 , score 0.506
Doc 1774491 , score 0.327
Doc 663828 , score 0.199
Doc 158491 , score 0.44
Doc 1956922 , score 0.164
Doc 1180246 , score 0.159
Doc 1170039 , score 0.264
Doc 945068 , score 0.163
Doc 589549 , score 0.22
Doc 360918 , score 0.384
Doc 685181 , score 0.22
Doc 2411344 , score 0.098
Doc 1158969 , score 0.275
Doc 1093529 , score 0.277
Doc 742912 , score 0.373


# Bonus Part

## Reading the train subset

In [62]:
training = pd.read_csv('wikIR1k/train/queries.csv')
training

Unnamed: 0,id_left,text_left
0,123839,yanni
1,188629,k pop
2,13898,venice film festival
3,316959,downtown brooklyn
4,515031,pennsylvania house of representatives
...,...,...
1439,896124,british ceylon
1440,12319,scottish national party
1441,4421,cinema of china
1442,296526,gold mining


In [63]:
search(make_query(training['text_left'][0]))

Doc 123839, score is 18.237452
Doc 806300, score is 17.769026
Doc 806326, score is 17.390072
Doc 836567, score is 16.393133
Doc 1793430, score is 15.605509
Doc 806075, score is 15.605509
Doc 806263, score is 14.116629
Doc 799188, score is 10.251266
Doc 1901730, score is 9.908206
Doc 817579, score is 9.660697
Doc 1727316, score is 9.660697


  return pretty_print_result(es.search(index='wikiir', query=query, size=20), args)


In [64]:
print(f'Total documents: {res["total"]["value"]}')

Total documents: 11


In [65]:
res

{'total': {'value': 11, 'relation': 'eq'},
 'max_score': 18.237452,
 'hits': [{'_index': 'wikiir',
   '_type': '_doc',
   '_id': '123839',
   '_score': 18.237452,
   '_source': {'text': 'although this genre of music was not well suited for commercial pop radio and music television yanni received international recognition by producing concerts at historic monuments and by producing videos that were broadcast on public television his breakthrough concert live at the acropolis yielded the second best selling music concert video of all time additional historic sites for yanni s concerts have included india s taj mahal china s forbidden city the united arab emirates burj khalifa russia s kremlin puerto rico s el morro castle lebanon s ancient city of byblos tunisia s roman theatre of carthage india s laxmi vilas palace the egyptian pyramids and great sphinx of giza and the amman citadel at least sixteen of yanni s albums have peaked at no 1 in billboard s top new age album category and two 

## Re-ranking

### Sampling by 100 train queries

In [121]:
training_sample = training.sample(100, random_state=123)
training_sample

Unnamed: 0,id_left,text_left
218,73285,smokey robinson
1341,1271109,gossip girl
1047,13944,vermont
1201,13000,tiberius
1195,73669,shivaji
...,...,...
1327,7535,latin literature
646,1264172,english studies
185,24349,bourbon restoration
1242,8998,nigeria


## Scoring by BM25 the training set

In [122]:
# Creating query-doc-score nested dict

query_doc = {}

for i, row in training_sample.iterrows():
    search_res = elasticsearch.search(index='wikiir', query=make_query(row['text_left']), size=50)['hits']
    query_doc[str(row['id_left'])] = {}
                 
    for hit in search_res['hits']:
        query_doc[str(row['id_left'])][hit['_id']] = hit['_score']

  search_res = es.search(index='wikiir', query=make_query(row['text_left']), size=50)['hits']


## Normalization

In [123]:
def normalize_query_doc(query_doc):
    '''
    Description:
        One way to normalize scores and bring them into a standard range is by using Min-Max normalization. 
        This method scales the values linearly to fit within the [0,1] range, with the minimum value being mapped to 0 and the maximum value to 1.
    Args:
        query_doc: queryid-document-score nested dict
    Returns:
        query_doc_norm: queryid-document-score nested dict with normalised scores
    '''

    query_doc_norm = {}
    
    for query in query_doc.keys():
        query_doc_norm[query] = {}
        
        if len(query_doc[query].values()) == 0:
            continue
            
        scores_min = min(query_doc[query].values())
        scores_max = max(query_doc[query].values())
                
        for doc in query_doc[query].keys():
            # division by zero only happens when search returns docs, all having the same score (min=max)
            # so we consider the following cases
            if scores_max-scores_min == 0:
                query_doc_norm[query][doc] = (query_doc[query][doc]-scores_min)/scores_max
            else:
                query_doc_norm[query][doc] = (query_doc[query][doc]-scores_min)/(scores_max-scores_min)
                
    return query_doc_norm


query_doc_norm = normalize_query_doc(query_doc)  # min-max normalizing BM25 scores
query_doc_norm[next(iter(query_doc_norm))] # Example: first query normalized docs scores


{'940359': 1.0,
 '891183': 0.9070208971351378,
 '313257': 0.9009368372057258,
 '1178786': 0.8785646663768064,
 '2095859': 0.871778916620803,
 '1447257': 0.8535369073670035,
 '2095725': 0.8404417278844232,
 '73285': 0.8250473141737416,
 '1586500': 0.7998564413654435,
 '1586232': 0.7785803767705302,
 '1002413': 0.7463635909044294,
 '1288012': 0.7135760223178509,
 '2303869': 0.7065682158933247,
 '1228192': 0.6943614171838423,
 '1289415': 0.6569291504390781,
 '152342': 0.6206207277495096,
 '1741112': 0.4517866970796467,
 '1253150': 0.41229235447795154,
 '1746762': 0.3466904811271479,
 '122936': 0.31675251137077304,
 '2450230': 0.29875428674542825,
 '1284918': 0.26118980305299405,
 '1608426': 0.2539514645145816,
 '1587519': 0.24306182706408483,
 '889237': 0.22662547292022003,
 '1177941': 0.2222894350722713,
 '2135393': 0.2222894350722713,
 '1527105': 0.21535166664624847,
 '1399732': 0.21535166664624847,
 '711399': 0.21535166664624847,
 '1615027': 0.21535166664624847,
 '1130382': 0.206730597

## Accquiring cosine similarities for the training set

In [124]:
def get_cosine_similarity_training_document_from_model(model, query_doc): #
    '''
    Args:
        model: SentenceTransformer model('msmarco-distilbert-cos-v5')
        query_doc: queryid-document-score nested dict
    Returns:
        query_doc_cosine: queryid-document-cosine_similarity_score nested dict
    '''

    q_ids = list(query_doc.keys())  # q_ids -- list of strings
    query_doc_cosine = {}

    for q_id in tqdm(q_ids, total=len(q_ids), bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        
        # encoding the query
        query_text = training_sample[training_sample['id_left']==int(q_id)].iloc[0]['text_left']
        query_embedding = model.encode(query_text)
        query_doc_cosine[q_id] = {}

        # documents encoding
        docs_texts = []
        for doc_id in query_doc[str(q_id)]:
            docs_texts.append(df[df['id_right']==int(doc_id)].iloc[0]['text_right'])
        docs_embedding = model.encode(docs_texts)

        # computing cosine similarity
        if len(docs_embedding) == 0:
            continue
        cos_sim_score = util.cos_sim(query_embedding, docs_embedding)[0]
        for i, doc_id in enumerate(query_doc[str(q_id)]):
            query_doc_cosine[q_id][doc_id] = cos_sim_score[i].item()
            
    return query_doc_cosine

In [125]:
model = SentenceTransformer('msmarco-distilbert-cos-v5')
#model = SentenceTransformer('msmarco-MiniLM-L6-cos-v5')

query_doc_cosine = get_cosine_similarity_training_document_from_model(model, query_doc)

100%|██████████████████████████████| 100/100 [12:15<00:00,  7.35s/it]                                                  


## Getting the optimal alpha

In [131]:
def combine_function(query_doc1, query_doc2, alpha):
    '''
    Args:
        query_doc1: scores of the first document
        query_doc2: scores of the second document
        alpha: 
    Returns:
        query_doc_mix: combination of query_doc1 and query_doc2
    '''

    query_doc_mix = {}
    
    for query in query_doc1.keys():
        query_doc_mix[query] = {}
        
        for doc in query_doc1[query].keys():
            bm25 = query_doc1[query][doc]
            cos_sim = query_doc2[query][doc]
            
            query_doc_mix[query][doc] = alpha*bm25 + (1-alpha)*cos_sim
    
    return query_doc_mix


In [132]:
# Retirieving training qrels
samples_q_ids = training_sample['id_left'].values.tolist()

qrels = list(ir_measures.read_trec_qrels('wikIR1k/train/qrels'))

qrels_sample = []
for qrel in qrels:
    if int(qrel[0]) in samples_q_ids:
        qrels_sample.append(qrel)

In [133]:
alpha_arr = np.arange(0.1, 1.1, 0.1)
alpha_arr

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [139]:
results = {}

for alpha in alpha_arr:
    query_doc_mix = combine_function(query_doc_norm, query_doc_cosine, alpha)
    map20 = ir_measures.calc_aggregate([MAP@20], qrels_sample, query_doc_mix)
    map20 = list(map20.values())[0]
    results[alpha] = map20
    
results_sorted = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))

print('alpha \t MAP@20')
print("---------------")
for alpha, score in results_sorted.items():
    print(f' {round(alpha,1)}\t{round(score,5)}')


alpha_best_score = next(iter(results_sorted))

print("\nBest alpha:",alpha_best_score, ", Score:",results_sorted[alpha_best_score])

alpha 	 MAP@20
---------------
 0.2	0.20377
 0.3	0.19877
 0.1	0.19826
 0.4	0.19535
 0.5	0.18978
 0.6	0.18328
 0.7	0.18095
 0.8	0.17577
 0.9	0.17394
 1.0	0.16886

Best alpha: 0.2 , Score: 0.2037672374159032


##  Applying the combine function to test queries

### Normalised BM25 scores

In [98]:
query_doc_bm25 = {}

for i, row in testing.iterrows():
    search_res = elastic_search.search(index='wikiir', query=make_query(row['text_left']), size=50)['hits']
    query_doc_bm25[str(row['id_left'])] = {}
                 
    for hit in search_res['hits']:
        query_doc_bm25[str(row['id_left'])][hit['_id']] = hit['_score']

qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')
print(ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, query_doc_bm25))

  search_res = es.search(index='wikiir', query=make_query(row['text_left']), size=50)['hits']


{P@20: 0.14700000000000005, P@10: 0.20599999999999988, AP@20: 0.14533487119899494}


### Normalized cosine similarity scores

In [106]:
query_doc_cosine_2 = get_cosine_similarity_testing_document_from_model(model, query_doc_bm25)

100%|██████████████████████████████| 100/100 [12:08<00:00,  7.29s/it]                                                  


In [111]:
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')
print(ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, query_doc_cosine_2))


{P@20: 0.15800000000000003, P@10: 0.23300000000000007, AP@20: 0.1763766531713263}


### Combination of BM25 and CosineSimilarity (alpha_best)

In [112]:
# normalizing bm25 scores from testing
query_doc_bm25_norm = normalize_query_doc(query_doc_bm25)

In [113]:
# combining BM25 and cosine similarity scores of testing using alpha best
query_doc_final = combine_function(query_doc_bm25_norm, query_doc_cosine_2, alpha_best_score)

In [114]:
# evaluation of combine scores
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')
print(ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, query_doc_final))

{P@20: 0.16450000000000006, P@10: 0.24200000000000008, AP@20: 0.18128666321804765}


In [115]:
alpha_best_score

0.2

## Printing the scores of the first testing query id

In [145]:
print("Query ID: ", list(query_doc_final.keys())[0])
for i in range (50):
    if i == 25:
        print("---------------")
    print("Doc",list(query_doc_final['158491'].keys())[i],", score",round(query_doc_final['158491'][list(query_doc_final['158491'].keys())[i]],3))

Query ID:  158491
Doc 1880296 , score 0.414
Doc 607552 , score 0.504
Doc 2261272 , score 0.348
Doc 1957435 , score 0.48
Doc 625257 , score 0.382
Doc 635537 , score 0.585
Doc 1774491 , score 0.437
Doc 663828 , score 0.327
Doc 158491 , score 0.499
Doc 1956922 , score 0.277
Doc 1180246 , score 0.257
Doc 1170039 , score 0.339
Doc 945068 , score 0.258
Doc 360918 , score 0.434
Doc 589549 , score 0.302
Doc 685181 , score 0.295
Doc 2411344 , score 0.197
Doc 1158969 , score 0.337
Doc 1093529 , score 0.334
Doc 742912 , score 0.408
Doc 967619 , score 0.302
Doc 2337647 , score 0.151
Doc 1059585 , score 0.342
Doc 637819 , score 0.064
Doc 1397771 , score 0.333
---------------
Doc 2225325 , score 0.197
Doc 1079407 , score 0.177
Doc 1485043 , score 0.167
Doc 2390322 , score 0.153
Doc 1422090 , score 0.306
Doc 1490799 , score 0.238
Doc 289756 , score 0.253
Doc 547150 , score 0.135
Doc 13801 , score 0.239
Doc 621578 , score 0.251
Doc 313493 , score 0.5
Doc 345165 , score 0.332
Doc 1454621 , score 0.316
