In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
import pandas as pd
from tqdm import tqdm
from time import time
import numpy as np
import ir_measures
from ir_measures import *
from sentence_transformers import SentenceTransformer, util

## Collection indexing

In [2]:
df = pd.read_csv('wikIR/documents.csv')

In [3]:
es = Elasticsearch('http://localhost:9200')

In [4]:
#Without stemming

mappings = {
    'properties': {
        '_document': {
            'type': 'text',
            'analyzer': 'standard'
        }
    }
}

settings_w = {
    'analysis' : {
        'analyzer' : 'standard' 
        
    }
}


def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_document': document
    }


def es_action_generator():
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc =  row['text_right']
        yield create_es_action(index_name, row['id_right'], doc)

In [None]:
index_name='wiki2'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)

start = time()
for ok, result in parallel_bulk(es, es_action_generator(), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        

In [42]:
def pretty_print_result(search_result):
    res = search_result['hits']
    results = {}
    for hit in res['hits']:
        results[hit["_id"]] = hit["_score"]
    return results
        
    
def search(query, i):
    s = pretty_print_result(es.search(index=i, query=query, size=50))
    return s


def query(l):
    pop = {
        'bool': {
            'must': {
                    'match': {
                        '_document': l
                    }
            },
                'should': {
                    'match_phrase': {
                        '_document': {
                            "query": l,
                            "boost": 2,
                            'slop': 10
                        }                           
                    }                    
                },
            "minimum_should_match": 1
            }
        }
    
    return pop


def querysearch(queries, indexname):
    results ={}
    for q in range(len(queries['text_left'])):
        row = queries['text_left'][q]
        qu = query(row)
        res = search(qu, indexname)
        results[str(queries['id_left'][q])] = res
    return results

In [6]:
dfq_tr = pd.read_csv('wikIR/training/queries.csv')
dfq_t = pd.read_csv('wikIR/test/queries.csv')


### Train data

#### Sampling and retrieving using Elasticsearch 

In [7]:
tr_s = dfq_tr.sample(n=200).sort_values(by=['id_left']).reset_index(drop=True)

In [8]:
runs = querysearch(tr_s, 'wiki2')

In [9]:
#Min-max normalise BM25
scores = []
for qdp in runs:
    if len(runs[str(qdp)]) != 0:
        for doc in runs[str(qdp)]:
            scores.append(runs[str(qdp)][str(doc)])
            
max_score = np.max(scores)
min_score = np.min(scores)
scores = (scores - min_score)/(max_score - min_score)

i = 0
for qdp in runs:
    if len(runs[str(qdp)]) != 0:
        for doc in runs[str(qdp)]:
            runs[str(qdp)][str(doc)] = scores[i]
            i += 1

#### Cosine similarities for query/document embeddings

In [10]:
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5')


In [11]:
cosine = {}
for qdp in tqdm(runs):
    qp_cosine = {}
    if len(runs[str(qdp)]) != 0:
        query = tr_s['text_left'][np.where(tr_s['id_left'] == int(qdp))[0][0]]
        docs = [df['text_right'][np.where(df['id_right'] == int(doc))[0][0]] for doc in runs[str(qdp)]]
        query_emb = model.encode(query)
        doc_emb = model.encode(docs)
        dot_scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
        i = 0
        for doc in runs[str(qdp)]:
            qp_cosine[str(doc)] = dot_scores[i]
            i += 1
    cosine[str(qdp)] = qp_cosine

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [14:44<00:00,  4.42s/it]


In [12]:
#Min-max normalise
scores_cos = []
for qdp in cosine:
    if len(cosine[str(qdp)]) != 0:
        for doc in cosine[str(qdp)]:
            scores_cos.append(cosine[str(qdp)][str(doc)])
max_min = [1.0, -1.0]            
max_score_cos = np.max(max_min)
min_score_cos = np.min(max_min)
scores_cos = (scores_cos - min_score_cos)/(max_score_cos - min_score_cos)

i = 0
for qdp in cosine:
    if len(cosine[str(qdp)]) != 0:
        for doc in cosine[str(qdp)]:
            cosine[str(qdp)][str(doc)] = scores_cos[i]
            i += 1

##### Just evaluating two rankings

In [13]:
dfbm_tr = pd.read_table('wikIR/training/qrels', header = None, names = ['id_left', 'n_u', 'id_right', 'label'])

In [14]:
qrels_tr = {}
for q_id in dfbm_tr['id_left'].unique():
    did = {}
    for d_id in dfbm_tr['id_right'][np.where(dfbm_tr['id_left'] == q_id)[0]]:
        did[str(d_id)] = int(dfbm_tr['label'][np.where((dfbm_tr['id_left'] == q_id)&(dfbm_tr['id_right']==d_id))[0][0]]) 
    qrels_tr[str(q_id)] = did
#maybe I was just supposed to use their built-in function, but I have written that code, so I didn't change it 

In [30]:
results = pd.DataFrame(tr_s['id_left'])
collection = ['Retrieved', 'Cosine similarity']
run = [runs, cosine]
measures = [(AP(rel=1)@20)]

In [31]:
for i in range(len(run)):
    print("Results for collection", collection[i], ir_measures.calc_aggregate(measures, qrels_tr, run[i]))

Results for collection Retrieved {AP@20: 0.018747767843860168}
Results for collection Cosine similarity {AP@20: 0.021426151583429306}


#### Finding alpha that maximizes MAP@20

In [32]:
a = np.arange(0, 1.001, 0.001)
max_val = 0
best_alpha = 0
runs_copy = runs
cosine_copy = cosine
for alpha in tqdm(a): 
    candidate = {}
    for qdp in runs:
        candidate[str(qdp)] = {}
        if len(runs[str(qdp)]) != 0:
            for doc in runs[str(qdp)]:
                candidate[str(qdp)][str(doc)] = alpha * runs[str(qdp)][str(doc)] + (1 - alpha) * cosine[str(qdp)][str(doc)]
    score = ir_measures.calc_aggregate([(MAP(rel=1)@20)], qrels_tr, candidate)[AP@20]
    if score > max_val:
        max_val = score
        best_alpha = alpha

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1001/1001 [00:38<00:00, 26.17it/s]


In [38]:
print('Alpha that maximizes MAP@20 on train data: ', best_alpha)

Alpha that maximizes MAP@20 on train data:  0.261


### Test data

BM25 scores

In [43]:
test_BM25 = querysearch(dfq_t, 'wiki2')

In [44]:
#Min-max normalise BM25
test_scores = []
for qdp in test_BM25:
    if len(test_BM25[str(qdp)]) != 0:
        for doc in test_BM25[str(qdp)]:
            test_scores.append(test_BM25[str(qdp)][str(doc)])
            
test_max_score = np.max(test_scores)
test_min_score = np.min(test_scores)
test_scores = (test_scores - test_min_score)/(test_max_score - test_min_score)

i = 0
for qdp in test_BM25:
    if len(test_BM25[str(qdp)]) != 0:
        for doc in test_BM25[str(qdp)]:
            test_BM25[str(qdp)][str(doc)] = test_scores[i]
            i += 1

Cosine similarities

In [45]:
test_cosine = {}
for qdp in tqdm(test_BM25):
    qp_cosine = {}
    if len(test_BM25[str(qdp)]) != 0:
        query = dfq_t['text_left'][np.where(dfq_t['id_left'] == int(qdp))[0][0]]
        docs = [df['text_right'][np.where(df['id_right'] == int(doc))[0][0]] for doc in test_BM25[str(qdp)]]
        query_emb = model.encode(query)
        doc_emb = model.encode(docs)
        dot_scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
        i = 0
        for doc in test_BM25[str(qdp)]:
            qp_cosine[str(doc)] = dot_scores[i]
            i += 1
    test_cosine[str(qdp)] = qp_cosine

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [06:20<00:00,  3.81s/it]


In [46]:
#Min-max normalise
test_scores_cos = []
for qdp in test_cosine:
    if len(test_cosine[str(qdp)]) != 0:
        for doc in test_cosine[str(qdp)]:
            test_scores_cos.append(test_cosine[str(qdp)][str(doc)])
max_min = [1.0, -1.0]            
max_score_cos = np.max(max_min)
min_score_cos = np.min(max_min)
test_scores_cos = (test_scores_cos - min_score_cos)/(max_score_cos - min_score_cos)

i = 0
for qdp in test_cosine:
    if len(test_cosine[str(qdp)]) != 0:
        for doc in test_cosine[str(qdp)]:
            test_cosine[str(qdp)][str(doc)] = test_scores_cos[i]
            i += 1

New ranking

In [49]:
new = {}
for qdp in test_BM25:
    new[str(qdp)] = {}
    if len(test_BM25[str(qdp)]) != 0:
        for doc in test_BM25[str(qdp)]:
            new[str(qdp)][str(doc)] = best_alpha * test_BM25[str(qdp)][str(doc)] + (1 - best_alpha) * test_cosine[str(qdp)][str(doc)]


Evaluating

In [47]:
dfbm_t = pd.read_table('wikIR/test/qrels', header = None, names = ['id_left', 'n_u', 'id_right', 'label'])

In [48]:
qrels_t = {}
for q_id in dfbm_t['id_left'].unique():
    did = {}
    for d_id in dfbm_t['id_right'][np.where(dfbm_t['id_left'] == q_id)[0]]:
        did[str(d_id)] = int(dfbm_t['label'][np.where((dfbm_t['id_left'] == q_id)&(dfbm_t['id_right']==d_id))[0][0]]) 
    qrels_t[str(q_id)] = did
#maybe I was just supposed to use their built-in function, but I have written that code, so I didn't change it 

In [50]:
test_results = pd.DataFrame(dfq_t['id_left'])
test_collection = ['Retrieved', 'Cosine similarity', 'New ranking']
test_run = [test_BM25, test_cosine, new]
test_measures = [(P(rel=1)@10),(P(rel=1)@20),(AP(rel=1)@20)]

In [51]:
for i in range(len(test_run)):
    print("Results for collection", test_collection[i], ir_measures.calc_aggregate(test_measures, qrels_t, test_run[i]))

Results for collection Retrieved {AP@20: 0.11117170166130358, P@20: 0.11549999999999999, P@10: 0.17899999999999994}
Results for collection Cosine similarity {AP@20: 0.13082231663095173, P@20: 0.11999999999999998, P@10: 0.18299999999999997}
Results for collection New ranking {AP@20: 0.13357284517315268, P@20: 0.12349999999999998, P@10: 0.1839999999999999}
