In [16]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
import pandas as pd
from tqdm import tqdm
from time import time
import numpy as np
import ir_measures
from ir_measures import *
from sentence_transformers import SentenceTransformer, util

Elasticsearch

In [3]:
df = pd.read_csv('wikIR/documents.csv')

In [4]:
es = Elasticsearch('http://localhost:9200')

In [5]:
mappings = {
    'properties': {
        '_document': {
            'type': 'text',
            'analyzer': 'standard'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : 'standard' 
        
    }
}


def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_document': document
    }


def es_action_generator():
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc =  row['text_right']
        yield create_es_action(index_name, row['id_right'], doc)

In [None]:
index_name='wiki2'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)

start = time()
for ok, result in parallel_bulk(es, es_action_generator(), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)

In [20]:
def pretty_print_result(search_result):
    res = search_result['hits']
    results = {}
    for hit in res['hits']:
        results[hit["_id"]] = hit["_score"]
    return results
        
    
def search(query, i):
    s = pretty_print_result(es.search(index=i, query=query, size=20))
    return s


def query(l):
    pop = {
        'bool': {
            'must': {
                    'match': {
                        '_document': l
                    }
            },
                'should': {
                    'match_phrase': {
                        '_document': {
                            "query": l
                        }                           
                    }                    
                }
            }
        }
    
    return pop


def querysearch(queries, indexname):
    results ={}
    for q in range(len(queries['text_left'])):
        row = queries['text_left'][q]
        qu = query(row)
        res = search(qu, indexname)
        results[str(queries['id_left'][q])] = res
    return results

Test queries

In [10]:
dfq_t = pd.read_csv('wikIR/test/queries.csv')

BM25

In [21]:
runs = querysearch(dfq_t, 'wiki2')

Cosine

In [9]:
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5')

In [22]:
cosine = {}
for qdp in tqdm(runs):
    qp_cosine = {}
    if len(runs[str(qdp)]) != 0:
        query = dfq_t['text_left'][np.where(dfq_t['id_left'] == int(qdp))[0][0]]
        docs = [df['text_right'][np.where(df['id_right'] == int(doc))[0][0]] for doc in runs[str(qdp)]]
        query_emb = model.encode(query)
        doc_emb = model.encode(docs)
        dot_scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
        i = 0
        for doc in runs[str(qdp)]:
            qp_cosine[str(doc)] = dot_scores[i]
            i += 1
    cosine[str(qdp)] = qp_cosine

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [03:52<00:00,  2.32s/it]


Evaluating

In [6]:
dfbm = pd.read_table('wikIR/test/qrels', header = None, names = ['id_left', 'n_u', 'id_right', 'label'])

In [7]:
qrels = {}
for q_id in dfbm['id_left'].unique():
    did = {}
    for d_id in dfbm['id_right'][np.where(dfbm['id_left'] == q_id)[0]]:
        did[str(d_id)] = int(dfbm['label'][np.where((dfbm['id_left'] == q_id)&(dfbm['id_right']==d_id))[0][0]]) 
    qrels[str(q_id)] = did
#maybe I was just supposed to use their built-in function, but I have written that code, so I didn't change it 

In [23]:
results = pd.DataFrame(dfq_t['id_left'])
collection = ['Retrieved', 'Cosine similarity']
run = [runs, cosine]
measures = [(P(rel=1)@10),(P(rel=1)@20),(AP(rel=1)@20)]

In [24]:
for i in range(len(run)):
    print("Results for collection", collection[i], ir_measures.calc_aggregate(measures, qrels, run[i]))

Results for collection Retrieved {AP@20: 0.1227333070822508, P@10: 0.19099999999999995, P@20: 0.13200000000000003}
Results for collection Cosine similarity {AP@20: 0.1460335516258388, P@10: 0.18899999999999992, P@20: 0.13200000000000003}
