In [27]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
from sentence_transformers import SentenceTransformer, util
import ir_measures
from ir_measures import *
import pandas as pd
from tqdm import tqdm
import torch
from time import time

In [33]:
es = Elasticsearch('http://elastic:Jx2cwj2gYvMZAloGgEez@localhost:9200', timeout=100)

In [29]:
df = pd.read_csv('documents.csv')
df

Unnamed: 0,id_right,text_right
0,1781133,it was used in landing craft during world war ...
1,2426736,after rejecting an offer from cambridge univer...
2,2224122,mat zan coached kuala lumpur fa in 1999 and wo...
3,219642,a barcode is a machine readable optical label ...
4,1728654,since the subordination of the monarchy under ...
...,...,...
369716,59396,the population was 416 at the 2010 census the ...
369717,1950034,the surface of the river is frozen from novemb...
369718,1984468,the first anti thrombin aptamer tba was genera...
369719,33966,state of oklahoma as of the 2010 census the po...


In [30]:
test_queries = pd.read_csv('queries.csv')
test_queries

Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united states
3,32674,patsy cline
4,406391,dierks bentley
...,...,...
95,679227,hiv aids
96,2136797,maren morris
97,5622,homer
98,1313598,south pole


In [31]:
index = 'wiki'

settings_fin = {
    'mappings': {
        'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
        }
    },
    'settings': {
        'analysis' : {
            'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
            }
        }
    }
}

if es.indices.exists(index=index):
    es.indices.delete(index=index)
es.indices.create(index=index, body=settings_fin)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki'}

In [36]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def es_action_generator(df):
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc = {
            'text': row['text_right'],
        }
        yield create_es_action(index, row['id_right'], doc)
        
start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index)

100%|████████████████████████████████████████████████████████████████████████| 369721/369721 [01:45<00:00, 3500.76it/s]


Indexing time: 112.53176641464233


{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

In [40]:
def pretty_print_result(search_result, fields=[]):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
    
def search(query, *args):
    return pretty_print_result(es.search(index=index, body=query, size=20), args)

def generate_scores(test):
    result = {}
    for _, row in test.iterrows():
        query_id = str(row['id_left'])
        query = row['text_left']
        
        search_body = {
            "query": {
                "match": {
                    "text": query
                }
            },
            "size": 20
        }
        
        response = es.search(index=index, body=search_body)
        hits = response['hits']['hits']
        
        scores = {}
        for hit in hits:
            doc_id = hit['_id']
            score = hit['_score']
            scores[doc_id] = score
            
        result[query_id] = scores
        
    return result

In [41]:
# BM25 results
BM25 = ir_measures.read_trec_run('BM25.res')
qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, BM25)

{AP@20: 0.0974761709661997,
 P@20: 0.09499999999999999,
 P@10: 0.1319999999999999}

In [42]:
# My results: test
result = generate_scores(test_queries)
qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, result)

{AP@20: 0.14917394561952935,
 P@20: 0.15000000000000008,
 P@10: 0.20599999999999988}

In [45]:
def model_res(model, run):
    
    run_cosine = {}
    
    for q_id, doc_scores in tqdm(run.items(), total=len(run), bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        
        # query encoding
        query_text = test_queries.loc[test_queries['id_left'] == int(q_id), 'text_left'].iloc[0]
        query_embedding = model.encode(query_text)
        run_cosine[q_id] = {}
        
        # document encoding and cosine similarity calculation
        for doc_id in doc_scores:
            doc_text = df.loc[df['id_right'] == int(doc_id), 'text_right'].iloc[0]
            doc_embedding = model.encode(doc_text)
            cos_sim = util.pytorch_cos_sim(query_embedding, doc_embedding)[0].item()
            run_cosine[q_id][doc_id] = cos_sim
            
    return run_cosine

In [46]:
model = SentenceTransformer('msmarco-MiniLM-L6-cos-v5')
cos_sim = model_res(model, run)

100%|██████████████████████████████| 100/100 [8:43:41<00:00, 314.21s/it]                                               


In [47]:
qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, cos_sim)

{AP@20: 0.1666900689321254,
 P@20: 0.14800000000000008,
 P@10: 0.2139999999999999}