define the necessary objects and functions for indexing:

In [2]:
import os
import sys
import argparse
import ast
import json
import time
import json
import pandas

import elasticsearch
from elasticsearch import helpers

ELASTI_HOST = '127.0.0.1' 
ELASTIC_PORT = 'port number' 

class ElasticSearch:
    def __init__(self):
        config = {
            ELASTI_HOST: ELASTIC_PORT
        }
        self.es = elasticsearch.Elasticsearch([config, ], timeout=300)
        self.last_scroll_id = None

    def create_index(self, name, mapping, replace=False):
        if replace:
            self.delete_index(name)
        print("creating index, name: ", name)
        self.es.indices.create(index=name, body=mapping)
        print("index created successfully, index name: " + name)

    def delete_index(self, name):
        print("deleting index, name: ", name)
        self.es.indices.delete(index=name, ignore=[400, 404])
        print("index deleted successfully, index name: " + name)

    def index(self, documents, index_name, is_bulk=False):
        if is_bulk:
            try:
                response = helpers.bulk(self.es, documents)
                print("\nRESPONSE:", response)
            except Exception as e:
                print("\nERROR:", e)

    def search(self, index, body):
        try:
            return self.es.search(index=index, body=body)
        except Exception as e:
            print("\nERROR:", e)

In [None]:
def get_tokenized_docs_scidocs(index_name, docs_csv_path, application_in_training, type_):
    docs = []
    docs_df = pd.read_csv(docs_csv_path, sep='\t', names=['doc_id', 'doc_text', 'bertext'])
    for row in docs_df.itertuples():
        if len(docs) % 10000 == 0:
            print("{} docs parsed".format(len(docs)))
        id_ = str(row.doc_id)
        # contents = str(row.bertext) # for indexing the tokenized text
        contents = str(row.doc_text) # for indexing the normal text

        doc = {
            "_index": index_name,
            "_id": id_,
            "application_in_training": application_in_training,
            "content": contents,
            "original_id": id_
        }
        docs.append(doc)
    return docs

def get_mapping(type_):
    if type == "para":
        pass
    else:
        return json.loads(open("./mapping.json").read())

def delete_index(index_name):
    es = ElasticSearch()
    es.delete_index(index_name)


def indexing_w_passaging(type_,
             index_name,
             file_path,
             application_in_training,
             replace_index,
             mapping,
             passaging_params={'text_attr':"text",
                               'length':0,
                               'stride':0,
                               'prepend_title':False}):
    pass
            

def indexing_wo_passaging(type_,
                          index_name,
                          file_path,
                          application_in_training,
                          replace_index,
                          mapping,
                         passaging_params=None):
    print("indexer run.. type: {}, index_name: {}, file_path: {}",
          type_, index_name, file_path)
    docs = get_tokenized_docs_scidocs(index_name, file_path, application_in_training, type_)
    es = ElasticSearch()

    print("creating index mapping...")
    es.create_index(index_name, mapping, replace=replace_index)
    print("index mapping created !")

    print("indexing documents started...: ")
    es.index(index_name=index_name, documents=docs, is_bulk=True)
    print("all docs indexed :)")
    
def indexing(passaging_mode, **kwargs):
    
    if passaging_mode:
        pass
    else:
        indexing_wo_passaging(kwargs['type_'],
                              kwargs['index_name'],
                              kwargs['file_path'],
                              kwargs['application_in_training'],
                              kwargs['replace_index'],
                              kwargs['mapping'],
                              kwargs['passaging_params'])  

set parameters for indexing


In [6]:
# ### for bm25
# index_setting = {
# #         "analysis": {
# #       "analyzer": {
# #         "my_analyzer": {
# #           "tokenizer": "whitespace",
# #           "filter": [
# #             "lowercase",
# #             "porter_stem"
# #           ]
# #         }
# #       }
# #     },
#     "index": {
#         "similarity": {
#             "default": {
#                 "type": "BM25",
#                 "b": bm25_b,
#                 "k1": bm25_k1
#             }
#         },
#         'number_of_shards': 1,
#         'number_of_replicas': 1
#     }}

In [3]:
# for lm
index_setting = {
    "index": {
        "similarity": {
              "default" : {
                "type" : "LMJelinekMercer",
              }
        },
        'number_of_shards': 1,
        'number_of_replicas': 1
    }}

indexing:

In [None]:
import pandas as pd

index_name = 'name/of/the/index'
type_ = "whole" # use it as default

file_path = 'path/to/tokenized/docs/with/bert/or/scibert'

passaging_mode = False

application_in_training = "corpus"
replace_index = True
mapping = get_mapping(type_)


mapping['settings'].update(index_setting)


indexing(passaging_mode=passaging_mode,
         type_=type_,
         index_name=index_name,
         file_path=file_path,
         application_in_training=application_in_training,
         replace_index=replace_index,
         mapping=mapping,
         passaging_params={'text_attr':"text",
                               'length':0,
                               'stride':0,
                               'prepend_title':False}, )

connect to the elasticsearch

In [1]:
from datetime import datetime
from elasticsearch import Elasticsearch

class ES_conn():

    def __init__(self):
        self.client = None
        self.start_connection()

    def start_connection(self):
        self.client = Elasticsearch(host='localhost',port='select/port/number', timeout=30)


elastic_server = ES_conn()
elastic_server.start_connection()

In [None]:
elastic_server.client.indices.refresh()

Read the test qrels with query text:

In [15]:
import pandas as pd

task_name = 'coview' # coread # cocite # cite  # coview
feb_qrel_w_tkned_qtext = pd.read_csv('path/to/test_qrel with scibert tokenized query text for task_name',
                                     sep=' ')

Ranking:

In [None]:
import functools
import operator
import secrets
import random
import csv
import os
import re
from elasticsearch import Elasticsearch
from tqdm import tqdm
import json 


f = open('../data/docid2index.json')
docid2index = json.load(f)
f.close()



def res_wrapper(query_id: str, q: int, doc_id: str, rank: int, score: float, run_id: str):
    return [query_id, q, doc_id, rank, score, run_id]


def es_search(q_id, query_text, docno_, index, top_k):
    bool_query = {
                    "size": topk,
                    "query": {
                        "bool": {
                            "must": [
                                {"term": {"original_id": docno_}}
                            ],
                            "should": [
                                {"match": {"content": query_text}}
                            ]
                            ,"minimum_should_match": 0,
                            "boost": 1.0
                        }
                    }
                }

    query_template = bool_query
    res = elastic_server.client.search(index=index, body=query_template)
    return res


def elastic_batchretrieve(topics_df, index_name, topk):
    hits = []
    
    for row in tqdm(topics_df.itertuples()):
        try:

            res = es_search(row.qid, row.query_text, row.docno, index_name,  topk)
            for rank, hit in enumerate(res['hits']['hits'], 1):
                hits.append(res_wrapper(rank=rank, doc_id=hit['_id'], q=0, score=hit['_score'],
                                            run_id='scidocs', query_id=row.qid))
        except:
            print(row.qid, row.query_text)

    return hits


index_name = 'name/of/the/index'
hits = elastic_batchretrieve(feb_qrel_w_tkned_qtext, index_name, 1)
res_df = pd.DataFrame(hits, columns=['qid', 'query', 'docno', 'rank', 'score', 'run_id'])

save the run file

In [10]:
res_df.to_csv('path/to/run/file/', sep='\t', index=False)

evaluation:

In [18]:
res_df.to_csv('./temp.run', header=False, index=False, sep=' ')

In [None]:
from eval_utils import qrel_metrics
import numpy as np

qrel_file = '../data/'+task_name+'_data/test.qrel'

run_file = './temp.run'

print(task_name)
res_dict = qrel_metrics(qrel_file, run_file, metrics=('ndcg', 'map'))
for item in res_dict:
    print(item, np.around(res_dict[item], 2))