In [28]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info().body

{'name': '1d81598a1420',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'bll6FNb4TT2czEtGbIlUVg',
 'version': {'number': '8.6.1',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '180c9830da956993e59e2cd70eb32b5e383ea42c',
  'build_date': '2023-01-24T21:35:11.506992272Z',
  'build_snapshot': False,
  'lucene_version': '9.4.2',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [29]:
import pandas as pd

df = pd.read_csv('documents.csv')

Without stemming

In [30]:
index_name = 'without_stem'
mappings = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'without_stem'})

In [31]:
def check_analyzer(analyzer, text):
    
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index=index_name, body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens

text = 'did it   fina  lly'
analyzer = {
    'analyzer': 'white'
}

check_analyzer(analyzer, text)


  tokens = es.indices.analyze(index=index_name, body=body)['tokens']


['did', 'it', 'fina', 'lly']

In [32]:
from tqdm import tqdm
from time import time
from elasticsearch.helpers import bulk, parallel_bulk
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def es_action_generator(df):
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc = {
            'text': row['text_right'],
        }
        yield create_es_action(index_name, row['id_right'], doc)


start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)


100%|██████████| 369721/369721 [01:13<00:00, 5049.03it/s] 


Indexing time: 76.13157105445862


In [33]:
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')

ListApiResponse([{'epoch': '1680247458', 'timestamp': '07:24:18', 'count': '369721'}])

In [34]:
def pretty_print_result(search_result, fields=[]):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
    
def search(query, *args):
    return pretty_print_result(es.search(index=index_name, query=query, size=20), args)

def get_doc_by_id(doc_id):
    return es.get(index=index_name, id=doc_id)['_source']

# customized
def get_results(search_result):
    res = search_result['hits']
    for hit in res['hits']:        
        return hit['_id'], hit['_score']


def search_results(query_id, query):
    res = es.search(index=index_name, query=query, size=20)['hits']
    ret = []
    for rank, hit in enumerate(res['hits']):
        ret.append((str(query_id), str(hit['_id']), hit['_score'], rank))
    return ret


In [35]:
test_queries = pd.read_csv('queries.csv')
test_queries.head()


Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united states
3,32674,patsy cline
4,406391,dierks bentley


In [10]:
import ir_measures
def make_query(text):
    return {
        'bool': {
            'must': {
                'match': {
                    'text': text
                }
            }
        }
    }

def generate_run(test_queries, save_to_file=False, filename=None):
    if save_to_file:
        f = open(filename, 'w')
    
    run = []
    for i, row in test_queries.iterrows():
        for res in search_results(row['id_left'], make_query(row['text_left'])):
            if(res[3]<21):
                run.append(ir_measures.ScoredDoc(res[0], res[1], res[2]))
                if save_to_file:
                    f.write(f'{res[0]} Q0 {res[1]} {res[3]} {res[2]} BM25\n')
    
    if save_to_file:
        f.close()

    return run

def print_scores(run, total=-1):
    if total > len(run) or total==-1:
        total = len(run)
    for i in range(total):
        print(run[i])

run = generate_run(test_queries, True, 'search_without_stem.res')
print_scores(run, 10)


ScoredDoc(query_id='158491', doc_id='1880296', score=17.35782)
ScoredDoc(query_id='158491', doc_id='2261272', score=17.199305)
ScoredDoc(query_id='158491', doc_id='607552', score=17.118353)
ScoredDoc(query_id='158491', doc_id='1957435', score=16.929768)
ScoredDoc(query_id='158491', doc_id='625257', score=16.877832)
ScoredDoc(query_id='158491', doc_id='635537', score=16.80612)
ScoredDoc(query_id='158491', doc_id='1774491', score=16.570059)
ScoredDoc(query_id='158491', doc_id='663828', score=16.55547)
ScoredDoc(query_id='158491', doc_id='158491', score=16.063732)
ScoredDoc(query_id='158491', doc_id='1956922', score=15.829921)


Stemming

In [None]:
index_name = 'with_stem'
mappings = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'porter_stemmer'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : {
            'porter_stemmer' : {
                'tokenizer' : 'whitespace',
                'filter' : ['porter_stem']
            }
        },
        'filter' : {
            'porter_stem' : {
                'type' : 'porter_stem',
                'language' : 'English'
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'with_stem'})

In [14]:
text = 'using playing ate agoing'
analyzer = {
    'analyzer': 'porter_stemmer'
}

check_analyzer(analyzer, text)

  tokens = es.indices.analyze(index=index_name, body=body)['tokens']


['us', 'plai', 'at', 'ago']

In [15]:
start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)

100%|██████████| 369721/369721 [02:05<00:00, 2952.31it/s]


ConnectionTimeout: Connection timed out

In [None]:
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')

ListApiResponse([{'epoch': '1679477778', 'timestamp': '09:36:18', 'count': '369721'}])

In [None]:
run = generate_run(test_queries=test_queries, save_to_file=True, filename='search_with_stem.res')
print_scores(run, 20)

ScoredDoc(query_id='158491', doc_id='1880296', score=17.132378)
ScoredDoc(query_id='158491', doc_id='2261272', score=16.981)
ScoredDoc(query_id='158491', doc_id='607552', score=16.919212)
ScoredDoc(query_id='158491', doc_id='625257', score=16.701923)
ScoredDoc(query_id='158491', doc_id='1957435', score=16.695692)
ScoredDoc(query_id='158491', doc_id='635537', score=16.567877)
ScoredDoc(query_id='158491', doc_id='663828', score=16.392046)
ScoredDoc(query_id='158491', doc_id='1774491', score=16.352182)
ScoredDoc(query_id='158491', doc_id='158491', score=16.30711)
ScoredDoc(query_id='158491', doc_id='1956922', score=15.618095)
ScoredDoc(query_id='158491', doc_id='1180246', score=15.388653)
ScoredDoc(query_id='158491', doc_id='1170039', score=15.346355)
ScoredDoc(query_id='158491', doc_id='360918', score=15.305798)
ScoredDoc(query_id='158491', doc_id='589549', score=15.305798)
ScoredDoc(query_id='158491', doc_id='945068', score=15.255717)
ScoredDoc(query_id='158491', doc_id='685181', score=

In [None]:
from ir_measures import *
run = ir_measures.read_trec_run('BM25.res')
qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@10, P@20, AP], qrels, run)

{P@20: 0.09499999999999999, P@10: 0.1319999999999999, AP: 0.11196168401599797}

In [None]:
run = ir_measures.read_trec_run('search_without_stem.res')

qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@10, P@20, AP], qrels, run)

{P@20: 0.14800000000000005, P@10: 0.20699999999999988, AP: 0.1477009039803989}

In [36]:
from ir_measures import *
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('msmarco-distilbert-dot-v5')  # replace with your model

In [37]:
run = {}

for i, row in test_queries.iterrows():
    search_res = es.search(index=index_name, query=make_query(row['text_left']), size=20)['hits']
    run[str(row['id_left'])] = {}
                 
    for hit in search_res['hits']:
        run[str(row['id_left'])][hit['_id']] = hit['_score']
                 

In [None]:
# q_ids = list(run.keys())

# run_rerank = {}

# for q_id in q_ids:
#     query_text = test_queries[test_queries['id_left']==int(q_id)]['text_left'].item()
#     query_embedding = model.encode(query_text)
#     run_rerank[q_id] = {}

#     docs_texts = []
#     for doc_id in run[str(q_id)]:
#         docs_texts.append(df[df['id_right']==int(doc_id)]['text_right'].item())
#     docs_embedding = model.encode(docs_texts)
        
#     if len(docs_embedding) == 0:
#         continue
#     cos_sim = util.dot_score(query_embedding, docs_embedding)[0]
#     for i, doc_id in enumerate(run[str(q_id)]):
#         run_rerank[q_id][doc_id] = cos_sim[i].item()


In [40]:
newRun = {}
otherDf = df.set_index('id_right')
for index, row in test_queries.iterrows():
    leftEncode = model.encode(row['text_left'])
    newRun[str(row['id_left'])] = {}
    
    searchResults = []
    for rightId in run[str(row['id_left'])]:
        searchResults.append(otherDf['text_right'][int(rightId)])
    rightEncode = model.encode(searchResults)

    if(len(searchResults) > 0):
        cos_sim = util.dot_score(leftEncode, rightEncode)[0]
        for innerIndex, innerRow in enumerate(run[str(row['id_left'])]):
            newRun[str(row['id_left'])][innerRow] = cos_sim[innerIndex].item()

KeyboardInterrupt: 

In [25]:
qrels = ir_measures.read_trec_qrels('qrels')
print(newRun)
print(run)
ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, newRun)

{158491: {'1880296': 75.81673431396484, '2261272': 70.7028579711914, '607552': 75.90214538574219, '1957435': 77.61271667480469, '625257': 72.64411926269531, '635537': 80.26268005371094, '1774491': 74.48651885986328, '663828': 72.2364273071289, '158491': 79.92601776123047, '1956922': 70.5085678100586, '1180246': 68.72467803955078, '1170039': 73.1563949584961, '589549': 71.44168853759766, '360918': 77.82566833496094, '945068': 69.86650085449219, '685181': 71.4924087524414, '2411344': 70.28116607666016, '1093529': 75.78836059570312, '1158969': 72.81409454345703, '742912': 77.2335205078125}, 5728: {'5728': 80.53766632080078, '1890681': 71.7760009765625, '283099': 75.02107238769531, '98452': 70.2144546508789, '592647': 73.87507629394531, '970793': 70.50012969970703, '375146': 71.57810974121094, '1597007': 70.98116302490234, '911906': 69.88957214355469, '501880': 66.70128631591797, '645822': 72.81500244140625, '255972': 71.07779693603516, '849904': 67.53150939941406, '79940': 68.794616699218

TypeError: Unable to extract query/object scores.

In [21]:
# run = ir_measures.read_trec_run('search_with_stem.res')
qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@10, P@20, AP], qrels, run)


{AP: 0.14701975024421418, P@10: 0.20699999999999988, P@20: 0.14750000000000008}