In [21]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
import pandas as pd
from tqdm import tqdm
from time import time
import random
import numpy as np
import spacy
import ir_measures
from ir_measures import *
nlp = spacy.load("en_core_web_sm")

# Building search engine

In [349]:
df = pd.read_csv('wikIR/documents.csv')

## Connection

In [3]:
es = Elasticsearch('http://localhost:9200')

## Indexing documents

In [366]:
# With stemming

mappings = {
    'properties': {
        '_document': {
            'type': 'text',
            'analyzer': 'porter_stemmer'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : {
            'porter_stemmer' : {
                'tokenizer' : 'whitespace',
                'filter' : 'porter_stem'
            }
        },
        'filter' : {
            'porter_stem' : {
                'type' : 'porter_stem',
                'language' : 'English'
            }
        }
    }
}

index_name='wiki1'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)

def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_document': document
    }


def es_action_generator():
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc =  row['text_right']
        yield create_es_action(index_name, row['id_right'], doc)


start = time()
for ok, result in parallel_bulk(es, es_action_generator(), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        

100%|█████████████████████████████████████████████████████| 369721/369721 [01:42<00:00, 3597.44it/s]


Indexing time: 103.22652745246887


In [367]:
#Without stemming

mappings_w = {
    'properties': {
        '_document': {
            'type': 'text',
            'analyzer': 'standard'
        }
    }
}

settings_w = {
    'analysis' : {
        'analyzer' : 'standard' 
        
    }
}

index_name2 = 'wiki2'
if es.indices.exists(index=index_name2):
    es.indices.delete(index=index_name2)
es.indices.create(index=index_name2,settings=settings_w, mappings=mappings_w)


def es_action_generator2():
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc =  row['text_right']
        yield create_es_action(index_name2, row['id_right'], doc)
        
        
start = time()
for ok, result in parallel_bulk(es, es_action_generator2(), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)

100%|█████████████████████████████████████████████████████| 369721/369721 [01:47<00:00, 3432.93it/s]


Indexing time: 108.60764980316162


### Optional task : lemmatized collection

In [260]:

dfl = pd.DataFrame(columns = df.columns)
dfl['id_right'] = df['id_right']
for doc_id in tqdm(range(len(df['text_right']))):
    tokens = nlp(df['text_right'][doc_id])
    lemmatized = " ".join(token.lemma_ for token in tokens)
    dfl['text_right'][doc_id] = lemmatized

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfl['text_right'][doc_id] = lemmatized
100%|█████████████████████████████████████████████████████| 369721/369721 [4:11:11<00:00, 24.53it/s]


In [368]:
# Lemmatized collection

index_name3 = 'wiki3'
if es.indices.exists(index=index_name3):
    es.indices.delete(index=index_name3)
es.indices.create(index=index_name3,settings=settings_w, mappings=mappings_w)


def es_action_generator3():
    for doc_id, row in tqdm(dfl.iterrows(), total=dfl.shape[0]):
        doc =  row['text_right']
        yield create_es_action(index_name3, row['id_right'], doc)
        
        
start = time()
for ok, result in parallel_bulk(es, es_action_generator3(), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)

100%|█████████████████████████████████████████████████████| 369721/369721 [02:08<00:00, 2877.74it/s]


Indexing time: 129.7395622730255


## Search

In [95]:
def pretty_print_result(search_result):
    res = search_result['hits']
    results = {}
    for hit in res['hits']:
        results[hit["_id"]] = hit["_score"]
    return results
        
    
def search(query, i):
    s = pretty_print_result(es.search(index=i, query=query, size=20))
    return s


def query(l): #with boosting
    pop = {
        'bool': {
            'must': {
                    'match': {
                        '_document': l
                    }
            },
                'should': {
                    'match_phrase': {
                        '_document': {
                            "query": l,
                            "boost": 2 #boost phrase matches
                        }                           
                    }                    
                },
            "minimum_should_match": 1
            }
        }
    
    return pop

def query_no_boost(l):
    pop = {
        'bool': {
            'must': {
                    'match': {
                        '_document': l
                    }
            },
                'should': {
                    'match_phrase': {
                        '_document': {
                            "query": l
                        }                           
                    }                    
                }
            }
        }
    
    return pop


def querysearch(queries, indexname,opt = 'no_boost'):
    time_list = []
    results ={}
    for q in range(len(queries['text_left'])):
        row = queries['text_left'][q]
        if opt == 'no_boost':
            qu = query_no_boost(row)
        else: 
            qu = query(row)
        start = time()
        res = search(qu, indexname)
        stop = time()
        time_list.append(stop - start)
        results[str(dfq['id_left'][q])] = res
    return results, time_list

### Lemmatize queries

In [10]:
dfq = pd.read_csv('wikIR/test/queries.csv')
dfql = pd.DataFrame(columns = dfq.columns)
dfql['id_left'] = dfq['id_left']
for doc_id in tqdm(range(len(dfq['text_left']))):
    tokens = nlp(dfq['text_left'][doc_id])
    lemmatized = " ".join(token.lemma_ for token in tokens)
    dfql['text_left'][doc_id] = lemmatized

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfql['text_left'][doc_id] = lemmatized
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 80.62it/s]


## Results

In [112]:
print('Without Stemming')
runs_without, time_without = querysearch(dfql, 'wiki2')
print("Average query execution time:", np.mean(time_without))

Without Stemming
Average query execution time: 0.007884113788604737


In [113]:
print('With Stemming')
runs_with, time_with = querysearch(dfql, 'wiki1')
print("Average query execution time:", np.mean(time_with))

With Stemming
Average query execution time: 0.008870112895965575


In [114]:
print('With Lemmatization')
runs_l, time_l = querysearch(dfql, 'wiki3')
print("Average query execution time:", np.mean(time_l))

With Lemmatization
Average query execution time: 0.008040292263031006


In [102]:
print('Without Stemming with Boost')
runs_without_boost, time_without_boost = querysearch(dfql, 'wiki2', 'boost')
print("Average query execution time:", np.mean(time_without))

Without Stemming with Boost
Average query execution time: 0.008446846008300781


In [103]:
print('With Stemming with Boost')
runs_with_boost, time_with_boost = querysearch(dfql, 'wiki1', 'boost')
print("Average query execution time:", np.mean(time_with))

With Stemming with Boost
Average query execution time: 0.00853665590286255


In [104]:
print('With Lemmatization with Boost')
runs_l_boost, time_l_boost = querysearch(dfql, 'wiki3', 'boost')
print("Average query execution time:", np.mean(time_l))

With Lemmatization with Boost
Average query execution time: 0.008120014667510986


# Evaluation

I have already formatted my runs in TREC format. So now we just need qrels and ready to calculate measures.

In [63]:
dfbm = pd.read_table('wikIR/test/qrels', header = None, names = ['id_left', 'n_u', 'id_right', 'label'])

In [66]:
qrels = {}
for q_id in dfbm['id_left'].unique():
    did = {}
    for d_id in dfbm['id_right'][np.where(dfbm['id_left'] == q_id)[0]]:
        did[str(d_id)] = int(dfbm['label'][np.where((dfbm['id_left'] == q_id)&(dfbm['id_right']==d_id))[0][0]]) 
    qrels[str(q_id)] = did
#maybe I was just supposed to use their built-in function, but I have written that code, so I didn't change it 

In [68]:
dfbmr = pd.read_csv('wikIR/test/BM25.res', header=None, names=['q'])
dfbmr[['q', 'n_u', 'd', 'r', 's', 'rn']] = dfbmr.q.str.split(expand=True)

In [69]:
runs = {}
for q_id in dfbmr['q'].unique():
    rid = {}
    for d_id in dfbmr['d'][np.where(dfbmr['q'] == q_id)[0]]:
        rid[str(d_id)] = float(dfbmr['s'][np.where((dfbmr['q'] == q_id)&(dfbmr['d']==d_id))[0][0]])
    runs[str(q_id)] = rid   

In [108]:
results = pd.DataFrame(dfq['id_left'])
collection = ['Without stemming', 'With stemming', 'Lemmatized', 'Without stemming with Boost', 'With stemming with Boost', 'Lemmatized with Boost', 'BM25']
run = [runs_without, runs_with, runs_l,runs_without_boost, runs_with_boost, runs_l_boost,  runs]
measures = [(P(rel=1)@10),(P(rel=1)@20),(MAP(rel=1))]

In [115]:
for measure in measures:
    for i in range(len(collection)):
        metric = []
        for m in measure.iter_calc(qrels, run[i]):
            metric.append(m[-1])
        colname = ' '.join([collection[i], str(measure)])
        results[colname] = pd.Series(metric) 

In [110]:
results

Unnamed: 0,id_left,Without stemming P@10,With stemming P@10,Lemmatized P@10,Without stemming with Boost P@10,With stemming with Boost P@10,Lemmatized with Boost P@10,BM25 P@10,Without stemming P@20,With stemming P@20,...,With stemming with Boost P@20,Lemmatized with Boost P@20,BM25 P@20,Without stemming AP,With stemming AP,Lemmatized AP,Without stemming with Boost AP,With stemming with Boost AP,Lemmatized with Boost AP,BM25 AP
0,158491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,...,0.05,0.05,0.05,0.006173,0.006173,0.006173,0.006944,0.006944,0.006944,0.006173
1,5728,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.10,0.15,...,0.15,0.10,0.10,0.076923,0.091209,0.076190,0.076923,0.091209,0.076190,0.094802
2,13554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,...,0.00,0.00,0.00,0.007519,0.007519,0.007519,0.000000,0.000000,0.000000,0.000000
3,32674,0.5,0.5,0.5,0.5,0.5,0.5,0.7,0.25,0.25,...,0.25,0.25,0.35,0.625850,0.625850,0.625850,0.625850,0.625850,0.625850,0.729592
4,406391,0.0,0.1,0.1,0.0,0.1,0.1,0.0,0.00,0.15,...,0.05,0.05,0.00,0.000000,0.108516,0.112179,0.000000,0.062500,0.062500,0.053588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,679227,0.1,0.2,0.1,0.0,0.2,0.1,0.1,0.15,0.15,...,0.15,0.15,0.15,0.020274,0.035034,0.027054,0.000000,0.039002,0.025467,0.067767
96,2136797,0.2,0.2,0.2,0.2,0.2,0.2,0.1,0.10,0.10,...,0.10,0.10,0.05,0.194444,0.194444,0.194444,0.194444,0.194444,0.194444,0.031250
97,5622,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.05,0.05,...,0.05,0.05,0.05,0.006579,0.006579,0.006579,0.006579,0.006579,0.006579,0.011759
98,1313598,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.15,0.15,...,0.15,0.15,0.10,0.163077,0.173077,0.173077,0.163077,0.163077,0.163077,0.127778


In [116]:
for i in range(len(run)):
    print("Results for collection", collection[i], ir_measures.calc_aggregate(measures, qrels, run[i]))

Results for collection Without stemming {P@20: 0.12400000000000005, P@10: 0.17699999999999996, AP: 0.11553823714486759}
Results for collection With stemming {P@20: 0.13450000000000004, P@10: 0.19499999999999992, AP: 0.12447524836381069}
Results for collection Lemmatized {P@20: 0.13100000000000003, P@10: 0.18799999999999997, AP: 0.1228401963484228}
Results for collection Without stemming with Boost {P@20: 0.10050000000000005, P@10: 0.15199999999999994, AP: 0.09928764651433107}
Results for collection With stemming with Boost {P@20: 0.11550000000000002, P@10: 0.17399999999999996, AP: 0.1090320259686725}
Results for collection Lemmatized with Boost {P@20: 0.113, P@10: 0.16699999999999993, AP: 0.10836973167982002}
Results for collection BM25 {P@20: 0.09499999999999999, P@10: 0.1319999999999999, AP: 0.11196168401599797}
