In [348]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
import pandas as pd
from tqdm import tqdm
from time import time
import random
import numpy as np
import spacy
from ir_measures import *

# Building search engine

In [349]:
df = pd.read_csv('wikIR/documents.csv')

## Connection

In [362]:
es = Elasticsearch('http://localhost:9200')

## Indexing documents

In [366]:
# With stemming

mappings = {
    'properties': {
        '_document': {
            'type': 'text',
            'analyzer': 'porter_stemmer'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : {
            'porter_stemmer' : {
                'tokenizer' : 'whitespace',
                'filter' : 'porter_stem'
            }
        },
        'filter' : {
            'porter_stem' : {
                'type' : 'porter_stem',
                'language' : 'English'
            }
        }
    }
}

index_name='wiki1'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)

def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_document': document
    }


def es_action_generator():
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc =  row['text_right']
        yield create_es_action(index_name, row['id_right'], doc)


start = time()
for ok, result in parallel_bulk(es, es_action_generator(), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        

100%|█████████████████████████████████████████████████████| 369721/369721 [01:42<00:00, 3597.44it/s]


Indexing time: 103.22652745246887


In [367]:
#Without stemming

mappings_w = {
    'properties': {
        '_document': {
            'type': 'text',
            'analyzer': 'standard'
        }
    }
}

settings_w = {
    'analysis' : {
        'analyzer' : 'standard' 
        
    }
}

index_name2 = 'wiki2'
if es.indices.exists(index=index_name2):
    es.indices.delete(index=index_name2)
es.indices.create(index=index_name2,settings=settings_w, mappings=mappings_w)


def es_action_generator2():
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc =  row['text_right']
        yield create_es_action(index_name2, row['id_right'], doc)
        
        
start = time()
for ok, result in parallel_bulk(es, es_action_generator2(), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)

100%|█████████████████████████████████████████████████████| 369721/369721 [01:47<00:00, 3432.93it/s]


Indexing time: 108.60764980316162


### Optional task : lemmatized collection

In [260]:
nlp = spacy.load("en_core_web_sm")
dfl = pd.DataFrame(columns = df.columns)
dfl['id_right'] = df['id_right']
for doc_id in tqdm(range(len(df['text_right']))):
    tokens = nlp(df['text_right'][doc_id])
    lemmatized = " ".join(token.lemma_ for token in tokens)
    dfl['text_right'][doc_id] = lemmatized

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfl['text_right'][doc_id] = lemmatized
100%|█████████████████████████████████████████████████████| 369721/369721 [4:11:11<00:00, 24.53it/s]


In [368]:
# Lemmatized collection

index_name3 = 'wiki3'
if es.indices.exists(index=index_name3):
    es.indices.delete(index=index_name3)
es.indices.create(index=index_name3,settings=settings_w, mappings=mappings_w)


def es_action_generator3():
    for doc_id, row in tqdm(dfl.iterrows(), total=dfl.shape[0]):
        doc =  row['text_right']
        yield create_es_action(index_name3, row['id_right'], doc)
        
        
start = time()
for ok, result in parallel_bulk(es, es_action_generator3(), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)

100%|█████████████████████████████████████████████████████| 369721/369721 [02:08<00:00, 2877.74it/s]


Indexing time: 129.7395622730255


## Search

In [388]:
def pretty_print_result(search_result):
    res = search_result['hits']
    results = {}
    for hit in res['hits']:
        results[hit["_id"]] = hit["_score"]
    return results
        
    
def search(query, i):
    s = pretty_print_result(es.search(index=i, query=query, size=20))
    return s


def query(l):
    pop = {
        'bool': {
            'must': {
                    'match': {
                        '_document': l
                    }
            },
                'should': {
                    'match_phrase': {
                        '_document': {
                            "query": l,
                            "boost": 2 #boost phrase matches
                        }                           
                    }                    
                },
            "minimum_should_match": 1
            }
        }
    
    return pop


def querysearch(queries, indexname):
    time_list = []
    results ={}
    for q in range(len(queries['text_left'])):
        row = queries['text_left'][q]
        qu = query(row)
        start = time()
        res = search(qu, indexname)
        stop = time()
        time_list.append(stop - start)
        results[str(dfq['id_left'][q])] = res
    return results, time_list

### Lemmatize queries

In [268]:
dfq = pd.read_csv('wikIR/test/queries.csv')
dfql = pd.DataFrame(columns = dfq.columns)
dfql['id_left'] = dfq['id_left']
for doc_id in tqdm(range(len(dfq['text_left']))):
    tokens = nlp(dfq['text_left'][doc_id])
    lemmatized = " ".join(token.lemma_ for token in tokens)
    dfql['text_left'][doc_id] = lemmatized

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfql['text_left'][doc_id] = lemmatized
100%|█████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 81.38it/s]


## Results

In [392]:
print('Without Stemming')
runs_without, time_without = querysearch(dfql, 'wiki2')
print("Average query execution time:", np.mean(time_without))

Without Stemming
Average query execution time: 0.009379141330718994


In [393]:
print('With Stemming')
runs_with, time_with = querysearch(dfql, 'wiki1')
print("Average query execution time:", np.mean(time_with))

With Stemming
Average query execution time: 0.00793302059173584


In [394]:
print('With Lemmatization')
runs_l, time_l = querysearch(dfql, 'wiki3')
print("Average query execution time:", np.mean(time_l))

With Lemmatization
Average query execution time: 0.007155168056488037


# Evaluation

I have already formatted my runs in TREC format. So now we just need qrels and ready to calculate measures.

In [186]:
dfbm = pd.read_csv('wikIR/test/BM25.qrels.csv')

In [216]:
qrels = {}
for q_id in dfbm['id_left'].unique():
    did = {}
    for d_id in dfbm['id_right'][np.where(dfbm['id_left'] == q_id)[0]]:
        did[str(d_id)] = int(dfbm['label'][np.where((dfbm['id_left'] == q_id)&(dfbm['id_right']==d_id))[0][0]])
    qrels[str(q_id)] = did
#maybe I was just supposed to use their built-in function, but I have written that code, so I didn't change it 

In [344]:
dfbmr = pd.read_csv('wikIR/test/BM25.res', header=None, names=['q'])
dfbmr[['q', 'n_u', 'd', 'r', 's', 'rn']] = dfbmr.q.str.split(expand=True)

In [345]:
runs = {}
for q_id in dfbmr['q'].unique():
    rid = {}
    for d_id in dfbmr['d'][np.where(dfbmr['q'] == q_id)[0]]:
        rid[str(d_id)] = float(dfbmr['s'][np.where((dfbmr['q'] == q_id)&(dfbmr['d']==d_id))[0][0]])
    runs[str(q_id)] = rid   

In [370]:
results = pd.DataFrame(dfq['id_left'])
collection = ['Without stemming', 'With stemming', 'Lemmatized', 'BM25']
run = [runs_without, runs_with, runs_l, runs]
measures = [(P@10),(P@20),(MAP)]

In [395]:
for measure in measures:
    for i in range(len(collection)):
        metric = []
        for m in measure.iter_calc(qrels, run[i]):
            metric.append(m[-1])
        colname = ' '.join([collection[i], str(measure)])
        results[colname] = pd.Series(metric) 
        print("Average measure for all queries for", str(measure), "for collection", collection[i], round(np.mean(metric),3))

Average measure for all queries for P@10 for collection Without stemming 0.522
Average measure for all queries for P@10 for collection With stemming 0.549
Average measure for all queries for P@10 for collection Lemmatized 0.546
Average measure for all queries for P@10 for collection BM25 1.0
Average measure for all queries for P@20 for collection Without stemming 0.452
Average measure for all queries for P@20 for collection With stemming 0.474
Average measure for all queries for P@20 for collection Lemmatized 0.467
Average measure for all queries for P@20 for collection BM25 1.0
Average measure for all queries for AP for collection Without stemming 0.09
Average measure for all queries for AP for collection With stemming 0.092
Average measure for all queries for AP for collection Lemmatized 0.092
Average measure for all queries for AP for collection BM25 1.0


In [382]:
results

Unnamed: 0,id_left,Without stemming P@10,Without stemming P@20,Without stemming AP,With stemming P@10,With stemming P@20,With stemming AP,Lemmatized P@10,Lemmatized P@20,Lemmatized AP,BM25 P@10,BM25 P@20,BM25 AP
0,158491,1.0,0.90,0.177856,1.0,0.90,0.177856,1.0,0.90,0.177856,1.0,1.0,1.0
1,5728,1.0,1.00,0.200000,1.0,0.95,0.186538,1.0,0.95,0.186538,1.0,1.0,1.0
2,13554,0.0,0.00,0.000000,0.0,0.00,0.000000,0.0,0.00,0.000000,1.0,1.0,1.0
3,32674,1.0,1.00,0.200000,1.0,1.00,0.200000,1.0,1.00,0.200000,1.0,1.0,1.0
4,406391,0.0,0.00,0.000000,0.2,0.10,0.020000,0.2,0.10,0.020000,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,679227,0.0,0.00,0.000000,1.0,1.00,0.200000,1.0,0.70,0.138750,1.0,1.0,1.0
96,2136797,0.7,0.35,0.070000,0.7,0.35,0.070000,0.7,0.35,0.070000,1.0,1.0,1.0
97,5622,1.0,1.00,0.200000,1.0,1.00,0.200000,1.0,1.00,0.200000,1.0,1.0,1.0
98,1313598,1.0,1.00,0.200000,1.0,0.95,0.188418,1.0,0.95,0.188418,1.0,1.0,1.0
