In [165]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
import ir_measures
from ir_measures import *
import pandas as pd
import json
from tqdm import tqdm
from time import time

In [101]:
es = Elasticsearch('http://elastic:Jx2cwj2gYvMZAloGgEez@localhost:9200', timeout=30)

### Index Configuration

In [None]:
#without stemming

index = 'wiki_without'

settings_fin = {
    'mappings': {
        'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
        }
    },
    'settings': {
        'analysis' : {
            'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
            }
        }
    }
}

In [106]:
if es.indices.exists(index=index):
    es.indices.delete(index=index)
es.indices.create(index=index, body=settings_fin)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki_without'}

### Checking analyzer

In [107]:
def check_analyzer(analyzer, text):
    body = analyzer
    body['text'] = text
    tokens = es.indices.analyze(index=index, body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens

text = 'It matters not what someone is born, but what they grow to be'
analyzer = {
    'analyzer': 'white'
}

check_analyzer(analyzer, text)

['It',
 'matters',
 'not',
 'what',
 'someone',
 'is',
 'born,',
 'but',
 'what',
 'they',
 'grow',
 'to',
 'be']

### WikiIR collection

In [108]:
df = pd.read_csv('documents.csv')
df

Unnamed: 0,id_right,text_right
0,1781133,it was used in landing craft during world war ...
1,2426736,after rejecting an offer from cambridge univer...
2,2224122,mat zan coached kuala lumpur fa in 1999 and wo...
3,219642,a barcode is a machine readable optical label ...
4,1728654,since the subordination of the monarchy under ...
...,...,...
369716,59396,the population was 416 at the 2010 census the ...
369717,1950034,the surface of the river is frozen from novemb...
369718,1984468,the first anti thrombin aptamer tba was genera...
369719,33966,state of oklahoma as of the 2010 census the po...


### Indexing documents

In [None]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def es_action_generator(df):
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc = {
            'text': row['text_right'],
        }
        yield create_es_action(index_name, row['id_right'], doc)

In [109]:
start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index)

100%|████████████████████████████████████████████████████████████████████████| 369721/369721 [01:01<00:00, 6026.96it/s]


Indexing time: 63.871764183044434


{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

### Search

In [166]:
def pretty_print_result(search_result, fields=[]):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
    
def search(query, *args):
    return pretty_print_result(es.search(index=index, body=query, size=20), args)

def get_doc_by_id(doc_id):
    return es.get(index=index, id=doc_id)['_source']

def search_results(query_id, query):
    res = es.search(index=index, body=query, size=20)['hits']
    ret = [(str(query_id), str(hit['_id']), hit['_score'], rank) for rank, hit in enumerate(res['hits'])]
    return ret

### Queries

In [117]:
test_queries = pd.read_csv('queries.csv')
test_queries

Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united states
3,32674,patsy cline
4,406391,dierks bentley
...,...,...
95,679227,hiv aids
96,2136797,maren morris
97,5622,homer
98,1313598,south pole


In [167]:
def make_query(text):
    return {
        'query': {
            'bool': {
                'must': {
                    'match': {
                        'text': text
                    }
                }
            }
        }
    }

def generate_scores(test):
    result = []
    file = open('results.res', 'w')
    for i, row in test.iterrows():
        for res in search_results(row['id_left'], make_query(row['text_left'])):
            result.append(ir_measures.ScoredDoc(res[0], res[1], res[2]))
            file.write(f'{res[0]} Q0 {res[1]} {res[3]} {res[2]} BM25\n')
    file.close()
    return result

In [124]:
run = generate_scores(test_queries)
start = time()
generate_scores(test_queries)
stop = time()

print('Query execution time (total):', stop-start, 's')

Query execution time (total): 0.37811851501464844 s


In [None]:
# with stemming

index = 'wiki_with'

settings_fin2 = {
    'mappings': {
        'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'porter_stemmer'
        }
        }
    },
    'settings': {
        'analysis' : {
            'analyzer' : {
            'porter_stemmer' : {
                'tokenizer' : 'whitespace',
                'filter' : ['porter_stem']
            }
        },
            'filter' : {
            'porter_stem' : {
                'type' : 'porter_stem',
                'language' : 'English'
            }
            }
        }
    }
}

In [127]:
if es.indices.exists(index=index):
    es.indices.delete(index=index)
es.indices.create(index=index, body=settings_fin2)

analyzer = {
    'analyzer': 'porter_stemmer'
}

check_analyzer(analyzer, text)

['It',
 'matter',
 'not',
 'what',
 'someon',
 'is',
 'born,',
 'but',
 'what',
 'thei',
 'grow',
 'to',
 'be']

In [128]:
start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index)

100%|████████████████████████████████████████████████████████████████████████| 369721/369721 [00:46<00:00, 7963.72it/s]


Indexing time: 46.94376277923584


{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

### lemmatization

In [154]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maksu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\maksu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maksu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\maksu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [156]:
# lemma = []

# def lemmatize_with_postag(sentence):
#     sent = TextBlob(sentence)
#     tag_dict = {"J": 'a', 
#                 "N": 'n', 
#                 "V": 'v', 
#                 "R": 'r'}
#     words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
#     lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
#     return " ".join(lemmatized_list)

# for i, row in tqdm(df.iterrows(), total=df.shape[0]):
#     lemmatized_output = lemmatize_with_postag(row['text_right'])
#     lemma.append(lemmatized_output)

# lemmatized = pd.DataFrame({'id_right': df['id_right'].values, 'text_right': lemma})
# lemmatized.to_csv('lemmatized.csv', index=None)

100%|████████████████████████████████████████████████████████████████████████| 369721/369721 [1:15:00<00:00, 82.16it/s]


In [157]:
lemmatized = pd.read_csv('lemmatized.csv')
lemmatized

Unnamed: 0,id_right,text_right
0,1781133,it be use in land craft during world war ii an...
1,2426736,after reject an offer from cambridge universit...
2,2224122,mat zan coach kuala lumpur fa in 1999 and win ...
3,219642,a barcode be a machine readable optical label ...
4,1728654,since the subordination of the monarchy under ...
...,...,...
369716,59396,the population be 416 at the 2010 census the v...
369717,1950034,the surface of the river be frozen from novemb...
369718,1984468,the first anti thrombin aptamer tba be generat...
369719,33966,state of oklahoma a of the 2010 census the pop...


### Queries (lemmatization)

In [158]:
queries = []

for i, row in tqdm(test_queries.iterrows(), total=test_queries.shape[0]):
    lemmatized_output = lemmatize_with_postag(row['text_left'])
    queries.append(lemmatized_output)

test_queries_lemmatized = pd.DataFrame({'id_left': test_queries['id_left'].values, 'text_left': queries})
test_queries_lemmatized

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:06<00:00, 14.48it/s]


Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united state
3,32674,patsy cline
4,406391,dierks bentley
...,...,...
95,679227,hiv aid
96,2136797,maren morris
97,5622,homer
98,1313598,south pole


In [159]:
if es.indices.exists(index=index):
    es.indices.delete(index=index)
es.indices.create(index=index, body=settings_fin)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki_with'}

In [160]:
start = time()
for ok, result in parallel_bulk(es, es_action_generator(lemmatized), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index)

100%|████████████████████████████████████████████████████████████████████████| 369721/369721 [01:10<00:00, 5262.23it/s]


Indexing time: 72.88427734375


{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

In [125]:
# BM25 results
BM25 = ir_measures.read_trec_run('BM25.res')
qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@5, P@10, P@20, AP], qrels, BM25)

{AP: 0.11196168401599797,
 P@20: 0.09499999999999999,
 P@5: 0.18399999999999994,
 P@10: 0.1319999999999999}

In [126]:
# My results
results = ir_measures.read_trec_run('results.res')
qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@5, P@10, P@20, AP], qrels, results)

{AP: 0.14774554975616933,
 P@20: 0.14900000000000005,
 P@5: 0.3039999999999997,
 P@10: 0.20699999999999988}

In [130]:
run = generate_scores(test_queries)
results = ir_measures.read_trec_run('results.res')
qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@5, P@10, P@20, AP], qrels, results)

{AP: 0.14839489899710584,
 P@20: 0.14850000000000008,
 P@5: 0.3059999999999997,
 P@10: 0.20699999999999988}

In [164]:
def make_query(text):
    return {
        'query':{
            "bool": {
                'must': {
                    'match': {
                        'text': text
                    }             
                },
                'should': {
                    "match_phrase": {
                        "text": {
                            "query": text,
                            "boost": 5
                        }
                    }
                }
            }
        }
    }

run = generate_scores(test_queries_lemmatized)

results = ir_measures.read_trec_run('results.res')
qrels = ir_measures.read_trec_qrels('qrels')

ir_measures.calc_aggregate([P@5, P@10, P@20, AP], qrels, results)

{AP: 0.12062277390283895,
 P@20: 0.136,
 P@5: 0.2439999999999998,
 P@10: 0.18599999999999997}