In [None]:
# Install txtai and elasticsearch python client
!pip install git+https://github.com/neuml/txtai elasticsearch

# Download and extract elasticsearch
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.1-linux-x86_64.tar.gz
!tar -xzf elasticsearch-7.8.1-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.8.1

In [None]:
import os
from subprocess import Popen, PIPE, STDOUT

# Start and wait for server
server = Popen(['elasticsearch-7.8.1/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
from elasticsearch import Elasticsearch
es = Elasticsearch(hosts=["http://localhost:9200"], timeout=60, retry_on_timeout=True)
es.indices.create(index="environment", ignore=400)

In [None]:
lines=[]
with open('../input/combined/tvnews_corpus.tsv', 'r', errors='replace') as f:
    lines = f.readlines()

In [None]:
from tqdm import tqdm
import csv
counter = 1
finalMapping = dict()

for dirname,_,filenames in os.walk('../input/environmental-news-nlp-dataset/TelevisionNews'):
    for filename in tqdm(filenames, "progress"):
        path = os.path.join(dirname, filename)
        with open(path, 'r', errors='replace') as file: 
            reader = csv.reader(file)
            dictForRow = dict()
            rowNum = -1
            for row in reader:
                if(rowNum!=-1):
                    rowNum += 1
                    dictForRow['snippet'] = row[6]
                    strBuilder = str(filename) + "#" + str(rowNum)
                    finalMapping[counter] = strBuilder
                    es.index(index="environment", doc_type="env", id=strBuilder, body=dictForRow)
                    counter += 1
                rowNum+=1

In [None]:
res = es.search(index="environment", body={"from":0, "size":10000, "min_score":0, "query":{"match":{"snippet":"Global warming is a hoax"}}})


In [None]:
res

In [None]:
!pip install pywsd

In [None]:
import pickle
import operator
import time
from pywsd.utils import lemmatize_sentence
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(v1,v2):
    return dot(v1, v2)/(norm(v1)*norm(v2))

# The inverted index file contents are loaded into memory
# All queries will be analyzed with this, so it is fine to preload
with open("../input/indexing/inv_index_percsv_version","rb") as f:
    posting_list = pickle.load(f)
    file_dict = pickle.load(f)
    vocab = pickle.load(f)

# The actual data is also loaded to be display the search results
fileobj = open('../input/combined/tvnews_corpus.tsv','r', errors='ignore')
lines = fileobj.readlines()
fileobj.close()

def searchAPI(searchterm):
    query = searchterm
    query_tokens = lemmatize_sentence(query) # lemmatize tokens to use as in vocabulary
    query_vector = []
    query_tf = {}
    total_query_vocab = 0
    for tok in query_tokens:
        try:
            indexvalue = vocab.index(tok)
            query_vector.append(indexvalue)
            query_tf[indexvalue] = 1 + query_tf.get(indexvalue,0)
            total_query_vocab += 1
        except ValueError: # Token doesnt exist in vocab - ignored
            deadcode='a'


    start_time = time.time() # Timer starts

    # First we obtain the list of all possible documents we actually need to search
    # This is a union of the docs in each query term's posting list
    # Not an intersection because we use cosine similarity and not boolean retrieval
    possible_docs = set()
    query_tf_vector = []

    for q in query_vector:
        possible_docs = possible_docs.union(posting_list[q].keys())
        query_tf_vector.append(query_tf[q]/total_query_vocab)
        # We also generate a TDF vector for the query. Does not make sense to scale with IDF

    # Run through each doc and generate the vector corresponding to the query terms
    # Compute the cosine similarities of it vs the TF vector of the query
    # Ties are broken by the magnitude of the vector - note that this is obtained by only considering the query terms
    # Plus these query term weights were scaled with relative TF, so a higher magnitude means the terms were more important
    doc_scores = {}
    for doc in possible_docs:
        doc_vector = []
        for q in query_vector:
            doc_vector.append(posting_list[q].get(doc,0))
        doc_scores[doc] = (cosine_similarity(doc_vector,query_tf_vector), norm(doc_vector))

    # Results are sorted
    sorted_results = sorted(doc_scores.items(), key=operator.itemgetter(1), reverse=True)

    end_time = time.time() # Timer ends as search portion is complete
    search_time = end_time - start_time

    ct = 1
    #print("-------------- SEARCH RESULTS --------------")
    results={}
    results['Details']=[]
    results['Documents']=[]
    for i in sorted_results:
        fname, rownum = file_dict[i[0]].split(' ')
        rownum = int(rownum[3:])
        search_res = lines[i[0]]
        search_res = search_res.split('\t')[2]
        answer=fname+'#'+str(rownum+1)
        results['Documents'].append(answer.split('\\')[-1])
        results['Details'].append({'Name': fname+'#'+str(rownum+1), 'Score': i[1], 'Results': search_res})
        ct += 1
    results['Time']=end_time-start_time
    return results

In [None]:
def retrivetopkelastic(hits):
    topk=[]
    for i in range(0,len(hits)):
        topk.append(hits[i]['_id'])
    return topk

def retrievaAllElastic(line):
    total=[]
    data = es.search(index="environment",scroll='2m', body={"min_score":0, "query":{"match":{"snippet":line}}})
    sid = data['_scroll_id']
    scroll_size = len(data['hits']['hits'])
    while scroll_size > 0:
        scroll_size = len(data['hits']['hits'])
        total+=[id['_id'] for id in data['hits']['hits']]
        data = es.scroll(scroll_id=sid, scroll='2m')
        sid = data['_scroll_id']
    return total

In [None]:
import pandas as pd
import time

testcasefile="../input/samplequeries/sample_queries.txt"
inputfile= open(testcasefile,"r")
timelist1=[]
timelist2=[]

retrieved=[]
relevant=[]

for line in tqdm(inputfile.readlines()[3:5], "progress"):
    
    #SearchAPI
    results = searchAPI(line)
    timelist1.append(results['Time'])
    retrieved.append(results['Documents'])
    
    #Elastic Search
    tic=time.time()
    relevant.append(retrievaAllElastic(line))
    toc=time.time()
    timelist2.append(toc-tic)

In [None]:
with open("resultsperindex9queryfinal","wb") as f:
    pickle.dump(retrieved,f)
    pickle.dump(relevant, f)
    pickle.dump(timelist1,f)
    pickle.dump(timelist2,f)

In [None]:
precisionnum = 0
precisionden = 0
for i in range(10):
    precisionnum+=len(set(retrieved[i][:1]).intersection(relevant[i]))
    precisionden+=len(set(retrieved[i][:1]))
print(precisionnum/precisionden)

<a href="./resultsperindex9queryfinal"> Download File </a>