# Importing libraries

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from elasticsearch import Elasticsearch
import numpy as np
import csv
import re
import time

# loading encorder model

In [None]:
def load_encorder_model():
    encorder = hub.load('universal_sentence_encorder_large')
    print("model loaded")
    return encorder

# Connecting to ElasticSearch

In [None]:
def connect_to_elasticsearch():
    es = Elasticsearch([{'host':'localhost','port':9200}])
    #Check if it is connected
    if es.ping():
        print("Connected to ElasticSearch")
        return es
    else:
        print("**Can't Connect to ElasticSearch**")

# Inserting data into ElasticSearch

In [None]:
#getting ElasticSearch Connection
es = connect_to_elasticsearch()

In [None]:
#Load encorder model
encorder = load_encorder_model()

In [None]:
#Defining ElasticSearch database

Es_database = {'mappings':{
                            'properties':{'title':{'type':'text'},
                                          'body':{'type':'text'},
                                          'title_vector':{'type':'dense_vector', "dims":512},
                                          'body_vector':{'type':'dense_vector', "dims":512}
                                         }
                           }           
              }

In [None]:
#es.indices.delete(index='question-index')

In [None]:
#creating elasticseach index/db in RDBMS
qs = es.indices.create(index='question-index',body=Es_database)

#Deleting any existing index
es.indices.delete(index='question-index')

In [None]:
#inserting in elasticseach

start = time.time()
cnt=0
with open('Quora_question_answers/Questions.csv',encoding='latin') as csvfile:
    #print("opened")
    reader = csv.reader(csvfile, delimiter=',')
    next(reader,None) #skip the header
    for row in reader:
        doc_id = row[0]
        title = ' '.join(re.sub("<.*?>", "", row[5]).split()) #removing HTML tage and whitespaces((space, tab, newline)
        body = ' '.join(re.sub("<.*?>", "",row[6]).split()) #removing HTML tage and whitespaces((space, tab, newline 
        title_vector = np.array(encorder([title])).tolist()[0]
        body_vector = np.array(encorder([body])).tolist()[0]
        
        bdy = {"title":title,
               "body":body,
               "title_vector":title_vector,
               "body_vector":body_vector
              }
        es.index(index='question-index',id=doc_id,body=bdy)
        
        cnt+=1
        if cnt%1000 ==0:
            print("Inserted '",cnt,"' Records in ElasticSearch")
            
end = time.time()
total_time = (end - start)/60
print(f"Runtime of the program is {total_time}")

To check inserted values
es.search(index = 'question-index',body={'query':{'match_all':{}}})

# Normal Search

In [None]:
def normal_search(query):
    normal_search_query = {'query':{'match':{'title':query}}}
    results = es.search(index = 'question-index', body= normal_search_query)
    for q in results['hits']['hits']:
        print('score:', q['_score'])
        print(q['_source']['title'])
        print('********************')

In [None]:
que ='how to install python'
normal_search(que)

# Sementic Search 

In [None]:
def sementic_search(query_question):
    query_vector = np.array(encorder([query_question])).tolist()[0]
    sementic_search_query = { "query": {
                                        "script_score": {
                                                        "query": { "match_all":{} },
                                                        "script":{
                                                                     "source":"cosineSimilarity(params.query_vector,'title_vector')+1.0",
                                                                     "params":{"query_vector": query_vector}
                                                                 }
                                                        }
                                        }
                            }
    sementic_search_result = es.search(index= 'question-index', body = sementic_search_query)
    for q in sementic_search_result['hits']['hits']:
        print('score:', q['_score'] - 1)
        print(q['_source']['title'])
        print('******************')

In [None]:
que = 'how to install python'
sementic_search(que)

# Sementic search on Title & Answer

In [None]:
def sementic_search_test(query_question):
    query_vector = np.array(encorder([query_question])).tolist()[0]
    sementic_search_query = { "query": {
                                        "script_score": {
                                                        "query": { "match_all":{} },
                                                        "script":{
                                                                     "source":"cosineSimilarity(params.query_vector,'title_vector')+cosineSimilarity(params.query_vector,'body_vector')+2.0",
                                                                     "params":{"query_vector": query_vector}
                                                                 }
                                                        }
                                        }
                            }
    sementic_search_result = es.search(index= 'question-index', body = sementic_search_query)
#    return sementic_search_result
    for q in sementic_search_result['hits']['hits']:
        print('score:', q['_score'] -2)
        print('Title:',q['_source']['title'])
        print('Body:',q['_source']['body'])
        print('******************')

In [None]:
que = 'ways to install java'
sementic_search_test('how to get rich')