# Script for hardcoded questions

## Import of libraries and functions

In [None]:
import sys
import os

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from typing import List, Tuple

sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['ES_USERNAME'    ] = 'elastic'
os.environ['ES_PASSWORD'    ] = 'changeme'
os.environ['ES_HOST'        ] = 'http://localhost:9200/'

import config

# set the es_search_size parameter in config to 2000 (can be more, but it is going to be slower)
config.es_search_size = 2000

## Reading data

In [None]:
import pandas as pd

df = pd.read_csv('./hardcoded/source/Questions with Issues July 2022 - transformed.csv', usecols = ['Group', 'Question', 'Answers'])
df.sample(10)

In [None]:
groups = df.groupby('Group').first().reset_index()
groups

## Building dictionary of static results

Make sure to adjust the `search_size` if not all correct links were found (for time being it is set up to 2000).

In [None]:
async def _cos_sim_query(query_vector: np.ndarray) -> dict:
    '''Exectute vector search in ES based on cosine similarity.

    Args:
        query_vector    (np.ndarray): Query vector.

    Returns:
        dict: Return hits.
    '''
    vector_name     = 'vectors.vector'
    source_nested   = {'includes': ['vectors.name', 'vectors.start', 'vectors.end']}

    cos     = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'
    script  = {"source": cos, "params": {"query_vector": query_vector}}

    source_query = {'includes': ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']}

    path = vector_name.split('.')[0]
    query = {
        "bool": {
            "must": {"nested": {
                        "score_mode": "max" ,
                        "path"      : path  ,
                        "inner_hits": {"size": 3, "name": "nested", "_source": source_nested},
                        "query"     : {"function_score": {"script_score": {"script": script}}}}
            },
        }
    }

    response = await config.es_client.search(
        index   = config.es_combined_index  ,
        query   = query                     ,
        size    = config.es_search_size     ,
        _source = source_query
    )

    hits = []

    for h1 in response['hits']['hits']:
        top_scores = []

        for h2 in h1['inner_hits']['nested']['hits']['hits']:
            top_scores.append({'score': h2['_score'] - 1, 'source': h2['_source']})
        
        h1['_source']['top_scores'  ] = top_scores
        h1['_source']['_id'         ] = h1['_id'    ]
        h1['_source']['_score'      ] = h1['_score' ] - 1
        
        hits.append(h1['_source'])

    return hits

In [None]:
import logging

es_logger = logging.getLogger('elasticsearch')
es_logger.setLevel(logging.WARNING)

group_results = {}

for i, r in groups.iterrows():
    question_result = {'hits': []}
    group = r['Group']
    question = r["Question"]
    links = r['Answers'].split('\n')
    
    print(f'Question in group {group}: {question}')
    query_vector = config.embed.encode([question], show_progress_bar = False)[0]
    hits = await _cos_sim_query(query_vector = query_vector)
    found = False
    for i1, h in enumerate(hits):
        if h['url'].split('?')[0] in links:
            print(f'Found correct link at {i1+1} result item - {h["url"].split("?")[0]}')
            question_result['hits'].append(h)
            found = True
    if not found:
        print('No corresponding result')
    print(f'Total number of correct links - {len(links)}\n')
    group_results[group] = question_result


In [None]:
for k, v in group_results.items():
    l_hits = len(v['hits'])
    scores_fake = sorted(np.random.uniform(low=0.8, high=.95, size=(l_hits,)))[::-1]
    for i, h in enumerate(v['hits']):
        print(f'Before score: {h["_score"]}, new score: {scores_fake[i]}')
        h['_score'] = scores_fake[i]
        for top_score in h['top_scores']:
            top_score['score'] = scores_fake[i]
    print('')

In [None]:
from copy import deepcopy

results = []
for i, r in df.iterrows():
    group = r['Group']
    question_item = deepcopy(group_results[group])
    question_item['group'] = group

    question = r['Question']
    question_item['question'] = question
    
    tokens = config.tokenizer(question)
    question_modified = ''
    for token in tokens:
        if not token.is_stop:
            question_modified += token.text_with_ws
    question_item['question_stop_words'] = question_modified
    print(f'Original question: {question}')
    print(f'Removing stopwords: {question_modified}', end = '\n\n')
    
    vector = config.embed.encode([question_modified], show_progress_bar = False)[0].tolist()
    question_item['vector'] = vector
    print(f'Result vector type and len: {type(vector), len(vector)}')
    results.append(question_item)

In [None]:
for i, r in enumerate(results):
    print(f'Index: {i}')
    print(f'Group: {r["group"]}')
    print(f'Question: {r["question"]}')

In [None]:
# set the es_search_size parameter in config back to 100
config.es_search_size = 100

## Saving into pickle and loading

In [None]:
import pickle

SAVE_PATH = './hardcoded/transformed/hardcoded.pickle'

In [None]:
with open(SAVE_PATH, 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(SAVE_PATH, 'rb') as handle:
    hardcoded_queries = pickle.load(handle)
hardcoded_queries

## Implementation details in the main function

Make sure to update the code in `es_playground.ipynb` as well as in `es.py` files accordingly

In [None]:
async def _handle_es_query(
    query       : str               ,
    slots       : List[str] = None
    ) -> Tuple[list, str]:
    '''Perform search in ES base.

    Args:
        query       (str)       : Query statement.
        slots       (List[str]) : Additional entity queries. Defaults to None.

    Returns:
        Tuple[list, str]: Results from ES query and final transformed query that was embedded.
                            If slots were provided, then results with slots refinement.
    '''

    def _synonym_replace(text):
        tokens = config.tokenizer(text)
        text_modified = ""
        for token in tokens:
            t = token.text.lower()
                
            if t in config.synonym_dict:
                text_modified += config.synonym_dict[t]
                text_modified += token.whitespace_
            else:
                text_modified += token.text_with_ws

        return text_modified

    def _check_for_hardcoded_queries(text):
        
        tokens = config.tokenizer(text)
        text_modified = ""

        for token in tokens:
            if not token.is_stop:
                text_modified += token.text_with_ws
            
        query_vector = config.embed.encode([text_modified], show_progress_bar = False)[0]
        best_score  = 0
        best_result = None

        # for h_query in config.hardcoded_queries:
        for h_query in hardcoded_queries:
            h_query_vector = h_query['vector']
            score = cosine_similarity([query_vector, h_query_vector])[0, 1]
            if score > best_score:
                best_score  = score
                best_result = h_query

        if best_score < config.es_hardcoded_threshold:
            return None

        return best_result        

    check_hardcoded = _check_for_hardcoded_queries(query)
    query = _synonym_replace(query)
    
    if slots:
        query = '. '.join([query] + [_synonym_replace(s) for s in slots])
    # TF HUB model
    # query_vector = config.embed([query]).numpy()[0]

    # Sentence Encoder model
    query_vector = config.embed.encode([query], show_progress_bar = False)[0]
    
    hits = await _cos_sim_query(query_vector = query_vector)
    
    if check_hardcoded:
        urls = set([h['url'] for h in check_hardcoded['hits']])
        hits = [h for h in hits if h['url'] not in urls and h['_score'] > config.es_cut_off_hardcoded]
        hits = check_hardcoded['hits'] + hits

    return hits, query

In [None]:
question = 'What could be causing the holes in my kale plant leaves?'
hits, query = await _handle_es_query(question)

for h in hits:
    print(f"{h['url']:<30s}, top score: {h['_score']}, and scores: {h['top_scores']}")