# Script for hardcoded questions

## Import of libraries and functions

In [None]:
import sys
import os

import numpy as np

from typing import List

sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['ES_USERNAME'    ] = 'elastic'
os.environ['ES_PASSWORD'    ] = 'changeme'
os.environ['ES_HOST'        ] = 'http://localhost:9200/'

import config

# set the es_search_size parameter in config to 2000 (can be more, but it is going to be slower)
config.es_search_size = 2000

In [None]:
async def _cos_sim_query(query_vector: np.ndarray) -> dict:
    '''Exectute vector search in ES based on cosine similarity.

    Args:
        query_vector    (np.ndarray): Query vector.

    Returns:
        dict: Return hits.
    '''
    vector_name     = 'vectors.vector'
    source_nested   = {'includes': ['vectors.name', 'vectors.start', 'vectors.end']}
    
    cos     = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'
    script  = {"source": cos, "params": {"query_vector": query_vector}}
    
    source_query = {'includes': ['source', 'url', 'title', 'description', 'identification', 'development', 'damage', 'management', 'links']}
    
    path = vector_name.split('.')[0]
    query = {
        "bool": {
            "must": {"nested": {
                        "score_mode": "max" ,
                        "path"      : path  ,
                        "inner_hits": {"size": 3, "name": "nested", "_source": source_nested},
                        "query"     : {"function_score": {"script_score": {"script": script}}}}
            },
        }
    }
    
    response = await config.es_client.search(
        index   = config.es_combined_index  ,
        query   = query                     ,
        size    = config.es_search_size     ,
        _source = source_query
    )

    hits = []

    for h1 in response['hits']['hits']:
        top_scores = []

        for h2 in h1['inner_hits']['nested']['hits']['hits']:
            top_scores.append({'score': h2['_score'] - 1, 'source': h2['_source']})
        
        h1['_source']['top_scores'  ] = top_scores
        h1['_source']['_id'         ] = h1['_id'    ]
        h1['_source']['_score'      ] = h1['_score' ] - 1
        
        hits.append(h1['_source'])

    return hits

## Reading data

In [None]:
import pandas as pd

df = pd.read_csv('./hardcoded/source/Questions with Issues July 2022 - transformed.csv', usecols = ['Question', 'Answers'])
df

## Building dictionary of static results

Make sure to adjust the `search_size` if not all correct links were found (for time being it is set up to 2000).

In [None]:
import logging

es_logger = logging.getLogger('elasticsearch')
es_logger.setLevel(logging.WARNING)

results = []

for i, r in df.iterrows():
    question_result = {'hits': []}
    question = r["Question"]
    links = r['Answers'].split('\n')
    
    question_result['question'] = question
    print(f'Question at index {i}: {question}')
    query_vector = config.embed.encode([question], show_progress_bar = False)[0]
    hits = await _cos_sim_query(query_vector = query_vector)
    found = False
    for i1, h in enumerate(hits):
        if h['url'].split('?')[0] in links:
            print(f'Found correct link at {i1+1} result item - {h["url"].split("?")[0]}')
            question_result['hits'].append(h)
            found = True
    if not found:
        print('No corresponding result')
    print(f'Total number of correct links - {len(links)}\n')
    results.append(question_result)


In [None]:
for res in results:
    question = res['question']
    tokens = config.tokenizer(question)
    question_modified = ""

    for token in tokens:
        if not token.is_stop:
            question_modified += token.text_with_ws

    res['question_stop_words'] = question_modified
    print(f'Original question: {question}')
    print(f'Removing stopwords: {question_modified}', end = '\n\n')

    vector = config.embed.encode([question_modified], show_progress_bar = False)[0].tolist()
    res['vector'] = vector
    print(f'Result vector type and len: {type(vector), len(vector)}')

In [None]:
import pickle

SAVE_PATH = './hardcoded/source/hardcoded.pickle'
with open(SAVE_PATH, 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(SAVE_PATH, 'rb') as handle:
    hardcoded_dict = pickle.load(handle)
hardcoded_dict