# ES playground

## Testing ES - playing with functions to handle requests from ChatBot

In [1]:
import sys
import os
import json

import numpy as np

from typing import List

sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE']             = 'dev'
os.environ['ES_USERNAME']       = 'elastic'
os.environ['ES_PASSWORD']       = 'changeme'
os.environ['ES_HOST']           = 'http://localhost:9200/'
os.environ['ES_IMITATE']        = 'false'
os.environ['ES_SEARCH_SIZE']    = '100'
os.environ['ES_CUT_OFF']        = '0.6'

import config

INFO:config:----------------------------------------------
INFO:config:Environment variables for DEV environment
INFO:config:- debug           = True
INFO:config:- es_search_size  = 100
INFO:config:- es_cut_off      = 0.6
INFO:config:- es_top_n        = 3
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch configuration:
INFO:config:- host                    = http://localhost:9200/
INFO:config:- username                = elastic
INFO:config:- password                = changeme
INFO:config:- tfhub_embedding_url     = https://tfhub.dev/google/universal-sentence-encoder/4
INFO:config:- tfhub_cache_dir         = /var/tmp/tfhub_modules
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch indexes:
INFO:config:- combined index          = combined
INFO:config:----------------------------------------------
INFO:co

### Querying for nested fields filtering IDs

In [2]:
question    = (
    'We have small (5mm) reddish brown beetles (species unknown) eating our salvia and basil leaves at night. '
    'Is there a safe control such as a powder, spray or oil that is effective at discouraging this pest?')

slots       = (
    'beetles salvia basil leaves pest'
)

index               = 'combined'
vector_name         = 'vectors.vector'
source_query        = {'includes': ['source', 'url', 'name', 'description', 'identification', 'development', 'damage', 'management', 'links']}
source_nested       = ['vectors.name']
# for links use this instead
# vector_name   = 'vectors_links.vector'
# source_nested = ['vectors_links.order']


query_vector = config.embed([question]).numpy()[0]

cos = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'
script =  {"source": cos, "params": {"query_vector": query_vector}}

path = vector_name.split('.')[0]
query_nested = {
    "bool": {"must": {"nested": {
                "score_mode": "max" ,
                "path"      : path  ,
                "inner_hits": {"size": 3, "name": "nested", "_source": source_nested},
                "query"     : {"function_score": {"script_score": {"script": script}}}}}
}}

# for filtering the IDs
query_nested['bool']['filter'] = {'ids': {'values': ['j4BDsn8BMcaQk2WQFaGI']}}

response = await config.es_client.search(
    index   = index         ,
    query   = query_nested  ,
    size    = 10            ,
    _source = source_query
)

hits = []

for h1 in response['hits']['hits']:
    top_scores = []
    for h2 in h1['inner_hits']['nested']['hits']['hits']:
        top_scores.append({'score': h2['_score'], 'source': h2['_source']})
    h1['_source']['top_scores'] = top_scores
    h1['_source']['_id']        = h1['_id']
    h1['_source']['_score']     = h1['_score']
    hits.append(h1['_source'])
    
    
for item in hits[:1]:
    for k, v in item.items():
        print(f'{k:<20}: {v[:100] if isinstance(v, str) else v}...')        
    

INFO:elasticsearch:GET http://localhost:9200/ [status:200 request:0.079s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.004s]
damage              : ...
development         : ...
identification      : ...
management          : ...
name                : grape vine question....
description         : grape vine question. I purchased two grapevines and two blueberry plants last fall and planted them ...
links               : [{'src': '', 'link': '', 'type': 'answer', 'title': "There are many reasons for a young grapevine to lose its leaves. Different varieties of grapes have different cold hardiness with some being able to withstand much colder temperatures than others. Adequate water in the ground is essential and must be maintained all year, even in the winter, in order for the roots to stay healthy and nourish the plant. That doesn't mean to drench the soil and drown the plant, just keep it somewhat moist and don't let it dry out completely. There a

## Debug cell

The flow of ES query is as follows:

Simple query against every possible field:
```python
(hits, hits_slots) = await _handle_es_query(question)
```

Some formatting fixed done on the results:
```python
hits = _handle_es_result(hits)
```

Some weighting done on scores:
```python
hits = await _weight_score(hits)
```

### Debug functions

In [3]:
async def _cos_sim_query(
    index           : str           ,
    vector_name     : str           ,
    query_vector    : np.ndarray    ,
    filter_ids      : List[str] = None
    ) -> dict:
    '''Exectute vector search in ES based on cosine similarity.

    Args:
        index           (str)       : Name of the index.
        vector_name     (str)       : Field vector to be compared against query vector.
        query_vector    (np.ndarray): Query vector.
        filter_ids      (list[str]) : Filter results based on the IDs given. Defaults to None.

    Returns:
        dict: Return hits.
    '''    
    cos     = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'
    script  = {"source": cos, "params": {"query_vector": query_vector}}
    
    source_query = {'includes': [
        'source', 'url', 'name', 'description', 'identification', 
        'development', 'damage', 'management', 'links'
    ]}
    source_nested = ['vectors.name']
    if vector_name == 'vectors_links.vector': 
        source_nested = ['vectors_links.name']

    path = vector_name.split('.')[0]
    query = {"bool": {"must": {"nested": {
                    "score_mode": "max" ,
                    "path"      : path  ,
                    "inner_hits": {"size": 3, "name": "nested", "_source": source_nested},
                    "query"     : {"function_score": {"script_score": {"script": script}}}}}
    }}

    if filter_ids:
        query['bool']['filter'] = {'ids': {'values': filter_ids}}

    response = await config.es_client.search(
        index   = index                 ,
        query   = query                 ,
        size    = config.es_search_size ,
        _source = source_query
    )

    hits = []

    for h1 in response['hits']['hits']:
        top_scores = []
        for h2 in h1['inner_hits']['nested']['hits']['hits']:
            top_scores.append({'score': h2['_score'], 'source': h2['_source']})
        h1['_source']['top_scores'] = top_scores
        h1['_source']['_id']        = h1['_id']
        h1['_source']['_score']     = h1['_score']
        hits.append(h1['_source'])

    return hits


In [4]:
async def _handle_es_query(
    question: str       ,
    slots   : str = None,
    ) -> dict:
    '''Perform search in ES base.

    Args:
        question (str)  : Query statement.
        slots    (str)  : Extracted slots. Defaults to None.

    Returns:
        dict    : return tuples for problems, information and askextension matches. 
    '''    
    
    query_vector = config.embed([question]).numpy()[0]
    
    hits        = []
    hits_slots  = []

    hits = await _cos_sim_query(
        index           = config.es_combined_index,
        vector_name     = 'vectors.vector'          ,
        query_vector    = query_vector
    )

    if slots:
        slots_vector = config.embed([slots]).numpy()[0]
        filter_ids = [h['_id'] for h in hits if h['_score'] > config.es_cut_off]
        hits_slots = await _cos_sim_query(
            index           = config.es_combined_index,
            vector_name     = 'vectors.vector'          ,
            query_vector    = slots_vector            ,
            filter_ids      = filter_ids
    )

    return hits, hits_slots

hits, hits_slots = await _handle_es_query(question, slots = slots)


INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.142s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.189s]


In [5]:
def _handle_es_result(hits: dict) -> dict:
    '''Processing of the results.

    Args:
        hits (dict): Results from query data.
        
    Returns:
        dict: Processed results.
    '''
    '''
    TO BE IMPLEMENTED
    '''
    return hits

hits = _handle_es_result(hits)

In [6]:
def _weight_score(hits: dict) -> dict:
    '''Weight and merge scores.

    Args:
        hits (dict): Sorted results.
        
    Returns:
        dict: Sorted data with new scores.
    '''

    '''
    TO BE IMPLEMENTED
    '''
    return hits

hits = _weight_score(hits)

In [7]:
def _format_result(
    index           = None,
    source           = None,
    score           = None,
    url             = None,
    name            = None,
    description     = None,
    damage          = None,
    identification  = None,
    development     = None,
    management      = None,
    ) -> dict:

    res = {}
    res['title'] = (
        f'<p>{index+1})<em>{name}</a></em>'
        f'</br>(score: {score:.2f})</br>'
        f'(source: <a href="{url}" target="_blank">{source}</a></p>')
    res['description'] = ''
    if description:
        res['description'] += (f'<p><strong>Details</strong>: {description[:100]}</p></br>'             )
    if damage:
        res['description'] += (f'<p><strong>Damage</strong>: {damage[:100]}</p></br>'                   )
    if identification:
        res['description'] += (f'<p><strong>Identification</strong>: {identification[:100]}</p></br>'   )
    if development:
        res['description'] += (f'<p><strong>Development</strong>: {development[:100]}</p></br>'         )
    if management:
        res['description'] += (f'<p><strong>Management</strong>: {management[:100]}</p></br>'           )
    
    return res

def _get_text(hits: dict) -> dict:
    '''Process results for output.

    Args:
        hits (dict): Sorted results from ES query.
        
    Returns:
        dict: Data for chatbot to return.
    '''

    top_n = config.es_top_n
    if len(hits) < config.es_top_n:
        top_n = len(hits)

    res = {
        'text'      : f'Top {top_n} results from data sources:',
        'payload'   : 'collapsible',
        'data'      : []
    }

    if len(hits):
        '''
        Fields:
        "source"
        "url"
        "name"
        "description"
        "identification"
        "development"
        "damage"
        "management"
        '''
            
        for i, h in enumerate(hits[:top_n]):
            score           = h.get('_score'        , 0.0   )
            source          = h.get('_source'       , None  )
            url             = h.get('url'           , None  )
            name            = h.get('name'          , None  )
            description     = h.get('description'   , None  )
            identification  = h.get('identification', None  )
            development     = h.get('development'   , None  )
            damage          = h.get('damage'        , None  )
            management      = h.get('management'    , None  )
        
            res['data'].append(
                _format_result(
                    index           = i             ,
                    source          = source        ,
                    score           = score         ,
                    url             = url           ,
                    name            = name          ,
                    description     = description   ,
                    identification  = identification,
                    development     = development   ,
                    damage          = damage        ,
                    management      = management
                )
            )   
    return res

_get_text(hits)

{'text': 'Top 3 results from data sources:',
 'payload': 'collapsible',
 'data': [{'title': '<p>1)<em>Pearleaf blister mite</a></em></br>(score: 1.63)</br>(source: <a href="http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/pearlfblmite.html?src=exchbt" target="_blank">None</a></p>',
   'description': '<p><strong>Details</strong>: This eriophyid mite (family Eriophyidae) feeds on pears, causing discolored fruit and leaves, distor</p></br><p><strong>Damage</strong>: Mites feeding during winter can cause infested buds to blacken, become dry, and drop before spring. </p></br><p><strong>Identification</strong>: Because eriophyids are microscopic, plant injury from their feeding as described below under "Damage</p></br><p><strong>Development</strong>: Pearleaf blister mite develops through four life stages: egg, protonymph, deutonymph, and adult. The</p></br><p><strong>Management</strong>: In residential fruit trees, natural enemies may keep pearleaf blister mites under adequate biologica</p></br>

In [9]:
hits, hits_slots = await _handle_es_query(question, slots)
hits = _handle_es_result(hits_slots)
hits = _weight_score(hits)
_get_text(hits)

INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.135s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.119s]


{'text': 'Top 3 results from data sources:',
 'payload': 'collapsible',
 'data': [{'title': '<p>1)<em>Elm Leaf Beetle</a></em></br>(score: 1.60)</br>(source: <a href="http://ipm.ucanr.edu/PMG/PESTNOTES/pn7403.html?src=exchbt" target="_blank">None</a></p>',
   'description': '<p><strong>Details</strong>: The elm leaf beetle, Xanthogaleruca (=Pyrrhalta) luteola, is a leaf-chewing pest of elm trees, espec</p></br><p><strong>Damage</strong>: Adults chew entirely through the leaf, often in a shothole pattern. Larvae skeletonize the leaf surf</p></br><p><strong>Development</strong>: Adults are 1/4 inch long, olive-green beetles with black, longitudinal stripes along the margin and </p></br><p><strong>Management</strong>: It is essential to correctly identify the cause of damaged elm leaves before taking management actio</p></br>'},
  {'title': '<p>2)<em>Leaf beetles</a></em></br>(score: 1.57)</br>(source: <a href="http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/INVERT/leafbeetle.html?src=exchbt" targ