# ES playground

## Testing ES - playing with functions to handle requests from ChatBot

In [1]:
import sys
import os

import numpy as np

from typing import List, Tuple

sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE'          ] = 'dev'
os.environ['ES_USERNAME'    ] = 'elastic'
os.environ['ES_PASSWORD'    ] = 'changeme'
os.environ['ES_HOST'        ] = 'http://localhost:9200/'
os.environ['ES_IMITATE'     ] = 'false'
os.environ['ES_SEARCH_SIZE' ] = '100'
os.environ['ES_CUT_OFF'     ] = '0.4'
os.environ['ES_ASK_WEIGHT'  ] = '0.8'

import config

INFO:config:----------------------------------------------
INFO:config:Environment variables for DEV environment
INFO:config:- debug           = True
INFO:config:- es_search_size  = 100
INFO:config:- es_cut_off      = 0.4
INFO:config:- es_top_n        = 5
INFO:config:- es_ask_weight   = 0.8
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch configuration:
INFO:config:- host                    = http://localhost:9200/
INFO:config:- username                = elastic
INFO:config:- password                = changeme
INFO:config:- tfhub_embedding_url     = https://tfhub.dev/google/universal-sentence-encoder/4
INFO:config:- tfhub_cache_dir         = /var/tmp/tfhub_modules
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch indexes:
INFO:config:- combined index          = combined
INFO:config:------------------

### Querying for nested fields filtering IDs

In [10]:
question    = (
    'We have small (5mm) reddish brown beetles (species unknown) eating our salvia and basil leaves at night. '
    'Is there a safe control such as a powder, spray or oil that is effective at discouraging this pest?')

slots       = (
    'beetles salvia basil leaves pest'
)

index               = 'combined'
vector_name         = 'vectors.vector'
source_query        = {'includes': ['source', 'url', 'name', 'description', 'identification', 'development', 'damage', 'management', 'links']}
source_nested       = ['vectors.name']
# for links use this instead
# vector_name   = 'vectors_links.vector'
# source_nested = ['vectors_links.order']


query_vector = config.embed([question]).numpy()[0]
slots_vector = config.embed([slots]).numpy()[0]

final_vector = np.average([query_vector, slots_vector])


cos = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'
script =  {"source": cos, "params": {"query_vector": final_vector}}

path = vector_name.split('.')[0]
query_nested = {
    "bool": {
        "must": {"nested": {
                "score_mode": "max" ,
                "path"      : path  ,
                "inner_hits": {"size": 3, "name": "nested", "_source": source_nested},
                "query"     : {"function_score": {"script_score": {"script": script}}}}
        },
        "filter"    : [],
        # "must_not"  : []
}}

# for filtering the IDs
query_nested['bool']['filter'   ].append({'ids'     : {'values': ['Bg4Wt38B_ISSR2mEO2MB']}})
# for querying only AskExtension source
# query_nested['bool']['filter'   ].append({'match'   : {'source': 'askExtension'}})
# for querying only IPM sources
# query_nested['bool']['must_not' ].append({'match'   : {'source': 'askExtension'}})

response = await config.es_client.search(
    index   = index         ,
    query   = query_nested  ,
    size    = 10            ,
    _source = source_query
)

hits = []

for h1 in response['hits']['hits']:
    top_scores = []
    for h2 in h1['inner_hits']['nested']['hits']['hits']:
        top_scores.append({'score': h2['_score'], 'source': h2['_source']})
    h1['_source']['top_scores'] = top_scores
    h1['_source']['_id']        = h1['_id']
    h1['_source']['_score']     = h1['_score']
    hits.append(h1['_source'])
    
    
for item in hits[:1]:
    for k, v in item.items():
        print(f'{k:<20}: {v[:100] if isinstance(v, str) else v}...')        
        
    

INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.103s]
damage              : ...
development         : ...
identification      : ...
management          : ...
name                : Oak Tree Growth....
description         : Oak Tree Growth. There is a bright yellow sponge-like growth on the base and side of one of my Oak t...
links               : [{'src': '', 'link': '', 'type': 'answer', 'title': 'Some wood decay pathogens are evident as growths on tree trunks. What you described sounds like it c'}]...
source              : askExtension...
url                 : https://ask2.extension.org/kb/faq.php?id=283702...
top_scores          : [{'score': 1.6670567, 'source': {'name': 'description_0'}}, {'score': 1.6299995, 'source': {'name': 'name_0'}}, {'score': 1.012282, 'source': {'name': 'identification_0'}}]...
_id                 : tuz-438BaSIrCT2xavoc...
_score              : 1.6670567...
damage              : ...
development         : ...
identificatio

## Debug cell

The flow of ES query is as follows:

Simple query against every possible field:
```python
(hits, hits_slots) = await _handle_es_query(question, slots)
```

Get response for chat from ES query:
```python
res = _get_text(hits)
```

### Debug functions

In [3]:
async def _cos_sim_query(
    index           : str               ,
    query_vector    : np.ndarray        ,
    query_links     : bool      = False ,           
    filter_ids      : List[str] = None  ,
    # filter_ae       : bool      = False
    ) -> dict:
    '''Exectute vector search in ES based on cosine similarity.

    Args:
        index           (str)       : Name of the index.
        query_vector    (np.ndarray): Query vector.
        query_link      (bool)      : True if querying against links. Defaults to False.
        filter_ids      (List[str]) : Filter results based on the IDs given. Defaults to None.
        filter_ae       (bool)      : Filter in results only for AskExtension. Defaults to False.

    Returns:
        dict: Return hits.
    '''
    vector_name     = 'vectors.vector'
    source_nested   = ['vectors.name']
    if query_links:
        vector_name = 'vectors_links.vector'
        source_nested   = ['vectors_links.order']
        
    cos     = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'
    script  = {"source": cos, "params": {"query_vector": query_vector}}
    
    source_query = {'includes': [
        'source', 'url', 'name', 'description', 'identification', 
        'development', 'damage', 'management', 'links'
    ]}

    path = vector_name.split('.')[0]
    query = {"bool": {
        "must": {"nested": {
                    "score_mode": "max" ,
                    "path"      : path  ,
                    "inner_hits": {"size": 3, "name": "nested", "_source": source_nested},
                    "query"     : {"function_score": {"script_score": {"script": script}}}}
        },
        "filter"    : [],
        "must_not"  : []
    }}

    if filter_ids is not None:
        query['bool']['filter'  ].append({'ids'     : {'values': filter_ids     }})
    # if filter_ae:    
    #     query['bool']['filter'  ].append({'match'   : {'source': 'askExtension' }})
    # else:
    #     query['bool']['must_not'].append({'match'   : {'source': 'askExtension' }})


    response = await config.es_client.search(
        index   = index                 ,
        query   = query                 ,
        size    = config.es_search_size ,
        _source = source_query
    )

    hits = []

    for h1 in response['hits']['hits']:
        top_scores = []

        for h2 in h1['inner_hits']['nested']['hits']['hits']:
            top_scores.append({'score': h2['_score'] - 1, 'source': h2['_source']})
        
        h1['_source']['top_scores'  ] = top_scores
        h1['_source']['_id'         ] = h1['_id'    ]
        h1['_source']['_score'      ] = h1['_score' ] - 1
        
        hits.append(h1['_source'])

    return hits


In [4]:
async def _handle_es_query(
    query       : str               ,
    filter_ids  : List[str] = None  ,
    ) -> list:
    '''Perform search in ES base.

    Args:
        query       (str)       : Query statement.
        filter_ids  (List[str]) : IDs of docs that should be considered. Defaults to None.

    Returns:
        list: return list of hits. 
    '''    
    
    query_vector = config.embed([query]).numpy()[0]
    
    hits = await _cos_sim_query(
        index           = config.es_combined_index  ,
        query_vector    = query_vector              ,
        filter_ids      = filter_ids
    )

    return hits

hits = await _handle_es_query(question)


INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.154s]


In [5]:
def _handle_es_result(hits: list) -> Tuple[list, list]:
    '''Process the ES query results (like filtering, reweighting, etc).

    Args:
        hits (list): Results from ES query.

    Returns:
        Tuple[list, list]: filtered and processed ES query results
    '''

    for h in hits: 
        if h['source'] == 'askExtension': 
            h['_score'] *= config.es_ask_weight
    
    hits = [h for h in hits if h['_score'] > config.es_cut_off]
    filter_ids = [h['_id'] for h in hits]

    return hits, filter_ids

hits, filter_ids = _handle_es_result(hits)

In [7]:
def _format_result(
    index           = None,
    source           = None,
    score           = None,
    url             = None,
    name            = None,
    description     = None,
    damage          = None,
    identification  = None,
    development     = None,
    management      = None,
    ) -> dict:

    res = {}
    res['title'] = (
        f'<p>{index+1})<em>{name}</a></em>'
        f'</br>(score: {score:.2f})</br>'
        f'(source: <a href="{url}" target="_blank">{source}</a>)</p>')
    res['description'] = ''
    if description:
        res['description'] += (f'<p><strong>Details</strong>: {description[:100]}</p></br>'             )
    if damage:
        res['description'] += (f'<p><strong>Damage</strong>: {damage[:100]}</p></br>'                   )
    if identification:
        res['description'] += (f'<p><strong>Identification</strong>: {identification[:100]}</p></br>'   )
    if development:
        res['description'] += (f'<p><strong>Development</strong>: {development[:100]}</p></br>'         )
    if management:
        res['description'] += (f'<p><strong>Management</strong>: {management[:100]}</p></br>'           )
    
    return res

def _get_text(hits: dict) -> dict:
    '''Process results for output.

    Args:
        hits (dict): Sorted results from ES query.
        
    Returns:
        dict: Data for chatbot to return.
    '''

    top_n = config.es_top_n
    if len(hits) < config.es_top_n:
        top_n = len(hits)

    res = {
        'text'      : f'Top {top_n} results from data sources:',
        'payload'   : 'collapsible',
        'data'      : []
    }

    if len(hits):
        '''
        Fields:
        "source"
        "url"
        "name"
        "description"
        "identification"
        "development"
        "damage"
        "management"
        '''
            
        for i, h in enumerate(hits[:top_n]):
            score           = h.get('_score'        , 0.0   )
            source          = h.get('_source'       , None  )
            url             = h.get('url'           , None  )
            name            = h.get('name'          , None  )
            description     = h.get('description'   , None  )
            identification  = h.get('identification', None  )
            development     = h.get('development'   , None  )
            damage          = h.get('damage'        , None  )
            management      = h.get('management'    , None  )
        
            res['data'].append(
                _format_result(
                    index           = i             ,
                    source          = source        ,
                    score           = score         ,
                    url             = url           ,
                    name            = name          ,
                    description     = description   ,
                    identification  = identification,
                    development     = development   ,
                    damage          = damage        ,
                    management      = management
                )
            )   
    return res

_get_text(hits)

{'text': 'Top 3 results from data sources:',
 'payload': 'collapsible',
 'data': [{'title': '<p>1)<em>Pearleaf blister mite</a></em></br>(score: 0.63)</br>(source: <a href="http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/PESTS/pearlfblmite.html?src=exchbt" target="_blank">None</a>)</p>',
   'description': '<p><strong>Details</strong>: This eriophyid mite (family Eriophyidae) feeds on pears, causing discolored fruit and leaves, distor</p></br><p><strong>Damage</strong>: Mites feeding during winter can cause infested buds to blacken, become dry, and drop before spring. </p></br><p><strong>Identification</strong>: Because eriophyids are microscopic, plant injury from their feeding as described below under "Damage</p></br><p><strong>Development</strong>: Pearleaf blister mite develops through four life stages: egg, protonymph, deutonymph, and adult. The</p></br><p><strong>Management</strong>: In residential fruit trees, natural enemies may keep pearleaf blister mites under adequate biologica</p></br

In [8]:
question    = (
    'We have small (5mm) reddish brown beetles (species unknown) eating our salvia and basil leaves at night. '
    'Is there a safe control such as a powder, spray or oil that is effective at discouraging this pest?')

slots       = (
    'beetles salvia basil leaves pest'
)

filter_ids = None

hits = await _handle_es_query(question, filter_ids = filter_ids)
hits, filter_ids = _handle_es_result(hits)
print(f'Number of filtered IDs for main query: {len(hits)}')
if slots:
    hits = await _handle_es_query(slots, filter_ids = filter_ids)
    hits, _ = _handle_es_result(hits)
    print(f'Number of filtered IDs for slots: {len(hits)}')


res = _get_text(hits)

INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.159s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.125s]


In [9]:
res

{'text': 'Top 3 results from data sources:',
 'payload': 'collapsible',
 'data': [{'title': '<p>1)<em>Elm Leaf Beetle</a></em></br>(score: 0.60)</br>(source: <a href="http://ipm.ucanr.edu/PMG/PESTNOTES/pn7403.html?src=exchbt" target="_blank">None</a>)</p>',
   'description': '<p><strong>Details</strong>: The elm leaf beetle, Xanthogaleruca (=Pyrrhalta) luteola, is a leaf-chewing pest of elm trees, espec</p></br><p><strong>Damage</strong>: Adults chew entirely through the leaf, often in a shothole pattern. Larvae skeletonize the leaf surf</p></br><p><strong>Development</strong>: Adults are 1/4 inch long, olive-green beetles with black, longitudinal stripes along the margin and </p></br><p><strong>Management</strong>: It is essential to correctly identify the cause of damaged elm leaves before taking management actio</p></br>'},
  {'title': '<p>2)<em>Leaf beetles</a></em></br>(score: 0.57)</br>(source: <a href="http://ipm.ucanr.edu/PMG/GARDEN/PLANTS/INVERT/leafbeetle.html?src=exchbt" tar

In [10]:
hits

[{'damage': 'Adults chew entirely through the leaf, often in a shothole pattern. Larvae skeletonize the leaf surface, causing damaged foliage to turn brown to whitish. Elm leaf beetles, when abundant, can entirely defoliate large elm trees, which eliminates summer shade and reduces the aesthetic value of trees. Repeated, extensive defoliation weakens elms, causing trees to decline. However, the elm leaf beetle has not been a significant, widespread problem in California since the 1990s.',
  'development': 'Adults are 1/4 inch long, olive-green beetles with black, longitudinal stripes along the margin and center of the back. Females lay yellowish eggs in double rows of about 5 to 25 on the underside of leaves. Eggs become grayish before hatching. Larvae resemble caterpillars and are black when newly hatched and shortly after molting (shedding the old skin). After feeding, larvae become yellowish to green with rows of tiny dark tubercles (projections). Third-instar larvae grow up to 1/3 