# ES playground

## Testing ES - playing with functions to handle requests from ChatBot

In [1]:
import sys
import os
import json

import numpy as np

from typing import Tuple

sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE']             = 'dev'
os.environ['ES_USERNAME']       = 'elastic'
os.environ['ES_PASSWORD']       = 'changeme'
os.environ['ES_HOST']           = 'http://localhost:9200/'
os.environ['ES_IMITATE']        = 'false'
os.environ['ES_SEARCH_SIZE']    = '10'
os.environ['ES_CUT_OFF']        = '0.4'

import config

INFO:config:----------------------------------------------
INFO:config:Environment variables for DEV environment
INFO:config:- debug           = False
INFO:config:- es_search_size  = 10
INFO:config:- es_cut_off      = 0.4
INFO:config:- es_top_n        = 3
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch configuration:
INFO:config:- host                    = http://localhost:9200/
INFO:config:- username                = elastic
INFO:config:- password                = changeme
INFO:config:- tfhub_embedding_url     = https://tfhub.dev/google/universal-sentence-encoder/4
INFO:config:- tfhub_cache_dir         = /var/tmp/tfhub_modules
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch indexes:
INFO:config:- askextension index      = askextension
INFO:config:- combined index          = combined
INFO:config:-

### Querying for single field

In [2]:
question    = (
    'Recently planted peach, cherry , apple, olive and hazelnut. '
    'Concerned about freezing temps in coming week. They have not leafed and '
    'buds are still tight. Is there a way to protect them during this upcoming freeze?')
index           = 'problem'
vector_name     = 'identification_vector'
source_query    = {"includes": ["source", "url", "name", "description", "identification", "development", "damage", "management"]}

query_vector = config.embed([question]).numpy()[0]
   
cos = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'
script =  {"source": cos, "params": {"query_vector": query_vector}}


query = {"script_score": {"query" : {"match_all": {}}, "script" : script}}

response = await config.es_client.search(
    index   = index         ,
    query   = query         ,
    size    = 1000            ,
    _source = source_query
)

hits = response['hits']['hits']

for item in hits:
    print(f'ID: {item["_id"]}, score: {item["_score"]:.3f}')
    print(json.dumps(item['_source'], indent = 2))
    print('----------------------------------------------')

INFO:elasticsearch:GET http://localhost:9200/ [status:200 request:0.004s]
INFO:elasticsearch:POST http://localhost:9200/problem/_search [status:200 request:3.871s]
ID: jniJjX8B6BZPpW0jXykf, score: 1.369
{
  "damage": "",
  "development": "",
  "identification": "Temperatures below about 60\u00b0F during flowering may prevent pollination of some simple, one-seeded fruits, resulting in distorted, \"catfaced\" berries. The damage appears identical to the catfacing caused by lygus bugs, but lygus bug damage occurs in late spring and early summer and low-temperature injury symptoms develop in early to mid-spring.",
  "management": "Sprinkler irrigation used when temperatures are expected to be below freezing can help minimize frost injury. ",
  "name": "Low temperature injury to strawberries",
  "description": "Low temperatures can kill flowers or cause the development of misshapen fruit, depending on the severity of chilling and the stage of flower development when it occurs. Cold, dry wea

### Querying for nested fields

In [3]:
question    = (
    'Recently planted peach, cherry , apple, olive and hazelnut. '
    'Concerned about freezing temps in coming week. They have not leafed and '
    'buds are still tight. Is there a way to protect them during this upcoming freeze?')

index           = 'problem'
vector_name     = 'links.title_vector'
source_query    = {"includes": ["source", "url", "name", "description", "identification", "development", "damage", "management"]}
source_nested   = ['links.type', 'links.title', 'links.src']

query_vector = config.embed([question]).numpy()[0]

cos = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'
script =  {"source": cos, "params": {"query_vector": query_vector}}

path = vector_name.split('.')[0]
query_nested = {
    "nested": {
        "score_mode": "max" ,
        "path"      : path  ,
        "inner_hits": {"size": 1, "name": "nested", "_source": source_nested},
        "query"     : {"function_score": {"script_score": {"script": script}}}
    }
}

response = await config.es_client.search(
    index   = index         ,
    query   = query_nested  ,
    size    = 10            ,
    _source = source_query
)

hits = response['hits']['hits']

for item in hits:
    nested = item['inner_hits']['nested']['hits']['hits'][0]['_source']
    item['_source'][path] = item['inner_hits']['nested']['hits']['hits'][0]['_source']

for item in hits[:3]:
    print(f'ID: {item["_id"]}, score: {item["_score"]:.3f}')
    print(json.dumps(item['_source'], indent = 2))
    print('----------------------------------------------')

INFO:elasticsearch:POST http://localhost:9200/problem/_search [status:200 request:0.155s]
ID: YniJjX8B6BZPpW0jXykf, score: 1.433
{
  "damage": "",
  "development": "",
  "identification": "",
  "management": "To reduce the likelihood of frost injury, conserve heat by keeping the vegetation under trees mowed short, so that the soil surface is firm and moistened. If overhead sprinklers are available, use them on stone fruits to prevent frost injury when low temperatures occur during bloom. Trees can be protected from frost injury by covering them with floating row cover or using protective shelters if freezing conditions occur during bloom or early fruit growth. These covers should be prevented from touching the tree. ",
  "name": "Frost injury",
  "description": "Mild frost may produce shoot dieback and browning of the seed coat. Young fruit may drop. ",
  "source": "damagesEnvironment",
  "url": "http://ipm.ucanr.edu/PMG/GARDEN/FRUIT/ENVIRON/frostinjury.html?src=exchbt",
  "links": {
 

### Debug cell

The flow of ES query is as follows:

Simple query against every possible field:
```python
(
    problem_hits    , 
    information_hits, 
    ask_hits
) = await _handle_es_query(
    question
)
```

Some formatting fixed done on the results:
```python
hits_ask, hits_ipm = _handle_es_result(
    problem_hits    , 
    information_hits, 
    ask_hits
)
```

Some weighting done on scores:
```python
hits_ask, hits_ipm = await _weight_score(
    hits_ask, hits_ipm
)
```

print the results:
```python
_print_hits(hits_ask, 'Ask Extension'   )
_print_hits(hits_ipm, 'IPM Data'        )
```

In [4]:
async def _cos_sim_query(
    index           : str           ,
    vector_name     : str           ,
    query_vector    : np.ndarray    ,
    source_query    : dict          ,
    nested          : bool = False  ,
    source_nested   : dict = None
    ) -> dict:
    '''Exectute vector search in ES based on cosine similarity.

    Args:
        index           (str)       : Name of the index.
        vector_name     (str)       : Field vector to be compared against query vector.
        query_vector    (np.ndarray): Query vector.
        source_query    (dict)      : Fields to include in result hits.
        nested          (bool)      : Indication if the query involves nested fields.
        source_nested   (dict)      : Fields to include in 

    Returns:
        dict: Return hits.
    '''    
    cos = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'
    script =  {"source": cos, "params": {"query_vector": query_vector}}


    
    # Query script for single field (without nesting).
    query = {"script_score": {"query" : {"match_all": {}}, "script" : script}}
    
    # Query script for nested fields (path indicates the nested field).
    path = vector_name.split('.')[0]
    query_nested = {
        "nested": {
            "score_mode": "max" ,
            "path"      : path  ,
            "inner_hits": {"size": 1, "name": "nested", "_source": source_nested},
            "query"     : {"function_score": {"script_score": {"script": script}}}
        }
    }

    if not nested:
        response = await config.es_client.search(
            index   = index         ,
            query   = query         ,
            size    = 100            ,
            _source = source_query
        )
    else:
        response = await config.es_client.search(
            index   = index         ,
            query   = query_nested  ,
            size    = 100           ,
            _source = source_query
        )

    hits = response['hits']['hits']
    
    # Add max. scored nested field as a field to main item.
    if nested:
        for item in hits:
            nested = item['inner_hits']['nested']['hits']['hits'][0]['_source']
            item['_source'][path] = item['inner_hits']['nested']['hits']['hits'][0]['_source']  
    
    return hits


In [5]:
async def _handle_es_query(
    question    : str
    ) -> Tuple[dict, dict, dict]:
    '''Perform search in ES base.

    Args:
        question    (str) : Question.

    Returns:
        Tuple[dict, dict, dict]: return tuples for problems, information and askextension matches. 
    '''    
    
    query_vector = config.embed([question]).numpy()[0]
   
    problem_hits        = {}
    information_hits    = {}
    ask_hits            = {}
    
    # ----------------------------------------------- Problem index
    index_config = {
        "index" : "problem",
        "sq"    : {"includes": ["source", "url", "name", "description", "identification", "development", "damage", "management"]},
        "cols"  : ['name', 'description', 'identification', 'development', 'damage', 'management'],
        "nested": [{"name": "links.title", "sq_nested": ['links.type', 'links.title', 'links.src']}]
    }

    for c in index_config['cols']:
        problem_hits[c] = await _cos_sim_query(
            index           = index_config['index']     ,
            vector_name     = c + '_vector'             ,
            query_vector    = query_vector              ,
            source_query    = index_config['sq']
        )
    
    for nested in index_config['nested']:
        problem_hits[nested['name']] = await _cos_sim_query(
            index           = index_config['index']     ,
            vector_name     = nested['name'] + '_vector',   
            query_vector    = query_vector              , 
            source_query    = index_config['sq']        ,
            nested          = True                      ,
            source_nested   = nested['sq_nested']
        )

    # ----------------------------------------------- Information index
    index_config = {
        "index" : "information",
        "sq"    : {"includes": ["source", "url", "name", "description", "management"]},
        "cols"  : ['name', 'description', 'management'],
        "nested": [
            {"name": "links.title"      , "sq_nested": ['links.type', 'links.title', 'links.src']},
            {"name": "problems.title"   , "sq_nested": ['problems.title', 'problems.src'        ]},
        ]
    }

    for c in index_config['cols']:
        information_hits[c] = await _cos_sim_query(
            index           = index_config['index']     ,
            vector_name     = c + '_vector'             ,
            query_vector    = query_vector              ,
            source_query    = index_config['sq']
        )
    
    for nested in index_config['nested']:
        information_hits[nested['name']] = await _cos_sim_query(
            index           = index_config['index']     ,
            vector_name     = nested['name'] + '_vector',   
            query_vector    = query_vector              , 
            source_query    = index_config['sq']        ,
            nested          = True                      ,
            source_nested   = nested['sq_nested']
        )

    # ----------------------------------------------- AskExtension index
    index_config = {
        "index" : "askextension",
        "sq"    : {"includes": ["ticket_no", "url", "created", "title", "question"]},
        "cols"  : ['title_question'],
        "nested": [
            {"name": "tags.tag"         , "sq_nested": ['tags.tag'          ]},
            {"name": "answer.response"  , "sq_nested": ['answer.response'   ]},
        ]
    }
    for c in index_config['cols']:
        ask_hits[c] = await _cos_sim_query(
            index           = index_config['index']     ,
            vector_name     = c + '_vector'             ,
            query_vector    = query_vector              ,
            source_query    = index_config['sq']
        )
    
    for nested in index_config['nested']:
        ask_hits[nested['name']] = await _cos_sim_query(
            index           = index_config['index']     ,
            vector_name     = nested['name'] + '_vector',   
            query_vector    = query_vector              , 
            source_query    = index_config['sq']        ,
            nested          = True                      ,
            source_nested   = nested['sq_nested']
        )
    
    return (problem_hits, information_hits, ask_hits)
(problem_hits, information_hits, ask_hits) = await _handle_es_query(question)


INFO:elasticsearch:POST http://localhost:9200/problem/_search [status:200 request:0.287s]
INFO:elasticsearch:POST http://localhost:9200/problem/_search [status:200 request:0.301s]
INFO:elasticsearch:POST http://localhost:9200/problem/_search [status:200 request:0.314s]
INFO:elasticsearch:POST http://localhost:9200/problem/_search [status:200 request:0.335s]
INFO:elasticsearch:POST http://localhost:9200/problem/_search [status:200 request:0.379s]
INFO:elasticsearch:POST http://localhost:9200/problem/_search [status:200 request:0.303s]
INFO:elasticsearch:POST http://localhost:9200/problem/_search [status:200 request:0.480s]
INFO:elasticsearch:POST http://localhost:9200/information/_search [status:200 request:1.055s]
INFO:elasticsearch:POST http://localhost:9200/information/_search [status:200 request:0.948s]
INFO:elasticsearch:POST http://localhost:9200/information/_search [status:200 request:0.839s]
INFO:elasticsearch:POST http://localhost:9200/information/_search [status:200 request:1.

In [6]:
def _handle_es_result(
    problem_hits    : dict,
    information_hits: dict,
    ask_hits        : dict
    ) -> Tuple[dict, dict]:
    '''Merge different sources into single source.

    Args:
        problem_hits        (dict): Results from problems data.
        information_hits    (dict): Results from information data.
        ask_hits            (dict): Results from ask extension data.

    Returns:
        dict: Merged and cut off results.
    '''

    # --------------------------------------- Problem results
    hits = []
    
    cols = ['name', 'description', 'identification', 'development', 'damage', 'management']
    for col in cols:
        for h1 in problem_hits[col]:

            h1['_score_max'] = h1.get('_score', 0.0)
            duplicate = False
            
            for h2 in hits:
                if h1['_id'] == h2['_id']:
                    h2['_score_max'] = max(h2.get('_score_max', 0.0), h1['_score'])

                    duplicate = True
            
            if not duplicate:
                hits.append(h1)

    if len(hits):
        hits = sorted(hits, key = lambda h: h['_score_max'], reverse = True)

    hits_problem = hits

    # --------------------------------------- Information results
    hits = []
    
    cols = ['name', 'description', 'management']
    for col in cols:
        for h1 in information_hits[col]:

            h1['_score_max'] = h1.get('_score', 0.0)
            duplicate = False
            
            for h2 in hits:
                if h1['_id'] == h2['_id']:
                    h2['_score_max'] = max(h2.get('_score_max', 0.0), h1['_score'])

                    duplicate = True
            
            if not duplicate:
                hits.append(h1)

    if len(hits):
        hits = sorted(hits, key = lambda h: h['_score_max'], reverse = True)
    
    hits_information = hits

    # --------------------------------------- Ask Extension results
    hits = []
    
    cols = ['title_question']
    for col in cols:
        for h1 in ask_hits[col]:

            h1['_score_max'] = h1.get('_score', 0.0)
            duplicate = False
            
            for h2 in hits:
                if h1['_id'] == h2['_id']:
                    h2['_score_max'] = max(h2.get('_score_max', 0.0), h1['_score'])

                    duplicate = True
            
            if not duplicate:
                hits.append(h1)

    if len(hits):
        hits = sorted(hits, key = lambda h: h['_score_max'], reverse = True)
    
    hits_ask = hits

    # --------------------------------------- Score cut off filter and merge

    hits = []

    for h in hits_problem + hits_information + hits_ask:
        if h['_score_max'] > config.es_cut_off + 1:
            hits.append(h)

    if len(hits):
        hits = sorted(hits, key = lambda h: h['_score_max'], reverse = True)
    
    return hits
hits = _handle_es_result(problem_hits, information_hits, ask_hits)

In [7]:
def _weight_score(
    hits: dict
    ) -> dict:
    '''Weight and merge scores.

    Args:
        hits (dict): Sorted results.
        
    Returns:
        dict: Sorted data with new scores.
    '''

    '''
    TO BE IMPLEMENTED
    '''
    return hits
hits = _weight_score(hits)

In [8]:
def _format_result(
    index           = None,
    group           = None,
    score           = None,
    url             = None,
    title           = None,
    description     = None,
    damage          = None,
    identification  = None,
    development     = None,
    management      = None,
    ) -> dict:

    res = {}
    res['title'] = (
        f'<p>{index+1})<em>{title}</a></em>'
        f'</br>(score: {score:.2f})</br>'
        f'(source: <a href="{url}" target="_blank">{group}</a></p>')
    res['description'] = ''
    if description:
        res['description'] += (f'<p><strong>Details</strong>: {description[:100]}</p></br>'     )
    if damage:
        res['description'] += (f'<p><strong>Damage</strong>: {damage[:100]}</p></br>'          )
    if identification:
        res['description'] += (f'<p><strong>Identification</strong>: {identification[:100]}</p></br>'  )
    if development:
        res['description'] += (f'<p><strong>Development</strong>: {development[:100]}</p></br>'      )
    if management:
        res['description'] += (f'<p><strong>Management</strong>: {management[:100]}</p></br>'      )
    
    return res

def _get_text(
    hits    : dict
    ) -> dict:
    '''Print results.

    Args:
        hits (dict): Sorted results from ES query.
        
    Returns:
        dict: Data for chatbot to return.
    '''

    top_n = config.es_top_n
    if len(hits) < config.es_top_n:
        top_n = len(hits)

    res = {
        'text'      : f'Top {top_n} results from data sources:',
        'payload'   : 'collapsible',
        'data'      : []
    }

    if len(hits):
        '''
        Fields:
        "source"
        "url"
        "name"
        "description"
        "identification"
        "development"
        "damage"
        "management"
        "ticket_no"
        "url""
        "created"
        "title"
        "question"
        
        '''
            
        for i, h in enumerate(hits[:top_n]):
            score   = h.get('_score_max', 0.0   )
            group   = h.get('_index'    , None  )
            source  = h.get('_source'           )

            url             = source.get('url'              , None)
            name            = source.get('name'             , None)
            description     = source.get('description'      , None)
            identification  = source.get('identification'   , None)
            development     = source.get('development'      , None)
            damage          = source.get('damage'           , None)
            management      = source.get('management'       , None)
        
            # ticket_no       = source.get("ticket_no")
            # created         = source.get("created")
            if source.get('title', None):
                name        = source.get('title'            , None)
                description = source.get('question'         , None)

            res['data'].append(
                _format_result(
                    index           = i             ,
                    group           = group         ,
                    score           = score         ,
                    url             = url           ,
                    title           = name          ,
                    description     = description   ,
                    identification  = identification,
                    development     = development   ,
                    damage          = damage        ,
                    management      = management
                )
            )   
    return res
_get_text(hits)

{'text': 'Top 3 results from data sources:',
 'payload': 'collapsible',
 'data': [{'title': '<p>1)<em>grape vine question.</a></em></br>(score: 1.53)</br>(source: <a href="https://ask2.extension.org/kb/faq.php?id=312965" target="_blank">askextension</a></p>',
   'description': "<p><strong>Details</strong>: I purchased two grapevines and two blueberry plants last fall and planted them in my yard. I wasn't </p></br>"},
  {'title': '<p>2)<em>How do we ripen pears picked off our tree?</a></em></br>(score: 1.52)</br>(source: <a href="https://ask2.extension.org/kb/faq.php?id=489260" target="_blank">askextension</a></p>',
   'description': '<p><strong>Details</strong>: My wife planted a pear tree 15 years ago. This year it suddenly has produced literally hundreds of p</p></br>'},
  {'title': '<p>3)<em>Rasberry Blooms bu no fruit.</a></em></br>(score: 1.51)</br>(source: <a href="https://ask2.extension.org/kb/faq.php?id=466890" target="_blank">askextension</a></p>',
   'description': '<p><stron

In [None]:
(problem_hits, information_hits, ask_hits) = await _handle_es_query(question)
hits = _handle_es_result(problem_hits, information_hits, ask_hits)
hits = _weight_score(hits)
_get_text(hits)