# ES playground

In [1]:
import os
import sys

import pandas as pd
import numpy as np

from typing import Dict, List, Tuple

sys.path.insert(1, os.path.realpath(os.path.pardir))

os.environ['STAGE']         = 'dev'
os.environ['ES_USERNAME']   = 'elastic'
os.environ['ES_PASSWORD']   = 'changeme'
os.environ['ES_HOST']       = 'http://localhost:9200/'
os.environ['ES_IMITATE']    = 'false'


import config

INFO:config:----------------------------------------------
INFO:config:Environment variables are for DEV environment
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch configuration:
INFO:config:- host                     = http://localhost:9200/
INFO:config:- username                 = elastic
INFO:config:- password                 = changeme
INFO:config:- tfhub_embedding_url      = https://tfhub.dev/google/universal-sentence-encoder/4
INFO:config:- tfhub_cache_dir          = /var/tmp/tfhub_modules
INFO:config:----------------------------------------------
INFO:config:----------------------------------------------
INFO:config:Elasticsearch indexes:
INFO:config:- askextension index       = askextension
INFO:config:- combined index           = combined
INFO:config:----------------------------------------------
INFO:config:Initializing the Elasticsearch client
INFO:config:Done initiliazing Elasti

## Testing ES - inserting simple text data

In [2]:
# for some reason he uses these files for injecting

DATA_FILE_NAMES = [
    # "askextension_transformed.json",
    "pestDiseaseItems_new.json",
    "turfPests.json",
    "weedItems.json",
    "exoticPests.json",
    "ipmdata_new.json",
]
DATA_FILE_NAMES = [config.PATH_DATA_UCIPM + f for f in DATA_FILE_NAMES]
DATA_FILE_NAMES.append(config.ASKEXTENSION_FILE_RESULT)

In [3]:
rename_data = {
    'ipmdata_new.json': {
        "name"                  : "name",
        "urlPestNote"           : "urlPestNote",
        "descriptionPestNote"   : "descriptionPestNote",
        "life_cycle"            : "life_cyclePestNote",
        "damagePestNote"        : "damagePestNote",
        "managementPestNote"    : "managementPestNote",
        "imagePestNote"         : "imagePestNote",
        "urlQuickTip"           : "urlQuickTipPestNote",
        "contentQuickTips"      : "contentQuickTipsPestNote",
        "imageQuickTips"        : "imageQuickTipsPestNote",
        "video"                 : "videoPestNote"
    },
    'pestDiseaseItems_new.json': {
        "name"              : "name",
        "url"               : "urlPestDiseaseItems",
        "description"       : "descriptionPestDiseaseItems",
        "identification"    : "identificationPestDiseaseItems",
        "life_cycle"        : "life_cyclePestDiseaseItems",
        "damage"            : "damagePestDiseaseItems",
        "solutions"         : "solutionsPestDiseaseItems",
        "images"            : "imagesPestDiseaseItems",
    },
    'turfPests.json': {
        "name"  : "name",
        "url"   : "urlTurfPests",
        "text"  : "textTurfPests",
        "images": "imagesTurfPests",
    },
    'weedItems.json': {
        "name"          : "name",
        "url"           : "urlWeedItems",
        "description"   : "descriptionWeedItems",
        "images"        : "imagesWeedItems",
    },
    'exoticPests.json': {
        "name"          : "name",
        "url"           : "urlExoticPests",
        "description"   : "descriptionExoticPests",
        "damage"        : "damageExoticPests",
        "identification": "identificationExoticPests",
        "life_cycle"    : "life_cycleExoticPests",
        "monitoring"    : "monitoringExoticPests",
        "management"    : "managementExoticPests",
        "related_links" : "related_linksExoticPests",
        "images"        : "imagesExoticPests", 
    },
    'askextension_transformed.json': {
        "faq-id"        : "ask_faq_id",
        "ticket-no"     : "ask_ticket_no",
        "url"           : "ask_url",
        "title"         : "ask_title",
        "title-question": "ask_title_question",
        "created"       : "ask_created",
        "updated"       : "ask_updated",
        "state"         : "ask_state",
        "county"        : "ask_county",
        "question"      : "ask_question",
        "answer"        : "ask_answer",
    }
}

In [4]:
df_docs_json = {}

for f in DATA_FILE_NAMES:
    # print(f)
    df = pd.read_json(f)
    # print(df.columns)

    if 'name' in df.columns:
        # for some reason he drops the duplicates from these files
        b_s = df.shape[0]
        df = df.drop_duplicates('name')
        a_s = df.shape[0]
        dropped = b_s - a_s
        if dropped > 0:
            print(f"Dropped {dropped} with same 'name' from {f}")
    
    # we then rename the columns to signal where that columns is from
    f_name = f.split('/')[-1]
    
    if f_name in rename_data:
        df = df.rename(columns = rename_data[f_name])
        df = df[rename_data[f_name].values()]
    df_docs_json[f] = df

# we then concatenate all the data
df_docs = pd.concat([df_docs_json[k] for k in df_docs_json.keys()], ignore_index=True)

# we rename the index by 'doc_id'
df_docs.index = df_docs.index.set_names('doc_id')
df_docs.index = df_docs.index.map(str)
df_docs = df_docs.reset_index()
df_docs['ask_faq_id'] = df_docs['ask_faq_id'].map(str)
df_docs['ask_ticket_no'] = df_docs['ask_ticket_no'].map(str)
# we then replace nans and fill nested fields
df_docs = df_docs.fillna('')

columnsNested = ['imagePestNote', 'imageQuickTipsPestNote', 'videoPestNote', 
    'imagesPestDiseaseItems', 'imagesTurfPests', 'imagesWeedItems', 
    'related_linksExoticPests', 'imagesExoticPests', 'ask_answer']

for c in columnsNested:
    df_docs[c] = [[] if x == '' else x for x in df_docs[c]]



Dropped 39 with same 'name' from /home/biddy/Toptal/Eduworks/askchatbot/actions/es/data/uc-ipm/updated/pestDiseaseItems_new.json
Dropped 1 with same 'name' from /home/biddy/Toptal/Eduworks/askchatbot/actions/es/data/uc-ipm/updated/ipmdata_new.json


In [None]:
from elasticsearch.helpers import parallel_bulk
from collections import deque

df_json = df_docs.to_dict('records')
await config.es_client.indices.delete(index = config.es_combined_index, ignore = 404)
config.es_client.indices.create(index = config.es_combined_index, body=config.ES_COMBINED_MAPPING)        
deque(parallel_bulk(config.es_client, df_json, index = config.ES_COMBINED_MAPPING), maxlen = 0)

## Testing ES - inserting vectorized data

In [5]:
df_json = df_docs.to_dict('records')
# docs = docs[:10]

columnsVectorized = ['name', 'descriptionPestDiseaseItems', 'identificationPestDiseaseItems',
    'life_cyclePestDiseaseItems', 'damagePestDiseaseItems', 'solutionsPestDiseaseItems',
    'textTurfPests', 'descriptionWeedItems', 'descriptionExoticPests', 'damageExoticPests', 
    'identificationExoticPests', 'life_cycleExoticPests', 'monitoringExoticPests', 'managementExoticPests',
    'descriptionPestNote', 'life_cyclePestNote', 'damagePestNote', 'managementPestNote', 
    'contentQuickTipsPestNote', 'ask_title', 'ask_title_question', 'ask_question']

docs_vectors = {}

for c in columnsVectorized:
    c_list      = [d[c] for d in df_json]
    c_vectors   = config.embed(c_list).numpy()
    docs_vectors[c] = c_vectors

for i in range(len(df_json)):
    for c in columnsVectorized:
        df_json[i][c + '_vector'] = docs_vectors[c][i]


In [None]:
from elasticsearch.helpers import parallel_bulk
from collections import deque

config.es_client.indices.delete(index = config.ES_COMBINED_INDEX, ignore = 404)
config.es_client.indices.create(index = config.ES_COMBINED_INDEX, body=config.ES_COMBINED_VECTOR_MAPPING)        
deque(parallel_bulk(config.es_client, df_json, index = config.ES_COMBINED_INDEX), maxlen = 0)

INFO:elasticsearch:DELETE http://localhost:9200/combined [status:200 request:0.050s]
INFO:elasticsearch:PUT http://localhost:9200/combined [status:200 request:0.196s]


  config.es_client.indices.create(index = config.ES_COMBINED_INDEX, body=config.ES_COMBINED_VECTOR_MAPPING)


INFO:elasticsearch:POST http://localhost:9200/combined/_bulk [status:200 request:3.989s]
INFO:elasticsearch:POST http://localhost:9200/combined/_bulk [status:200 request:3.692s]
INFO:elasticsearch:POST http://localhost:9200/combined/_bulk [status:200 request:3.641s]
INFO:elasticsearch:POST http://localhost:9200/combined/_bulk [status:200 request:3.263s]
INFO:elasticsearch:POST http://localhost:9200/combined/_bulk [status:200 request:3.405s]
INFO:elasticsearch:POST http://localhost:9200/combined/_bulk [status:200 request:3.512s]
INFO:elasticsearch:POST http://localhost:9200/combined/_bulk [status:200 request:0.508s]
INFO:elasticsearch:POST http://localhost:9200/combined/_bulk [status:200 request:3.621s]


deque([])

## Testing ES - playing with functions to handle requests from ChatBot

In [3]:
import es_query_vector as ev

In [4]:
QUESTION    = (
    'Recently planted peach, cherry , apple, olive and hazelnut. '
    'Concerned about freezing temps in coming week. They have not leafed and '
    'buds are still tight. Is there a way to protect them during this upcoming freeze?')
PEST_DAMAGE = 'it\'s looking sparse and leaves have brown and white spots on them'

In [8]:
'''
The flow of ES query is as follows:

Simple query against every possible field:
(
    es_ask_hits,
    es_name_hits,
    es_other_hits,
    es_damage_hits
) = await _handle_es_query(
    question,
    pest_damage
)

Some formatting fixed done on the results:
hits_ask, hits_ipm = await _handle_es_result(
    es_ask_hits,
    es_name_hits,
    es_other_hits,
    es_damage_hits
)

Some weighting done on scores:
hits_ask, hits_ipm = await _weight_score(
    hits_ask, hits_ipm
)

print the results:
_print_hits(hits_ask, 'Ask Extension'   )
_print_hits(hits_ipm, 'IPM Data'        )
'''

def _cos_sim_query(
    source_query: dict,
    query_vector: np.ndarray,
    vector_name : str     ,
    ) -> dict:
    '''Exectute vector search in ES based on cosine similarity.

    Args:
        source_query (dict)         : Fields to include in result hits. 
        query_vector (np.ndarray)   : Query vector.
        vector_name (str)           : Field vector to be compared against query vector.

    Returns:
        dict: Return hits.
    '''    
    cos = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'

    script_query = {
        "script_score": {
            "query" : {"match_all": {}},
            "script": {"source": cos, "params": {"query_vector": query_vector}}
        }
    }

    response = config.es_client.search(
        index   = config.es_combined_index,
        query   = script_query,
        size    = 10,
        _source = source_query
    )

    hits = response['hits']['hits']

    return hits

def _handle_es_query(
    question    : str,
    pest_damage : str
    ) -> Tuple[dict, dict, dict, dict]:
    '''Perform search in ES base.

    Args:
        question (str)      : Question.
        pest_damage (str)   : Pest damage description.

    Returns:
        Tuple[dict, dict, dict, dict]: return tuples for AE data matches, name matches, other sources matches, and damage matches. 
    '''    
    
    if pest_damage:
        question = '. '.join([question, pest_damage])    
    
    question_vector = config.embed([question]).numpy()[0]
   
    
    source_query = {
        "includes": [
            "doc_id"    ,
            "name"      ,

            "urlPestDiseaseItems"           ,
            "descriptionPestDiseaseItems"   ,
            "identificationPestDiseaseItems",
            "life_cyclePestDiseaseItems"    ,
            "damagePestDiseaseItems"        ,
            "solutionsPestDiseaseItems"     ,

            "urlTurfPests"                  ,
            "textTurfPests"                 ,

            "urlWeedItems"                  ,
            "descriptionWeedItems"          ,
            
            "urlExoticPests"                ,
            "descriptionExoticPests"        ,
            "damageExoticPests"             ,
            "identificationExoticPests"     ,
            "life_cycleExoticPests"         ,
            "monitoringExoticPests"         ,
            "managementExoticPests"         ,

            "urlPestNote"                   ,
            "urlQuickTipPestNote"           ,
            "descriptionPestNote"           ,
            "life_cyclePestNote"            ,
            "damagePestNote"                ,
            "managementPestNote"            ,
            "contentQuickTipsPestNote"      ,

            "ask_url"                       ,
            "ask_faq_id"                    ,
            "ask_title"                     ,
            "ask_title_question"            ,
            "ask_question"
        ]
    }

    es_ask_hits     = {}
    es_name_hits    = {}
    es_other_hits   = {}
    es_damage_hits  = {}
    # es_caption_hits = {}
    # es_video_hits   = {}

    es_name_hits['name'] = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'name_vector'
    )

    
    '''
    Pest Diseases Items

    "descriptionPestDiseaseItems_vector"
    "identificationPestDiseaseItems_vector"
    "life_cyclePestDiseaseItems_vector"
    "damagePestDiseaseItems_vector"
    "solutionsPestDiseaseItems_vector"

    "imagesPestDiseaseItems"    : "caption_vector"
    '''
    es_other_hits['pd_description']     = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'descriptionPestDiseaseItems_vector'
    )

    es_other_hits['pd_identification']  = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'identificationPestDiseaseItems_vector'
    )

    es_other_hits['pd_life_cycle']      = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'life_cyclePestDiseaseItems_vector'
    )

    es_damage_hits['pd_damage_hits'] = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'damagePestDiseaseItems_vector'
    )
    
    es_other_hits['pd_solutions']       = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'solutionsPestDiseaseItems_vector'
    )

    '''
    Turf Pests

    "textTurfPests_vector"
    "imagesTurfPests": "caption_vector"
    '''
    es_other_hits['tp_text']            = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'textTurfPests_vector'
    )

    '''
    Weed Items

    "descriptionWeedItems_vector"
    "imagesWeedItems": "caption_vector"
    '''
    es_other_hits['wi_text']            = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'descriptionWeedItems_vector'
    )

    '''
    Exotic Pests

    "descriptionExoticPests_vector"
    "damageExoticPests_vector"
    "identificationExoticPests_vector"
    "life_cycleExoticPests_vector"
    "monitoringExoticPests_vector"
    "managementExoticPests_vector"

    "related_linksExoticPests"  : "text_vector"
    "imagesExoticPests"         : "caption_vector"
    '''
    es_other_hits['ep_description']     = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'descriptionExoticPests_vector'
    )

    es_damage_hits['ep_damage']     = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'damageExoticPests_vector'
    )
    
    es_other_hits['ep_identification']  = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'identificationExoticPests_vector'
    )

    es_other_hits['ep_life_cycle']      = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'life_cycleExoticPests_vector'
    )

    es_other_hits['ep_monitoring']      = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'monitoringExoticPests_vector'
    )

    es_other_hits['ep_management']      = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'managementExoticPests_vector'
    )

    '''
    IPM Data

    "descriptionPestNote_vector"
    "life_cyclePestNote_vector"
    "damagePestNote_vector"
    "managementPestNote_vector"
    "contentQuickTipsPestNote_vector"

    "imageQuickTipsPestNote"    : "caption_vector"
    "imagePestNote"             : "caption_vector"
    "videoPestNote"             : "videoPestNote"
    '''
    es_other_hits['pn_description']     = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'descriptionPestNote_vector'
    )

    es_other_hits['pn_life_cycle']      = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'life_cyclePestNote_vector'
    )

    es_damage_hits['pn_damage'] = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'damagePestNote_vector'
    )

    es_other_hits['pn_management']      = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'managementPestNote_vector'
    )

    es_other_hits['pn_content_tips']    = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'contentQuickTipsPestNote_vector'
    )

    '''
    AskExtension data
    
    "ask_title_vector"
    "ask_title_question_vector"
    "ask_question_vector"

    "ask_answer" : "response_vector"
    '''

    es_ask_hits['ask_name_title']   = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'ask_title_vector'
    )

    es_ask_hits['ask_question'] = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'ask_title_question_vector'
    )

    es_ask_hits['ask_damage'] = _cos_sim_query(
        source_query    = source_query,
        query_vector    = question_vector,
        vector_name     = 'ask_title_question_vector'
    )

    return (es_ask_hits, es_name_hits, es_other_hits, es_damage_hits)

def _handle_es_result(
    es_ask_hits     : dict,
    es_name_hits    : dict,
    es_other_hits   : dict,
    es_damage_hits  : dict
    ) -> Tuple[dict, dict]:
    '''Merge different sources into single source.

    Args:
        es_ask_hits (dict)      : Results from Ask Extension data.
        es_name_hits (dict)     : Results from name vector comparison.
        es_other_hits (dict)    : Results from other fields.
        es_damage_hits (dict)   : Results from damage-related fields.

    Returns:
        Tuple[dict, dict]: Two dictionaries, for Ask Extension results and IPM data results.
    '''

    '''
    ask extension data
    
    es_ask_hits has following keys: ['ask_name_title', 'ask_question', 'ask_damage']
    '''
    hits = []

    for h1 in (es_ask_hits['ask_name_title'] + es_ask_hits['ask_question'] + es_ask_hits['ask_damage']):
        
        h1['_score_max'] = h1.get('_score', 0.0)
        duplicate = False
        
        for h2 in hits:
            if h1['_source']['doc_id'] == h2['_source']['doc_id']:
                h2['_score_max'] = max(h2.get('_score_max', 0.0), h1['_score'])
                duplicate = True
        
        if not duplicate:
            hits.append(h1)

    if len(hits):
        hits = sorted(hits, key = lambda h: h['_score_max'], reverse = True)

    # new field - _score_max
    hits_ask = hits

    '''
    ipm data - es_name_hits
    
    es_name_hits has following keys: ['name']
    '''
    hits = es_name_hits['name']

    for h1 in hits:
        h1['_score_name']   = h1.get('_score', 0.0)

    
    '''
    ipm data - es_other_hits

    es_other_hits has following keys: 
        # pest diseases - ['pd_description', 'pd_identification', 'pd_life_cycle', 'pd_solutions'   ]
        # turf pests    - ['tp_text']
        # weed items    - ['wi_text']
        # exotic pests  - ['ep_description', 'ep_identification', 'ep_life_cycle', 'ep_monitoring'  , 'ep_management']
        # pest notes    - ['pn_description', 'pn_life_cycle'    , 'pn_management', 'pn_content_tips']
    
    '''
    for h1 in (
        es_other_hits['pd_description'      ] +
        es_other_hits['pd_identification'   ] +
        es_other_hits['pd_life_cycle'       ] +
        es_other_hits['pd_solutions'        ] + 
        es_other_hits['tp_text'             ] + 
        es_other_hits['wi_text'             ] +
        es_other_hits['ep_description'      ] +
        es_other_hits['ep_identification'   ] +
        es_other_hits['ep_life_cycle'       ] +
        es_other_hits['ep_monitoring'       ] +
        es_other_hits['ep_management'       ] +
        es_other_hits['pn_description'      ] +
        es_other_hits['pn_life_cycle'       ] +
        es_other_hits['pn_management'       ] +
        es_other_hits['pn_content_tips'     ]
        ):
        
        h1['_score_other'] = h1.get('_score', 0.0)
        duplicate = False

        for h2 in hits:
            if h1['_source']['doc_id'] == h2['_source']['doc_id']:
                h2['_score_other'] = max(h2.get('_score_other', 0.0), h1['_score'])
                duplicate = True
        
        if not duplicate:
            hits.append(h1)

    '''
    ipm data - es_damage_hits
    es_damage_hits has following keys: ['pd_damage_hits', 'ep_damage', 'pn_damage']
    '''
    
    for h1 in (
        es_damage_hits['pd_damage_hits' ] +
        es_damage_hits['ep_damage'      ] +
        es_damage_hits['pn_damage'      ]
        ):

        h1['_score_damage'] = h1.get('_score', 0.0)
        
        for h2 in hits:
            if h1['_source']['doc_id'] == h2['_source']['doc_id']:
                h2['_score_damage'] = max(h2.get('_score_damage', 0.0), h1['_score'])
                duplicate = True
        
        if not duplicate:
            hits.append(h1)
    
    for h in hits:
        h['_score_name'     ] = h.get('_score_name'   , 0.0)
        h['_score_other'    ] = h.get('_score_other'  , 0.0)
        h['_score_damage'   ] = h.get('_score_damage' , 0.0)

    if len(hits):
        hits = sorted(hits, key = lambda h: h['_score'], reverse = True)

    # new fields - _score_name, _score_other, _score_damage
    hits_ipm = hits

    return hits_ask, hits_ipm

def _weight_score(
    hits_ask: dict, 
    hits_ipm: dict
    ) -> Tuple[dict, dict]:
    '''Weight and merge scores.

    Args:
        hits_ask (dict): Results for Ask Extension data.
        hits_ipm (dict): Results for IPM data.

    Returns:
        Tuple[dict, dict]: Sorted data with new scores.
    '''    

    ########################################################################
    # For searches in the askextension data, we do not weigh. Already maxed.

    if len(hits_ask) > 0:
        hits_ask = sorted(hits_ask, key=lambda h: h['_score_max'], reverse=True)

    #######################################################################
    if len(hits_ipm) > 0:
        for hit in hits_ipm:
            score_name      = hit.get('_score_name'     , 0.0)
            score_other     = hit.get('_score_other'    , 0.0)
            score_damage    = hit.get('_score_damage'   , 0.0)

            w = [0.8, 0.05, 0.05]

            if score_damage < 1.0:
                w[0] += 0.5 * w[2]
                w[1] += 0.5 * w[2]
                w[2] = 0.0

            hit['_score_weighted'] = (
                w[0] * score_name + w[1] * score_other + w[2] * score_damage
            )

        # Sort to weighted score
        hits_ipm = sorted(hits_ipm, key=lambda h: h['_score_weighted'], reverse=True)

    # Do not filter on threshold. Leave this up to the caller
    return hits_ask, hits_ipm

def _print_hits(
    hits_ask: dict, 
    hits_ipm: dict, 
    ) -> None:
    '''Print results.

    Args:
        hits_ask (dict): Results from Ask Extension base.
        hits_ipm (dict): Results from IPM data.
    '''    

    if len(hits_ask):
        print(f'Found {len(hits_ask)} similar posts from Ask Extension Base.')

        '''
        Fields:
        "ask_url"
        "ask_faq_id"
        "ask_title"
        "ask_title_question"
        "ask_question"
        '''
        for i, h in enumerate(hits_ask):
            score   = h.get('_score_max', 0.0)
            source  = h.get('_source')

            url         = source.get('ask_url'      )
            title       = source.get('ask_title'    )
            question    = source.get('ask_question' )

            print('----------------------------------------------------------------')
            print(f'{i+1}) {title:>30} (score: {score:.2f})'      )
            print(f'Title   : {title}'             )
            print(f'Question: {question[:100]}'    )
            print(f'URL     : {url}'    )
            print('----------------------------------------------------------------', end = '\n\n')
    
    if len(hits_ipm):

        print(f'Found {len(hits_ipm)} articles from IPM sources')

        for i, h in enumerate(hits_ipm):
            score   = h.get('_score_weighted', 0.0)
            source  = h.get('_source')

            if source['urlPestDiseaseItems'] != '':
                
                '''
                Fields:
                "name"
                "urlPestDiseaseItems"   
                "descriptionPestDiseaseItems"
                "identificationPestDiseaseItems"
                "life_cyclePestDiseaseItems"
                "damagePestDiseaseItems"
                "solutionsPestDiseaseItems"
                '''
                url             = source.get('urlPestDiseaseItems'              )
                name            = source.get('name'                             )
                description     = source.get('descriptionPestDiseaseItems'      )
                identification  = source.get('identificationPestDiseaseItems'   )
                life_cycle      = source.get('life_cyclePestDiseaseItems'       )
                damage          = source.get('damagePestDiseaseItems'           )
                solutions       = source.get('solutionsPestDiseaseItems'        )
                print('----------------------------------------------------------------')
                print(f'{i+1}) {name:>30} (score: {score:.2f}, group: Pest Diseases)'      )
                if description:
                    print(f'Description     : {description[:100]}'      )
                if identification:
                    print(f'Identification  : {identification[:100]}'   )
                if life_cycle:
                    print(f'Life Cycle      : {life_cycle[:100]}'       )
                if damage:
                    print(f'Damage          : {damage[:100]}'           )
                if solutions:
                    print(f'Solutions       : {solutions[:100]}'        )
                print(f'URL             : {url}')    
                print('----------------------------------------------------------------', end = '\n\n')
        
            elif source['urlTurfPests'] != '': 
                '''
                Fields:
                "name"
                "urlTurfPests"
                "textTurfPests"
                '''
                url             = source.get('urlTurfPests' )
                description     = source.get('textTurfPests')
                print('----------------------------------------------------------------')
                print(f'{i+1}) {name:>30} (score: {score:.2f}, group: Turf Pests)'      )
                if description:
                    print(f'Description     : {description[:100]}'      )
                print(f'URL             : {url}')    
                print('----------------------------------------------------------------', end = '\n\n')

            elif source['urlWeedItems'] != '':
                '''
                Fields:
                "name"
                "urlWeedItems"
                "descriptionWeedItems"
                '''
                url             = source.get('urlWeedItems' )
                description     = source.get('descriptionWeedItems')
                print('----------------------------------------------------------------')
                print(f'{i+1}) {name:>30} (score: {score:.2f}, group: Weed Items)'      )
                if description:
                    print(f'Description     : {description[:100]}'      )
                print(f'URL             : {url}')    
                print('----------------------------------------------------------------', end = '\n\n')
            
            elif source['urlExoticPests'] != '':
                '''
                Fields:
                "name"
                "urlExoticPests"
                "descriptionExoticPests"
                "damageExoticPests"
                "identificationExoticPests"
                "life_cycleExoticPests"
                "monitoringExoticPests"
                "managementExoticPests"
                '''
                url             = source.get('urlExoticPests'               )
                name            = source.get('name'                         )
                description     = source.get('descriptionExoticPests'       )
                damage          = source.get('damageExoticPests'            )
                identification  = source.get('identificationExoticPests'    )
                life_cycle      = source.get('life_cycleExoticPests'        )
                monitoring      = source.get('monitoringExoticPests'        )
                management      = source.get('managementExoticPests'        )
                print('----------------------------------------------------------------')
                print(f'{i+1}) {name:>30} (score: {score:.2f}, group: Exotic Pests)'      )
                if description:
                    print(f'Description     : {description[:100]}'      )
                if damage:
                    print(f'Damage          : {damage[:100]}'           )
                if identification:
                    print(f'Identification  : {identification[:100]}'   )
                if life_cycle:
                    print(f'Life Cycle      : {life_cycle[:100]}'       )
                if monitoring:
                    print(f'Monitoring      : {monitoring[:100]}'       )
                if management:
                    print(f'Monitoring      : {management[:100]}'       )
                print(f'URL             : {url}')    
                print('----------------------------------------------------------------', end = '\n\n')
            elif source['urlPestNote'] != '':
                '''
                Fields:
                "name"
                "urlPestNote"
                "urlQuickTipPestNote"
                "descriptionPestNote"
                "life_cyclePestNote"
                "damagePestNote"
                "managementPestNote"
                "contentQuickTipsPestNote"
                '''
                url             = source.get('urlPestNote'               )
                name            = source.get('name'                      )
                description     = source.get('descriptionPestNote'       )
                life_cycle      = source.get('life_cyclePestNote'        )
                damage          = source.get('damagePestNote'            )
                management      = source.get('managementPestNote'        )
                quicktips       = source.get('contentQuickTipsPestNote'  )
                print('----------------------------------------------------------------')
                print(f'{i+1}) {name:>30} (score: {score:.2f}, group: Pest Notes)'      )
                if description:
                    print(f'Description     : {description[:100]}'      )
                if life_cycle:
                    print(f'Life Cycle      : {life_cycle[:100]}'       )
                if damage:
                    print(f'Damage          : {damage[:100]}'           )
                if management:
                    print(f'Monitoring      : {management[:100]}'       )
                if quicktips:
                    print(f'Quick tips      : {quicktips[:100]}'        )
                print(f'URL             : {url}')    
                print('----------------------------------------------------------------', end = '\n\n')


In [9]:
ask_hits, name_hits, other_hits, damage_hits    = _handle_es_query  (QUESTION, PEST_DAMAGE)
hits_ask, hits_ipm                              = _handle_es_result (ask_hits, name_hits, other_hits, damage_hits)
hits_ask, hits_ipm                              = _weight_score     (hits_ask, hits_ipm)
_print_hits(hits_ask, hits_ipm)

INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.061s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.058s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.045s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.045s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.041s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.048s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.047s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.046s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.047s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.059s]
INFO:elasticsearch:POST http://localhost:9200/combined/_search [status:200 request:0.055s]

In [58]:
vector_name = 'name_vector'
query_vector = 

cos = f'cosineSimilarity(params.query_vector, "{vector_name}") + 1.0'

script_query = {
    "script_score": {
        "query" : {"match_all": {}},
        "script": {"source": cos, "params": {"query_vector": query_vector}}
    }
}

response = config.es_client.search(
    index   = config.es_combined_index,
    query   = script_query,
    size    = 10,
    _source = source_query
)

hits = response['hits']['hits']

{'_index': 'combined',
 '_type': '_doc',
 '_id': 'aNH9RX8B5PMjGht9JJK1',
 '_score': 1.3317567,
 '_source': {'life_cyclePestDiseaseItems': '',
  'textTurfPests': 'Hosts\nAnnual bluegrass (a common weed in turf), creeping bentgrass, Kentucky bluegrass\nSymptoms\nFusarium patch develops as circular patches 1 to 6 inches (2.5 - 15 cm) wide. Leaves appear watersoaked and are reddish-brown on the outside progressing to a straw color in the center. Dead leaves may have gelatinous spore masses. White or pink fungal threads may be observed in the early morning, giving the turf a pink cast.\nConditions favoring disease\nCool temperatures (40° to 60°F) and moist conditions favor Fusarium patch. High nitrogen applications in the fall and neutral or alkaline soils also contribute to disease development. Fusarium patch survives in grass residues.\nPrevention and management\nReduce shade and provide good soil aeration and water drainage. Irrigate at the recommended frequency for your turf species and

In [47]:
''' 
fields that are available for hits_ask:
    "doc_id"    ,
    "name"      ,

    "ask_url"                       ,
    "ask_faq_id"                    ,
    "ask_title"                     ,
    "ask_title_question"            ,
    "ask_question"
'''
for i, h in enumerate(hits_ask):
    print(f'\n\n{i+1}')
    print(f'score: {h["_score"]:.2f}, score_max: {h["_score_max"]:.2f}')
    for k, v in h['_source'].items():
        if v != '':
            print(f'{k:<30}: {v[:120]}')


for i, h in enumerate(hits_ipm):
    print(f'\n\n{i+1}')
    print(f'score: {h["_score"]:.2f}, score_weighted: {h["_score_weighted"]:.2f}, score_other: {h["_score_other"]:.2f}, score_damage: {h["_score_damage"]:.2f}, score_name: {h["_score_name"]:.2f}')
    for k, v in h['_source'].items():
        if v != '':
            print(f'{k:<30}: {v[:120]}')



1
score: 1.55, score_max: 1.55
ask_question                  : I transferred my minin rose to a bigger pot outdoors late December early January. Ive tried to protect it from the freez
ask_title                     : Mini roses.
ask_faq_id                    : 95595.0
ask_title_question            : Mini roses. I transferred my minin rose to a bigger pot outdoors late December early January. Ive tried to protect it fr
ask_url                       : https://ask2.extension.org/kb/faq.php?id=442895
doc_id                        : 961


2
score: 1.54, score_max: 1.54
ask_question                  : I just got this Peach tree and haven't planted it yet. In the week I've had it, the leaves have started to brown off and
ask_title                     : Unhappy Peach.
ask_faq_id                    : 119609.0
ask_title_question            : Unhappy Peach. I just got this Peach tree and haven't planted it yet. In the week I've had it, the leaves have started t
ask_url                       : ht

In [None]:
# hits = hits_ipm
# title = 'hits_ipm'
hits = hits_ask
title = 'hits_ask'

"""print the hits & scores"""
print("----------------------------------------------------------")
print(title)
# print("{} total hits.".format(response["hits"]["total"]["value"]))
for hit in reversed(hits[:25]):

    if "_score_weighted" not in hit.keys():
        scores = f'score={hit.get("_score_max"  , 0.0):.3f}; '
    else:
        scores = (
            f'wght={hit.get("_score_weighted"   , 0.0):.3f}; '
            f'name={hit.get("_score_name"       , 0.0):.3f}; '
            f'othr={hit.get("_score_other"      , 0.0):.3f}; '
            f'damg={hit.get("_score_damage"     , 0.0):.3f}; '
        )
    text = (
        # f'{hit["_id"]}; '
        f"{scores}"
        f'({hit["_source"]["doc_id"]}, {hit["_source"]["name"]}); '
        f'({hit["_source"]["ask_faq_id"]}, {hit["_source"]["ask_title"]}); '
    )

    pdi_url = hit["_source"]["urlPestDiseaseItems"  ]
    tp_url  = hit["_source"]["urlTurfPests"          ]
    wi_url  = hit["_source"]["urlWeedItems"          ]
    ep_url  = hit["_source"]["urlExoticPests"        ]
    pn_url  = hit["_source"]["urlPestNote"           ]
    qt_url  = hit["_source"]["urlQuickTipPestNote"   ]
    ask_url = hit["_source"]["ask_url"              ]

    text = f"{text}URLS:"

    if qt_url:
        text = f"{text} [quick tip]({qt_url}),"
    if pn_url:
        text = f"{text} [pestnote]({pn_url}),"
    if pdi_url:
        text = f"{text} [pest disease item]({pdi_url}),"
    if tp_url:
        text = f"{text} [turf pest]({tp_url}),"
    if wi_url:
        text = f"{text} [weed item]({wi_url})"
    if ep_url:
        text = f"{text} [exotic pests]({ep_url})"
    if ask_url:
        text = f"{text} [askextension]({ask_url})"

    print(text)

----------------------------------------------------------
hits_ask
score=1.373; (2599, ); (64538.0, Pruning citrus.); URLS: [askextension](https://ask2.extension.org/kb/faq.php?id=344528)
score=1.401; (1030, ); (103422.0, Rasberry Blooms bu no fruit.); URLS: [askextension](https://ask2.extension.org/kb/faq.php?id=466890)
score=1.403; (1139, ); (113354.0, Should I prune my Artichoke plants in march?); URLS: [askextension](https://ask2.extension.org/kb/faq.php?id=545612)
score=1.419; (2158, ); (11512.0, Corn Plant with Yellow and Brown Spots.); URLS: [askextension](https://ask2.extension.org/kb/faq.php?id=162733)
score=1.421; (1310, ); (127979.0, Tiny black bugs in bathroom.); URLS: [askextension](https://ask2.extension.org/kb/faq.php?id=596586)
score=1.423; (1034, ); (103833.0, My beautiful fruitless plum tree looks like its dying have any helpful tips.); URLS: [askextension](https://ask2.extension.org/kb/faq.php?id=468052)
score=1.444; (1196, ); (118783.0, Should I remove frost-damage