In [24]:
import json
from tqdm import tqdm
from elasticsearch import Elasticsearch

In [31]:
elastic_host = 'localhost'
elastic_port = 9200
index_name = 'elastic_classifier'

In [32]:
es = Elasticsearch([{'host': elastic_host, 'port': elastic_port}])

In [34]:
if not es.indices.exists(index_name):
    es.indices.create(index_name)

  """Entry point for launching an IPython kernel.


In [35]:
def add_record(es, index_name, record):
    """
    Add a single record to index
    """
    try:
        es.index(index=index_name, document=record)
    except Exception as ex:
        print(str(ex))
        
        
def search(es, index_name, search_text):
    """
    Search index for the search text
    """
    search_object = {'query': {'match': {'text': search_text}}}
    res = es.search(index=index_name, body=json.dumps(search_object))
    return res

In [36]:
def fill_index_with_data(texts, labels, es, index_name):
    """
    fill the index with data (texts and their corresponding labels)
    """
    for text, label in tqdm(zip(texts, labels)):
        doc = json.dumps({'label': str(label), 'text': str(text)})
        add_record(es, index_name, doc)    

In [41]:
def predict(es, text, normalize_by_score=True):
    """
    Calculate prediction for the provided text. 
    Elasticsearch returns scores for multiple hits for the given query. Prediction can be brought to
    scale [0,1] by normalizing either by class counts or by class scores of all the hits.
    Args:
        es: elasticsearch object
        text: search query string
        normalize_by_score: whether to normalize by scores of the hits or by counts
    """
    res = search(es, index_name, text)
    
    if not res['hits']['hits']:
        return None
    else:
        hits = res['hits']['total']['value']
        max_score = res['hits']['max_score']
        
        preds = {}
        for pred_i in res['hits']['hits']:
            pred_class = pred_i['_source']['label']
            score = pred_i['_score']
            #accumaluate the hits scores and counts by predicted classes
            if pred_class not in preds:
                preds[pred_class] = {}
                preds[pred_class]['score'] = score
                preds[pred_class]['count'] = 1
            else:
                preds[pred_class]['score'] += score
                preds[pred_class]['count'] += 1
                
        pred_classes, scores_and_counts = list(zip(*list(preds.items())))
        
        score_type = 'score' if normalize_by_score else 'count'
        scores = [i[score_type] for i in scores_and_counts]
        scores = [i/sum(scores) for i in scores]
        
        return list(zip(pred_classes, scores))
        
            
            

In [38]:
#  Toy example 
texts = ['dell computer', 'panasonic tv, black', 'wooden furniture', 
         'black table with chair set', 'lcd monitor', 'sofa',
         'ikea chest of drawers', 'apple macbook pro'
        ]
labels = ['electronics', 'electronics', 'furniture', 
         'furniture', 'electronics', 'furniture',
         'furniture', 'electronics'
         ]

In [39]:
fill_index_with_data(texts, labels, es, index_name)

8it [00:00, 74.16it/s]


In [52]:
predict(es, 'ikea bed', normalize_by_score=True)

  # Remove the CWD from sys.path while we load stuff.


[('furniture', 1.0)]