# Logging of chats to ES

## Creating index

In [1]:
import os
os.environ['TF_CACHE_DIR'   ] = '/var/tmp/models'

ES_USERNAME = 'elastic'
ES_PASSWORD = 'changeme'
ES_LOGGING_INDEX = 'logs'

## select the environment for ingestion
# ES_HOST = 'http://localhost:9200/'
ES_HOST = 'https://dev.es.chat.ask.eduworks.com/'
# ES_HOST = 'https://qa.es.chat.ask.eduworks.com/'

In [2]:
pipeline_id = "transform_id"
mapping  = {
    "pipeline": {
        "id"    : pipeline_id,
        "body"  : {
            "description"   : "Replace the _id with chat_id for the logs index",
            "processors"    : [{
                "set": {
                    "field": "_id",
                    "value": "{{chat_id}}"
                }
            }]
        }
    },
    "settings": {
        "number_of_shards"  : 2, 
        "number_of_replicas": 1,
        "default_pipeline"  : pipeline_id
    },
    "mappings": {
        "dynamic"   : "false",
        "_source"   : {"enabled": "true"},
        "properties": {
            "chat_id"       : {"type": "keyword", "index": "true", "doc_values": "false", "ignore_above": 256},
            "timestamp"     : {"type": "date"   , "index": "true", "doc_values": "true"},
            "chat_history"  : {
                "dynamic"       : "false",
                "type"          : "nested",
                "properties"    : {
                    "agent"     : {"type": "keyword"        , "index": "false", "doc_values": "false", "ignore_above": 256  },
                    "timestamp" : {"type": "date"           , "index": "false", "doc_values": "false"                       },
                    "text"      : {"type": "match_only_text"                                                                },
                    "intent"    : {"type": "keyword"        , "index": "false", "doc_values": "false", "ignore_above": 256  },
                    "results"   : {
                        "dynamic"   : "false",
                        "type"      : "nested",
                        "properties": {
                            "score"     : {"type": "keyword"        , "index": "false", "doc_values": "false", "ignore_above": 256  },
                            "url"       : {"type": "keyword"        , "index": "false", "doc_values": "false", "ignore_above": 256  }
                        }
                    }
                }
            }
        }
    }
}

In [3]:
from elasticsearch import Elasticsearch, RequestError
from elasticsearch.client import IngestClient


# increase the timeout if necessary
es_client = Elasticsearch([ES_HOST], http_auth=(ES_USERNAME, ES_PASSWORD), timeout = 20)
es_ingest = IngestClient(es_client)

# create pipeline for replacing _id with chat_id
es_ingest.put_pipeline(
    id   = mapping['pipeline']['id'     ],
    body = mapping['pipeline']['body'   ])

# create index
es_client.indices.delete(
    index   = ES_LOGGING_INDEX, 
    ignore  = 404)
es_client.indices.create(
    index       = ES_LOGGING_INDEX      , 
    settings    = mapping['settings']   , 
    mappings    = mapping['mappings']   )

es_client.indices.refresh()

{'_shards': {'total': 12, 'successful': 6, 'failed': 0}}

## Reading sample events json file and transforming logs

In [4]:
import json

doc_sample_raw = None
with open(os.path.join(os.path.dirname(__name__), 'events_sample/events_log.json'), 'r') as f:
    doc_sample_raw = json.load(f)
doc_sample_raw

[{'event': 'action',
  'timestamp': 1658832730.7627108,
  'metadata': {'model_id': '98f858017efc411d91bc3bcd370d8f8b'},
  'name': 'action_session_start',
  'policy': None,
  'confidence': 1.0,
  'action_text': None,
  'hide_rule_turn': False},
 {'event': 'session_started',
  'timestamp': 1658832730.76289,
  'metadata': {'model_id': '98f858017efc411d91bc3bcd370d8f8b'}},
 {'event': 'action',
  'timestamp': 1658832730.762939,
  'metadata': {'model_id': '98f858017efc411d91bc3bcd370d8f8b'},
  'name': 'action_listen',
  'policy': None,
  'confidence': None,
  'action_text': None,
  'hide_rule_turn': False},
 {'event': 'user',
  'timestamp': 1658832730.885766,
  'metadata': {'model_id': '98f858017efc411d91bc3bcd370d8f8b'},
  'text': '/restart',
  'parse_data': {'intent': {'name': 'restart', 'confidence': 1.0},
   'entities': [],
   'text': '/restart',
   'message_id': 'c4b4c16d86eb4c7aac9581d39474bdbc',
   'metadata': {},
   'intent_ranking': [{'name': 'restart', 'confidence': 1.0}]},
  'inpu

In [5]:
from datetime import datetime

def _parse_tracker_events(events):
    '''Parse chat history - filtering in only `bot` and `user` events.'''
    chat_history = []

    for event in events:
        if event['event'] == 'user':
            '''
            structure:
                agent       keyword
                timestamp   date
                text        match_only_text
                intent      keyword
            '''
            text        = event['text']
            intent      = event['parse_data']['intent']['name']
            timestamp   = datetime.fromtimestamp(event['timestamp']).isoformat()
            chat_history.append({
                'agent'     : 'user'    ,
                'timestamp' : timestamp ,
                'text'      : text      ,
                'intent'    : intent
            })
        elif event['event'] == 'bot':
            '''
            structure:
                agent       keyword
                timestamp   date
                text        match_only_text
                results     nested
                    url         keyword
                    score       keyword
            '''
            text        = event['text']
            timestamp   = datetime.fromtimestamp(event['timestamp']).isoformat()
            
            chat_history.append({
                'agent'     : 'bot'     ,
                'timestamp' : timestamp ,
                'text'      : text      ,
            })

            if event['data']['custom'] is not None and event['data']['custom']['payload'] == 'resultscollapsible':
                results = []
                custom = event['data']['custom']
                for result in custom['data']:
                    url     = result['url'  ]
                    score   = result['score']
                    results.append({
                        'url'   : url,
                        'score' : score
                    })
                chat_history[-1]['results'] = results
    
    return chat_history

chat_history = _parse_tracker_events(doc_sample_raw)
doc_sample = {
    'chat_id'       : 'chat_id'                     , # tracker.sender_id            
    'timestamp'     : chat_history[0]['timestamp']  ,
    'chat_history'  : chat_history
}
doc_sample

{'chat_id': 'chat_id',
 'timestamp': '2022-07-26T16:52:10.885766',
 'chat_history': [{'agent': 'user',
   'timestamp': '2022-07-26T16:52:10.885766',
   'text': '/restart',
   'intent': 'restart'},
  {'agent': 'user',
   'timestamp': '2022-07-26T16:52:10.977252',
   'text': 'Hello',
   'intent': 'intent_greet'},
  {'agent': 'bot',
   'timestamp': '2022-07-26T16:52:10.995590',
   'text': 'Bot Configuration:</br>Debug: True</br>Version: 25.06.22</br><strong>expert_url <i>https://ucanr.edu/About/Locations/</i></strong></br><strong>es_search_size <i>100</i></strong></br><strong>es_cut_off <i>0.4</i></strong></br><strong>es_top_n <i>10</i></strong></br><strong>es_ask_weight <i>0.6</i></strong></br><strong>es_slots_weight <i>0.1</i></strong></br></br>To change the configuration parameters, use following schema:</br>parameter <i>param_name value</i></br>(i.e. <strong>parameter es_cut_off <i>0.5</i></strong>)'},
  {'agent': 'bot',
   'timestamp': '2022-07-26T16:52:10.995598',
   'text': "Hi, I'

In [7]:
response = None
try:
    response = es_client.index(
        index       = ES_LOGGING_INDEX      ,
        document    = doc_sample            ,
        id          = doc_sample['chat_id'] ,
    )
except RequestError as e:
    print(f'Error at inserting logs with chat_id - {doc_sample["chat_id"]}')
    raise(e)

response

{'_index': 'logs',
 '_type': '_doc',
 '_id': 'chat_id',
 '_version': 2,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 1,
 '_primary_term': 1}

## Retrieving logs

In [8]:
# date format = `dd.mm.yyyy`
def _parse_date(aft_date = None, bfr_date = None):

    
    try:
        if aft_date is None:
            aft_date = datetime.min
        else:

            aft_date = datetime.strptime(aft_date, '%d.%m.%Y')
        
        if bfr_date is None:
            bfr_date = datetime.max
        else:
            bfr_date = datetime.strptime(bfr_date, '%d.%m.%Y')

        aft_date = aft_date.isoformat()
        bfr_date = bfr_date.isoformat()

    except (TypeError, ValueError) as e:
        print(f'Input(s) should be string in the format `dd.mm.yyyy`')
        raise(e)

    return aft_date, bfr_date


# aft_date = None
# bfr_date = None
aft_date = '05.05.2021'
bfr_date = None
aft_date, bfr_date = _parse_date(
    aft_date = aft_date, 
    bfr_date = bfr_date
)    
    

query = {
    "range": {"timestamp": {
        'gte': aft_date,
        'lte': bfr_date 
    }}
}

response = es_client.search(
    index   = ES_LOGGING_INDEX,
    query   = query                     
)

response

{'took': 174,
 'timed_out': False,
 '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'logs',
    '_type': '_doc',
    '_id': 'chat_id',
    '_score': 1.0,
    '_source': {'chat_history': [{'agent': 'user',
       'text': '/restart',
       'intent': 'restart',
       'timestamp': '2022-07-26T16:52:10.885766'},
      {'agent': 'user',
       'text': 'Hello',
       'intent': 'intent_greet',
       'timestamp': '2022-07-26T16:52:10.977252'},
      {'agent': 'bot',
       'text': 'Bot Configuration:</br>Debug: True</br>Version: 25.06.22</br><strong>expert_url <i>https://ucanr.edu/About/Locations/</i></strong></br><strong>es_search_size <i>100</i></strong></br><strong>es_cut_off <i>0.4</i></strong></br><strong>es_top_n <i>10</i></strong></br><strong>es_ask_weight <i>0.6</i></strong></br><strong>es_slots_weight <i>0.1</i></strong></br></br>To change the configuration parameters, 