# This notebook is using to simulate a user that is chatting with the chatBot

In [1]:
import json
import redis
import time
import os
from uuid import uuid4
from generative_retirver import settings

from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import PreProcessor
from eda.src import config

### Cretating Document store
first run the docker-compose.yaml

**> docker-compose up -d**

In [2]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document"
)

In [3]:
json_file_path = os.path.join(config.DATA_PROCESSED_PATH, "json_dataset.json")
# Load the documents from the JSON file
with open(json_file_path, "r") as f:
    document_list = json.load(f)

In [4]:
# Create an instance of PreProcessor
# Each document is divided into paragraphs of approximately 500 tokens.

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_overlap=20,
    split_respect_sentence_boundary=False,
)

doc_processed = preprocessor.process(document_list)

# clear the document store before loading documents
document_store.delete_documents()

# Upload documents to the document store
document_store.write_documents(doc_processed)

Preprocessing:   0%|          | 0/10 [00:00<?, ?docs/s]

Document ce803e014fb54e0713db190d4857f33 is 166040 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document f0a84d290964abe8583b799bfa63f0fa is 162796 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.


### Load message broker redi

In [9]:
# Connect to Redis and assign to variable `db``
# Make use of settings.py module to get Redis settings like host, port, etc.
db = redis.Redis(settings.REDIS_IP, settings.REDIS_PORT, db=settings.REDIS_DB_ID)
db.ping()

True

In [24]:

def model_predict(job_id, messages):
    """
    Receives the name of the user_jason and queues the job into Redis.
    Will loop until getting the answer from our retriver-generative service.

    Parameters
    ----------
    job_id: str
        id of the task
    messages : list[dict]
        chat_history

    Returns
    -------
    answer:  str
        Model generate an answer as a string 
    """
    
    # Assign an unique ID for this job and add it to the queue.
    # We need to assing this ID because we must be able to keep track
    # of this particular job across all the services
    job_id = str(uuid4())

    # Create a dict with the job data we will send through Redis having the   
    Out_dict = {
        "id": job_id,
        "messages":messages,
    }
    job_data = json.dumps(Out_dict)

    # Send the job to the model service using Redis 
    db.lpush(settings.REDIS_QUEUE, job_data)

    # Loop until we received the response from the retriver-generative model
    while True:
        # Attempt to get model predictions using job_id        
        output = db.get(job_id)

        # Check if the text was correctly processed by our model
        if output is not None:
            answer = json.loads(output)['answer']
            
            db.delete(job_id)
            break

        # Sleep some time waiting for model results
        time.sleep(settings.SERVER_SLEEP)

    return answer

### Simulating a user
first run in bash the retriver-generative model

**> python .\generative_retirver\retriver_gen.py**

In [31]:
def use_list_dict(list_dict, new_sentence = None, isquery=True):
     """ 
     Processing the list of dictionary. If new_sentence is not added then just return the string query.
     Otherwise, it appends the new sentence to the list in the role determined by isquery

     Parameters
    ----------
    list_dict: list[dict]
        list of dictionary
     new_sentence: str
        new sentence tha can be an user query or sytem answer
     isquery: bool
        True -> new_sentence is a query / False -> new_sentence is an answer

    Returns
    -------
    query_prompt (str) 
          if new_sentence in None
    list_dict: list[dict]
          list of dictionary with new sentence append
    """
     
     if not new_sentence:
         query_prompt = ' \n '.join([f"{item['role']}: {item['conten']}" for item in list_dict])
         return query_prompt
     if isquery:
          list_dict.append({'role':'user', 'conten':new_sentence})
     else:
          list_dict.append({'role':'assistant', 'conten':new_sentence})

     return list_dict

In [30]:
def run_chatbox():
    #query = 'first query'
    chat_history=[]
    job_id = str(uuid4())
    while True:
        query = input('User: ')
        if query == 'exit': 
            return chat_history
        chat_history = use_list_dict(chat_history, query, isquery=True)
        
        answer = model_predict(job_id, chat_history) 
        chat_history = use_list_dict(chat_history, answer, isquery=False)
        print('\n', query, ' \n', answer)

chat = run_chatbox()


 What are stock symbols for Seagate Technology PLC and nicolet-bankshares-inc?  
 The stock symbol for Seagate Technology PLC is STX and the stock symbol for Nicolet Bankshares Inc is NCBS.

 What are their latest financial performance?   
 Seagate Technology PLC reported a revenue of $2.7 billion and a net income of $482 million in Q2 2021. Nicolet Bankshares Inc reported a net income of $10.5 million and a return on average assets of 1.28% in Q1 2021.

 Do they have a global presence?  
 Seagate Technology PLC has a global presence in over 40 countries, while information on Nicolet Bankshares Inc's global presence is not readily available.


In [29]:
print(chat)

[{'role': 'user', 'conten': 'What are stock symbols for Seagate Technology PLC and nicolet-bankshares-inc?'}, {'role': 'assistant', 'conten': 'The stock symbol for Seagate Technology PLC is STX and the stock symbol for Nicolet Bankshares Inc is NCBS.'}, {'role': 'user', 'conten': 'What are their latest financial performance? '}, {'role': 'assistant', 'conten': 'Seagate Technology PLC reported a revenue of $2.7 billion and a net income of $482 million in Q2 2021. Nicolet Bankshares Inc reported a net income of $10.5 million in Q2 2021.'}, {'role': 'user', 'conten': 'Do they have a global presence? '}, {'role': 'assistant', 'conten': 'Seagate Technology PLC has a global presence with operations in over 40 countries, while Nicolet Bankshares Inc operates solely in the United States.'}, {'role': 'user', 'conten': "Who are some of Seagate Technology PLC's major competitors? "}, {'role': 'assistant', 'conten': "Seagate Technology PLC's major competitors include Western Digital Corporation, T