In [1]:
from haystack.document_stores import ElasticsearchDocumentStore
import os
from haystack import Pipeline
from haystack.nodes import PreProcessor
from src import download_data, config, preprocess_data_raw, preprocess_data_raw_2
import json

In [2]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

### Load data for AWS bucket

In [2]:
bucket_name = config.BUCKET_NAME
s3_folder = config.S3_FOLDER_URL
local_dir = config.DATA_RAW_PATH

# load data form AWS bucket and save in local_dir
download_data.download_s3_folder(bucket_name, s3_folder, 
                                 local_dir=local_dir, 
                                 sample_size=10)

### Save data set as jason in .data/processed

In [3]:
preprocess_data_raw_2.convert_pdf_jason(config.DATA_RAW_PATH, config.DATA_PROCESSED_PATH)

INFO - haystack.utils.preprocessing -  Converting d:\Alexander\Cursos Programación\AnyoneAI ML\Final Project\financial-advisor-chatbot\data\raw\ameri-holdings-inc\NASDAQ_AMRH_2017.pdf
INFO - haystack.utils.preprocessing -  Converting d:\Alexander\Cursos Programación\AnyoneAI ML\Final Project\financial-advisor-chatbot\data\raw\ameri-holdings-inc\NASDAQ_AMRH_2018.pdf
INFO - haystack.utils.preprocessing -  Converting d:\Alexander\Cursos Programación\AnyoneAI ML\Final Project\financial-advisor-chatbot\data\raw\copart\NASDAQ_CPRT_2018.pdf
INFO - haystack.utils.preprocessing -  Converting d:\Alexander\Cursos Programación\AnyoneAI ML\Final Project\financial-advisor-chatbot\data\raw\core-mark-holding-company-inc\NASDAQ_CORE_2018.pdf
INFO - haystack.utils.preprocessing -  Converting d:\Alexander\Cursos Programación\AnyoneAI ML\Final Project\financial-advisor-chatbot\data\raw\gencor-industries-inc\NASDAQ_GENC_2021.pdf
INFO - haystack.utils.preprocessing -  Converting d:\Alexander\Cursos Programa

### Generated the Elastisearch document store
Before we need to run the elastisearch container using just once:

**docker pull docker.elastic.co/elasticsearch/elasticsearch:7.9.2**

And then run image:

**docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2**

You can do this manually, or using our [launch_es()](https://docs.haystack.deepset.ai/reference/utils-api) utility function.


In [3]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document"
)

### Load dataset from json_dataset.json 

In [4]:
json_file_path = os.path.join(config.DATA_PROCESSED_PATH, "json_dataset.json")
# Load the documents from the JSON file
with open(json_file_path, "r") as f:
    document_list = json.load(f)


In [None]:
document_list[0]

### Pre-processing and loading into the database

In [5]:
# Create an instance of PreProcessor
# Each document is divided into paragraphs of approximately 500 tokens.

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_overlap=20,
    split_respect_sentence_boundary=False,
)

doc_processed = preprocessor.process(document_list)

# clear the document store before loading documents
document_store.delete_documents()

# Upload documents to the document store
document_store.write_documents(doc_processed)
    

Preprocessing:   0%|          | 0/10 [00:00<?, ?docs/s]



### Generate retriver and PromptTemplate

In [4]:
from haystack.nodes import BM25Retriever, SentenceTransformersRanker, PromptTemplate, AnswerParser, PromptNode
# Create an instance of BM25Retriver to select top_k better documents into document_store
retriever = BM25Retriever(document_store=document_store, top_k = 10)

In [5]:
# ranked to have a better sorted list of relevant documents using a Cross-Encoder model .
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k = 3)

INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
  return self.fget.__get__(instance, owner)()


### Using GPT

### Pipeline Retriver-Generator

In [6]:
tool_template = PromptTemplate(
    name="question-answering-with-references",
    prompt_text="Create a concise and informative answer (no more than 50 words) for a given question "
    "based solely on the given documents. You must only use information from the given documents. "
    "Use a financial assistant tone and be very specific with amounts. Do not repeat text. Cite the documents using Document[number] notation. "
    "If multiple documents contain the answer, cite those documents like ‘as stated in Document[number], Document[number], etc.’. "
    "If the documents do not contain the answer to the question, say that ‘answering is not possible given the available information.’\n"
    "{join(documents, delimiter=new_line, pattern=new_line+'Document[$idx]: $content', str_replace={new_line: ' ', '[': '(', ']': ')'})} \n Question: {query}; Answer: ",
    output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]"),)

document_prompt_node = PromptNode(
        "gpt-3.5-turbo",
        api_key=config.API_KEY,
        max_length=256,
        default_prompt_template= tool_template
)


In [7]:
from haystack.nodes import Shaper, DocumentMerger
# Doing our own pipeline

# shaper is used to join all documents returned by retriver
# shaper = Shaper(func="join_documents", inputs={"documents": "documents"},outputs=["documents"])

#document_merger = DocumentMerger(separator=" \n Conten: \n")

# creating a pipeline
pipeline = Pipeline()

# Add retriver node (BM25retriver)
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])

# Ranker to get a sorted list of relevant documents
pipeline.add_node(component=ranker, name="Ranker", inputs=["Retriever"])

# Add shaper node to join all documents
#pipeline.add_node(component=document_merger, name="DocumentMerger", inputs=["Ranker"])
#pipeline.add_node(component=shaper, name="Shaper", inputs=["Ranker"])

# generative
pipeline.add_node(component=document_prompt_node, name="Generative", inputs=["Ranker"])


# only the "recall_single_hit" metric will be considered to evaluate the performance of the Retriever
#pipeline.metrics_filter = {"Retriever": ["recall_single_hit"]}



In [11]:
# chek the pipeline
query="What was Microstrategy's net income in 2019?" #"What is the current stock price of seagate-technology-plc?"
result = pipeline.run(query=query, params={"Retriever": {"top_k": 10}, "Ranker":{"top_k":3}}) # 
result['answers']

### Agent

In [8]:
from haystack.agents import Agent, Tool

# Let's create the prompt for the Agent's PromptNode
# See the full example at https://github.com/deepset-ai/haystack/blob/main/examples/agent_multihop_qa.py

zero_shot_react_template=PromptTemplate(
            name="zero-shot-react", 
            prompt_text="You are a chatbot working as a financial assistant and you are having a conversation with a user."
            "At the end of the conversation, the user tells you something which is usually a new question. "
            "If you don't know the answer, to achieve your goal of answering complex questions correctly, you have access to the following tool:\n\n"
            "Search_in_documents: useful for when you need to get answers. \n\n"
            "To answer questions, you'll need to go through multiple steps involving step-by-step. "
            "thinking and selecting when to use the Search_in_documents tool; tool will respond with Observation."
            "If after searching you feel that the user should enter any additional data, return an answer requesting it."
            "When you are ready for a final answer, respond with the `Final Answer:`\n\n"
            "Use the following format:\n\n"
            "Question: the question to be answered\n"
            "Thought: Reason if you have the final answer. If yes, answer the question. If not, make an appropriate query to the Search_in_documents tool.\n"
            "Tool: Search_in_documents \n"
            "Tool Input: query to search for the missing information needed to answer \n"
            "Observation: The tool will respond with the answer \n"
            "...\n"
            "Final Answer: the final answer to the question, in no more than 50 words\n\n"
            "Thought, Tool, Tool Input, and Observation steps can be repeated multiple times, but sometimes we can find an answer in the first pass\n"
            "---\n\n"
            "Question: {query}\n"
            "Thought: Let's think step-by-step, I first need to ", 
        ) # output_parser=AnswerParser()

# PromptNode using GPT
agent_prompt_node = PromptNode(
    "gpt-3.5-turbo", api_key=config.API_KEY, max_length=500, stop_words=["Observation:"],
    model_kwargs={"stream":False, 'temperature':0} # max_length=512, stop_words=["Observation:"],
)

# Let's configure Search as the Agent's tool
# Each tool needs to have a description that helps the Agent decide when to use it
search_in_ds = Tool(
    name="Search_in_documents",
    pipeline_or_node=pipeline,
    description="useful for when you need to get answers.",
    output_variable='answers',
)

# Time to initialize the Agent specifying the PromptNode to use and the Tools
agent = Agent(
    prompt_node=agent_prompt_node,
    prompt_template=zero_shot_react_template, 
    final_answer_pattern= r"Final Answer\s*:\s*(.*)", 
)

agent.add_tool(tool=search_in_ds)




### Procesing the list of dictionaries

In [9]:
def use_list_dict(list_dict, new_sentence = None, isquery=True):
     """ 
     Processing the list of dictionary. If new_sentence is not added then just return the string query.
     Otherwise, it appends the new sentence to the list in the role determined by isquery

     Parameters
    ----------
    list_dict: list[dict]
        list of dictionary
     new_sentence: str
        new sentence tha can be an user query or sytem answer
     isquery: bool
        True -> new_sentence is a query / False -> new_sentence is an answer

    Returns
    -------
    query_prompt (str) 
          if new_sentence in None
    list_dict: list[dict]
          list of dictionary with new sentence append
    """
     
     if not new_sentence:
         query_prompt = ' \n '.join([f"{item['role']}: {item['conten']}" for item in list_dict])
         return query_prompt
     if isquery:
          list_dict.append({'role':'user', 'conten':new_sentence})
     else:
          list_dict.append({'role':'assistant', 'conten':new_sentence})

     return list_dict
       

In [27]:
json_file_path = os.path.join(config.DATA_PROCESSED_PATH, "qa_dataset.json")
# Load the documents from the JSON file
with open(json_file_path, "r") as f:
    qa_list = json.load(f)

qa_list[28]

{'question': 'Who founded Microstrategy?',
 'answer': 'Michael J. Saylor, Sanju Bansal, and Thomas Spahr'}

### Simulating chatBot

In [28]:
def run_chatbox():
    #query = 'first query'
    chat_history=[]
    while True:
        try:
            query = input('User: ')
            if query == 'exit': 
                return chat_history
            chat_history = use_list_dict(chat_history, query, isquery=True)
            result = agent.run(query =use_list_dict(chat_history)) #json.dumps(chat_history)
            answer = result["answers"][0].answer
            chat_history = use_list_dict(chat_history, answer, isquery=False)
            print('\n', query, ' \n', result["answers"][0].answer)
        except:
            print('error')

chat_history = run_chatbox()  


Agent zero-shot-react started with {'query': 'user: Who founded Microstrategy?', 'params': None}
[32mcheck[0m[32m if[0m[32m I[0m[32m have[0m[32m this[0m[32m information[0m[32m in[0m[32m my[0m[32m database[0m[32m.[0m[32m If[0m[32m not[0m[32m,[0m[32m I[0m[32m can[0m[32m use[0m[32m the[0m[32m Search[0m[32m_in[0m[32m_documents[0m[32m tool[0m[32m to[0m[32m find[0m[32m the[0m[32m answer[0m[32m.
[0m[32mTool[0m[32m:[0m[32m Search[0m[32m_in[0m[32m_documents[0m[32m
[0m[32mTool[0m[32m Input[0m[32m:[0m[32m "[0m[32mMicro[0m[32mstrategy[0m[32m founder[0m[32m"
[0mObservation: [33manswering is not possible given the available information.[0m
Thought: [32mI[0m[32m need[0m[32m more[0m[32m information[0m[32m to[0m[32m answer[0m[32m this[0m[32m question[0m[32m.[0m[32m Can[0m[32m you[0m[32m provide[0m[32m any[0m[32m additional[0m[32m details[0m[32m or[0m[32m context[0m[32m?
[0m[32mFinal[

In [23]:
print(chat_history)


[{'role': 'user', 'conten': 'Who are the founders of Paratek Pharmaceuticals?'}]
