In [1]:
from haystack.document_stores import ElasticsearchDocumentStore
import os
from haystack import Pipeline
from haystack.nodes import PreProcessor
from src import download_data, config, preprocess_data_raw, preprocess_data_raw_2
import os
import json

### Load data for AWS bucket

In [2]:
bucket_name = config.BUCKET_NAME
s3_folder = config.S3_FOLDER_URL
local_dir = config.DATA_RAW_PATH

# load data form AWS bucket and save in local_dir
download_data.download_s3_folder(bucket_name, s3_folder, 
                                 local_dir=local_dir, 
                                 sample_size=10)

### Save data set as jason in .data/processed

In [3]:
preprocess_data_raw_2.convert_pdf_jason(config.DATA_RAW_PATH, config.DATA_PROCESSED_PATH)

### Generated the Elastisearch document store
Before we need to run the elastisearch container using just once:

**docker pull docker.elastic.co/elasticsearch/elasticsearch:7.9.2**

And then run image:

**docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2**

You can do this manually, or using our [launch_es()](https://docs.haystack.deepset.ai/reference/utils-api) utility function.


In [2]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document"
)

### Load dataset from json_dataset.json 

In [3]:
json_file_path = os.path.join(config.DATA_PROCESSED_PATH, "json_dataset.json")
# Load the documents from the JSON file
with open(json_file_path, "r") as f:
    document_list = json.load(f)


In [None]:
document_list[0]

### Pre-processing and loading into the database

In [5]:
# Create an instance of PreProcessor
# Each document is divided into paragraphs of approximately 500 tokens.

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=500,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)

doc_processed = preprocessor.process(document_list)

# clear the document store before loading documents
document_store.delete_documents()

# Upload documents to the document store
document_store.write_documents(doc_processed)
    

Preprocessing:   0%|          | 0/10 [00:00<?, ?docs/s]

We found one or more sentences whose word count is higher than the split length.
Document 58149ab146d0b3889605e662512296b9 is 27615 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document cf8c146f296576b1988b1cefaac9f500 is 320800 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.


In [None]:
doc_processed[0]

### Generate retriver and PromptTemplate

In [7]:
from haystack.nodes import BM25Retriever
# Create an instance of BM25Retriver to select top_k better documents into document_store
retriever = BM25Retriever(document_store=document_store, top_k = 5)

In [8]:
from haystack.nodes import PromptTemplate

# Create an instance of PromTemplate, we use the type question-answering
qa_prompt = PromptTemplate(
            name="question-answering",
            prompt_text="""Given the context please answer the question. 
            Your answer should be in your own words and be no longer than 50 words. 
            Context: {join(documents)}; 
            Question:  {query}; Answer:""",
                    )

#prompt_node = PromptNode(model_name_or_path="google/flan-t5-large", default_prompt_template=lfqa_prompt)

### Using GPT

In [9]:
from haystack.nodes import PromptNode

# Create an instance of PromptNode using ChatGPT API as generator
generator = PromptNode("gpt-3.5-turbo", 
                         api_key = config.API_KEY,
                         default_prompt_template = qa_prompt,
                         model_kwargs={"stream":True}) 


### Pipeline Retriver-Generator

In [10]:
pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=generator, name="generator", inputs=["retriever"])

In [13]:
# Queries examples: use some companie name
output = pipe.run(query="What is the company AMERI Holdings dedicated to?", 
                      params={"retriever": {"top_k": 2},}) # Set params value is optative, because top_k was fixed as 5 before
print(output['results'])

AMERI Holdings provides SAP cloud, digital and enterprise services to clients worldwide, with a primary business objective of enhancing their clients' business capabilities and technologies through consulting services and strategic acquisitions.["AMERI Holdings provides SAP cloud, digital and enterprise services to clients worldwide, with a primary business objective of enhancing their clients' business capabilities and technologies through consulting services and strategic acquisitions."]
