In [1]:
from haystack.document_stores import ElasticsearchDocumentStore
import os
from haystack import Pipeline
from haystack.nodes import PreProcessor
from src import download_data, config, preprocess_data_raw, preprocess_data_raw_2
import json

### Load data for AWS bucket

In [2]:
bucket_name = config.BUCKET_NAME
s3_folder = config.S3_FOLDER_URL
local_dir = config.DATA_RAW_PATH

# load data form AWS bucket and save in local_dir
download_data.download_s3_folder(bucket_name, s3_folder, 
                                 local_dir=local_dir, 
                                 sample_size=10)

### Save data set as jason in .data/processed

In [2]:
preprocess_data_raw_2.convert_pdf_jason(config.DATA_RAW_PATH, config.DATA_PROCESSED_PATH)

### Generated the Elastisearch document store
Before we need to run the elastisearch container using just once:

**docker pull docker.elastic.co/elasticsearch/elasticsearch:7.9.2**

And then run image:

**docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2**

You can do this manually, or using our [launch_es()](https://docs.haystack.deepset.ai/reference/utils-api) utility function.


In [2]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document"
)

### Load dataset from json_dataset.json 

In [3]:
json_file_path = os.path.join(config.DATA_PROCESSED_PATH, "json_dataset.json")
# Load the documents from the JSON file
with open(json_file_path, "r") as f:
    document_list = json.load(f)


In [9]:
document_list[0]

{'content': "\x0c\x0c \n \n \n1 \n \n \nUNITED STATES \nSECURITIES AND EXCHANGE COMMISSION \nWashington, D.C. 20549 \nFORM 10 – K \n[X] \nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 \n \nFor the Fiscal Year Ended September 30, 2021 \n[  ] \nTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 \n \nCommission File No. 001-11703 \nGENCOR INDUSTRIES, INC. \n(Exact name of registrant as specified in its charter) \n \nDelaware \n59-0933147 \n(State or other jurisdiction of \n(I.R.S. Employer Identification No.) \nincorporation or organization) \n5201 North Orange Blossom Trail \nOrlando, Florida 32810 \n(Address of principal executive offices, including zip code) \nRegistrant’s telephone number, including area code:  (407) 290-6000 \nSECURITIES REGISTERED PURSUANT TO SECTION 12(b) OF THE ACT: \nTitle of Class \n \n \n \nTrading Symbol (s) \n \nName of Exchange on which Registered \nCommon Stock ($.10 Par Value) \nG

### Pre-processing and loading into the database

In [11]:
# Create an instance of PreProcessor
# Each document is divided into paragraphs of approximately 500 tokens.

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_overlap=10,
    split_respect_sentence_boundary=False,
)

doc_processed = preprocessor.process(document_list)

# clear the document store before loading documents
document_store.delete_documents()

# Upload documents to the document store
document_store.write_documents(doc_processed)
    

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

### Generate retriver and PromptTemplate

In [12]:
from haystack.nodes import BM25Retriever
# Create an instance of BM25Retriver to select top_k better documents into document_store
retriever = BM25Retriever(document_store=document_store, top_k = 5)

In [6]:
from haystack.nodes import PromptTemplate

# Create an instance of PromTemplate, we use the type question-answering
qa_prompt = PromptTemplate(
            name="question-answering",
            prompt_text="""Given the context please answer the question. 
            Your answer should be in your own words and be no longer than 50 words. 
            Context: {join(documents)}; 
            Question:  {query}; Answer:""",
                    )

#prompt_node = PromptNode(model_name_or_path="google/flan-t5-large", default_prompt_template=lfqa_prompt)

### Using GPT

In [7]:
from haystack.nodes import PromptNode

# Create an instance of PromptNode using ChatGPT API as generator
generator = PromptNode("gpt-3.5-turbo", 
                         api_key = config.API_KEY,
                         default_prompt_template = qa_prompt,
                         model_kwargs={"stream":True}) 


### Pipeline Retriver-Generator

In [8]:
pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=generator, name="generator", inputs=["retriever"])

In [13]:
# Queries examples: use some companie name
# "What is the company gencor-industries dedicated to?"
output = pipe.run(query="How much was the Cash and cash equivalents at Beginning of year 2021 and End of year of Gencor Industries?", 
                      params={"retriever": {"top_k": 2},}) # Set params value is optative, because top_k was fixed as 5 before
#print(output['results'])

The Cash and cash equivalents at Beginning of year 2021 was $35,584,000 and at End of year was $23,232,000 for Gencor Industries.

In [22]:
len(output['invocation_context']['prompts'][0])
output['invocation_context']

{'query': 'How much was the Cash and cash equivalents at Beginning of year 2021 and End of year of Gencor Industries?',
 'documents': [<Document: {'content': 'from financing activities:\n\nProceeds from stock option exercises\n264,000\n103,000\n\nCash flows provided by financing activities\n264,000\n103,000\n\nNet increase (decrease) in cash and cash equivalents\n(12,352,000)\n25,282,000\nCash and cash equivalents at:\n\nBeginning of year\n35,584,000\n10,302,000\nEnd of year\n$23,232,000\n$35,584,000\n\nNon-cash investing and financing activities:\n\nOperating lease right-of-use assets\n$248,000\n$942,000\nOperating lease liabilities\n$248,000\n$942,000\n\nSee accompanying Notes to Consolidated Financial Statements\n\n30\n\nGENCOR INDUSTRIES, INC.\nConsolidated Statements of Cash Flows\nFor the Years Ended September 30, 2021 and 2020\n\n2021\n2020\n\nCash flows from operating activities:\n\nNet income\n$5,805,000\n$5,531,000\nAdjustments to reconcile net income to cash provided by oper