In [None]:
!pip install flask
!pip install flask_ngrok


In [None]:
!apt-get install xpdf

In [None]:
mkdir -p data/amazon

In [None]:
!wget -P ./data/amazon/ https://s2.q4cdn.com/299287126/files/doc_financials/2020/q4/Amazon-Q4-2020-Earnings-Release.pdf
!wget -P ./data/amazon/ https://s2.q4cdn.com/299287126/files/doc_financials/2020/q3/AMZN-Q3-2020-Earnings-Release.pdf
!wget -P ./data/amazon/ https://s2.q4cdn.com/299287126/files/doc_financials/2020/q2/Q2-2020-Amazon-Earnings-Release.pdf
!wget -P ./data/amazon/ https://s2.q4cdn.com/299287126/files/doc_financials/2020/Q1/AMZN-Q1-2020-Earnings-Release.pdf

In [None]:
ls data/amazon

Amazon-Q4-2020-Earnings-Release.pdf  AMZN-Q3-2020-Earnings-Release.pdf
AMZN-Q1-2020-Earnings-Release.pdf    Q2-2020-Amazon-Earnings-Release.pdf


In [None]:
!nvidia-smi

In [None]:
# Install the latest master of Haystack
!pip install git+https://github.com/deepset-ai/haystack.git
!pip install urllib3==1.25.4

In [None]:
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

In [None]:
# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [None]:
# Connect to Elasticsearch

from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

In [None]:
directory = "data/amazon/"
from haystack.file_converter.pdf import PDFToTextConverter
from haystack.preprocessor.preprocessor import PreProcessor

converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["de","en"])

processor = PreProcessor(clean_empty_lines=True,
                         clean_whitespace=True,
                         clean_header_footer=True,
                         split_by="word",
                         split_length=200,
                         split_respect_sentence_boundary=True)
docs = []
for filename in os.listdir(directory):

    # Run the conversion on each file (PDF -> 1x doc)
    d = converter.convert(os.path.join(directory, filename), meta=None)

    # clean and split each dict (1x doc -> multiple docs)
    d = processor.process(d)
    docs.extend(d)

# Let's have a look at the first 3 entries:
print(docs[:3])

In [None]:
document_store.write_documents(docs)

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(docs)

In [None]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

In [None]:
from haystack.pipeline import ExtractiveQAPipeline
pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
prediction = pipe.run(query="Who is the new CEO?", top_k_retriever=20, top_k_reader=5)

In [None]:
print_answers(prediction, details="minimal")

In [None]:
import json
import os
import logging
from flask_ngrok import run_with_ngrok
from flask_cors import CORS
from flask import Flask, request, jsonify
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts
from haystack.reader.farm import FARMReader
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.file_converter.pdf import PDFToTextConverter
from haystack.retriever.dense import DensePassageRetriever
from haystack.retriever.sparse import ElasticsearchRetriever

#application settings
app = Flask(__name__)
CORS(app)


In [None]:
@app.route('/search',methods=['GET', 'POST'])
def search():
    """Return the n answers."""

    question = request.get_json()
    question = question['query']
 
    #initialization of the Haystack Elasticsearch document storage
    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

    # using pretrain model
    #reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

    # Finder sticks together reader and retriever
    # in a pipeline to answer our actual questions.
    finder = Finder(reader, retriever)

    prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=5)
    answer = []
    for res in prediction['answers']:
        answer.append(res['answer'])

    return json.dumps({'status':'success','message': 'Process succesfully', 'result': answer})


In [None]:
run_with_ngrok(app)
app.run()