In [20]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [27]:
import os

from haystack.document_stores import ElasticsearchDocumentStore
from haystack.document_stores import BaseDocumentStore
# Get the host where Elasticsearch is running, default to localhost
#host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
# document_store = ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", port="9200", index="bom123")
# document_store = OpenSearchDocumentStore(host="localhost", port="9200", index="bom123")

In [22]:
from haystack.utils import fetch_archive_from_http

doc_dir = "data/build_a_scalable_question_answering_system"

fetch_archive_from_http(
    url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip",
    output_dir=doc_dir,
)


INFO - haystack.utils.import_utils -  Found data stored in 'data/build_a_scalable_question_answering_system'. Delete this first if you really want to fetch new data.


False

In [28]:
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor

indexing_pipeline = Pipeline()
text_converter = TextConverter()
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=200,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)


In [24]:
import os

indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])


In [None]:
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline.run_batch(file_paths=files_to_index)

In [26]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)


In [27]:
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)


INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
Downloading (…)lve/main/config.json: 100%|██████████| 571/571 [00:00<00:00, 75.3kB/s]
INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
Downloading model.safetensors: 100%|██████████| 496M/496M [00:30<00:00, 16.5MB/s] 
INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
Downloading (…)okenizer_config.json: 100%|██████████| 79.0/79.0 [00:00<00:00, 8.34kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 7.91MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 23.1MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 772/772 [00:00<00:00, 226kB/s]
INFO - haystack.modeling

In [28]:
from haystack import Pipeline

querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])


In [29]:
prediction = querying_pipeline.run(
    query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)


Inferencing Samples: 100%|██████████| 1/1 [00:05<00:00,  5.38s/ Batches]


In [30]:
from pprint import pprint

pprint(prediction)

{'answers': [<Answer {'answer': 'Eddard', 'type': 'extractive', 'score': 0.9933727979660034, 'context': "s Nymeria after a legendary warrior queen. She travels with her father, Eddard, to King's Landing when he is made Hand of the King. Before she leaves,", 'offsets_in_document': [{'start': 207, 'end': 213}], 'offsets_in_context': [{'start': 72, 'end': 78}], 'document_ids': ['9e3c863097d66aeed9992e0b6bf1f2f4'], 'meta': {'_split_id': 4, '_split_overlap': [{'range': [0, 266], 'doc_id': '241c8775e39c6c937c67bbd10ccc471c'}, {'range': [960, 1200], 'doc_id': '87e8469dcf7354fd2a25fbd2ba07c543'}]}}>,
             <Answer {'answer': 'Ned', 'type': 'extractive', 'score': 0.9753610491752625, 'context': "k in the television series.\n\n====Season 1====\nArya accompanies her father Ned and her sister Sansa to King's Landing. Before their departure, Arya's h", 'offsets_in_document': [{'start': 630, 'end': 633}], 'offsets_in_context': [{'start': 74, 'end': 77}], 'document_ids': ['7d3360fa29130e69ea6b2

In [31]:
from haystack.utils import print_answers

print_answers(prediction, details="minimum")  ## Choose from `minimum`, `medium` and `all`


'Query: Who is the father of Arya Stark?'
'Answers:'
[   {   'answer': 'Eddard',
        'context': 's Nymeria after a legendary warrior queen. She travels '
                   "with her father, Eddard, to King's Landing when he is made "
                   'Hand of the King. Before she leaves,'},
    {   'answer': 'Ned',
        'context': 'k in the television series.\n'
                   '\n'
                   '====Season 1====\n'
                   'Arya accompanies her father Ned and her sister Sansa to '
                   "King's Landing. Before their departure, Arya's h"},
    {   'answer': 'Lord Eddard Stark',
        'context': 'rk daughters.\n'
                   '\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Joffrey',
        'context': 'Mycah, sparring in the woods with broomsticks.  Arya '
    