In [None]:
%%bash
pip install --upgrade pip
pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.3.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farm-haystack[colab]
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-install-lpok8by0/farm-haystack_6d706714d41f4bfab8a71c4040198259
  Resolved https://github.com/deepset-ai/haystack.git to commit 057a8c0b4f22ed858f2ac0a41a4f0dd2efa528e0
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started

  Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-install-lpok8by0/farm-haystack_6d706714d41f4bfab8a71c4040198259
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.48.2 requires grpcio>=1.48.2, but you have grpcio 1.47.0 which is incompatible.


In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

## Document Store


In [None]:
# In-Memory Document Store
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

INFO:haystack.telemetry:Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


## Preprocessing of documents

Haystack provides a customizable pipeline for:
 - converting files into texts
 - cleaning texts
 - splitting texts
 - writing them to a Document Store

In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch.

In [None]:
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http


# Let's first get some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "sample_data/Data"
#s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip"
#fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is: {"name": "<some-document-name>", "content": "<the-actual-text>"}

# Let's have a look at the first 3 entries:
print(docs[:3])

# Now, let's write the docs to our DB.
document_store.write_documents(docs)

INFO:haystack.utils.preprocessing:Converting sample_data/Data/15.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/1.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/14.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/9.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/2.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/11.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/3.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/12.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/10.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/6.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/8.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/7.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/5.txt
INFO:haystack.utils.preprocessing:Converting sample_data/Data/13.txt
INFO:haystack.utils.preprocessing:Converti

[<Document: {'content': 'In 1804, Napoleon Bonaparte crowned himself Emperor of France.\nHe set out to conquer neighbouring European countries, dispossessing\ndynasties and creating kingdoms where he placed members of his family.\nNapoleon saw his role as a moderniser of Europe. He introduced many\nlaws such as the protection of private property and a uniform system of\nweights and measures provided by the decimal system. Initially, many\nsaw Napoleon as a liberator who would bring freedom for the people.\nBut soon the Napoleonic armies came to be viewed everywhere as an\ninvading force. He was finally defeated at Waterloo in 1815. Many of his\nmeasures that carried the revolutionary ideas of liberty and modern laws\nto other parts of Europe had an impact on people long after Napoleon\nThe ideas of liberty and democratic rights were the most important\nlegacy of the French Revolution. These spread from France to the\nrest of Europe during the nineteenth century, where feudal systems we

## Initialize Retriever, Reader & Pipeline

### Retriever

Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered. 

With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more retrievers, please refer to the tutorial-1.

In [None]:
# An in-memory TfidfRetriever based on Pandas dataframes
from haystack.nodes import TfidfRetriever

retriever = TfidfRetriever(document_store=document_store)

INFO:haystack.nodes.retriever.sparse:Found 15 candidate paragraphs from 15 docs in DB


### Reader

A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
on powerful, but slower deep learning models.

Haystack currently supports Readers based on the frameworks FARM and Transformers.
With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).

**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2)

**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)

**Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)

**Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible"

#### FARMReader

In [None]:
from haystack.nodes import FARMReader


# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)


Downloading pytorch_model.bin:   0%|          | 0.00/473M [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.


Downloading tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


#### TransformersReader

Alternatively, we can use a Transformers reader:

In [None]:
# from haystack.nodes import FARMReader, TransformersReader
# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

### Pipeline

With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelines).

In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

 Voilà! Ask a question!

In [None]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k for retriever, the better (but also the slower) your answers.
prediction = pipe.run(
    query="Which battle was fought in 1815", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Inferencing Samples: 100%|██████████| 1/1 [00:20<00:00, 20.05s/ Batches]


In [None]:
# You can try asking more questions:

# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})

In [None]:
# Now you can either print the object directly...
from pprint import pprint
pprint(prediction)
# Sample output:
# {
#     'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
#                  <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
#                  ...
#                ]
#     'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
#                    <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
#                    ...
#                  ],
#     'no_ans_gap':  11.688868522644043,
#     'node_id': 'Reader',
#     'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
#     'query': 'Who is the father of Arya Stark?',
#     'root_node': 'Query'
# }

{'answers': [<Answer {'answer': 'Waterloo', 'type': 'extractive', 'score': 0.9419114589691162, 'context': ' be viewed everywhere as an\ninvading force. He was finally defeated at Waterloo in 1815. Many of his\nmeasures that carried the revolutionary ideas of ', 'offsets_in_document': [{'start': 589, 'end': 597}], 'offsets_in_context': [{'start': 71, 'end': 79}], 'document_id': '60df330d3733f1888c5dfdd50ac6f46b', 'meta': {'name': '15.txt'}}>,
             <Answer {'answer': 'the Bastille', 'type': 'extractive', 'score': 0.07627208530902863, 'context': 'arded ammunition. In the armed fight that followed,\nthe commander of the Bastille was killed and the prisoners released –\nthough there were only seven', 'offsets_in_document': [{'start': 608, 'end': 620}], 'offsets_in_context': [{'start': 69, 'end': 81}], 'document_id': '616bdfbc1340850459e42d0d01e34eb5', 'meta': {'name': '1.txt'}}>,
             <Answer {'answer': 'revolts against\nincreasing taxes and food scarcity', 'type': 'extracti

In [None]:
# ...or use a util to simplify the output
from haystack.utils import print_answers


# Change `minimum` to `medium` or `all` to control the level of detail
print_answers(prediction, details="minimum")


Query: French Society During the Late Eighteenth Century
Answers:
[   {   'answer': 'call a meeting of the Estates General',
        'context': 'pose taxes according to his will alone. Rather he had to\n'
                   'call a meeting of the Estates General which would then '
                   'pass his\n'
                   'proposals for new taxes. The '},
    {   'answer': 'French society in the eighteenth century was divided\n'
                  'into three estates',
        'context': 'n this measure would not\n'
                   'have sufficed. French society in the eighteenth century '
                   'was divided\n'
                   'into three estates, and only members of the third estate '},
    {   'answer': 'rest of Europe',
        'context': 'nt\n'
                   'legacy of the French Revolution. These spread from France '
                   'to the\n'
                   'rest of Europe during the nineteenth century, where feudal '
                   'sy