In [1]:
#import sys

#!{sys.executable} -m pip install farm-haystack -f https://download.pytorch.org/whl/torch_stable.html
#!{sys.executable} -m pip install farm-haystack[ocr]

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting farm-haystack
  Using cached farm_haystack-1.9.1-py3-none-any.whl (733 kB)
Collecting mlflow
  Using cached mlflow-1.29.0-py3-none-any.whl (16.9 MB)
Collecting dill
  Using cached dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
Collecting more-itertools
  Using cached more_itertools-8.14.0-py3-none-any.whl (52 kB)
Collecting mmh3
  Using cached mmh3-3.0.0-cp39-cp39-win_amd64.whl (15 kB)
Collecting pydantic
  Using cached pydantic-1.10.2-cp39-cp39-win_amd64.whl (2.1 MB)
Collecting rapidfuzz<2.8.0,>=2.0.15
  Using cached rapidfuzz-2.7.0-cp39-cp39-win_amd64.whl (1.2 MB)
Collecting python-docx
  Using cached python_docx-0.8.11-py3-none-any.whl
Collecting elastic-apm
  Using cached elastic_apm-6.12.0-py3-none-any.whl
Collecting elasticsearch<7.11,>=7.7
  Using cached elasticsearch-7.10.1-py2.py3-none-any.whl (322 kB)
Collecting torch<1.13,>1.9
  Using cached https://download.pytorch.org/whl/cu116/torch-1.12.1%2Bcu116-

Collecting pdf2image>1.14
  Using cached pdf2image-1.16.0-py3-none-any.whl (10 kB)
Collecting pytesseract>0.3.7
  Using cached pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.16.0 pytesseract-0.3.10


In [1]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

### Test using InMemoryDocumentStore

In [2]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [3]:
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http
from haystack.nodes import PreProcessor


# Let's first get some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "Datasets/Watchman"

#s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip"
#fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
all_docs = convert_files_to_docs(dir_path=doc_dir, clean_func = lambda x:x.replace("\n"," "), split_paragraphs=True)

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=200,
    split_overlap=30,
    split_respect_sentence_boundary=True,
)
docs = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is: {"name": "<some-document-name>", "content": "<the-actual-text>"}

# Let's have a look at the first 3 entries:
print(docs[:3])

# Now, let's write the docs to our DB.
document_store.write_documents(docs)

INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\1-s2.0-S0002870317300881-main.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\Alternative to Warfarin – WATCHMAN Implant.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\How The WATCHMAN Device Works.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\WATCHMAN Implant for Non-Valvular Afib Stroke Risk.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\WATCHMAN Procedure vs. Blood Thinners.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\WATCHMAN_DTP_Brochure.pdf


Preprocessing:   0%|          | 0/6 [00:00<?, ?docs/s]

n_files_input: 6
n_docs_output: 93
[<Document: {'content': 'The Assessment of the Watchman Device in Patients Unsuitable for Oral Anticoagulation (ASAP-TOO) trial David R. Holmes, MD, a Vivek Y. Reddy, MD, b Maurice Buchbinder, MD, c Kenneth Stein, MD, d Myriah Elletson d Martin W. Bergmann, MD, e Boris Schmidt, MD, f and Jacqueline Saw, MD, FRCPC g Rochester, Minneapolis, MN; New York, NY; Stanford, CA; Hamburg, Frankfurt, Germany; and British Columbia, Canada Background Oral anticoagulants (OACs) reduce stroke risks with nonvalvular atrial fibrillation (AF); however, they are underused because of absolute or relative contraindications due to real or perceived risk of bleeding. Although left atrial appendage closure is increasingly performed in OAC-ineligible patients, this has not been studied in a randomized controlled trial. Study objectives The ASAP-TOO study is designed to establish the safety and effectiveness of the Watchman left atrial appendage closure device in patients with

Testing with InMemoryDocumentStore used TfidfRetriever, when use Elasticsearch can use BM25Retriever

In [4]:
from haystack.nodes import TfidfRetriever

retriever = TfidfRetriever(document_store=document_store)

INFO - haystack.nodes.retriever.sparse -  Found 93 candidate paragraphs from 93 docs in DB


In [5]:
from haystack.nodes import FARMReader


# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.infer -  Got ya 11 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0     0     0     0     0     0     0     0     0     0     0  
INFO - haystack.modeling.infer -  /w\   /w\   /w\   /w\   /w\   /w\   /w\   /|\   /w\   /w\   /w\ 
INFO - haystack.modeling.infer -  /'\   / \   /'\   /'\   / \   / \   /'\   /'\   /'\   /'\   /'\ 


In [6]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k for retriever, the better (but also the slower) your answers.
prediction = pipe.run(
    query="What is the size of the Watchman implant?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

In [None]:
# Now you can either print the object directly...
from pprint import pprint

pprint(prediction)

In [None]:
# ...or use a util to simplify the output
from haystack.utils import print_answers


# Change `minimum` to `medium` or `all` to control the level of detail
print_answers(prediction, details="medium")

### Test using ElasticSearchDocumentStore and BM25 Retriever

In [2]:
# Recommended: Start Elasticsearch using Docker via the Haystack utility function
#from haystack.utils import launch_es

#launch_es()



Used command line to start docker container  
docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:7.5.1

In [2]:
import time
time.sleep(30)

In [3]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")
document_store.delete_documents(index="document")

                1. delete_all_documents() method is deprecated, please use delete_documents method
                For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/1045
                


In [4]:
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http
from haystack.nodes import PreProcessor


# Let's first get some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "Datasets/Watchman"

#s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip"
#fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
all_docs = convert_files_to_docs(dir_path=doc_dir, clean_func = lambda x:x.replace("\n"," "), split_paragraphs=True)

for doc in all_docs:
    doc.meta['device'] = "Watchman"

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=200,
    split_overlap=30,
    split_respect_sentence_boundary=True,
)
docs = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is: {"name": "<some-document-name>", "content": "<the-actual-text>"}

# Let's have a look at the first 3 entries:
print(docs[:3])

# Now, let's write the docs to our DB.
document_store.write_documents(docs)

INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\1-s2.0-S0002870317300881-main.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\Alternative to Warfarin – WATCHMAN Implant.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\How The WATCHMAN Device Works.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\WATCHMAN Implant for Non-Valvular Afib Stroke Risk.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\WATCHMAN Procedure vs. Blood Thinners.pdf
INFO - haystack.utils.preprocessing -  Converting Datasets\Watchman\WATCHMAN_DTP_Brochure.pdf


Preprocessing:   0%|          | 0/6 [00:00<?, ?docs/s]

n_files_input: 6
n_docs_output: 93
[<Document: {'content': 'The Assessment of the Watchman Device in Patients Unsuitable for Oral Anticoagulation (ASAP-TOO) trial David R. Holmes, MD, a Vivek Y. Reddy, MD, b Maurice Buchbinder, MD, c Kenneth Stein, MD, d Myriah Elletson d Martin W. Bergmann, MD, e Boris Schmidt, MD, f and Jacqueline Saw, MD, FRCPC g Rochester, Minneapolis, MN; New York, NY; Stanford, CA; Hamburg, Frankfurt, Germany; and British Columbia, Canada Background Oral anticoagulants (OACs) reduce stroke risks with nonvalvular atrial fibrillation (AF); however, they are underused because of absolute or relative contraindications due to real or perceived risk of bleeding. Although left atrial appendage closure is increasingly performed in OAC-ineligible patients, this has not been studied in a randomized controlled trial. Study objectives The ASAP-TOO study is designed to establish the safety and effectiveness of the Watchman left atrial appendage closure device in patients with

Testing with InMemoryDocumentStore used TfidfRetriever, when use Elasticsearch can use BM25Retriever

In [5]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)

In [6]:
from haystack.nodes import FARMReader


# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.infer -  Got ya 11 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0     0     0     0     0     0     0     0     0     0     0  
INFO - haystack.modeling.infer -  /w\   /w\   /w\   /w\   /w\   /w\   /w\   /|\   /w\   /w\   /w\ 
INFO - haystack.modeling.infer -  /'\   / \   /'\   /'\   / \   / \   /'\   /'\   /'\   /'\   /'\ 


In [7]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [8]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k for retriever, the better (but also the slower) your answers.
prediction = pipe.run(
    query="What are the risks of having a Watchman implant procedure?",
    params={"Retriever": {"top_k": 15}, "Reader": {"top_k": 5}, "filters":{"device":["Watchman"]}}
)

Inferencing Samples: 100%|██████████| 1/1 [00:30<00:00, 30.55s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.93 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.25 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.90 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.88 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.87 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  8.09 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.93 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.93 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.93 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.54 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.85 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.14 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00

In [9]:
# Now you can either print the object directly...
from pprint import pprint

pprint(prediction)

{'answers': [<Answer {'answer': 'stroke risk and bleeding worry', 'type': 'extractive', 'score': 0.7876741886138916, 'context': 'dure is a trusted alternative that permanently reduces both stroke risk and bleeding worry. With almost 20 years of clinical and real-world experience', 'offsets_in_document': [{'start': 505, 'end': 535}], 'offsets_in_context': [{'start': 60, 'end': 90}], 'document_id': '9e35ababdb14c8de0420cc0e7d6e3ca', 'meta': {'device': 'Watchman', '_split_id': 0, 'name': 'WATCHMAN_DTP_Brochure.pdf'}}>,
             <Answer {'answer': 'accidental heart puncture, air embolism, allergic reaction, anemia, anesthesia risks', 'type': 'extractive', 'score': 0.6158087849617004, 'context': 's include but are not limited to accidental heart puncture, air embolism, allergic reaction, anemia, anesthesia risks, arrhythmias, AV (Arteriovenous)', 'offsets_in_document': [{'start': 546, 'end': 630}], 'offsets_in_context': [{'start': 33, 'end': 117}], 'document_id': '438ddeafa3fc513634e5e2

In [10]:
# ...or use a util to simplify the output
from haystack.utils import print_answers


# Change `minimum` to `medium` or `all` to control the level of detail
print_answers(prediction, details="medium")


Query: What are the risks of having a Watchman implant procedure?
Answers:
[   {   'answer': 'stroke risk and bleeding worry',
        'context': 'dure is a trusted alternative that permanently reduces '
                   'both stroke risk and bleeding worry. With almost 20 years '
                   'of clinical and real-world experience',
        'score': 0.7876741886138916},
    {   'answer': 'accidental heart puncture, air embolism, allergic reaction, '
                  'anemia, anesthesia risks',
        'context': 's include but are not limited to accidental heart '
                   'puncture, air embolism, allergic reaction, anemia, '
                   'anesthesia risks, arrhythmias, AV (Arteriovenous)',
        'score': 0.6158087849617004},
    {   'answer': 'patients should not be considered for the WATCHMAN Implant',
        'context': 'ue to the risk of having a medical procedure, patients '
                   'should not be considered for the WATCHMAN Implant if they 