In [1]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

# Install Elasticsearch

In [None]:
## ADD GPG and package source
# wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -
# echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-7.x.list

## INSTALL
# sudo apt-get update && sudo apt-get install elasticsearch

## INIT
# sudo update-rc.d elasticsearch defaults 95 10
# sudo -i service elasticsearch start
# sudo -i service elasticsearch stop

## CHECK
# curl http://localhost:9200/

In [8]:
import pandas as pd
import os, json

from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http, print_answers
from haystack import Document

from haystack.nodes.retriever import EmbeddingRetriever
from haystack.nodes import TableReader

In [3]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_index = "document"
document_store = ElasticsearchDocumentStore( host=host, index=document_index )



In [4]:
doc_dir = "data/tutorial15"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

INFO - haystack.utils.import_utils -  Found data stored in 'data/tutorial15'. Delete this first if you really want to fetch new data.


False

In [5]:
def read_json_to_tables(filename):
    processed_tables = []
    with open(filename) as tables:
        tables = json.load(tables)
        for key, table in tables.items():
            current_columns = table["header"]
            current_rows = table["data"]
            current_df = pd.DataFrame(columns=current_columns, data=current_rows)
            document = Document(content=current_df, content_type="table", id=key)
            processed_tables.append(document)

    return processed_tables

tables = read_json_to_tables(f"{doc_dir}/tables.json")
document_store.write_documents(tables, index=document_index)

# Showing content field and meta field of one of the Documents of content_type 'table'
print(tables[0].content)
print("="*50)
print(tables[0].meta)


                Opponent    M    W    L  T  NR   Win% First  Last
0            Afghanistan    2    2    0  0   0  100.0  2012  2014
1              Australia   98   32   62  1   3  34.21  1975  2017
2             Bangladesh   35   31    4  0   0  88.57  1986  2015
3                 Canada    2    2    0  0   0  100.0  1979  2011
4                England   82   31   49  0   2  38.75  1974  2017
5              Hong Kong    2    2    0  0   0  100.0  2004  2008
6                  India  129   73   52  0   4   58.4  1978  2017
7                Ireland    7    5    1  1   0  78.57  2007  2016
8                  Kenya    6    6    0  0   0  100.0  1996  2011
9                Namibia    1    1    0  0   0  100.0  2003  2003
10           Netherlands    3    3    0  0   0  100.0  1996  2003
11           New Zealand  103   53   47  1   2  52.97  1973  2018
12              Scotland    3    3    0  0   0  100.0  1999  2013
13          South Africa   73   25   47  0   1  34.72  1992  2017
14        

In [6]:
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/all-mpnet-base-v2-table")
document_store.update_embeddings(retriever=retriever)
retrieved_tables = retriever.retrieve("Who won the Super Bowl?", top_k=5)

print(retrieved_tables[0].content)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model deepset/all-mpnet-base-v2-table
  return self.fget.__get__(instance, owner)()
INFO - haystack.document_stores.search_engine -  Updating embeddings for all 1000 docs ...
Batches: 100%|██████████| 32/32 [00:34<00:00,  1.08s/it] Docs/s]
Updating embeddings: 10000 Docs [00:42, 237.35 Docs/s]          
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.07it/s]


                     Year                   Coach              Super Bowl  \
0                    1966          Vince Lombardi                       I   
1                    1967          Vince Lombardi                      II   
2                    1996           Mike Holmgren                    XXXI   
3                    2010           Mike McCarthy                     XLV   
4  Total Super Bowls won:  Total Super Bowls won:  Total Super Bowls won:   

                  Location                Opponent  Score Record  
0  Los Angeles, California      Kansas City Chiefs  35–10   12–2  
1           Miami, Florida         Oakland Raiders  33–14  9–4–1  
2   New Orleans, Louisiana    New England Patriots  35–21   13–3  
3         Arlington, Texas     Pittsburgh Steelers  31–25   10–6  
4   Total Super Bowls won:  Total Super Bowls won:      4      4  


In [10]:
reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq", max_seq_len=512)
table_doc = document_store.get_document_by_id("36964e90-3735-4ba1-8e6a-bec236e88bb2")
print(table_doc.content)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


                  Name        Program           Role       Salary     Year  \
0         Simon Cowell   The X Factor          Judge  $75 million  2012–13   
1       Britney Spears  American Idol    $25 million      2017–18     [15]   
2       Jennifer Lopez    $20 million        2011–12         [16]      nan   
3         Mariah Carey    $18 million        2012–13         [17]      nan   
4          Hugh Laurie          House  Gregory House  $15 million     2013   
5        Ryan Seacrest  American Idol           Host      2013–16     [14]   
6           Katy Perry   The X Factor          Judge         2012     [17]   
7          Miley Cyrus      The Voice          Coach  $13 million  2016–17   
8          Adam Levine        2016–18           [18]          nan      nan   
9        Blake Shelton        2016–18           [18]          nan      nan   
10  Christina Aguilera  $12.5 million           2013         [19]      nan   
11      Kelly Clarkson    $12 million           2018         [20



In [11]:
prediction = reader.predict(query="Who played Gregory House in the series House?", documents=[table_doc])
print_answers(prediction, details="all")

'Query: Who played Gregory House in the series House?'
'Answers:'
[   <Answer {'answer': 'Hugh Laurie', 'type': 'extractive', 'score': 1.0, 'context': [['Name', 'Program', 'Role', 'Salary', 'Year', 'Ref.'], ['Simon Cowell', 'The X Factor', 'Judge', '$75 million', '2012–13', '[14]'], ['Britney Spears', 'American Idol', '$25 million', '2017–18', '[15]', 'nan'], ['Jennifer Lopez', '$20 million', '2011–12', '[16]', 'nan', 'nan'], ['Mariah Carey', '$18 million', '2012–13', '[17]', 'nan', 'nan'], ['Hugh Laurie', 'House', 'Gregory House', '$15 million', '2013', '[14]'], ['Ryan Seacrest', 'American Idol', 'Host', '2013–16', '[14]', 'nan'], ['Katy Perry', 'The X Factor', 'Judge', '2012', '[17]', 'nan'], ['Miley Cyrus', 'The Voice', 'Coach', '$13 million', '2016–17', '[18]'], ['Adam Levine', '2016–18', '[18]', 'nan', 'nan', 'nan'], ['Blake Shelton', '2016–18', '[18]', 'nan', 'nan', 'nan'], ['Christina Aguilera', '$12.5 million', '2013', '[19]', 'nan', 'nan'], ['Kelly Clarkson', '$12 million', '2

In [12]:
print(f"Predicted answer: {prediction['answers'][0].answer}")
print(f"Meta field: {prediction['answers'][0].meta}")

Predicted answer: Hugh Laurie
Meta field: {'aggregation_operator': 'NONE', 'answer_cells': ['Hugh Laurie']}


In [13]:
# Initialize pipeline
from haystack import Pipeline

table_qa_pipeline = Pipeline()
table_qa_pipeline.add_node(component=retriever, name="EmbeddingRetriever", inputs=["Query"])
table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["EmbeddingRetriever"])
# prediction = table_qa_pipeline.run("When was Guilty Gear Xrd : Sign released?", params={"top_k": 3})
prediction = table_qa_pipeline.run("Who played Gregory House in the series House?", params={"top_k": 3})
print_answers(prediction, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  9.46it/s]


'Query: Who played Gregory House in the series House?'
'Answers:'
[   {   'answer': 'Christopher Columbus',
        'context':          Year                               Title  \
0   1978-1979                        The Riordans   
1   1980-1982                             Bracken   
2        1981                           Strangers   
3        1981  The Search for Alexander the Great   
4        1982                       Joyce in June   
5        1983                              Wagner   
6        1985                Christopher Columbus   
7        1985        Mussolini : The Untold Story   
8        1993                   Intimate Portrait   
9        1994                          Screen Two   
10       1995                       Buffalo Girls   
11       1995                 Saturday Night Live   
12       1996                            Draíocht   
13       1997                             Glenroe   
14       1997         Weapons of Mass Distraction   
15       2000            

In [14]:
print(f"Predicted answer: {prediction['answers'][0].answer}")
print(f"Meta field: {prediction['answers'][0].meta}")

Predicted answer: Christopher Columbus
Meta field: {'aggregation_operator': 'NONE', 'answer_cells': ['Christopher Columbus']}


In [21]:
# Add 500 text passages to our document store.

def read_texts(filename):
    processed_passages = []
    with open(filename) as passages:
        passages = json.load(passages)
        for key, content in passages.items():
            document = Document(content=content, content_type="text", id=key)
            processed_passages.append(document)

    return processed_passages


passages = read_texts(f"{doc_dir}/texts.json")
document_store.write_documents(passages, index=document_index)
document_store.update_embeddings(retriever=retriever, update_existing_embeddings=False)


Batches: 100%|██████████| 18/18 [00:09<00:00,  1.87it/s]Docs/s]
Updating embeddings: 10000 Docs [00:14, 703.44 Docs/s]         


In [22]:
from haystack.nodes import FARMReader, RouteDocuments, JoinAnswers

text_reader = FARMReader("deepset/roberta-base-squad2")
# In order to get meaningful scores from the TableReader, use "deepset/tapas-large-nq-hn-reader" or
# "deepset/tapas-large-nq-reader" as TableReader models. The disadvantage of these models is, however,
# that they are not capable of doing aggregations over multiple table cells.
table_reader = TableReader("deepset/tapas-large-nq-hn-reader")
route_documents = RouteDocuments()
join_answers = JoinAnswers()


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
Downloading (…)lve/main/config.json: 100%|██████████| 571/571 [00:00<00:00, 441kB/s]
INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
Downloading model.safetensors: 100%|██████████| 496M/496M [00:07<00:00, 70.6MB/s] 
INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
Downloading (…)okenizer_config.json: 100%|██████████| 79.0/79.0 [00:00<00:00, 75.7kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 3.21MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.58MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 772/772 [00:00<00:00, 2.57MB/s]
INFO - haystack.mo

In [23]:
text_table_qa_pipeline = Pipeline()
text_table_qa_pipeline.add_node(component=retriever, name="EmbeddingRetriever", inputs=["Query"])
text_table_qa_pipeline.add_node(component=route_documents, name="RouteDocuments", inputs=["EmbeddingRetriever"])
text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"])
text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"])
text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"])


In [25]:
# Example query whose answer resides in a text passage
predictions = text_table_qa_pipeline.run(query="Who was Thomas Alva Edison?")
# We can see both text passages and tables as contexts of the predicted answers.
print_answers(predictions, details="minimum")

Batches: 100%|██████████| 1/1 [00:00<00:00, 13.51it/s]
Traceback (most recent call last):
  File "/home/catsmile/miniconda3/envs/py310/lib/python3.10/site-packages/urllib3/connection.py", line 174, in _new_conn
    conn = connection.create_connection(
  File "/home/catsmile/miniconda3/envs/py310/lib/python3.10/site-packages/urllib3/util/connection.py", line 95, in create_connection
    raise err
  File "/home/catsmile/miniconda3/envs/py310/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/catsmile/miniconda3/envs/py310/lib/python3.10/site-packages/elasticsearch/connection/http_urllib3.py", line 255, in perform_request
    response = self.pool.urlopen(
  File "/home/catsmile/miniconda3/envs/py310/lib/python3.10/site-packages/urllib3/connectionpool.py", line 

Exception: Exception while running node 'EmbeddingRetriever': ConnectionError(<urllib3.connection.HTTPConnection object at 0x7fa37278b520>: Failed to establish a new connection: [Errno 111] Connection refused) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7fa37278b520>: Failed to establish a new connection: [Errno 111] Connection refused)
Enable debug logging to see the data that was passed when the pipeline failed.

: 

In [None]:
# Example query whose answer resides in a table
predictions = text_table_qa_pipeline.run(query="Which country does the film Macaroni come from?")
# We can see both text passages and tables as contexts of the predicted answers.
print_answers(predictions, details="minimum")


In [None]:
from haystack import Label, MultiLabel, Answer

def read_labels(filename, tables):
    processed_labels = []
    with open(filename) as labels:
        labels = json.load(labels)
        for table in tables:
            if table.id not in labels:
                continue
            label = labels[table.id]
            label = Label(
                query=label["query"],
                document=table,
                is_correct_answer=True,
                is_correct_document=True,
                answer=Answer(answer=label["answer"]),
                origin="gold-label",
            )
            processed_labels.append(MultiLabel(labels=[label]))
    return processed_labels


table_labels = read_labels(f"{doc_dir}/labels.json", tables)
passage_labels = read_labels(f"{doc_dir}/labels.json", passages)

In [None]:
eval_results = text_table_qa_pipeline.eval(table_labels + passage_labels, params={"top_k": 10})

In [None]:
# Calculating and printing the evaluation metrics
print(eval_results.calculate_metrics())