In [None]:
%%bash

pip install --upgrade pip
pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

In [64]:
import logging
from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http
from haystack.nodes import TfidfRetriever
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from pprint import pprint
from haystack.utils import print_answers
import pandas as pd

In [65]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
# In-Memory Document Store
document_store = InMemoryDocumentStore()

In [None]:
doc_dir = "/content/text"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

print(docs[:3])
document_store.write_documents(docs)

In [68]:
retriever = TfidfRetriever(document_store=document_store)

INFO:haystack.nodes.retriever.sparse:Found 1 candidate paragraphs from 1 docs in DB


In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

In [70]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [78]:
df = pd.read_csv('/content/QA_by_T5.csv')
questions = df['Question'].tolist()
t5answers = df['T5 Answer'].tolist()
contexts = df['Context'].tolist()
answers = []

In [72]:
len(questions)

124

In [None]:
for question in questions:
  prediction = pipe.run(
      query=question, params={"Retriever": {"top_k": 3}, "Reader": {"top_k": 3}}
  )
  temp = []
  temp.append(prediction['answers'][0].answer)
  answers.append(str(temp[0]))

In [82]:
import pandas as pd

In [85]:
dataframe = {
    'Context': contexts,
    'Question': questions,
    'T5 Answer': t5answers,
    'Heystack Answer': answers
}

In [86]:
df = pd.DataFrame(dataframe)

In [87]:
df.head()

Unnamed: 0,Context,Question,T5 Answer,Heystack Answer
0,"Egypt,officially the Arab Republic of Egypt, is a transcontinental country s...",What land bridge forms the border between Africa and Asia?,Sinai Peninsula,Sinai Peninsula
1,"Egypt,officially the Arab Republic of Egypt, is a transcontinental country s...",What is Egypt bordered by to the north?,the Mediterranean Sea,the Mediterranean Sea
2,"Egypt,officially the Arab Republic of Egypt, is a transcontinental country s...",What separates Egypt from Jordan and Saudi Arabia?,Gulf of Aqaba,The Gulf of Aqaba
3,"Egypt,officially the Arab Republic of Egypt, is a transcontinental country s...",What is the second-largest city in Egypt?,Alexandria,Alexandria
4,"Egypt,officially the Arab Republic of Egypt, is a transcontinental country s...",How many people live in Egypt?,100 million,100 million


In [89]:
df.to_csv('/content/Answering_by_Heystack.csv')