### Setup

In [1]:
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

In [2]:
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]
!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz
!tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
!pip install 'farm-haystack[faiss]'
!pip install Pillow==9.0.0
clear_output()

### Import Libraries

In [18]:
from haystack.nodes import PDFToTextConverter,PreProcessor
from haystack.document_stores.faiss import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline
import time

In [12]:
from google.colab import files
uploaded = files.upload()

document = "/content/Machine_Vision_and_SNA.pdf"

Saving Machine_Vision_and_SNA.pdf to Machine_Vision_and_SNA (1).pdf


### Pre-processing

In [13]:
pdf_converter = PDFToTextConverter(
    remove_numeric_tables=True,
    valid_languages=["en"]
)

converted = pdf_converter.convert(file_path=document, meta={"company": "Company_1", "processed":False})

In [14]:
preprocessor = PreProcessor(split_by="word", split_length=200, split_overlap=10)
preprocessed = preprocessor.process(converted)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

In [19]:
timestr = time.strftime("%Y%m%d-%H%M%S")

document_store = FAISSDocumentStore(
    sql_url='sqlite:///'+timestr+'_document_store.db', faiss_index_factory_str="Flat", return_embedding=True)
document_store.delete_documents()
document_store.write_documents(preprocessed)

Writing Documents:   0%|          | 0/63 [00:00<?, ?it/s]

### Update Embedding

In [26]:
retriever = EmbeddingRetriever(document_store=document_store,
                               embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
                               model_format="sentence_transformers")
reader = FARMReader(model_name_or_path='deepset/tinyroberta-squad2', use_gpu=False)
document_store.update_embeddings(retriever)

Updating Embedding:   0%|          | 0/63 [00:00<?, ? docs/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [27]:
pipeline = ExtractiveQAPipeline(reader, retriever)

### Testing

In [28]:
%%time
questions = 'what is this research aim for?'

prediction = pipeline.run(query=questions, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 10}})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:04<00:00,  5.00s/ Batches]

CPU times: user 5.18 s, sys: 70.8 ms, total: 5.25 s
Wall time: 5.24 s





In [29]:
print('Top 5 Answers: \n')

for i in range(0,5):
  print('Answer Number', i+1)
  print('Q:', prediction['query'])
  print('A:', prediction['answers'][i].answer)
  print('Context:', prediction['answers'][i].context)
  print('score: ',prediction['answers'][i].score)
  print('\n')

Top 5 Answers: 

Answer Number 1
Q: what is this research aim for?
A: capturing and analyzing tweets
Context: der, B. (2014). Programmed method: Developing
a toolset for capturing and analyzing tweets. Aslib Journal
of Information Management, 66(3), 262–278. h
score:  0.948314905166626


Answer Number 2
Q: what is this research aim for?
A: To arrive at specific visual themes that resonated
Context: htag space.
Automated Object Annotation of
Images
To arrive at specific visual themes that resonated, this study
includes object labels that are gener
score:  0.8876258134841919


Answer Number 3
Q: what is this research aim for?
A: computational interpretation of visual data
Context: dia poses specific methodological challenges, which in turn have directed scholarly attention toward
the computational interpretation of visual data. 
score:  0.7542051672935486


Answer Number 4
Q: what is this research aim for?
A: deep learning algorithms on social media
Context: g it real: From faces and fea