In [None]:
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]

In [None]:
!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz
!tar -xvf xpdf-tools-linux-4.03.tar.gz
!sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin

In [None]:
from haystack.modeling.utils import initialize_device_settings
devices, n_gpu = initialize_device_settings(use_cuda=True)

In [None]:
from typing import List
import requests
import pandas as pd

from haystack import Document
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.utils import convert_files_to_dicts, fetch_archive_from_http, clean_wiki_text
from haystack.nodes import RAGenerator, EmbeddingRetriever, DensePassageRetriever
# from haystack.document_stores.faiss import FAISSDocumentStore -> We will use ElasticSearch Document Store

In [None]:
# Alternative in Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [None]:
# from google.colab import files
# uploaded = files.upload()

In [None]:
root_dir = "../data"
doc_dir = "/QA_data_1"

all_docs = convert_files_to_dicts(root_dir,clean_func=clean_wiki_text, split_paragraphs=True)
#all_docs = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

INFO - haystack.utils.preprocessing -  Converting ../data/QA_data_1.pdf


In [None]:
preprocessor = PreProcessor(
    split_length=200,
    split_overlap=0,
    split_respect_sentence_boundary=False,
    clean_empty_lines=False,
    clean_whitespace=False
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#Checking if the data is processed.
docs = preprocessor.process(all_docs)
print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

100%|██████████| 1/1 [00:00<00:00, 2790.62docs/s]

n_files_input: 1
n_docs_output: 4





In [None]:
from haystack.utils import launch_es
#These indexes are for evaluation data.
doc_index = "eval_docs"
label_index = "eval_labels"
from haystack.document_stores import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore()




In [None]:
# Adding documents to the document store
document_store.delete_documents()
document_store.write_documents(all_docs)

In [None]:
#initializing DPR Retriever
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
    max_seq_len_passage=256,
)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/493 [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-ctx_encoder-single-nq-base


In [None]:
# Must update the embeddings
document_store.update_embeddings(retriever,update_existing_embeddings = False)

INFO - haystack.document_stores.elasticsearch -  Updating embeddings for 1 docs without embeddings ...


Updating embeddings:   0%|          | 0/1 [00:00<?, ? Docs/s]

In [None]:
# Describing documents
#document_store.describe_documents(index = doc_index)
# display(docs[0].keys())
# all_docs[0]["meta"]
#dicts = convert_files_to_dicts(dir_path=root_dir, clean_func=clean_wiki_text, split_paragraphs=True)
dicts[0].meta

In [None]:
newdocs = retriever.embed_documents(dicts)

In [None]:
test = document_store.get_all_documents()
display(test)

[<Document: {'content': "No student of a foreign language needs to be told that grammar is complex. By changing word sequences and by adding a range of auxiliary verbs and suffixes, we are able to communicate tiny variations in meaning. We can turn a statement into a question, state whether an action has taken place or is soon to take place, and perform many other word tricks to convey subtle differences in meaning. Nor is this complexity inherent to the English language. All languages, even those of socalled 'primitive' tribes have clever grammatical components. The Cherokee pronoun system, for example, can distinguish between 'you and I', 'several other people and I' and 'you, another person and I'. In English, all these meanings are summed up in the one, crude pronoun 'we'. Grammar is universal and plays a part in every language, no matter how widespread it is. So the question which has baffled many linguists is - who created grammar?\nAt first, it would appear that this question is

In [None]:
document_store.return_embedding = True

In [None]:
#initializing RAG Generator
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
  f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load

Downloading:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

Some weights of RagTokenForGeneration were not initialized from the model checkpoint at facebook/rag-token-nq and are newly initialized: ['rag.generator.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from haystack.pipelines import GenerativeQAPipeline
from haystack.utils import print_answers

In [None]:
pipe = GenerativeQAPipeline(generator = generator, retriever = retriever)
res = pipe.run(query = "Languages are created by",  params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})

Exception: ignored

In [None]:
print_answers(res, details="minimum")

In [None]:
# document_store.write_documents(all_docs)
# document_store.update_embeddings(retriever)

# document_store.save("my_faiss_index.faiss")
# new_document_store = FAISSDocumentStore.load("my_faiss_index.faiss")

In [None]:
# Installing the evaluation data
from haystack.utils import fetch_archive_from_http

# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers
doc_dir = "../data/nq"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

INFO - haystack.utils.import_utils -  Found data stored in `../data/nq`. Delete this first if you really want to fetch new data.


False

In [None]:
# Trying to evaluate the pipeline


# The add_eval_data() method converts the given dataset in json format into Haystack document and label objects. 
# Those objects are then indexed in their respective document and label index in the document store. 
# The method can be used with any dataset in SQuAD format.
document_store.add_eval_data(
    filename="../data/nq/nq_dev_subset_v2.json",
    doc_index=doc_index,
    label_index=label_index,
    preprocessor=preprocessor,
)
# document_store.update_embeddings(retriever=retriever)

In [None]:

from haystack.schema import EvaluationResult, MultiLabel
# We can load evaluation labels from the document store
eval_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=False)

In [None]:
test = document_store.get_all_documents()

In [None]:

eval_result = pipe.eval(
    labels=eval_labels,
    params={"Retriever": {"top_k": 5}}
)

Exception: ignored

In [None]:
retriever_result = eval_result["Retriever"]
retriever_result.head()

Unnamed: 0,query,gold_document_contents,content,gold_id_match,answer_match,gold_id_or_answer_match,rank,document_id,gold_document_ids,type,node,eval_mode
0,who is written in the book of life,"[Book of Life - wikipedia Book of Life Jump to: navigation, search This arti...","Children appear to have innate grammatical machinery in their brains, which ...",0.0,0.0,0.0,1.0,9409f2a7f05da84c7aa066a1f4aff54f,"[1b090aec7dbd1af6739c4c80f8995877-0, 1b090aec7dbd1af6739c4c80f8995877-1]",document,Retriever,integrated
1,who is written in the book of life,"[Book of Life - wikipedia Book of Life Jump to: navigation, search This arti...",The Creators of Grammar\nNo student of a foreign language needs to be told t...,0.0,0.0,0.0,2.0,15f3162930e1ae263991cea0533257d5,"[1b090aec7dbd1af6739c4c80f8995877-0, 1b090aec7dbd1af6739c4c80f8995877-1]",document,Retriever,integrated
2,who is written in the book of life,"[Book of Life - wikipedia Book of Life Jump to: navigation, search This arti...","Although it was based on the signs of the older children, the younger childr...",0.0,0.0,0.0,3.0,d9b10a9dd09dc45bb92332b617aaa174,"[1b090aec7dbd1af6739c4c80f8995877-0, 1b090aec7dbd1af6739c4c80f8995877-1]",document,Retriever,integrated
3,who is written in the book of life,"[Book of Life - wikipedia Book of Life Jump to: navigation, search This arti...","The Cherokee pronoun system, for example, can distinguish between 'you and I...",0.0,0.0,0.0,4.0,e2aad86b68391c1682e06ea50f177976,"[1b090aec7dbd1af6739c4c80f8995877-0, 1b090aec7dbd1af6739c4c80f8995877-1]",document,Retriever,integrated
4,who is written in the book of life,"[Book of Life - wikipedia Book of Life Jump to: navigation, search This arti...",The Creators of Grammar\nNo student of a foreign language needs to be told t...,0.0,0.0,0.0,5.0,598355a3dded9fc7b5e6fb1d9a867a58,"[1b090aec7dbd1af6739c4c80f8995877-0, 1b090aec7dbd1af6739c4c80f8995877-1]",document,Retriever,integrated
