In [None]:
%%bash

pip install --upgrade pip
pip install farm-haystack[colab,preprocessing,faiss,inference,file-conversion,pdf]==1.17.2

In [3]:
api_key  = "sk-xxxxxxxxxxxxxxxxxxxxxxxx"

# Index Papers

In [None]:
# index papers

from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.utils import convert_files_to_docs
from haystack.nodes import EmbeddingRetriever

document_store = FAISSDocumentStore(embedding_dim=1536, sql_url="sqlite:///data/faiss_document_store.db")


# OpenAI EmbeddingRetriever
retriever = EmbeddingRetriever(
   document_store=document_store,
   batch_size=8,
   embedding_model="text-embedding-ada-002",
   api_key=api_key,
   max_seq_len=1536
)


preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="sentence",
    split_length=7,
    split_overlap=1,
    split_respect_sentence_boundary=False,
)

doc_dir = "data/pdfdata"  # upload papers in pdf format here


all_docs = convert_files_to_docs(dir_path=doc_dir)
docs = preprocessor.process(all_docs)
document_store.write_documents(docs)
document_store.update_embeddings(retriever)

document_store.save(index_path="data/faiss_document_store_index.faiss", config_path="data/faiss_document_store_config.json")


# Search papers

In [None]:
question = "According to the author how conspiracy theories benefit populists?"

In [49]:
# load created in the previos step document store

from haystack.document_stores import FAISSDocumentStore
new_document_store = FAISSDocumentStore(faiss_index_path="data/faiss_document_store_index.faiss", faiss_config_path="data/faiss_document_store_config.json")

# Check if the DocumentStore is loaded correctly
assert new_document_store.faiss_index_factory_str == "Flat"

In [None]:
# init EmbeddingRetriever
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
   document_store=new_document_store,
   batch_size=8,
   embedding_model="text-embedding-ada-002",
   api_key=api_key,
   max_seq_len=1536
)

In [None]:
# finc relevant documents
from haystack.nodes import PromptNode

candidate_documents = retriever.retrieve(
    query=question,
    top_k=7
)

In [52]:
# answer question

prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo", api_key=api_key, default_prompt_template='question-answering-with-references')

from haystack.pipelines import Pipeline

pipe = Pipeline()
pipe.add_node(component=prompt_node, name="prompt_node", inputs=["Query"])

output = pipe.run(query=question, documents=candidate_documents)

In [None]:
result = {
    "answers" : output["answers"][0].answer
}

papers = set()

for i, d in enumerate(output["documents"]):
  s = d.content
  cleaned_string = s.encode('ascii', 'ignore').decode('utf-8')
  cleaned_string = cleaned_string.replace("\n", " ").replace("\x0c", " ")
  papers.add(d.meta["name"])
  result[f"Document {i +1}"] = {
      "title": d.meta["name"],
      "citation": cleaned_string
  }

  result["papers"] = papers

In [None]:
result