# Document retrieval

**Can we query the ArXiv documents and retrieve the _k_ most common documents based on semantic search?**

In [1]:
from haystack.pipelines import MostSimilarDocumentsPipeline, DocumentSearchPipeline
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import PreProcessor, EmbeddingRetriever
from haystack.utils import launch_milvus
from haystack.schema import Document

  if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):


In [2]:
from tqdm.notebook import tqdm
import pandas as pd

from typing import List, Dict, Optional, Union, Tuple
import logging
import pickle
import time
import os

In [3]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logger = logging.getLogger("haystack")
logger.setLevel(logging.INFO)

<h2>Load in ArXiv meta-data</h2>

In [4]:
arxiv_json_data = pd.read_json('../data/arxiv/arxiv-metadata-oai-snapshot.json', lines=True)

In [5]:
arxiv_json_data.shape

(2162833, 14)

In [6]:
arxiv_json_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2162833 entries, 0 to 2162832
Data columns (total 14 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   id              object
 1   submitter       object
 2   authors         object
 3   title           object
 4   comments        object
 5   journal-ref     object
 6   doi             object
 7   report-no       object
 8   categories      object
 9   license         object
 10  abstract        object
 11  versions        object
 12  update_date     object
 13  authors_parsed  object
dtypes: object(14)
memory usage: 231.0+ MB


In [7]:
arxiv_json_data = arxiv_json_data.sample(250000)

<h2>Preprocess data and create Documents</h2>

In [8]:
arxiv_json_data["title"] = arxiv_json_data["title"].str.replace("\n", "")
arxiv_json_data["abstract"] = arxiv_json_data["abstract"].str.replace("\n", "")
arxiv_json_data["content"] = arxiv_json_data[["title", "abstract"]].agg(" ".join, axis=1)

In [9]:
def create_document(row: pd.Series) -> Document:
    return Document(
            content=row.content,
            content_type="text",
            id=row.id,
            meta={
                "id": row.id, 
                "authors": row.authors,
                "comments": row.comments,
                "journal-ref": row["journal-ref"],
                "doi": row.doi,
                "report-no": row["report-no"],
                "categories": row.categories
            }
    )

In [10]:
documents = [
    create_document(row) for _, row in tqdm(arxiv_json_data.iterrows(), total=arxiv_json_data.shape[0])
]

  0%|          | 0/250000 [00:00<?, ?it/s]

In [11]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=250,
    split_respect_sentence_boundary=True
)

In [12]:
cleaned_docs = preprocessor.process(documents)

Preprocessing:   0%|          | 0/250000 [00:00<?, ?docs/s]



In [13]:
with open("../data/arxiv/sample.pkl", "wb") as pkl:
    pickle.dump(cleaned_docs, pkl)

<h2>Create document store</h2>

In [14]:
with open("../data/arxiv/sample.pkl", "rb") as pkl:
    cleaned_docs = pickle.load(pkl)

In [15]:
document_store = FAISSDocumentStore(
    sql_url="sqlite:///sample.db",
    faiss_index_factory_str="Flat"
)

In [16]:
document_store.write_documents(cleaned_docs)

Writing Documents:   0%|          | 0/260108 [00:00<?, ?it/s]

<h2>Create embeddings</h2>

In [17]:
retriever = EmbeddingRetriever(
    document_store=document_store,
   embedding_model="sentence-transformers/msmarco-distilbert-base-tas-b",
   model_format="sentence_transformers"
)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.nodes.retriever.dense:Init retriever using embeddings of model sentence-transformers/msmarco-distilbert-base-tas-b


In [18]:
document_store.update_embeddings(retriever)

INFO:haystack.document_stores.faiss:Updating embeddings for 260102 docs...


Updating Embedding:   0%|          | 0/260102 [00:00<?, ? docs/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
document_store.save("../data/arxiv/sample.faiss")

<h2>Document semantic search</h2>

<h3>Search with single query</h3>

In [20]:
pipeline = DocumentSearchPipeline(retriever)

In [24]:
result = pipeline.run("monte carlo simulation in the study of pulsars", params={"Retriever": {"top_k": 5}})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
print(result["documents"][0].content)

Population Synthesis of Normal Radio and Gamma-ray Pulsars Using Markov  Chain Monte Carlo Techniques   We present preliminary results of a pulsar population synthesis of normalpulsars from the Galactic disk using a Markov Chain Monte Carlo method tobetter understand the parameter space of the assumed model. We use the Kuipertest, similar to the Kolmogorov-Smirnov test, to compare the cumulativedistributions of chosen observables of detected radio pulsars with thosesimulated for various parameters. Our code simulates pulsars at birth usingMonte Carlo techniques and evolves them to the present assuming initialspatial, kick velocity, magnetic field, and period distributions. Pulsars arespun down to the present, given radio and gamma-ray emission characteristics,filtered through ten selected radio surveys, and a {\it Fermi} all-skythreshold map. Each chain begins with a different random seed and searches aten-dimensional parameter space for regions of high probability for a total ofone th

In [26]:
print(result["documents"][0].score)

0.7391471571930097


In [27]:
print(result["documents"][1].content)

Population study for gamma-ray pulsars with the outer gap model   Inspired by increase of population of $\gamma$-ray emitting pulsars by the$Fermi$ telescope, we perform a population study for $\gamma$-ray emittingcanonical pulsars. We use a Monte-Carlo technique to simulate the Galacticpopulation of neutron stars and the radio pulsars. For each simulated neutronstar, we consider the $\gamma$-ray emission from the outer gap accelerator inthe magnetosphere. In our outer gap model, we apply the gap closure mechanismproposed by Takata et al., in which both photon-photon pair-creation andmagnetic pair-creation processes are considered. Simulating the sensitivitiesof previous major radio surveys, our simulation predicts that there are $\sim18-23$ radio loud and $\sim 26-34$ $\gamma$-ray-selected $\gamma$-ray pulsars,which can be detected with a $\gamma$-ray flux $F_{\gamma}\ge10^{-10}~\mathrm{erg/cm^2 s}$. Applying the sensitivity of the six-monthobservation of the $Fermi$ telescope, 40-61 

In [28]:
print(result["documents"][1].score)

0.737303920741257


<h3>Search for most similar document</h3>

In [35]:
from haystack.pipelines import MostSimilarDocumentsPipeline

In [33]:
document_store.describe_documents(index=None)

{'count': 260102,
 'chars_mean': 949.4979969396621,
 'chars_max': 2123,
 'chars_min': 10,
 'chars_median': 920.0}

In [34]:
doc_generator = document_store.get_all_documents_generator()

In [37]:
example_doc = next(doc_generator)

In [39]:
example_doc

<Document: {'content': 'Enrichments of Boolean Algebras: a uniform treatment of some classical  and some novel examples   We give a unified treatment of the model theory of various enrichments ofinfinite atomic Boolean algebras, with special attention toquantifier-eliminations, complete axiomatizations and decidability. A classicalexample is the enrichment by a predicate for the ideal of finite sets, and anovel one involves predicates giving congruence conditions on the cardinalityof finite sets. We focus on three examples, and classify them by expressivepower.', 'content_type': 'text', 'score': None, 'meta': {'vector_id': '1', 'id': '1310.3527', 'authors': 'Jamshid Derakhshan, Angus Macintyre (accepted for publication in\n  Fundamenta Mathematicae)', 'comments': None, 'journal-ref': None, 'doi': None, 'report-no': None, 'categories': 'math.LO', '_split_id': 0}, 'embedding': None, 'id': '1001005e687fae0fe32ce32a387a788e'}>

In [38]:
msd_pipeline = MostSimilarDocumentsPipeline(document_store)

In [41]:
msd_pipeline.run(document_ids=["1001005e687fae0fe32ce32a387a788e"])

[[<Document: {'content': 'Enrichments of Boolean Algebras: a uniform treatment of some classical  and some novel examples   We give a unified treatment of the model theory of various enrichments ofinfinite atomic Boolean algebras, with special attention toquantifier-eliminations, complete axiomatizations and decidability. A classicalexample is the enrichment by a predicate for the ideal of finite sets, and anovel one involves predicates giving congruence conditions on the cardinalityof finite sets. We focus on three examples, and classify them by expressivepower.', 'content_type': 'text', 'score': 0.7724444979362711, 'meta': {'id': '1310.3527', 'authors': 'Jamshid Derakhshan, Angus Macintyre (accepted for publication in\n  Fundamenta Mathematicae)', 'comments': None, 'journal-ref': None, 'doi': None, 'report-no': None, 'categories': 'math.LO', '_split_id': 0, 'vector_id': '1'}, 'embedding': None, 'id': '1001005e687fae0fe32ce32a387a788e'}>,
  <Document: {'content': 'Complexity assessmen

<h3>Question answer system</h3>

In [42]:
from haystack.nodes import FARMReader

In [43]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


In [44]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [45]:
prediction = pipe.run(
    query="What is the average life span of a pulsar star?", 
    params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|███████| 1/1 [00:00<00:00,  6.32 Batches/s]


In [46]:
from haystack.utils import print_answers

print_answers(prediction, details="minimum")


Query: What is the average life span of a pulsar star?
Answers:
[   {   'answer': '4.0E4 years',
        'context': 'The average lifetime of stars in this phase isestimated to '
                   'be about 4.0E4 years, indicating they will undergo at most '
                   'onemore thermal pulse before lea'},
    {   'answer': '104 years',
        'context': ' a period derivative of 3.614 x10-13 s s-1 . Its '
                   'characteristic age of 104 years is comparable to '
                   'thatestimated for the SNR. It is conjectured that mo'},
    {   'answer': '350 kyr',
        'context': 'rellBank and Parkes show that it is young, with a '
                   'characteristic age of 350 kyr,and is in a 231-day, highly '
                   'eccentric orbit with a companion whose mas'},
    {   'answer': '723 yr',
        'context': 'ted with the supernovaremnant Kes 75. With a '
                   'characteristic age of only 723 yr, consistent with theage '
             