## Get stuff from arXiv

In [1]:
import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [54]:
import arxiv
client = arxiv.Client()

def get_pdf(input: str, output_path): 
    """ Accepts user input and downloads relevant pdf. 
        - validate input.. 
        - Currently only accepts arXiv id, but can expand search query functionality. 
        https://info.arxiv.org/help/api/user-manual.html#_details_of_atom_results_returned. 
        - if user provides url to pdf, can directly call read_pdf with the url.. 
        - get multiple papers?
        - output to cloud.. 
        - generate (somewhere in code) f string with filename 
          for easy storage (and later retrieval)

    """
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[input])))
    paper.download_pdf(dirpath = output_path)
    

    

In [55]:
get_pdf(input = '1706.03762', output_path='retrieved_data')

In [None]:
# From the arXiv API : use python to retrieve 
import urllib, urllib.request
url = 'https://arxiv.org/abs/1705.02315'
data = urllib.request.urlopen(url)
print(data.read().decode('utf-8'))


In [3]:
# from https://pypi.org/project/arxiv/

import arxiv
client = arxiv.Client()
search_by_id = arxiv.Search(id_list = ['2309.11838'])
sample = next(client.results(search_by_id))

In [4]:
sample, sample.title
sample.download_pdf(filename='main_doc.pdf', dirpath = 'retrieved_data')

'retrieved_data/main_doc.pdf'

In [21]:
search = arxiv.Search(
    query= 'Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and \
        Percy Liang. 2016. SQuAD: 100,000+ questions for\
        machine comprehension of text. In Proceedings of\
        the 2016 Conference on Empirical Methods in Natu-\
        ral Language Processing, pages 2383–2392, Austin,\
        Texas. Association for Computational Linguistics.', 
    max_results = 3,
    )

In [29]:
next(client.results(search)).download_pdf(filename='doc1.pdf', dirpath='retrieved_data')

'retrieved_data/doc1.pdf'

## Reading PDF

In [33]:
from llama_index.readers.file import PyMuPDFReader

def read_pdf(filepath): 
    ''' Uses PyMuPDFReader to parse PDF and return Document Object.
    '''

    loader = PyMuPDFReader()
    doc = loader.load(filepath)
    return doc


In [34]:
file = 'retrieved_data/1706.03762v7.Attention_Is_All_You_Need.pdf'
doc = read_pdf(file)

In [3]:
from llmsherpa.readers import LayoutPDFReader

def read_pdf_old(filepath): 
    """ Uses LLMSherpa API to parse PDF. Returns a llama-index Document.
    
    Can later do it locally with their public docker image.
    https://www.reddit.com/r/LocalLLaMA/comments/18lwa5c/alternative_tool_like_llmsherpa_that_can_run/
    """
    # This doc does not work with the node parser. 
    
    llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
    pdf_reader = LayoutPDFReader(llmsherpa_api_url)
    doc = pdf_reader.read_pdf(filepath)
    return doc


In [4]:

test_doc = read_pdf_old(file)

### trial and error

In [None]:
# Trials

llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_url = 'retrieved_data/main_doc.pdf'
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
doc = pdf_reader.read_pdf(pdf_url)

In [61]:
## Does this work with direct url to arxiv?

new_pdf_url = 'https://arxiv.org/pdf/2110.04770.pdf'
doc2 = pdf_reader.read_pdf(new_pdf_url)

[https://ambikasukla.substack.com/p/efficient-rag-with-document-layout?r=ft8uc&utm_campaign=post&utm_medium=web&triedRedirect=true]

In [27]:
type(doc2)

llmsherpa.readers.layout_reader.Document

In [None]:
doc2.to_html()

## Get references

In [57]:
def get_arxiv_references(filepath) -> set:
    """ Returns set of all arXiv ids for references in the paper.
        - Uses LayoutPDF reader, so 
        Necessarily limited to references that exist in arXiv.
    """

    def read_pdf_old(filepath): 
        """ Uses LLMSherpa API to parse PDF. Returns a llama-index Document.
        
        Can later do it locally with their public docker image.
        https://www.reddit.com/r/LocalLLaMA/comments/18lwa5c/alternative_tool_like_llmsherpa_that_can_run/
        """
        # This doc does not work with the node parser. 
        llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
        pdf_reader = LayoutPDFReader(llmsherpa_api_url)
        doc = pdf_reader.read_pdf(filepath)
        return doc

    doc = read_pdf_old(filepath)
    patterns = [r"\w+-\w+/\d{7}", r"\d{4}\.\d{4,5}"]

    refs = set()
    for pattern in patterns:
        [refs.add(item) for item in re.findall(pattern, doc.to_text())]
    return refs

    

In [59]:
ids = get_arxiv_references(file)

In [48]:
import re

def get_arxiv_references(doc)-> set:
     """ Returns set of all arXiv ids for references in the paper.
     Uses LayoutPDFReader object. 

     Necessarily limited to references that exist in arXiv. 
     """
     patterns = ['\w+-\w+/\d{7}', '\d{4}\.\d{4,5}']
     
     refs = set()
     for pattern in patterns: 
          [refs.add(item) for item in re.findall(pattern, doc.to_text())]
     return refs


In [119]:
test_set = get_arxiv_references(test_doc)
len(test_set)

22

### trial and error

In [100]:
#Trials 

import re
test_doc.to_text()


refs = [ref[1] for ref in re.findall('(arXiv:)*(\d{4}\.\d{4,5})', test_doc.to_text())]
refs = set(refs)
len(refs)

22

In [116]:
refs = set()
[refs.add(item) for item in re.findall('\d{4}\.\d{4,5}', test_doc.to_text())]
len(refs)

22

### Vector Store

In [None]:
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

def build_vector_store(doc): 
    ''' Builds in-memory vector store using a document.
        - uses OpenAI embedding atm
    '''
    node_parser = SentenceSplitter(chunk_size = 512, chunk_overlap = 50)
    nodes = node_parser.get_nodes_from_documents(doc)   

    embed_model = embed_model()

    for node in nodes: 
        node_embedding = embed_model.get_text_embedding(
            node.get_content(metadata_mode= 'all')
            )
        node.embedding = node_embedding

    index = VectorStoreIndex(nodes)
    return index

    def embed_model():
        import os
        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        return OpenAIEmbedding()
        

In [20]:
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.node_parser import SentenceSplitter

loader = PyMuPDFReader()
doc = loader.load(file_path='/Users/sonal/Documents/MSDS/spring2/entrepreneurship/Podsicle/retrieved_data/1706.03762v7.Attention_Is_All_You_Need.pdf')

node_parser = SentenceSplitter(chunk_size = 512, chunk_overlap = 50)
nodes = node_parser.get_nodes_from_documents(doc)

# Generate Embeddings
from llama_index.embeddings.openai import OpenAIEmbedding
embed_model = OpenAIEmbedding()

for node in nodes: 
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode= 'all')
        )
    node.embedding = node_embedding

index = VectorStoreIndex(nodes)

In [22]:
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_documents(doc)

In [23]:
type(index), type(nodes)

(llama_index.core.indices.vector_store.base.VectorStoreIndex, list)

In [18]:
index.storage_context.to_dict().keys()

dict_keys(['vector_store', 'doc_store', 'index_store', 'graph_store'])

In [1]:
# Can't seem to get the key from the environment variables
import openai

In [29]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("retrieved_data").load_data()

In [20]:
response = query_engine.query("explain the paper in 100 words")

from [https://docs.llamaindex.ai/en/stable/examples/low_level/vector_store/]

In [131]:
# ! pip install llama-index-readers-file pymupdf

# Load doc
from llama_index.readers.file import PyMuPDFReader

loader = PyMuPDFReader()
doc = loader.load(file_path='/Users/sonal/Documents/MSDS/spring2/entrepreneurship/Podsicle/retrieved_data/1706.03762v7.Attention_Is_All_You_Need.pdf')

In [132]:
# Parse into Nodes
from llama_index.core.node_parser import SentenceSplitter

node_parser = SentenceSplitter(chunk_size = 512, chunk_overlap = 50)
nodes = node_parser.get_nodes_from_documents(doc)

In [133]:
# Generate Embeddings
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding()
for node in nodes: 
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode= 'all')
        )
    node.embedding = node_embedding

In [137]:
from llama_index.core import VectorStoreIndex, StorageContext

index = VectorStoreIndex.from_documents(doc)

In [141]:
index.vector_store

<llama_index.core.vector_stores.simple.SimpleVectorStore at 0x111c67250>