# arXiv RAG

In [1]:
# import libraries
from utils import Utils, Preprocess, Inspect, FileSelector, QueryInterface
from unstructured_client import UnstructuredClient
from unstructured.staging.base import dict_to_elements
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.llms import openai

import logging
# Disable OpenAI and httpx logging
# Configure logging level for specific loggers by name
logging.getLogger("openai").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)

# instantiate Utils class
utils = Utils()

# instantiate File Selector class
s = FileSelector()

INFO: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO: NumExpr defaulting to 8 threads.




## PDF Selection UI

In [5]:

# select PDF to parse
result = s.get_file()

# unpack filename and prefix from result
filename, file_prefix = result

# instantiate Preprocess class with file path
doc = Preprocess(filename)



Dropdown(description='Select PDF:', options=('rtx_paper.pdf', 'runway_ml.pdf'), value='rtx_paper.pdf')

Dropdown(description='Parse?', options=(('Yes', 1), ('No', 0)), value=1)

Button(description='Execute', style=ButtonStyle())

Output()

## Preprocessing Pipeline


In [None]:

# RUN THIS CELL TO MAKE FUNCTIONS AVAILABLE


def parse_file(doc: object) -> tuple[list[dict], object]:
    # read in the file
    files = doc.read_file()

    # build request (instruct the API on how to parse the file)
    req = doc.partition_file(files, strategy='hi_res', model_name='yolox')

    # store the parsed file as records (list of dictionaries) and Elements (object)
    records, elements = doc.get_structured_text(client, req)

    return records, elements
    
def filter_and_add_metadata(doc: object, records: list[dict], elements: object) -> list[dict]:
    # instantiate Inspect class with records and elements
    inspector = Inspect(records, elements)
    
    # get header and reference ids to filter from pdf_elements
    header_id, references_id = inspector.get_references_and_header_id(records)

    # remove child elements from the references and header section
    pdf_data = [el for el in elements if el.metadata.parent_id not in (references_id, header_id)]

    # get dictionary of all Title elements with unique IDs
    section_ids = inspector.get_section_id_dict()

    # 1. convert elements to records, 2. add section title to metadata
    pdf_data = doc.add_parent_to_metadata(pdf_data, section_ids)

    # remove references and header parent elements
    pdf_data = [record for record in pdf_data if record['element_id'] not in (references_id, header_id)]

    # save records as JSON
    json_name = file_prefix + '.json'
    utils.save_json_line_by_line(json_name, pdf_data)
    
    return pdf_data

def preprocess_pipeline(doc):
    records, elements = parse_file(doc)
    
    data = filter_and_add_metadata(doc, records, elements)
    
    return data

## Build Query Interface

In [4]:

# THIS CODE RUNS PREPROCESSING, INDEXES DOCUMENT, AND BUILDS QUERY INTERFACE 

# if prefix = None, skip preprocessing because json exists, else preprocess
if file_prefix is not None:
    # implement preprocessing pipeline
    UNSTRUCTURED_API_KEY = utils.get_api_key("UNSTRUCTURED")
    client = UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY)
    
    data = preprocess_pipeline(doc)
    
else:
    # load existing json 
    data = utils.load_json_line_by_line(filename)
    
# convert json to Document object (chunk and join text into one document) 
document = doc.json_to_doc(data)

# define a service context that contains both the llm and the embedding model
llm = openai.OpenAI(model='gpt-3.5-turbo', temperature=0.1)
service_context = ServiceContext.from_defaults(llm=llm, embed_model='local:BAAI/bge-small-en-v1.5')
index = VectorStoreIndex.from_documents([document], service_context=service_context)

# setup query engine
query_engine = index.as_query_engine()

# launch UI
query_interface = QueryInterface(query_engine)
query_interface.display()

Data loaded successfully from rtx_paper.json
INFO: Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5


  service_context = ServiceContext.from_defaults(llm=llm, embed_model='local:BAAI/bge-small-en-v1.5')


INFO: 2 prompts are loaded, with the keys: ['query', 'text']


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Text(value='', description='Query:', layout=Layout(width='100%'), placeholder='Enter your query')

Button(description='Submit', icon='check', style=ButtonStyle(), tooltip='Click to submit query')

Output()