In [10]:
from langchain_community.document_loaders import PyPDFLoader   
loader = PyPDFLoader("arxiv_papers/0701061.pdf")
pages = loader.load_and_split()

print (pages[0].metadata)

{'source': 'arxiv_papers/0701061.pdf', 'page': 0}


In [31]:
import arxiv
name="0310058"
def get_paper(name):
     print (f"Searching for the paper with ID: {name}")
     if len(name) == 7:
          name = "nucl-ex/" + name
     try:
          search = arxiv.Search( id_list=[name] )
          paper = next(arxiv.Client().results(search))
          return paper
     
     except StopIteration:
          print(f"No results found for the ID: {name}")

     # if length of name is 7, then it is an old arxiv id , add prefix nucl-ex or hep-ex if nucl-ex was not found

     if name.startswith("nucl-ex/"):
          name = name.replace("nucl-ex/","hep-ex/")
          return get_paper(name)
     
     return None
     


paper=get_paper(name)


Searching for the paper with ID: 0310058


Skipping partial result: id


No results found for the ID: nucl-ex/0310058
Searching for the paper with ID: hep-ex/0310058


In [33]:
paper.get_short_id()

'hep-ex/0310058v2'

In [1]:
import os
import arxiv
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader

from langchain_chroma import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings


class Document:
    def __init__(self, content, metadata=None):
        self.page_content = content
        self.metadata = metadata if metadata is not None else {}

# =============================================================================
# ===========================Read PDFs=========================================

# def extract_text_and_metadata_from_pdfs(folder_path):
folder_path="test_docs"


# def extract_text_and_metadata_from_pdfs(folder_path):
folder_path="arxiv_papers"


def getDocument(filename):

    loader = PyPDFLoader(filename)
    pages = loader.load_and_split()

    print (pages[0].metadata)
    
    arxiv_id = filename.split("/")[-1].split(".pdf")[0]
    # if arxiv_id does not have a dot, add nucl-ex/ in front of it
    if "." not in arxiv_id:
        arxiv_id = "nucl-ex/" + arxiv_id
    search = arxiv.Search( id_list=[arxiv_id] )
    paper  = next(arxiv.Client().results(search))
    if paper:
        text_metadata = {
            "title": paper.title,
            "published": paper.published.strftime('%Y-%m-%d'),
            "authors": '\n'.join([f'{i+1}. {auth.name}' for i, auth in enumerate(paper.authors)])
        }
    else:
        text_metadata = {
                "title": paper.metadata.get("title", "Unknown"),
                "published": paper.metadata.get("creationDate", "Unknown"),
                "authors": paper.metadata.get("author", "Unknown")
            }

    text_metadata["arxiv_id"] = arxiv_id
   
   


    full_text = ""
    page_number = 1
    for page in pages:
        full_text += f"\n PAGE {page_number}\n"
        full_text += page.page_content
        page_number += 1
        
    document= Document(content=full_text, metadata=text_metadata) 
    print (f"Document {arxiv_id} has been loaded")

    return document


data = []

for filename in tqdm(os.listdir(folder_path)):
    if not filename.endswith(".pdf"):
        continue
    filename = os.path.join(folder_path, filename)
    print (f"Loading {filename}")
    print 
    document = getDocument(filename)
    data.append(document)


    



  from tqdm.autonotebook import tqdm, trange
 50%|█████     | 1/2 [00:00<00:00,  1.26it/s]

Document 0704.2915 has been loaded


100%|██████████| 2/2 [00:03<00:00,  1.88s/it]

Document 0704.0220 has been loaded



  warn_deprecated(


In [2]:


data = []

for filename in tqdm(os.listdir(folder_path)):
    if not filename.endswith(".pdf"):
        continue
    filename = os.path.join(folder_path, filename)
    document = getDocument(filename)
    data.append(document)
    

# =============================================================================
# ===========================Split Document====================================


# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# =============================================================================
# ===========================Store Document====================================

import chromadb

persistent_client = chromadb.PersistentClient(path="database")
collection = persistent_client.get_or_create_collection("arxiv_papers")

child_vectorstore = Chroma(
    client=persistent_client,
    embedding_function=embedding_function
)

# # The storage layer for the parent documents
parent_docstore = InMemoryStore()

child_splitter  = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
retriever = ParentDocumentRetriever(
    vectorstore=child_vectorstore, 
    docstore=parent_docstore,
    child_splitter=child_splitter)

retriever.add_documents(data)


In [3]:
doc

[<__main__.Document at 0x7fef791db010>]

In [6]:
from langchain.llms import OpenAI
from openai import OpenAI
import streamlit as st
from langchain_openai import ChatOpenAI
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
import dotenv
dotenv.load_dotenv()

openai_api_key=os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"

llm = ChatOpenAI(model_name="gpt-4o-mini")
# Define a custom retriever to handle parent-child structure

from langchain.prompts import PromptTemplate

system_prompt  = """
You are an expert on the STAR experiment, a high-energy nuclear physics experiment at the Relativistic Heavy Ion Collider (RHIC) at Brookhaven National Laboratory. \
Your task is to answer questions specifically related to the STAR experiment, its findings, technologies, and related topics.  \
Refrain any other topics by saying you will not answer questions about them and Exit right away here. DO NOT PROCEED. \
You are not allowed to use any other sources other than the provided search results. \

Generate a comprehensive, and informative answer strictly within 200 words or less for the \
given question based solely on the provided search results (URL and content). You must \
only use information from the provided search results. Use an unbiased and \
journalistic tone. Combine search results together into a coherent answer. Do not \
repeat text. You should use bullet points in your answer for readability. Make sure to break down your answer into bullet points.\
You should not hallicunate nor build up any references, Use only the `context` html block below and do not use any text within <ARXIV_ID> and </ARXIV_ID> except when citing in the end. 
Make sure not to repeat the same context. Be specific to the exact question asked for.\

Here is the response template:
---
# Response template 

- Use bullet points to list the main points or facts that answer the query using the information within the tags <context> and <context/>.  
- After answering, analyze the respective source links provided within <ARXIV_ID> and </ARXIV_ID> and keep only the unique links for the next step. Try to minimize the total number of unique links with no more than 10 unique links for the answer.
- You will strictly use no more than 10 most unique links for the answer.
- Use bulleted list of superscript numbers within square brackets to cite the sources for each point or fact. The numbers should correspond to the order of the sources which will be provided in the end of this reponse. Note that for every source, you must provide a URL and pages from where it is taken.
- End with a closing remark and a list of sources with their respective URLs and relevant pages as a bullet list explicitly with full links which are enclosed in the tag <ARXIV_ID> and </ARXIV_ID> respectively.\
---
Here is how an response would look like. Reproduce the same format for your response:
---
# Example response

Hello, here are some key points:

- The STAR (Solenoidal Tracker at RHIC) experiment is a major high-energy nuclear physics experiment conducted at the Relativistic Heavy Ion Collider (RHIC) at Brookhaven National Laboratory[^1^]
- The primary research goal of STAR is to study the properties of the quark-gluon plasma (QGP), a state of matter thought to have existed just after the Big Bang, by colliding heavy ions at nearly the speed of light[^2^]
- STAR utilizes a variety of advanced detectors to measure the thousands of particles produced in these collisions, including the Time Projection Chamber (TPC), the Barrel Electromagnetic Calorimeter (BEMC), and the Muon Telescope Detector (MTD)[^3^]
- Key findings from STAR include evidence for the QGP's near-perfect fluidity, the discovery of the "chiral magnetic effect," and insights into the spin structure of protons[^4^]

Sources

    [^1^][1]: https://arxiv.org/abs/nucl-ex/0005004, p. 3
    [^2^][2]: https://arxiv.org/abs/nucl-ex/0106003, p. 5-7
    [^3^][3]: https://arxiv.org/abs/nucl-ex/0501009, p. 1
    [^4^][4]: https://arxiv.org/abs/nucl-ex/0603028, p. 4

---

Where each of the references are taken from the corresponding <ARXIV_ID> in the context. Strictly do not provide title for the references \
Strictly do not repeat the same links. Use the numbers to cite the sources. \

If there is nothing in the context relevant to the question at hand, just say "Hmm, \
I'm not sure." or greet back. Don't try to make up an answer. Write the answer in the form of markdown bullet points.\
Make sure to highlight the most important key words in bold font. Dot repeat any context nor points in the answer.\

Anything between the following `context`  html blocks is retrieved from a knowledge \
bank, not part of the conversation with the user. The context are numbered based on its knowledge retrival and increasing cosine similarity index. \
Make sure to consider the order in which they appear context appear. It is an increasing order of cosine similarity index.\
The contents are formatted in latex, you need to remove any special characters and latex formatting before cohercing the points to build your answer.\
Write your answer in the form of markdown bullet points. You can use latex commands if necessary.
You will strictly cite no more than 10 unqiue citations at maximum from the context below.\
Make sure these citations have to be relavant and strictly do not repeat the context in the answer.

<context>
    {context}
<context/>

REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \
not sure." or greet back. Don't try to make up an answer. Anything between the preceding 'context' \
html blocks is retrieved from a knowledge bank, not part of the conversation with the \
user.\
Question: {question}
"""





In [7]:


from langchain.schema.runnable import RunnableMap
from langchain.schema import StrOutputParser

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain_core.runnables import RunnableParallel, RunnablePassthrough

output_parser = StrOutputParser()


from operator import itemgetter


def format_docs(docs):
    return f"\n\n".join(f'{i+1}. ' + doc.page_content.strip("\n") 
                        + f"<ARXIV_ID> {doc.metadata['arxiv_id']} <ARXIV_ID/>" 
                        for i, doc in enumerate(docs))


from langchain.schema.runnable import RunnableMap

system_template = PromptTemplate.from_template(system_prompt)

rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | system_template
    | llm
    | StrOutputParser()
)
rag_chain_with_source = RunnableMap(
    {"documents": retriever, "question": RunnablePassthrough()}
) | {
    "answer": rag_chain_from_docs,
}

rag_chain_with_source.invoke("what was the biggest systematic in THREE PARTICLE CORRELATIONS?")

{'answer': 'Hello, here are some key points:\n\n- The major sources of **systematic error** in the three-particle correlation analysis from the STAR experiment include uncertainties from **elliptic flow measurements** and **background normalization**.\n- Systematic uncertainty due to the elliptic flow (v2) was assessed by varying the measurements between the **reaction plane** and **4-particle cumulant methods**.\n- The analysis found that while variations in the hard-soft background and trigger flow individually fluctuate significantly with changes in elliptic flow, these variations tend to **cancel out** to first order, making the overall signal robust against those fluctuations.\n- Other systematic uncertainties arise from factors like the impact of requiring a correlated particle on the trigger particle flow, uncertainties in the **v4 parameterization**, and **multiplicity bias effects** on the soft-soft background.\n\nThis systematic robustness indicates confidence in the findings