In [43]:
import os
import arxiv
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader

from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever

from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings



class Document:
    def __init__(self, content, metadata=None):
        self.page_content = content
        self.metadata = metadata if metadata is not None else {}


# def extract_text_and_metadata_from_pdfs(folder_path):
folder_path="test_docs"
data = []
# for filename in tqdm(os.listdir(folder_path)):
# if not filename.endswith(".pdf"):
#     continue
filename = "0704.0220.pdf"

filename = os.path.join(folder_path, filename)
arxiv_id = filename.split("/")[-1].split(".pdf")[0]
search = arxiv.Search(id_list=[arxiv_id])
paper  = next(arxiv.Client().results(search))
if paper:
    text_metadata = {
        "title": paper.title,
        "published": paper.published.strftime('%Y-%m-%d'),
        "authors": '\n'.join([f'{i+1}. {auth.name}' for i, auth in enumerate(paper.authors)])
    }
else:
    text_metadata = {
            "title": paper.metadata.get("title", "Unknown"),
            "published": paper.metadata.get("creationDate", "Unknown"),
            "authors": paper.metadata.get("author", "Unknown")
        }

print (text_metadata)
text_metadata["arxiv_id"] = arxiv_id


loader = PyPDFLoader(filename)
pages = loader.load_and_split()


full_text = ""
for  page in pages:
    full_text += page.page_content
document= Document(content=full_text, metadata=text_metadata) 

child_splitter  = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
child_chunks = child_splitter.split_documents([document])

print (f"Your document has been split into {len(child_chunks)} chunks")


# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# The vectorstore to use to index the child chunks


import chromadb

persistent_client = chromadb.PersistentClient(path="database")
collection = persistent_client.get_or_create_collection("parent_document_splits")

child_vectorstore = Chroma(
    client=persistent_client,
    embedding_function=embedding_function
)

    # # The storage layer for the parent documents
parent_docstore = InMemoryStore()

retriever_parent = ParentDocumentRetriever(
    vectorstore=child_vectorstore, 
    docstore=parent_docstore,
    child_splitter=child_splitter)
    
    
retriever_parent.add_documents([document])

list(parent_docstore.yield_keys())



    # for i, chunk in enumerate(chunks):            
    #     # add "page" to existing text_metadata from chunk.metadata
    #     page=chunk.metadata.get("page", "Unknown")
    #     text_metadata["page"]=page

    #     data.append({
    #             "id": f"{arxiv_id}_{i}",
    #             "text": chunk.page_content,
    #             "metadata": text_metadata
    #     })

#     return data


# data = extract_text_and_metadata_from_pdfs("test_docs")






{'title': 'Three Particle Correlations from STAR', 'published': '2007-04-02', 'authors': '1. Jason Glyndwr Ulery'}
Your document has been split into 120 chunks


  warn_deprecated(


['33480d63-90a4-4675-9c07-0f97c36032c4']

In [None]:


metadata_list = [item["metadata"] for item in data]
ids = [item["id"] for item in data]

texts = [item["text"] for item in data]
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts, show_progress_bar=True)

import chromadb
client = chromadb.PersistentClient(path="database")

collection = client.get_or_create_collection(name="arxiv_papers")

collection.add(
    ids=ids,
    documents =texts,
    embeddings=embeddings,
    metadatas=metadata_list
)

In [7]:
query = "experimental results"
results = collection.query(
    query_texts=query, # Chroma will embed this for you
    n_results=2 # how many results to return
)
results

{'ids': [['0704.2915_22', '0704.0220_68']],
 'distances': [[1.3720604181289673, 1.4036426544189453]],
 'metadatas': [[{'authors': '1. Monika Sharma\n2. Sunil Dogra\n3. Neeraj Gupta',
    'page': 5,
    'published': '2007-04-23',
    'title': 'Energy and System Size Dependence of Photon Production at Forward Rapidities at RHIC'},
   {'authors': '1. Jason Glyndwr Ulery',
    'page': 7,
    'published': '2007-04-02',
    'title': 'Three Particle Correlations from STAR'}]],
 'embeddings': None,
 'documents': [['a behaviour has been studied by BRAHMS5, PHOBOS6and STAR7. The STAR\nexperiment reported measurements of the pseudorapidity distrib ution in the for-',
   '1.45). It increases with centrality and signiﬁcantly deviates from zer o in central\nAu+Au collisions. The right panel shows the average signal were con ical emissions']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [None]:
from langchain.retrievers import ParentDocumentRetriever





In [None]:

from langchain.llms import OpenAI
from openai import OpenAI
import streamlit as st
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

import dotenv
dotenv.load_dotenv()


openai_api_key=os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
# Define a custom retriever to handle parent-child structure

from langchain.prompts import PromptTemplate

template = """
You are an expert on the STAR experiment, a high-energy nuclear physics experiment at the Relativistic Heavy Ion Collider (RHIC) at Brookhaven National Laboratory. \
Your task is to answer questions specifically related to the STAR experiment, its findings, technologies, and related topics.  \
 Refrain any other topics by saying you will not answer questions about them and Exit right away here. DO NOT PROCEED. \
You are not allowed to use any other sources other than the provided search results. \

Generate a comprehensive, and informative answer strictly within 200 words or less for the \
given question based solely on the provided search results (URL and content). You must \
only use information from the provided search results. Use an unbiased and \
journalistic tone. Combine search results together into a coherent answer. Do not \
repeat text. You should use bullet points in your answer for readability. Make sure to break down your answer into bullet points.\
You should not hallicunate nor build up any references, Use only the `context` html block below and do not use any text within <ARXIV_ID> and </ARXIV_ID> except when citing in the end. 
Make sure not to repeat the same context. Be specific to the exact question asked for.\


Here is the response template:
---
# Response template 

- Start with a greeting and a summary of the user's query
- Use bullet points to list the main points or facts that answer the query using the information within the tags <context> and <context/>.  
- After answering, analyze the respective source links provided within <ARXIV_ID> and </ARXIV_ID> and keep only the unique links for the next step. Try to minimize the total number of unique links with no more than 10 unique links for the answer.
- You will strictly use no more than 10 most unique links for the answer.
- Use bulleted list of superscript numbers within square brackets to cite the sources for each point or fact. The numbers should correspond to the order of the sources which will be provided in the end of this reponse. Note that for every source, you must provide a URL.
- End with a closing remark and a list of sources with their respective URLs as a bullet list explicitly with full links which are enclosed in the tag <ARXIV_ID> and </ARXIV_ID> respectively.\
---
Here is how an response would look like. Reproduce the same format for your response:
---
# Example response

Hello, thank you for your question about the STAR experiment. Here are some key points about STAR:

- The STAR (Solenoidal Tracker at RHIC) experiment is a major high-energy nuclear physics experiment conducted at the Relativistic Heavy Ion Collider (RHIC) at Brookhaven National Laboratory[^1^]
- The primary research goal of STAR is to study the properties of the quark-gluon plasma (QGP), a state of matter thought to have existed just after the Big Bang, by colliding heavy ions at nearly the speed of light[^2^]
- STAR utilizes a variety of advanced detectors to measure the thousands of particles produced in these collisions, including the Time Projection Chamber (TPC), the Barrel Electromagnetic Calorimeter (BEMC), and the Muon Telescope Detector (MTD)[^3^]
- Key findings from STAR include evidence for the QGP's near-perfect fluidity, the discovery of the "chiral magnetic effect," and insights into the spin structure of protons[^4^]

I hope this helps you understand more about the STAR experiment.
Sources

    [^1^][1]: https://arxiv.org/abs/nucl-ex/0005004
    [^2^][2]: https://arxiv.org/abs/nucl-ex/0106003
    [^3^][3]: https://arxiv.org/abs/nucl-ex/0501009
    [^4^][4]: https://arxiv.org/abs/nucl-ex/0603028

---

Where each of the references are taken from the corresponding <ARXIV_ID> in the context. Strictly do not provide title for the references \
Strictly do not repeat the same links. Use the numbers to cite the sources. \

If there is nothing in the context relevant to the question at hand, just say "Hmm, \
I'm not sure." or greet back. Don't try to make up an answer. Write the answer in the form of markdown bullet points.\
Make sure to highlight the most important key words in bold font. Dot repeat any context nor points in the answer.\

Anything between the following `context`  html blocks is retrieved from a knowledge \
bank, not part of the conversation with the user. The context are numbered based on its knowledge retrival and increasing cosine similarity index. \
Make sure to consider the order in which they appear context appear. It is an increasing order of cosine similarity index.\
The contents are formatted in latex, you need to remove any special characters and latex formatting before cohercing the points to build your answer.\
Write your answer in the form of markdown bullet points. You can use latex commands if necessary.
You will strictly cite no more than 10 unqiue citations at maximum from the context below.\
Make sure these citations have to be relavant and strictly do not repeat the context in the answer.

<context>
    {context}
<context/>

REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \
not sure." or greet back. Don't try to make up an answer. Anything between the preceding 'context' \
html blocks is retrieved from a knowledge bank, not part of the conversation with the \
user.\
Question: {question}
"""



rag_prompt_custom = PromptTemplate.from_template(template)
from langchain.schema.runnable import RunnableMap

rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableMap(
    {"documents": retriever, "question": RunnablePassthrough()}
) | {
    "answer": rag_chain_from_docs,
}


chain = LLMChain(
    retriever=custom_retriever,
    llm=llm,
    prompt="Using the context provided, answer the following question: {question}"
)

def chatbot(question):
    response = chain({"question": question})
    context_with_metadata = response["documents"]
    cited_sources = "\n".join([
        f"- {doc['metadata']['title']} by {doc['metadata']['authors']} (Published on {doc['metadata']['published']}, Page {doc['metadata']['page']}"
        for doc in context_with_metadata
    ])
    return response["text"], cited_sources

# Example usage
user_question = "What is the main purpose of STAR experiment?"
response_text, cited_sources = chatbot(user_question)
print(response_text)
print("Sources:")
print(cited_sources)