In [None]:
from langchain.chat_models import init_chat_model
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain.agents import create_agent
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.agents.middleware import dynamic_prompt, ModelRequest
import tqdm 
import json
import getpass
import os

In [None]:
PATH = 'path/to/dataset'

In [None]:
if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

llm = init_chat_model("google_genai:gemini-2.5-flash-lite")

In [None]:
model = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={'device': 'cuda'}
)


In [None]:
texts = []
ids = []

with open(PATH) as f:
    for line in f:
        if line.strip():
            entry = json.loads(line)
            text = " ".join("Title: ",entry.get(["title",""]),"\nAbstract: ",entry.get(["abstract",""]))
            texts.append(text)
            ids.append(entry.get("id",""))

In [None]:
vector_store = FAISS.from_texts(
    texts = texts,  # Used only 500k(texts[:500000]) of research papers due to computational issue, can do 2.8M papers.
    embedding = model
)

In [None]:

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    """Inject context into state messages."""
    last_query = request.state["messages"][-1].text
    retrieved_docs = vector_store.similarity_search(last_query)

    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    system_message = (
        "You are a Research Assistant with access to the user's personal notes and documents. Always search the knowledge base first before answering questions, and cite which documents your information comes from (e.g., 'According to [Document Name], ...'). If information isn't in the knowledge base, clearly state 'I don't find information about this in your knowledge base' and offer to use general knowledge instead. When answering, combine information from multiple sources when relevant, keep responses focused on what's most relevant to the question, and never fabricate citations or attribute information to documents it doesn't come from. Your goal is to help users extract maximum value from their stored knowledge by retrieving and synthesizing information clearly. Use the following context in your response:"
        f"\n\n{docs_content}"
    )

    return system_message


agent = create_agent(llm, tools=[], middleware=[prompt_with_context])

In [None]:
query = "What is prompt diphoton?"
for step in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()