In [1]:
!pip install -q langchain langgraph langchain-google-genai langchain-community langchain-tavily tavily-python faiss-cpu python-dotenv sentence-transformers pypdf langchain-text-splitters pathlib

In [2]:
import os
from pathlib import Path

from typing import TypedDict, Annotated, List

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_tavily import TavilySearch
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages

import operator
from operator import itemgetter
from google.colab import userdata

from IPython.display import Image

In [3]:
os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
os.environ['TAVILY_API_KEY'] = userdata.get('TAVILY_API_KEY')

os.environ["LANGSMITH_TRACING"] = userdata.get('LANGSMITH_TRACING')
os.environ["LANGSMITH_ENDPOINT"] = userdata.get('LANGSMITH_ENDPOINT')
os.environ["LANGSMITH_API_KEY"] = userdata.get('LANGSMITH_API_KEY')
os.environ["LANGCHAIN_PROJECT"] = userdata.get('LANGCHAIN_PROJECT')

In [4]:
llm = ChatGoogleGenerativeAI(model='gemini-2.5-flash', temperature=0.1, convert_system_message_to_human=True)

In [5]:
search_tool = TavilySearch(max_results=5)

In [6]:
data_dir = os.path.join(os.getcwd(), 'data')

all_documents = []
pdf_files = list(Path(data_dir).rglob('*.pdf'))

if not pdf_files:
    print(f"No PDF files found in '{data_dir}'.")
else:
    for pdf_file in pdf_files:
        loader = PyPDFLoader(str(pdf_file))
        docs = loader.load()
        for doc in docs:
            doc.metadata['source'] = pdf_file.name
            doc.metadata["page"] = doc.metadata.get("page", 0) + 1
        all_documents.extend(docs)

print(f"Loaded {len(all_documents)} pages from {len(pdf_files)} PDF files.")

Loaded 535 pages from 1 PDF files.


In [7]:
# Chunk all documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust for your needs (e.g., 500-2000 chars)
    chunk_overlap=200,  # Overlap for context retention
    length_function=len,
    add_start_index=True  # Track source positions
)
chunks = text_splitter.split_documents(all_documents)

print(f"Created {len(chunks)} chunks from {len(all_documents)} pages across {len(pdf_files)} files.")

Created 1506 chunks from 535 pages across 1 files.


In [8]:
# Embed with Hugging Face (local, no API key needed)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",  # Fast, 384-dim; alternatives: "all-mpnet-base-v2" for higher quality
    model_kwargs={'device': 'cpu'},  # Use 'cuda' if GPU available
    encode_kwargs={'normalize_embeddings': True}
)

  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
# Vectorize and store in FAISS (merges all into one index)
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 chunks

In [10]:
vectorstore.save_local("faiss_multi_pdf_index")
print("Unified vector store created and saved! Ready for RAG.")

Unified vector store created and saved! Ready for RAG.


**State Schema**

In [11]:
class AgentState(TypedDict):
    messages: Annotated[List[AIMessage | HumanMessage], add_messages]
    original_query: str
    sub_questions: List[str]
    current_question: str
    notes: List[str]
    bookmarks: List[str]
    iteration: int
    max_iterations: int
    converged: bool

**Prompts for Nodes**

In [12]:
# Planner: Generate sub-questions
planner_prompt = ChatPromptTemplate([
    ("system", """You are a research planner. Given the original query and any prior notes,
    generate 3-5 non-overlapping sub-questions to explore gaps. Focus on multi-hop aspects.
    Output only the list of questions, one per line."""),
    MessagesPlaceholder(variable_name="messages"),
    ("human", "{original_query}\nPrior notes:{notes}")
])

In [13]:
# Researcher/QA Agent: Retrieve and answer
researcher_prompt = ChatPromptTemplate([
    ("system", """You are a factual researcher. Use the provided context or search tool to answer the current question succinctly.
    Append notes on unknowns/gaps and bookmark relevant excerpts. Be rigorous, no hallucinations.
    If needed, call the search tool."""),
    MessagesPlaceholder(variable_name="messages"),
    ("human", "Current question: {current_question}\nContext: {context}")
])

In [14]:
# Picker/Director: Select next question
picker_prompt = ChatPromptTemplate([
    ("system", """You are a task director. From the list of unanswered sub-questions, select the most pertinent one
    to the original query. Output only the selected question."""),
    ("human", "Original: {original_query}\nUnanswered: {sub_questions}\nPrior notes: {notes}")
])

In [15]:
# Analyser/Manager: Check convergence
analyser_prompt = ChatPromptTemplate([
    ("system", """You are a research manager. Review the latest notes and iteration count. Decide if converged
    (enough depth, no major gaps). Output 'CONVERGE' or 'CONTINUE'."""),
    MessagesPlaceholder(variable_name="messages"),
    ("human", "Iteration: {iteration}\nNotes: {notes}")
])

In [16]:
# Compiler: Synthesize final report
compiler_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a report compiler. Synthesize all notes and bookmarks into a comprehensive, cited answer
    to the original query. Structure: Introduction, Key Findings, Conclusion. Include citations."""),
    ("human", "Original: {original_query}\nAll notes: {notes}\nBookmarks: {bookmarks}")
])

**Agent Functions (nodes)**

In [17]:
def planner_node(state:AgentState) -> AgentState:
    """Generate sub-questions if none exist."""
    if not state.get('sub_questions'):
        chain = planner_prompt | llm | StrOutputParser()
        response = chain.invoke({
            "messages": state.get('messages'),
            "original_query": state.get('original_query'),
            "notes": state.get('notes')
        })
        sub_questions = [q.strip() for q in response.split('\n') if q.strip()]
        state['sub_questions'] = sub_questions
        state['messages'].append(AIMessage(content=f"Generated sub-questions: {sub_questions}"))

    return state

In [18]:
def researcher_node(state: AgentState) -> AgentState:
    """Retrieve context and answer current question."""
    # Fixed RAG chain: Use itemgetter to feed only query to retriever
    rag_chain = (
        {
            "context": itemgetter("current_question") | retriever | format_docs,
            "current_question": itemgetter("current_question"),
            "messages": itemgetter("messages")
        }
        | researcher_prompt
        | llm.bind_tools([search_tool])  # Bind tool for web search if needed
        | StrOutputParser()
    )

    response = rag_chain.invoke({
        "messages": state["messages"],
        "current_question": state["current_question"]
    })

    # Parse response (assume: answer + notes + bookmarks; in prod, use structured parser)
    notes = [response]  # Simplified; extract gaps/unknowns from response
    bookmarks = ["Sample citation from corpus"]  # Enhance with retriever metadata (e.g., doc.metadata['source'])

    state["notes"].extend(notes)
    state["bookmarks"].extend(bookmarks)
    state["messages"].append(AIMessage(content=response))
    return state

In [19]:
def picker_node(state: AgentState) -> AgentState:
    """Select next sub-question."""
    if state["sub_questions"]:
        chain = picker_prompt | llm | StrOutputParser()
        response = chain.invoke({
            "original_query": state["original_query"],
            "sub_questions": state["sub_questions"],
            "notes": "\n".join(state["notes"])
        })
        state["current_question"] = response
        # Remove selected from list (simplified: pop first match)
        state["sub_questions"] = [q for q in state["sub_questions"] if q != response]
        state["messages"].append(AIMessage(content=f"Selected: {response}"))
    return state

In [20]:
def analyser_node(state: AgentState) -> AgentState:
    """Check if converged."""
    state["iteration"] += 1
    chain = analyser_prompt | llm | StrOutputParser()
    decision = chain.invoke({
        "messages": state["messages"],
        "iteration": state["iteration"],
        "notes": "\n".join(state["notes"])
    }).strip().upper()

    state["converged"] = (decision == "CONVERGE") or (state["iteration"] >= state["max_iterations"])
    state["messages"].append(AIMessage(content=f"Decision: {decision}"))
    return state

In [21]:
def compiler_node(state: AgentState) -> AgentState:
    """Compile final report."""
    chain = compiler_prompt | llm | StrOutputParser()
    report = chain.invoke({
        "original_query": state["original_query"],
        "notes": "\n".join(state["notes"]),
        "bookmarks": state["bookmarks"]
    })
    state["messages"].append(AIMessage(content=report))
    return state

**UTILS (Helper Function)**

In [22]:
def should_continue(state:AgentState) -> str:
    if state['converged'] or not state['sub_questions']:
        return "compiler"
    return "picker"

In [23]:
def format_docs(docs):
    """Format retrieved docs into a single string for context."""
    return "\n\n".join([doc.page_content for doc in docs])

**Build And Compile**

In [24]:
# Create the graph
workflow = StateGraph(AgentState)

# Add nodes
workflow.add_node("planner", planner_node)
workflow.add_node("picker", picker_node)
workflow.add_node("researcher", researcher_node)
workflow.add_node("analyser", analyser_node)
workflow.add_node("compiler", compiler_node)

# Edges: Start -> planner -> picker -> researcher -> analyser -> conditional
workflow.set_entry_point("planner")

workflow.add_edge("planner", "picker")
workflow.add_edge("picker", "researcher")
workflow.add_edge("researcher", "analyser")

workflow.add_conditional_edges("analyser", should_continue, {"compiler": "compiler", "picker":"picker"})
workflow.add_edge("compiler", END)

# Compile graph
app = workflow.compile()

print('Graph compiled! Ready to run.')

Graph compiled! Ready to run.


**Run the Agent**

In [27]:
user_query = input('Enter the query: ')

initial_state = AgentState(messages=HumanMessage(content=user_query),
                           original_query=user_query,
                           sub_questions=[],
                           current_question="",
                           notes=[],
                           bookmarks=[],
                           iteration=0,
                           max_iterations=3,
                           converged=False
                        )

config = {"configurable": {"thread_id": "trace-thread-1"}}

result = app.invoke(initial_state, config=config)

final_report = result["messages"][-1].content

print("Final Research Report: \n")
print(final_report)

Enter the query: what's the most beneficial application of ai
Final Research Report: 

## The Most Beneficial Application of AI: A Synthesis

### Introduction
This report synthesizes the provided notes and bookmarks to address the query regarding the most beneficial application of Artificial Intelligence (AI). Based on the available context, the analysis focuses on identified applications and their measurable impacts, while acknowledging the limitations in comprehensive coverage due to incomplete information.

### Key Findings
Among the applications discussed, **Coding** is highlighted as a highly popular and impactful use case for AI, particularly within the realm of generative AI [Sample citation from corpus]. Surveys consistently identify coding as "hands down the most popular use case" for generative AI [Sample citation from corpus].

A prime example of this impact is GitHub Copilot, a code completion tool that represents one of the earliest successful production applications of fo

In [26]:
# Image(app.get_graph().draw_mermaid_png())