<a href="https://colab.research.google.com/github/amalsalilan/Infosys-Springboard-Virtual-Internship-6.0-Open-Deep-Researcher-batch-2/blob/Nischay_Pandey/Milestone1_ScopePhase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Cell 1 — Install required libraries

In [None]:
!pip install langgraph google-generativeai tavily-python


cell 2:- API Keys Setup

In [None]:
import os
from tavily import TavilyClient
import google.generativeai as genai

# 🔑 Replace (abc & xyz) with real keys
os.environ["TAVILY_API_KEY"] = "abc"
os.environ["GEMINI_API_KEY"] = "xyz"

tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])
genai.configure(api_key=os.environ["GEMINI_API_KEY"])


Cell 3 — Research State

In [None]:
from typing import List, Optional
from pydantic import BaseModel

class ResearchState(BaseModel):
    query: str
    clarification_needed: bool = False
    follow_up_questions: Optional[List[str]] = None
    clarified_query: Optional[str] = None
    search_results: Optional[List[dict]] = None
    summary: Optional[str] = None


Cell 4 — Utility: safe response extractor

In [None]:
def estimate_tokens(text: str) -> int:
    """Rough token estimator."""
    return int(len(text.split()) * 1.3)

def safe_print_json(data):
    """Nicely print dicts/lists like an AI agent response."""
    import json
    print(json.dumps(data, indent=2, ensure_ascii=False))


Cell 5 — Clarification agent

In [None]:
def clarification_agent(state: ResearchState) -> ResearchState:
    """
    Decide if the query is clear enough or follow-up questions are needed.
    Uses Gemini to judge only when necessary.
    """
    model = genai.GenerativeModel("gemini-1.5-flash")

    prompt = f"""
You are a helpful research assistant.
Given this user query:

"{state.query}"

Decide if clarification is needed.
If yes, ask **max 2 clear and specific follow-up questions**.
If not, confirm it's sufficient.

Respond in JSON:
{{
  "clarification_needed": true/false,
  "follow_up_questions": [..] or null
}}
"""
    response = model.generate_content(prompt)
    try:
        parsed = response.text.strip()
        import json
        parsed_json = json.loads(parsed)
        state.clarification_needed = parsed_json.get("clarification_needed", False)
        state.follow_up_questions = parsed_json.get("follow_up_questions")
    except Exception:
        # fallback: assume no clarification
        state.clarification_needed = False
        state.follow_up_questions = None

    return state


Cell 6 — Query generator node

In [None]:
def query_generator(state: ResearchState) -> ResearchState:
    """
    Generate a clarified search query if clarification was answered.
    """
    if not state.clarified_query:
        state.clarified_query = state.query
    return state


Cell 7 — research pipeline

In [None]:
def research_pipeline(state: ResearchState) -> ResearchState:
    """
    Runs Tavily search if query is broad/needs context,
    otherwise simulates LLM reasoning without web search.
    """
    # Simple rule: only call Tavily if query looks factual/research heavy
    keywords = ["latest", "statistics", "research", "compare", "trends", "report"]
    if any(kw in state.clarified_query.lower() for kw in keywords):
        print("🔎 Running Tavily search...")
        search = tavily_client.search(state.clarified_query, max_results=3)
        state.search_results = search.get("results", [])
    else:
        print("⚡ Skipping web search (not needed).")
        state.search_results = []

    # Summarize with Gemini
    model = genai.GenerativeModel("gemini-1.5-flash")
    context = ""
    if state.search_results:
        for r in state.search_results:
            context += f"- {r.get('title','')} :: {r.get('content','')[:200]}\n"

    summary_prompt = f"""
User query: {state.clarified_query}

Context (may be empty):
{context}

Provide a clear, agent-like research summary.
"""
    response = model.generate_content(summary_prompt)
    state.summary = response.text.strip()

    return state


Cell 8 — LangGraph Flow and compile

In [None]:
from langgraph.graph import StateGraph, END

workflow = StateGraph(ResearchState)

workflow.add_node("clarification", clarification_agent)
workflow.add_node("query_gen", query_generator)
workflow.add_node("pipeline", research_pipeline)

workflow.set_entry_point("clarification")
workflow.add_edge("clarification", "query_gen")
workflow.add_edge("query_gen", "pipeline")
workflow.add_edge("pipeline", END)

graph = workflow.compile()


Cell 9 — Memory initialization and chat() function (single-turn)

In [None]:
def decide_search(state: ResearchState) -> bool:
    return bool(state.search_results)

def extract_facts_with_gemini(text: str):
    return []  # milestone 2+


Cell 10 — continuous chatbot loop

In [None]:
def chatbot():
    print("🤖 Research Agent Ready (Milestone 1)\n")
    while True:
        query = input("You: ").strip()
        if query.lower() in ["exit", "quit"]:
            print("👋 Ending session.")
            break

        state = ResearchState(query=query)

        # Step 1: Clarification
        state = clarification_agent(state)
        if state.clarification_needed and state.follow_up_questions:
            print("\n🤖 I need a bit more info:")
            for q in state.follow_up_questions:
                print(" -", q)
            ans = input("\nYour clarification: ").strip()
            state.clarified_query = f"{state.query} | Clarified: {ans}"
        else:
            state.clarified_query = state.query

        # Step 2: Query & Pipeline
        state = query_generator(state)
        state = research_pipeline(state)

        # Step 3: Final Output
        print("\n📌 Research Summary:")
        print(state.summary)
        print("\n---\n")

chatbot()
