In [None]:
# -----------------------------
# Config variables (inline instead of config.py)
# -----------------------------
PROMPT_TEMPLATE = """
You are a great researcher. With the information provided understand in deep and try to answer the question.
If you cant answer the question based on the information either say you cant find an answer or unable to find an answer.
So try to understand in depth about the context and answer only based on the information provided. Dont generate irrelevant answers.

Context: {context}
Question: {question}
Do provide only helpful answers

Answer:
"""
INPUT_VARIABLES = ["context", "question"]
SEPARATORS = "\n"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 500
EMBEDDER = "BAAI/bge-base-en-v1.5"
CHAIN_TYPE = "stuff"
SEARCH_KWARGS = {'k': 3}

# -----------------------------
# Imports
# -----------------------------
import os
import json
import requests
from dotenv import load_dotenv, find_dotenv

from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings  # updated

load_dotenv(find_dotenv())

# -----------------------------
# Researcher class
# -----------------------------
class Researcher:

    def __init__(self):
        self.serper_api_key = os.getenv("SERPER_API_KEY")
        self.groq_api_key = os.getenv("GROQ_API_KEY")
        self.prompt_template = PromptTemplate(
            template=PROMPT_TEMPLATE,
            input_variables=INPUT_VARIABLES
        )
        self.text_splitter = RecursiveCharacterTextSplitter(
            separators=SEPARATORS,
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )

        # Use a current production Groq model
        self.llm = ChatGroq(
            temperature=0.5,
            model_name="groq/compound",  # updated model
            groq_api_key=self.groq_api_key
        )

        # CPU-friendly embeddings (safe for Colab free)
        self.hfembeddings = HuggingFaceEmbeddings(
            model_name=EMBEDDER,
            model_kwargs={'device': 'cpu'}
        )

    def search_articles(self, query):
        url = "https://google.serper.dev/search"
        data = json.dumps({"q": query})
        headers = {
            'X-API-KEY': self.serper_api_key,
            'Content-Type': 'application/json'
        }
        response = requests.post(url, headers=headers, data=data)
        return response.json()

    def research_answerer(self):
        research_qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type=CHAIN_TYPE,
            retriever=self.db.as_retriever(search_kwargs=SEARCH_KWARGS),
            return_source_documents=True,
            verbose=True,
            chain_type_kwargs={"prompt": self.prompt_template}
        )
        return research_qa_chain

    def get_urls(self, articles):
        urls = []
        try:
            urls.append(articles["answerBox"]["link"])
        except:
            pass
        for i in range(0, min(3, len(articles.get("organic", [])))):
            urls.append(articles["organic"][i]["link"])
        return urls

    def get_content_from_urls(self, urls):
        loader = UnstructuredURLLoader(urls=urls)
        try:
            research_content = loader.load()
        except Exception as e:
            print(f"Error loading URLs: {e}")
            research_content = []
        return research_content

    def research_given_query(self, research_objective, research_content):
        docs = self.text_splitter.split_documents(research_content)
        self.db = FAISS.from_documents(documents=docs, embedding=self.hfembeddings)
        bot = self.research_answerer()
        research_out = bot.invoke({"query": research_objective})  # updated for latest LangChain
        return research_out["result"]

    def research(self, query):
        search_articles = self.search_articles(query)
        urls = self.get_urls(search_articles)
        research_content = self.get_content_from_urls(urls)
        answer = self.research_given_query(query, research_content)
        return answer

# -----------------------------
# Run example
# -----------------------------
if __name__ == "__main__":
    # Set API keys manually in Colab if not using .env
    os.environ["SERPER_API_KEY"] = ""
    os.environ["GROQ_API_KEY"] = ""

    researcher = Researcher()
    query = "Who won Asia Cup 2025?"
    print("Searching and retrieving answer...")
    answer = researcher.research(query)
    print("\n===== Answer =====")
    print(answer)


Searching and retrieving answer...


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

===== Answer =====
Based on the provided context, the answer to the question "Who won Asia Cup 2025?" is:

India

This information is directly mentioned in the context, specifically in the article "Why did India refuse to accept the Asia Cup trophy after beating Pakistan?" which states: "Tensions between India and Pakistan scorched the world of sports on Sunday when the Indian cricket team refused to accept the Asia Cup trophy after beating their neighbours by five wickets in the final in Dubai." 

Additionally, the same information is also mentioned in the article "Asia Cup 2025: India and Pakistan turn cricket into militarised theatre" which states: "Despite being fined for making comments deemed political – dedicating India’s win to the victims of the Pahalgam attack and the Indian armed forces – at a post-match news conference on September 14, India’s captain, Suryakumar Yad

In [None]:
!pip install -q langchain langchain_groq langchain_community langchain_huggingface langchain-text-splitters python-dotenv unstructured faiss-cpu sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m51.3 MB/s[0m eta [36m0:0

In [None]:
# -----------------------------
# Config variables (inline instead of config.py)
# -----------------------------
PROMPT_TEMPLATE = """
You are a great researcher. With the information provided understand in deep and try to answer the question.
If you cant answer the question based on the information either say you cant find an answer or unable to find an answer.
So try to understand in depth about the context and answer only based on the information provided. Dont generate irrelevant answers.

Context: {context}
Question: {question}
Do provide only helpful answers

Answer:
"""
INPUT_VARIABLES = ["context", "question"]
SEPARATORS = "\n"
CHUNK_SIZE = 3000
CHUNK_OVERLAP = 500
EMBEDDER = "BAAI/bge-base-en-v1.5"
CHAIN_TYPE = "stuff"
SEARCH_KWARGS = {'k': 3}

# -----------------------------
# Imports
# -----------------------------
import os
import json
import requests
from dotenv import load_dotenv, find_dotenv
from typing import List
import time
import statistics

from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings  # updated

load_dotenv(find_dotenv())

# -----------------------------
# Researcher class (with evaluation)
# -----------------------------
class Researcher:

    def __init__(self):
        self.serper_api_key = os.getenv("SERPER_API_KEY")
        self.groq_api_key = os.getenv("GROQ_API_KEY")
        self.prompt_template = PromptTemplate(
            template=PROMPT_TEMPLATE,
            input_variables=INPUT_VARIABLES
        )
        self.text_splitter = RecursiveCharacterTextSplitter(
            separators=SEPARATORS,
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )

        # Use a current production Groq model
        self.llm = ChatGroq(
            temperature=0.5,
            model_name="groq/compound",
            groq_api_key=self.groq_api_key
        )

        # CPU-friendly embeddings (safe for Colab free)
        self.hfembeddings = HuggingFaceEmbeddings(
            model_name=EMBEDDER,
            model_kwargs={'device': 'cpu'}
        )

    def search_articles(self, query: str) -> dict:
        url = "https://google.serper.dev/search"
        data = json.dumps({"q": query})
        headers = {
            'X-API-KEY': self.serper_api_key,
            'Content-Type': 'application/json'
        }
        response = requests.post(url, headers=headers, data=data)
        response.raise_for_status()
        return response.json()

    def research_answerer(self):
        research_qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type=CHAIN_TYPE,
            retriever=self.db.as_retriever(search_kwargs=SEARCH_KWARGS),
            return_source_documents=True,
            verbose=True,
            chain_type_kwargs={"prompt": self.prompt_template}
        )
        return research_qa_chain

    def get_urls(self, articles: dict) -> List[str]:
        urls = []
        if not articles:
            return urls
        try:
            if "answerBox" in articles and "link" in articles["answerBox"]:
                urls.append(articles["answerBox"]["link"])
        except Exception:
            pass
        for i in range(0, min(3, len(articles.get("organic", [])))):
            urls.append(articles["organic"][i]["link"])
        return urls

    def get_content_from_urls(self, urls: List[str]):
        # load pages one by one to be robust and skip failures
        documents = []
        for url in urls:
            try:
                loader = UnstructuredURLLoader(urls=[url])
                docs = loader.load()
                # attach url to metadata if loader didn't
                for d in docs:
                    if not d.metadata.get("source"):
                        d.metadata["source"] = url
                documents.extend(docs)
            except Exception as e:
                print(f"Skipping URL {url}: {e}")
        return documents

    def research_given_query(self, research_objective: str, research_content):
        docs = self.text_splitter.split_documents(research_content)
        self.db = FAISS.from_documents(documents=docs, embedding=self.hfembeddings)
        bot = self.research_answerer()
        # use invoke if your LangChain version expects it
        try:
            research_out = bot.invoke({"query": research_objective})
        except Exception:
            research_out = bot({"query": research_objective})
        return research_out["result"]

    def research(self, query: str):
        search_articles = self.search_articles(query)
        urls = self.get_urls(search_articles)
        research_content = self.get_content_from_urls(urls)
        answer = self.research_given_query(query, research_content)
        return answer

    # -----------------------------
    # New: evaluate retrieval accuracy + response time
    # -----------------------------
    def evaluate_retriever(self, dataset: List[dict], top_k: int = 3):
        """
        dataset: list of {"query":..., "expected_source":..., "expected_answer":...}
        This runs the same search -> scrape -> embed -> faiss -> retrieve pipeline and
        checks whether expected_source or expected_answer snippet appears in top-k docs.
        Also measures response time per query (search -> retrieval).
        """
        results = []
        correct = 0
        total = len(dataset)
        times = []

        for i, sample in enumerate(dataset, start=1):
            q = sample["query"]
            expected_source = sample.get("expected_source", "").lower()
            expected_answer = sample.get("expected_answer", "").lower()

            start_time = time.time()  # measure start of the pipeline

            # Run pipeline: search -> get urls -> load content -> build FAISS
            try:
                search_res = self.search_articles(q)
            except Exception as e:
                elapsed = time.time() - start_time
                times.append(elapsed)
                print(f"[{i}] Search failed for query: {q} -> {e} (time: {elapsed:.3f}s)")
                results.append((q, False, [], "search_failed", elapsed))
                continue

            urls = self.get_urls(search_res)
            docs = self.get_content_from_urls(urls)
            if not docs:
                elapsed = time.time() - start_time
                times.append(elapsed)
                print(f"[{i}] No documents loaded for query: {q} (urls: {urls}) (time: {elapsed:.3f}s)")
                results.append((q, False, urls, "no_docs", elapsed))
                continue

            # build faiss for these docs
            db = FAISS.from_documents(documents=docs, embedding=self.hfembeddings)
            retriever = db.as_retriever(search_kwargs={"k": top_k})

            # retrieve
            try:
                retrieved = retriever.get_relevant_documents(q)
            except Exception as e:
                elapsed = time.time() - start_time
                times.append(elapsed)
                print(f"[{i}] Retrieval failed for query: {q} -> {e} (time: {elapsed:.3f}s)")
                results.append((q, False, [], "retrieval_failed", elapsed))
                continue

            # collect retrieved sources/text
            retrieved_sources = []
            retrieved_texts = []
            found = False
            for d in retrieved:
                src = ""
                # try common metadata fields
                for key in ("source", "url", "source_url"):
                    if key in d.metadata and d.metadata[key]:
                        src = str(d.metadata[key]).lower()
                        break
                retrieved_sources.append(src)
                content = (d.page_content or "").lower()
                retrieved_texts.append(content)

                # check for expected_source substring
                if expected_source and expected_source in src:
                    found = True
                    break
                # fallback: check expected_answer snippet in content
                if expected_answer and expected_answer in content:
                    found = True
                    break

            elapsed = time.time() - start_time
            times.append(elapsed)

            if found:
                correct += 1
                status = "hit"
            else:
                status = "miss"

            results.append((q, found, urls, retrieved_sources, status, elapsed))

            # print per-query summary
            print(f"[{i}] Query: {q}")
            print(f"     expected_source: {sample.get('expected_source')}")
            print(f"     urls fetched: {urls}")
            print(f"     retrieved sources: {retrieved_sources}")
            print(f"     result: {status}")
            print(f"     response_time: {elapsed:.3f}s\n")

        accuracy = correct / total * 100 if total else 0.0
        print(f"Retrieval accuracy @top-{top_k}: {accuracy:.2f}% ({correct}/{total})")

        # timing summary
        if times:
            mean_t = statistics.mean(times)
            median_t = statistics.median(times)
            p95 = statistics.quantiles(times, n=100)[94] if len(times) >= 100 else max(times)  # fallback when <100 measurements
            print(f"Response time summary (s): mean={mean_t:.3f}, median={median_t:.3f}, p95~={p95:.3f}")

        return accuracy, results, times


# -----------------------------
# Test dataset (14 likely-success cases + 2 likely-paywalled/failure cases = 16)
# -----------------------------
TEST_DATA = [
    {"query": "Who is the CEO of NVIDIA?", "expected_answer": "jensen huang", "expected_source": "nvidia.com"},
    {"query": "What is the capital of Japan?", "expected_answer": "tokyo", "expected_source": "en.wikipedia.org/wiki/Tokyo"},
    {"query": "When was OpenAI founded?", "expected_answer": "december 2015", "expected_source": "en.wikipedia.org/wiki/OpenAI"},
    {"query": "What is LangChain used for?", "expected_answer": "framework for building applications", "expected_source": "langchain.com"},
    {"query": "Which company created the Mistral-7B model?", "expected_answer": "mistral ai", "expected_source": "mistral.ai"},
    {"query": "What is FAISS used for?", "expected_answer": "similarity search", "expected_source": "github.com/facebookresearch/faiss"},
    {"query": "Who invented the World Wide Web?", "expected_answer": "tim berners-lee", "expected_source": "en.wikipedia.org/wiki/Tim_Berners-Lee"},
    {"query": "What is HuggingFace Transformers?", "expected_answer": "library for state-of-the-art natural language processing models", "expected_source": "huggingface.co/transformers"},
    {"query": "What is the default port for HTTP?", "expected_answer": "port 80", "expected_source": "developer.mozilla.org"},
    {"query": "What programming language is TensorFlow written in?", "expected_answer": "c++", "expected_source": "tensorflow.org"},
    {"query": "What is the latest iPhone model?", "expected_answer": "iphone 16", "expected_source": "apple.com"},
    {"query": "Which algorithm does Google use for search ranking?", "expected_answer": "pagerank", "expected_source": "en.wikipedia.org/wiki/PageRank"},
    {
    "query": "Let x₀ be the real number such that e^(x₀) + x₀ = 0. For a given real number α, define g(x) = (3 x e^x + 3 x - α e^x - α x) / (3 (e^x + 1)) for all real numbers x. Then which one of the following statements is TRUE? (A) For α = 2, lim x→x₀ |g(x) + e^(x₀)| / |x - x₀| = 0. (B) For α = 2, lim x→x₀ |g(x) + e^(x₀)| / |x - x₀| = 1. (C) For α = 3, lim x→x₀ |g(x) + e^(x₀)| / |x - x₀| = 0. (D) For α = 3, lim x→x₀ |g(x) + e^(x₀)| / |x - x₀| = 2/3.",
    "expected_answer": "C",
    "expected_source": "sarthaks.com"
    },
    {"query": "Give me Optiver OA questions that happened on 14th October 2025", "expected_answer": "Researchers are studying the behavior of squirrels in a forest, where they compete to hide and collect nuts in hidden locations. You are tasked with implementing a tracker that can simulate this behavior. Your system is initialized with a list of locations and their capacities, and should support the following actions: Register a nut has been hid by a squirrel in a location. Register how nuts are retrieved by a squirrel from a location. Complete the functions described below in the SquirrelResearch class."
    , "expected_source": "https://oahelper.in/questions/U2FsdGVkX18gVO7LQ0Ng-51OZtY-etlab8ViFBPseMw?company_id=U2FsdGVkX19j-__NQxYfL1WkTv_vpON0lGt2of6ikmU"}
]

# -----------------------------
# Run evaluation if executed directly
# -----------------------------
if __name__ == "__main__":
    # Provide your API keys here (or ensure .env is present)
    os.environ["SERPER_API_KEY"] = os.getenv("SERPER_API_KEY", os.environ.get("SERPER_API_KEY", ""))
    os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY", os.environ.get("GROQ_API_KEY", ""))

    researcher = Researcher()


    researcher = Researcher()

    # Run the retriever-only evaluation (does NOT invoke the LLM for scoring)
    acc, details, timings = researcher.evaluate_retriever(TEST_DATA, top_k=3)


[1] Query: Who is the CEO of NVIDIA?
     expected_source: nvidia.com
     urls fetched: ['https://en.wikipedia.org/wiki/Jensen_Huang', 'https://www.linkedin.com/in/jenhsunhuang', 'https://www.forbes.com/profile/jensen-huang/']
     retrieved sources: ['https://www.forbes.com/profile/jensen-huang/']
     result: hit
     response_time: 5.066s

[2] Query: What is the capital of Japan?
     expected_source: en.wikipedia.org/wiki/Tokyo
     urls fetched: ['https://en.wikipedia.org/wiki/Capital_of_Japan', 'https://clintonwhitehouse3.archives.gov/WH/New/Pacific/tokyo.html', 'https://www.britannica.com/place/Tokyo']
     retrieved sources: ['https://clintonwhitehouse3.archives.gov/wh/new/pacific/tokyo.html']
     result: hit
     response_time: 5.694s

[3] Query: When was OpenAI founded?
     expected_source: en.wikipedia.org/wiki/OpenAI
     urls fetched: ['https://en.wikipedia.org/wiki/OpenAI', 'https://www.lxahub.com/stories/the-history-of-openai', 'https://www.opb.org/article/2023/11/24/