This is a notebook containing a code for a bot deployed using streamlit to answer all your questions regarding ICS and related information. It uses an adaptive RAG to classify your queries and provide you well suited answers based on the query and classification.

Run the following pip install commands to install the required libraries:

In [None]:
!pip install -U langchain_community --quiet
!pip install langchain_google_genai --quiet
!pip install langchain_text_splitter --quiet
!pip install chromadb --quiet
!pip install streamlit --quiet
!pip install pyngrok --quiet

Below code webscrapes the ICS website and produces full text by webscraping all the links to which the base url is linked to as well. After this the text is broken into chunks of size 500 with an overlap of 50 to maintain continuity.

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

# Example URL (replace with your target)
base_url = "https://www.iitk.ac.in/counsel/"

response = requests.get(base_url)

total_text = []
soup = BeautifulSoup(response.text, "html.parser")
full_text = soup.get_text(separator="\n", strip=True)
links = set()
for a_tag in soup.find_all("a", href=True):
    href = a_tag["href"]
    full_url = urljoin(base_url, href)

    if urlparse(full_url).netloc == urlparse(base_url).netloc:
        links.add(full_url)
total_text = []
for link in links:
    try:
        response = requests.get(link)
        time.sleep(1)
        page_soup = BeautifulSoup(response.text, "html.parser")
        page_text = page_soup.get_text(separator="\n", strip=True)
        total_text.append(page_text)
        print(f"\n📄 {link}")
        print(page_text[:100])  # Print first 100 chars
    except Exception as e:
        print(f"Couldnt load link : {link}")
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

docs = [Document(page_content=text) for text in total_text]

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
chunks = splitter.split_documents(docs)

In [None]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
import os
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

Here, we initialize the embeddings and store the chunks into a vector store with the directory 'my_chroma_db' so it can accessed within the app.py file again without having to create the vector store repeatedly when the app is launched.

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=GOOGLE_API_KEY)
texts = [str(chunk.page_content) for chunk in chunks]
vectorstore = Chroma.from_texts(texts, embeddings, persist_directory="my_chroma_db")

# Persist to disk
vectorstore.persist()

Here, we write the app.py file along with the functions of adaptive RAG.

In [None]:
%%writefile app.py
import streamlit as st
import time
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import requests
import numpy as np
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel,Field
from typing import List
import os
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=GOOGLE_API_KEY)
vectorstore = Chroma(persist_directory="my_chroma_db",embedding_function=embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k":10})

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature = 0.1,
    api_key=GOOGLE_API_KEY,
)
class categories_options(BaseModel):
        category: str = Field(description="The category of the query, the options are: Factual, Analytical, Opinion, or Contextual", example="Factual")
prompt = PromptTemplate(
            input_variables=["query"],
            template="Classify the following query into one of these categories: Factual, Analytical, Opinion, or Contextual.\nQuery: {query}\nCategory:"
        )
classify_chain = prompt | llm.with_structured_output(categories_options)
class SubQueries(BaseModel):
    sub_queries: List[str] = Field(description="List of sub-queries", example=["What is the population of New York?", "What is the GDP of New York?"])
def Analytical(query):
  prompt = PromptTemplate(
            input_variables=["query"],
            template="Generate 3 sub-questions for: {query}"
        )
  chain = prompt | llm.with_structured_output(SubQueries)
  new_query = new_query = chain.invoke({"query": query}).sub_queries
  response_prompt = PromptTemplate(
        input_variables=["new_query"],
        template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. "
                  "If you don't know the answer, just say that you don't know. "
                  "Use three sentences maximum and keep the answer concise. "
                  "Question: {question} Context: {context}Answer:"
    )

  chain = response_prompt | llm
  for i in new_query:
    response = chain.invoke({"context" : retriever.get_relevant_documents(i) , "question": i}).content
    st.write(f"Query : {i} ; Answer : {response}\n")

def Factual(query :str):
    prompt = PromptTemplate(
            input_variables=["query"],
            template="Enhance this factual query for better information retrieval but do not give options it should be a single query \nQuery: {query}\nNew_Query:"
        )
    new_query = (prompt | llm).invoke({"query": query}).content

    response_prompt = PromptTemplate(
        input_variables=["new_query"],
        template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. "
                  "If you don't know the answer, just say that you don't know. "
                  "Use three sentences maximum and keep the answer concise. "
                  "Question: {question} Context: {context}Answer:"
    )

    chain = response_prompt | llm
    response = chain.invoke({"context" : retriever.get_relevant_documents(new_query) , "question": new_query}).content
    st.write(response)
def Opinion(query : str):
  prompt = PromptTemplate(
            input_variables=["query"],
            template="Create 3 new queries explaining different view points in relation to the given query: {query}"
        )
  chain = prompt | llm.with_structured_output(SubQueries)
  new_query = new_query = chain.invoke({"query": query}).sub_queries
  response_prompt = PromptTemplate(
        input_variables=["new_query"],
        template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. "
                  "If you don't know the answer, just say that you don't know. "
                  "Use three sentences maximum and keep the answer concise. "
                  "Question: {question} Context: {context}Answer:"
    )
  chain = response_prompt | llm
  for i in new_query:
    response = chain.invoke({"context" : retriever.get_relevant_documents(i) , "question": i}).content
    st.write(f"Query : {i} ; Answer : {response}\n")
def Contextual(query):
  context_prompt = PromptTemplate(
            input_variables=["query", "context"],
            template="Given the user context: {context}\nReformulate the query to best address the user's needs: {query}"
        )
  context_chain = context_prompt | llm
  contextualized_query = context_chain.invoke({"context":retriever.get_relevant_documents(query), "query" : query}).content
  response_prompt = PromptTemplate(
        input_variables=["new_query"],
        template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. "
                  "If you don't know the answer, just say that you don't know. "
                  "Use three sentences maximum and keep the answer concise. "
                  "Question: {question} Context: {context}Answer:"
    )
  chain = response_prompt | llm
  response = chain.invoke({"context" : retriever.get_relevant_documents(contextualized_query) , "question": contextualized_query}).content
  st.write(response)
st.title("Hi, this is a bot to help you answer all your questions related to ICS and more!!!")
st.write("Welcome! Any question you have can be asked below:")
query = st.text_input("Enter your question or topic:")
if query:
    category = classify_chain.invoke(query).category
    with st.spinner(f"Processing..."):
      time.sleep(1)
    st.write(f"Your query is a {category} query")
    globals()[category](query)


Here, we launch the app to open the app click on the link printed.

In [None]:
from pyngrok import ngrok
import os
AUTH = userdata.get("NGROK")
ngrok.set_auth_token(AUTH)
!pkill streamlit
ngrok.kill()

public_url = ngrok.connect(8501)
print(f"🚀 Streamlit is live at: {public_url}")

!streamlit run app.py --server.port 8501 --server.headless true --server.enableCORS false
