In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import subprocess

def run(commands):
    for command in commands:
        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
            for line in sp.stdout:
                line = line.decode("utf-8", errors = "replace")
                if "undefined reference" in line:
                    raise RuntimeError("Failed Processing.")
                print(line, flush = True, end = "")
        pass
    pass
pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install langchain langchain-core langchain-community langchain-groq langchain-google-genai langchain-ollama faiss-cpu pypdf gradio python-dotenv tavily-python tiktoken uuid

In [None]:
import uuid
import logging
import base64
from dotenv import load_dotenv
from typing import Annotated, Dict, Any, Optional, List, Sequence, Literal, TypedDict
from typing_extensions import TypedDict
from pydantic import BaseModel as PydanticBaseModel, Field # Alias Pydantic BaseModel

# --- LangChain Core Imports ---
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document

# --- Document Loaders and Splitters ---
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Vector Stores and Embeddings ---
from langchain_community.vectorstores import FAISS
# Ensure you have Ollama running and the model pulled
# ollama pull nomic-embed-text
from langchain_ollama import OllamaEmbeddings # Correct import

# --- LLMs ---
# Ensure you have GROQ and Google API keys set in your environment
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI

# --- Tools ---
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.tools.retriever import create_retriever_tool

# --- LangGraph ---
from langgraph.graph import StateGraph, END, START
from langgraph.graph.message import add_messages
from langgraph.checkpoint.memory import MemorySaver # In-memory checkpointer

# --- Gradio ---
import gradio as gr

In [None]:
commands = [
        "curl -fsSL https://ollama.com/install.sh | sh",
]
run(commands)

import os
os.system("/usr/local/bin/ollama serve &")
os.system("echo 'ollama test'")

In [None]:
commands = [
        "ollama pull nomic-embed-text"
]
run(commands)

In [None]:
pdf=PyPDFLoader("/kaggle/input/optopp/Book.pdf")
chunks=SemanticChunker(embeddings=embed)
pdf_documents = pdf.load()
# Then split the loaded documents using the previously defined chunker
docs = chunks.split_documents(pdf_documents)

In [None]:
vectorstore = FAISS.from_documents(docs, embed)
vectorstore.save_local("cadb")
vectorstore.as_retriever()

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GOOGLE_API_KEY")
secret_value_1 = user_secrets.get_secret("GROQ_API_KEY")
secret_value_2 = user_secrets.get_secret("LANGCHAIN_API_KEY")
secret_value_3 = user_secrets.get_secret("TAVILY_API_KEY")

os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGCHAIN_PROJECT', 'Business Law RAG Agent')

# --- Global Variables & Configuration ---
VECTOR_STORE_PATH = "cadb"
OLLAMA_EMBED_MODEL = "nomic-embed-text" # Ollama model for embeddings
GROQ_MODEL = 'meta-llama/llama-4-scout-17b-16e-instruct' # Groq model (ensure this is available)
GEMINI_MODEL = "gemini-2.0-flash" # Google model

# Global variable to hold the retriever once the PDF is processed
global_retriever = None
indexing_done = False


def load_retriever() -> Optional[Any]:
    """Loads the FAISS retriever from local storage."""
    global indexing_done
    if not os.path.exists(VECTOR_STORE_PATH):
         logger.warning(f"FAISS index not found at {VECTOR_STORE_PATH}. Please upload and index a PDF.")
         indexing_done = False
         return None
    try:
        logger.info(f"Loading FAISS index from {VECTOR_STORE_PATH}...")
        embeddings = OllamaEmbeddings(model=OLLAMA_EMBED_MODEL)
        vectorstore = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
        logger.info("FAISS index loaded successfully.")
        indexing_done = True
        return vectorstore.as_retriever()
    except Exception as e:
        logger.error(f"Error loading FAISS index: {e}", exc_info=True)
        indexing_done = False
        return None

# --- Initialize LLMs and Tools ---
logger.info("Initializing LLMs and Tools...")
try:
    llm_fast = ChatGroq(model=GROQ_MODEL, temperature=0.2) # For simpler tasks like grading/rewriting
    llm_gen = ChatGoogleGenerativeAI(model=GEMINI_MODEL, temperature=0.3) # For generation

    # Attempt to load retriever on startup if index exists
    global_retriever = load_retriever()

    search_tool = TavilySearchResults(max_results=2, search_depth='basic')
    logger.info("LLMs and Tavily Search Tool initialized.")

except Exception as e:
    logger.error(f"Initialization Error: {e}", exc_info=True)
    # Handle error appropriately - maybe exit or disable features
    raise SystemExit("Failed to initialize core components.")

In [None]:
# --- RAG Graph Definition ---

class RAGGraphState(TypedDict):
    """State for the RAG pipeline."""
    original_question: str
    question: str
    documents: List[Document]
    is_rag_relevant: bool | None
    generation: str
    source_used: Literal["Vector Store", "Web Search", "None"]

# Pydantic model for the grader
class GradeDocuments(PydanticBaseModel):
    """Binary score for relevance check."""
    binary_score: Literal['yes', 'no'] = Field(description="Is the document relevant? ('yes' or 'no')")

# RAG Nodes (Adapted for Business Law Context)
def rewrite_query_node(state: RAGGraphState) -> RAGGraphState:
    """Rewrites the query for legal context retrieval."""
    logger.info("--- RAG Node: REWRITE QUERY ---")
    original_question = state['original_question']
    rewrite_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a query rewriting expert specializing in Indian Business Law and the Companies Act. Rewrite the user's question to be precise and effective for retrieving relevant legal sections or explanations from a database. Focus on legal terms, section numbers (if mentioned), and core concepts. Do not answer the question, only rewrite."),
        ("human", "Original question: {question}\n\nRewritten query for legal database:"),
    ])
    rewriter_chain = rewrite_prompt | llm_fast | StrOutputParser()
    try:
        rewritten_question = rewriter_chain.invoke({"question": original_question})
        logger.info(f"Rewritten Query: {rewritten_question}")
        return {"question": rewritten_question, "original_question": original_question}
    except Exception as e:
         logger.error(f"Error rewriting query: {e}", exc_info=True)
         return {"question": original_question, "original_question": original_question} # Fallback


def retrieve_rag_node(state: RAGGraphState) -> RAGGraphState:
    """Retrieves documents using the global retriever."""
    logger.info("--- RAG Node: RETRIEVE (Vector Store) ---")
    if global_retriever is None:
        logger.warning("Retriever not available. Skipping RAG retrieval.")
        return {"documents": [], "source_used": "None"}
    question = state['question']
    try:
        logger.info(f"Retrieving documents for: {question}")
        docs = global_retriever.invoke(question)
        logger.info(f"Retrieved {len(docs)} documents from vector store.")
        return {"documents": docs, "source_used": "Vector Store"}
    except Exception as e:
         logger.error(f"Error during RAG retrieval: {e}", exc_info=True)
         return {"documents": [], "source_used": "None"}

def grade_documents_node(state: RAGGraphState) -> RAGGraphState:
    """Grades the relevance of retrieved documents."""
    logger.info("--- RAG Node: GRADE DOCUMENTS ---")
    question = state['question']
    documents = state['documents']
    if not documents:
        logger.warning("No documents to grade.")
        return {"is_rag_relevant": False}

    system = """You are a grader assessing the relevance of a retrieved document snippet to a user question about Indian Business Law or the Companies Act.
    Focus ONLY on relevance. Does the document contain keywords, legal concepts, section numbers, or context that could potentially help answer the question?
    Output a binary score 'yes' or 'no' based SOLELY on relevance."""
    grade_prompt = ChatPromptTemplate.from_messages([
        ("system", system),
        ("human", "Document snippet:\n\n{document}\n\nUser question: {question}"),
    ])
    structured_llm_grader = llm_fast.with_structured_output(GradeDocuments)
    grader_chain = grade_prompt | structured_llm_grader

    relevant_docs_found = False
    for doc in documents[:2]: # Grade only first few for speed
        try:
            doc_snippet = doc.page_content[:1000] # Limit context for grader
            score = grader_chain.invoke({"question": question, "document": doc_snippet})
            if score.binary_score == 'yes':
                logger.info("--- GRADE: Relevant document found (Vector Store) ---")
                relevant_docs_found = True
                break
            else:
                logger.info("--- GRADE: Document deemed NOT relevant (Vector Store) ---")
        except Exception as e:
            logger.error(f"Error grading document: {e}", exc_info=True)
            # Treat grading errors as not relevant for robustness
            continue

    logger.info(f"Vector Store Relevance Decision: {'Relevant' if relevant_docs_found else 'Not Relevant'}")
    return {"is_rag_relevant": relevant_docs_found}

def web_search_node(state: RAGGraphState) -> RAGGraphState:
    """Performs web search if RAG fails."""
    logger.info("--- RAG Node: WEB SEARCH ---")
    question = state['question'] # Use rewritten question for search
    logger.info(f"Performing web search for: {question}")
    docs = []
    try:
        web_results = search_tool.invoke({"query": question})
        if web_results and isinstance(web_results, list):
             docs = [Document(page_content=d.get("content", ""), metadata={"source": d.get("url", "web")})
                    for d in web_results if d.get("content")]
             logger.info(f"Found {len(docs)} web results.")
        else:
             logger.warning(f"Web search returned unexpected or empty results: {web_results}")
    except Exception as e:
        logger.error(f"Error during web search: {e}", exc_info=True)
        docs = [Document(page_content="Web search failed due to an error.", metadata={"source": "error"})]

    return {"documents": docs, "source_used": "Web Search"}


def generate_answer_node(state: RAGGraphState) -> RAGGraphState:
    """Generates the final answer using context."""
    logger.info("--- RAG Node: GENERATE ANSWER ---")
    question = state['original_question']
    documents = state['documents']
    source = state.get('source_used', 'Unknown') # Get source info

    if not documents:
        logger.warning("No documents available for generation.")
        return {"generation": "I could not find relevant information from the provided document or web search to answer your question."}

    context = "\n\n".join([f"Source: {doc.metadata.get('source', source)}\nContent: {doc.page_content}" for doc in documents])

    prompt_template = """You are an expert assistant specializing in Indian Business Law, particularly concepts covered in the provided study materials or relevant web searches.

CONTEXT INFORMATION:
{context}

USER QUESTION: {question}

INSTRUCTIONS:
1. Provide an accurate, clear, and concise answer based **strictly** on the CONTEXT INFORMATION provided above.
2. If the context comes from the "Vector Store" (the uploaded PDF), prioritize that information. If it comes from "Web Search", state that clearly (e.g., "According to web search results...").
3. Explain legal concepts in simple terms suitable for a foundation-level CA student. Define technical terms briefly if necessary.
4. If the context allows, mention relevant section numbers (e.g., "As per Section X of the Indian Contract Act...").
5. **Crucially:** If the context does **not** contain the information to answer the question, state clearly: "Based on the provided information, I cannot answer this question." Do **not** invent information or draw external knowledge not present in the context.
6. Format the answer for readability using paragraphs or bullet points.

Answer:
"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    rag_chain = prompt | llm_gen | StrOutputParser() # Use the more capable LLM for generation

    try:
        generation = rag_chain.invoke({"context": context, "question": question})
    except Exception as e:
        logger.error(f"Error during generation: {e}", exc_info=True)
        generation = "Sorry, an error occurred while generating the response."

    logger.info(f"Generated Answer (Source: {source}): {generation[:150]}...")
    return {"generation": generation}

def decide_search_or_generate(state: RAGGraphState) -> Literal["web_search", "generate_answer"]:
    """Decision node for the RAG pipeline."""
    logger.info("--- RAG Edge: Decide Search or Generate ---")
    if state.get('is_rag_relevant', False):
        logger.info("Decision: Relevant docs found in vector store -> Generate Answer.")
        return "generate_answer"
    else:
        logger.info("Decision: No relevant docs in vector store -> Web Search.")
        return "web_search"

In [None]:
logger.info("Building the RAG workflow graph...")
rag_workflow = StateGraph(RAGGraphState)
rag_workflow.add_node("rewrite_query", rewrite_query_node)
rag_workflow.add_node("retrieve_rag", retrieve_rag_node)
rag_workflow.add_node("grade_documents", grade_documents_node)
rag_workflow.add_node("web_search", web_search_node)
rag_workflow.add_node("generate_answer", generate_answer_node)

rag_workflow.set_entry_point("rewrite_query")
rag_workflow.add_edge("rewrite_query", "retrieve_rag")
rag_workflow.add_edge("retrieve_rag", "grade_documents")
rag_workflow.add_conditional_edges(
    "grade_documents",
    decide_search_or_generate,
    {"web_search": "web_search", "generate_answer": "generate_answer"}
)
rag_workflow.add_edge("web_search", "generate_answer")
rag_workflow.add_edge("generate_answer", END)

try:
    rag_app = rag_workflow.compile()
    logger.info("RAG workflow graph compiled successfully.")
except Exception as e:
    logger.error(f"Error compiling RAG graph: {e}", exc_info=True)
    rag_app = None
    raise SystemExit("Failed to compile RAG workflow.")


# --- Conversational Graph Definition ---
class ConversationState(TypedDict):
    """State for the conversational agent."""
    messages: Annotated[List[BaseMessage], add_messages]

# Checkpointer for storing conversation history
memory = MemorySaver()

def call_rag_pipeline(state: ConversationState) -> ConversationState:
    """Node that invokes the RAG pipeline for the latest user query."""
    logger.info("--- Conversation Node: CALL RAG PIPELINE ---")
    last_message = state['messages'][-1]
    if not isinstance(last_message, HumanMessage):
        # Should not happen in typical flow, but good practice
        logger.warning("Last message is not a HumanMessage, skipping RAG call.")
        return {"messages": [AIMessage(content="Internal error: Expected user input.")]}

    user_query = last_message.content
    logger.info(f"Invoking RAG pipeline for query: {user_query}")

    if rag_app is None:
        logger.error("RAG App is not compiled. Cannot process query.")
        return {"messages": [AIMessage(content="Error: The document processing pipeline is not ready.")]}
    if not indexing_done and global_retriever is None:
         logger.warning("PDF not indexed. RAG pipeline will rely on web search only.")
         # Proceed, but generation node will know source is web or none

    rag_input = {"original_question": user_query}
    try:
        rag_output = rag_app.invoke(rag_input)
        generated_answer = rag_output.get("generation", "Sorry, I couldn't retrieve an answer.")
        logger.info("RAG pipeline finished.")
        # The conversational LLM will now use this generated answer as context
        # We just pass the AI message back into the conversation state
        return {"messages": [AIMessage(content=generated_answer)]}
    except Exception as e:
        logger.error(f"Error invoking RAG pipeline: {e}", exc_info=True)
        return {"messages": [AIMessage(content="Sorry, there was an error retrieving information.")]}

# Build the Conversational Graph
logger.info("Building the conversational workflow graph...")
conversation_graph = StateGraph(ConversationState)
conversation_graph.add_node("call_rag", call_rag_pipeline)
conversation_graph.set_entry_point("call_rag")
conversation_graph.add_edge("call_rag", END)

try:
    # Compile with memory
    agent_executor = conversation_graph.compile(checkpointer=memory)
    logger.info("Conversational agent compiled successfully.")
except Exception as e:
    logger.error(f"Error compiling conversational agent: {e}", exc_info=True)
    agent_executor = None
    raise SystemExit("Failed to compile conversational agent.")


In [None]:
def handle_pdf_upload(pdf_file):
    """Handles PDF upload, processing, and retriever setup."""
    global global_retriever, indexing_done
    if pdf_file is None:
        return "Status: No PDF uploaded. Using existing index if available, or web search only.", None # Return None for chatbot

    file_path = pdf_file.name
    logger.info(f"Processing uploaded PDF: {file_path}")
    status_update = f"Processing PDF: {os.path.basename(file_path)}..."
    yield status_update, None # Update status, clear chatbot

    vectorstore = process_pdf(file_path)

    if vectorstore:
        global_retriever = vectorstore.as_retriever()
        indexing_done = True
        status_update = f"Status: Successfully indexed {os.path.basename(file_path)}. Ready to chat."
        logger.info("Retriever updated with new PDF.")
        yield status_update, None
    else:
        status_update = f"Status: Failed to index {os.path.basename(file_path)}. Please check logs. Will rely on web search."
        global_retriever = None # Ensure retriever is None if indexing failed
        indexing_done = False
        logger.error("Retriever could not be created from PDF.")
        yield status_update, None


def chat_interface(message, history, thread_id_state):
    """Handles the chat interaction with the LangGraph agent."""
    logger.info(f"Received message for thread_id {thread_id_state}: {message}")

    if agent_executor is None:
         logger.error("Agent executor not available.")
         # This error should ideally be shown more prominently in the UI
         history.append((message, "Error: Chat agent is not ready."))
         return "", history, thread_id_state

    if not indexing_done and global_retriever is None:
         logger.warning("Chatting without an indexed PDF. Relying on web search.")
         # Optionally add a notice to the user in the chat history here

    # LangGraph configuration
    config = {"configurable": {"thread_id": thread_id_state}}

    # Append user message to Gradio history immediately
    # history.append((message, None)) # This causes duplicate display, manage history return instead
    # yield "", history, thread_id_state # Update UI to show user message

    # Stream the response
    response_content = ""
    try:
        logger.info(f"Streaming response for thread_id {thread_id_state}")
        # Use stream for potentially better UX, fallback to invoke if needed
        # Note: Streaming directly into Gradio chatbot needs careful handling
        # For simplicity, let's use invoke and update history at the end

        # inputs = {"messages": [HumanMessage(content=message)]} # add_messages handles history
        inputs = {"messages": HumanMessage(content=message)} # Correct way with add_messages

        # Invoke the agent
        result = agent_executor.invoke(inputs, config)

        # Get the last AI message
        if result and "messages" in result and len(result["messages"]) > 0:
             # The result['messages'] contains the whole history now due to add_messages
             # The last one should be the AI's response to the current input
            ai_msg_obj = result["messages"][-1]
            if isinstance(ai_msg_obj, AIMessage):
                 response_content = ai_msg_obj.content
            else:
                 response_content = "Error: Agent returned an unexpected message type."
                 logger.error(f"Unexpected message type from agent: {type(ai_msg_obj)}")
        else:
            response_content = "Sorry, I couldn't process that."
            logger.error(f"Agent did not return expected messages. Result: {result}")

        logger.info(f"Agent response received for thread_id {thread_id_state}")

    except Exception as e:
        logger.error(f"Error during agent invocation for thread_id {thread_id_state}: {e}", exc_info=True)
        response_content = f"An error occurred: {e}"

    # Update Gradio history with the AI response
    # Convert LangChain message history to Gradio format
    gradio_history = []
    if result and "messages" in result:
        lc_messages = result["messages"]
        # Iterate through pairs (or handle the first message if odd)
        for i in range(0, len(lc_messages), 2):
            user_msg = lc_messages[i].content if isinstance(lc_messages[i], HumanMessage) else None
            ai_msg = lc_messages[i+1].content if (i+1 < len(lc_messages)) and isinstance(lc_messages[i+1], AIMessage) else None
            if user_msg is not None: # Should always have user message first in pairs
                 gradio_history.append((user_msg, ai_msg))
        # Handle case where the last message is the user's (shouldn't happen after invoke)
        # or if history started with AI (unlikely with add_messages)
    else: # Fallback if invoke failed badly
         history.append((message, response_content))
         gradio_history = history


    # Return empty string for the textbox, updated history, and the same thread_id
    return "", gradio_history, thread_id_state


# --- Build Gradio App ---
logger.info("Building Gradio interface...")
with gr.Blocks(theme=gr.themes.Soft(), title="CA Business Law Chat") as demo:
    gr.Markdown("# Chat with CA Foundation Business Law Assistant")
    gr.Markdown("Upload your CA Foundation Business Law PDF (Syllabus 2022 context provided in the example) and ask questions.")

    # Hidden state for conversation thread ID
    thread_id_state = gr.State(value=str(uuid.uuid4())) # Initialize with a new ID

    with gr.Row():
        with gr.Column(scale=1):
            pdf_upload = gr.File(label="Upload Business Law PDF", file_types=[".pdf"])
            index_button = gr.Button("Index PDF")
            upload_status = gr.Textbox("Status: Please upload and index PDF.", label="Indexing Status", interactive=False)

        with gr.Column(scale=4):
            chatbot = gr.Chatbot(label="Chat History", height=600)
            msg_textbox = gr.Textbox(label="Your Question", placeholder="Ask about Contracts Act, Sale of Goods Act, etc.", scale=7)
            submit_button = gr.Button("Send", variant="primary", scale=1)
            clear_button = gr.ClearButton([msg_textbox, chatbot], value="Clear Chat")

    # Event Handlers
    upload_event = index_button.click(
        fn=handle_pdf_upload,
        inputs=[pdf_upload],
        outputs=[upload_status, chatbot] # Update status and clear chat on new index
    )

    # Handle message submission (using Enter key or Send button)
    msg_textbox.submit(
         fn=chat_interface,
         inputs=[msg_textbox, chatbot, thread_id_state],
         outputs=[msg_textbox, chatbot, thread_id_state] # Textbox cleared, chatbot updated
    )
    submit_button.click(
         fn=chat_interface,
         inputs=[msg_textbox, chatbot, thread_id_state],
         outputs=[msg_textbox, chatbot, thread_id_state]
    )

    # Handle clearing chat (doesn't reset thread_id implicitly here, might need adjustment if full reset is desired)
    # ClearButton handles clearing components. If thread reset is needed, add custom logic.


logger.info("Gradio interface defined.")

# --- Launch the App ---
if __name__ == "__main__":
    logger.info("Launching Gradio App...")
    demo.launch(share=False) # share=True to create public link if needed (use with caution)