In [None]:
%pip install langchain langchain-community langchain-google-genai chromadb docling python-dotenv

In [None]:
import os
from typing import List
import tempfile
from pathlib import Path
import uuid

# Core imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate

# LangGraph imports for modern memory management
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langgraph.prebuilt import create_react_agent

# Document processing
from docling.document_converter import DocumentConverter

# Set up environment
from dotenv import load_dotenv

load_dotenv()

# Configuration
GOOGLE_API_KEY = ""
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY


class RAGChatSystem:
    def __init__(self, temperature=0.5, top_p=0.9):
        """Initialize the RAG chat system with LangGraph persistence."""
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.llm = GoogleGenerativeAI(
            model="gemini-2.0-flash-exp",
            temperature=0.3,
            top_p=top_p,
            google_api_key=GOOGLE_API_KEY,
        )
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200, length_function=len
        )
        self.vectorstore = None
        self.document_converter = DocumentConverter()

        self.memory = MemorySaver()
        self.app = None
        self.thread_id = str(uuid.uuid4())

    def process_documents(self, file_paths: List[str]) -> List[Document]:
        """Process documents using Docling and return LangChain documents."""
        documents = []

        for file_path in file_paths:
            try:
                print(f"Processing: {file_path}")

                # Convert document using Docling
                result = self.document_converter.convert(file_path)

                # Extract text content
                text_content = result.document.export_to_markdown()

                # Create LangChain document
                doc = Document(
                    page_content=text_content,
                    metadata={"source": file_path, "filename": Path(file_path).name},
                )
                documents.append(doc)

            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                continue

        return documents

    def create_vector_store(self, documents: List[Document]):
        """Create ChromaDB vector store from documents."""
        if not documents:
            print("No documents to process!")
            return

        # Split documents into chunks
        texts = self.text_splitter.split_documents(documents)
        print(f"Created {len(texts)} text chunks")

        # Create vector store
        self.vectorstore = Chroma.from_documents(
            documents=texts, embedding=self.embeddings, persist_directory="./chroma_db"
        )

        # Create LangGraph application
        self._create_langgraph_app()

        print("Vector store and LangGraph app created successfully!")

    def _create_langgraph_app(self):
        """Create LangGraph application with RAG capabilities."""

        def rag_agent(state: MessagesState):
            """RAG agent that retrieves context and generates responses."""
            # Get the last human message
            last_message = state["messages"][-1]
            query = last_message.content

            # Retrieve relevant documents
            retriever = self.vectorstore.as_retriever(search_kwargs={"k": 3})
            relevant_docs = retriever.invoke(query)

            # Create context from retrieved documents
            context = "\n\n".join([doc.page_content for doc in relevant_docs])

            # Create RAG prompt
            rag_prompt = f"""Based on the following context, answer the user's question:

Context:
{context}

Question: {query}

Answer based on the context provided. If the context doesn't contain relevant information, say so."""

            # Generate response using LLM
            response = self.llm.invoke(rag_prompt)

            # Return AIMessage with sources
            sources_info = f"\n\nSources: {len(relevant_docs)} documents used"
            return {"messages": [AIMessage(content=response + sources_info)]}

        # Create the graph
        graph_builder = StateGraph(MessagesState)
        graph_builder.add_node("rag_agent", rag_agent)
        graph_builder.add_edge(START, "rag_agent")

        # Compile with checkpointer for memory
        self.app = graph_builder.compile(checkpointer=self.memory)

    def chat(self, question: str) -> dict:
        """Chat with the RAG system using LangGraph."""
        if not self.app:
            return {"error": "Please upload and process documents first!"}

        try:
            # Create config with thread ID for conversation persistence
            config = {"configurable": {"thread_id": self.thread_id}}

            # Invoke the graph with the user's question
            result = self.app.invoke(
                {"messages": [HumanMessage(content=question)]}, config=config
            )

            # Extract the response
            ai_message = result["messages"][-1]

            return {"answer": ai_message.content, "thread_id": self.thread_id}

        except Exception as e:
            return {"error": f"Error generating response: {str(e)}"}

    def reset_chat_history(self):
        """Reset conversation by creating a new thread ID."""
        self.thread_id = str(uuid.uuid4())
        print(f"Chat history cleared! New thread ID: {self.thread_id}")

    def get_chat_history(self):
        """Get current conversation history."""
        if not self.app:
            return []

        config = {"configurable": {"thread_id": self.thread_id}}
        state = self.app.get_state(config)
        return state.values.get("messages", [])


rag_system = RAGChatSystem()

In [None]:
def process_folder(
    self, folder_path: str, supported_extensions: List[str] = None
) -> List[Document]:
    """Process all documents in a specified folder."""
    if supported_extensions is None:
        supported_extensions = [
            ".pdf",
            ".pptx",
            ".txt",
            ".docx",
            ".doc",
            ".md",
            ".html",
            ".csv",
            ".xlsx",
        ]

    documents = []

    # Check if folder exists
    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' does not exist!")
        return documents

    # Get all files in the folder
    file_paths = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        # Check if it's a file and has supported extension
        if os.path.isfile(file_path):
            file_extension = os.path.splitext(filename)[1].lower()
            if file_extension in supported_extensions:
                file_paths.append(file_path)

    print(f"Found {len(file_paths)} supported files in '{folder_path}':")
    for fp in file_paths:
        print(f"  - {os.path.basename(fp)}")

    # Process all found files
    if file_paths:
        documents = self.process_documents(file_paths)
    else:
        print("No supported files found in the folder!")

    return documents

In [None]:
def interactive_chat():
    """Simple interactive chat interface."""
    print("RAG Chat System Ready! Type 'quit' to exit, 'reset' to clear history.")
    print("-" * 50)

    while True:
        question = input("\nYou: ").strip()

        if question.lower() == "quit":
            break
        elif question.lower() == "reset":
            rag_system.reset_chat_history()
            continue
        elif not question:
            continue

        response = rag_system.chat(question)

        if "error" in response:
            print(f"Error: {response['error']}")
        else:
            print(f"\nBot: {response['answer']}")
            print(f"\nSources used: {len(response['source_documents'])} documents")


# Start interactive chat
interactive_chat()