In [3]:
from dotenv import load_dotenv
load_dotenv()
import os
import time
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, UnstructuredPowerPointLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA

# Set your Google API Key
def set_api_key(api_key=None):
    if api_key is None:
        api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        print("API key not found!")
        return
    os.environ["GOOGLE_API_KEY"] = api_key
    print("API Key Successfully Completed")

# Start the chatbot with ability to add new files to existing DB
def initialize_chatbot(force_reindex=False):
    print("Starting CTSE Chatbot...")

    # Define the prompt template
    template = """
    As an expert assistant for the CTSE course, provide concise, focused answers to course material, 
    avoiding making up answers if you don't know the answer.

    Context:
    {context}

    Question: {question}

    Answer:
    """

    prompt = PromptTemplate(
        template=template,
        input_variables=["context", "question"]
    )

    # Initialize the LLM
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        temperature=0.2,
    )

    # Setup paths and embedding model
    db_path = "./Chroma_db"
    lecture_notes_path = "data_sets" 

    # Initialize embedding model
    print("Starting embedding model...")
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": "cpu"}
    )

    # Load all PDF and PPTX documents from the directory
    print(f"Scanning {lecture_notes_path} folder for lecture materials...")

    if not os.path.exists(lecture_notes_path):
        print(f"Error: Lecture notes directory not found: {lecture_notes_path}")
        print("Please create this directory and add your lecture notes (PDFs or PPTXs).")
        return None

    # Load PDF files
    pdf_loader = DirectoryLoader(
        lecture_notes_path,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )
    pdf_docs = pdf_loader.load()

    # Load PPTX files
    pptx_loader = DirectoryLoader(
        lecture_notes_path,
        glob="**/*.pptx",
        loader_cls=UnstructuredPowerPointLoader
    )
    pptx_docs = pptx_loader.load()

    # Combine documents
    all_documents = pdf_docs + pptx_docs

    if not all_documents:
        print("Error: No documents (PDF/PPTX) found in the lecture notes directory.")
        return None

    print(f"Total documents found: {len(all_documents)}")

    # Process documents based on DB existence
    if os.path.exists(db_path) and len(os.listdir(db_path)) > 0 and not force_reindex:
        print("Existing vector database found.")

        print("Loading existing Chroma DB...")
        vectorstore = Chroma(
            persist_directory=db_path,
            embedding_function=embedding_model
        )

        # Get list of documents already in the DB
        print("Checking for new documents to embed...")
        existing_docs = set(vectorstore.get()["metadatas"])
        existing_sources = set()

        for doc in existing_docs:
            if doc and "source" in doc:
                existing_sources.add(doc["source"])

        print(f"Found {len(existing_sources)} documents already embedded.")

        # Filter new documents
        new_documents = []
        for doc in all_documents:
            if doc.metadata.get("source") not in existing_sources:
                new_documents.append(doc)

        print(f"Found {len(new_documents)} new documents to embed.")

        if new_documents:
            print("Processing new documents...")
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200
            )
            new_splits = text_splitter.split_documents(new_documents)
            print(f"Created {len(new_splits)} chunks from new documents.")

            print("Adding new documents to Chroma DB...")
            vectorstore.add_documents(new_splits)
            print("Successfully added new documents to existing database...!")
        else:
            print("No new documents to add.")
    else:
        if force_reindex:
            print("Full reindexing initiated by user request.")
        else:
            print("No existing Chroma DB found or it's empty.")

        print("Creating new vector database from all documents...")

        print("Splitting documents into text chunks for vector embedding...")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        splits = text_splitter.split_documents(all_documents)
        print(f"Chunking complete: {len(splits)} sections generated from {len(all_documents)} documents.")

        print("Embedding documents (this may take some time)...")
        start_time = time.time()
        vectorstore = Chroma.from_documents(
            documents=splits,
            embedding=embedding_model,
            persist_directory=db_path
        )
        end_time = time.time()
        print(f"Embedding completed in {end_time - start_time:.2f}s.")

    # Create QA chain
    print("Building intelligent QA system from embedded knowledge...")
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True
    )

    print("Chatbot Started successful and ready to answer queries....!")
    return qa_chain

# Function to ask a question and get an answer
def ask_question(qa_chain, question):
    print("\n" + "=" * 60)
    print(f" --- CTSE Chatbot ---")
    print("=" * 60)
    print(f"📌 Question: {question}")
    print("-" * 60)

    try:
        print("🔎 Scanning documents for the best match...")
        start_time = time.time()
        result = qa_chain({"query": question})
        end_time = time.time()

        answer = result["result"]
        source_docs = result["source_documents"]

        print(f"\n✅ Answer: {answer}")
        print(f"\n --- Response generated in {end_time - start_time:.2f} seconds ---")

        if source_docs:
            print("\n📚 Sources:")
            unique_sources = list({os.path.basename(doc.metadata.get('source', 'Unknown')) for doc in source_docs})
            for idx, source in enumerate(unique_sources[:3], 1):
                print(f"  {idx}. {source}")

        print("=" * 60)
        return result

    except Exception as e:
        print(f"❌ Error while processing your question: {str(e)}")
        print("=" * 60)
        return None

In [4]:
api_key = "AIzaSyCnoTSZVnaZJoHVMVGeMGWNXDMAsQTCndU"  # Replace with your actual API key
set_api_key(api_key)

qa_chain = initialize_chatbot(force_reindex=True)  

API Key Successfully Completed
Starting CTSE Chatbot...
Starting embedding model...
Scanning data_sets folder for lecture materials...
Total documents found: 630
Full reindexing initiated by user request.
Creating new vector database from all documents...
Splitting documents into text chunks for vector embedding...
Chunking complete: 1102 sections generated from 630 documents.
Embedding documents (this may take some time)...
Embedding completed in 55.85s.
Building intelligent QA system from embedded knowledge...
Chatbot Started successful and ready to answer queries....!


In [6]:
question = "can you tell me what is Docker?"

result = ask_question(qa_chain, question)


 --- CTSE Chatbot ---
📌 Question: can you tell me what is Docker?
------------------------------------------------------------
🔎 Scanning documents for the best match...

✅ Answer: Docker is a container engine (runtime + tool for managing containers and images) that packages and runs applications in isolated environments called containers.  It provides a CLI tool, platform, and company offerings.

 --- Response generated in 3.03 seconds ---

📚 Sources:
  1. Containers 101.pptx
