# Reimplemented Code (Compartmentalized)
This notebook reimplements your original code with the same functionality.

## (Optional) Installation Commands
Uncomment if you're in a fresh environment (e.g., Google Colab) or missing dependencies.

In [1]:
# !pip install python-dotenv langchain_openai langchain_chroma plotly gradio langchain_community unstructured python-docx
# !apt-get install -y libreoffice

In [2]:
# !pip install sklearn numpy

## Imports and Environment Setup
If you're using Google Colab with Google Drive, the drive mounting is included. If you are local or not using Drive, it will just print a warning.

In [3]:
import os
import glob
import numpy as np
import plotly.graph_objects as go
import gradio as gr
import docx

from dotenv import load_dotenv
from langchain.document_loaders import (
    DirectoryLoader,
    TextLoader,
    UnstructuredWordDocumentLoader
)
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from sklearn.manifold import TSNE


MODEL = "gpt-4o"
db_name = "C:\\Users\\Ben.Ball\\Documents\\work_projects\\llm_engineering\\vector_db2"


load_dotenv()
persist_directory = db_name

text_loader_kwargs = {"encoding": "utf-8"}
chunk_size = 2000
chunk_overlap = 400
load_from_scratch = False


## 1) Document Processing Function
Loads `.doc`/`.docx` files, filters out temp files, and splits them into smaller chunks.

In [4]:
def process_folders(
    folders,
    text_loader_kwargs,
    chunk_size=2000,
    chunk_overlap=400
):
    """
    Process documents in the specified folders and return chunks and documents.

    Args:
        folders (list): List of folder paths to process.
        text_loader_kwargs (dict): Arguments for the text loader (e.g., encoding).
        chunk_size (int): Size of text chunks for splitting.
        chunk_overlap (int): Overlap between text chunks.

    Returns:
        tuple: A tuple containing:
            - documents (list): Loaded documents with metadata.
            - chunks (list): Split document chunks.
    """
    def filter_valid_files(folder, extensions):
        """Filter valid .doc and .docx files in a folder."""
        print(f"Filtering files in folder: {folder} with extensions: {extensions}")
        all_files = []
        for ext in extensions:
            matching_files = glob.glob(os.path.join(folder, f"**/*{ext}"), recursive=True)
            print(f"Found {len(matching_files)} files with extension {ext}: {matching_files}")
            all_files.extend(matching_files)

        # Exclude temporary files that start with "~$"
        filtered_files = [
            file for file in all_files if not os.path.basename(file).startswith("~$")
        ]
        print(f"Filtered files: {filtered_files}")
        return filtered_files

    # Initialize an empty list for documents
    documents = []

    # Process each folder
    for folder in folders:
        print(f"--- Processing folder: {folder} ---")
        doc_type = os.path.basename(folder)
        print(f"Determined document type: {doc_type}")

        # Filter and process .docx files
        print("Filtering .docx files...")
        docx_files = filter_valid_files(folder, (".docx",))
        print(f"Found {len(docx_files)} valid .docx files: {docx_files}")
        try:
            for file in docx_files:
                print(f"Processing .docx file: {file}")
                loader = UnstructuredWordDocumentLoader(file, **text_loader_kwargs)
                docs = loader.load()  # Load returns a list of documents
                print(f"Loaded {len(docs)} documents from file: {file}")
                for doc in docs:
                    doc.metadata["doc_type"] = doc_type
                    documents.append(doc)
                    print(f"Appended document with metadata: {doc.metadata}")
        except Exception as e:
            print(f"Error processing .docx file {file}: {e}")

        # Filter and process .doc files
        print("Filtering .doc files...")
        doc_files = filter_valid_files(folder, (".doc",))
        print(f"Found {len(doc_files)} valid .doc files: {doc_files}")
        try:
            for file in doc_files:
                print(f"Processing .doc file: {file}")
                loader = UnstructuredWordDocumentLoader(file, **text_loader_kwargs)
                docs = loader.load()  # Load returns a list of documents
                print(f"Loaded {len(docs)} documents from file: {file}")
                for doc in docs:
                    doc.metadata["doc_type"] = doc_type
                    documents.append(doc)
                    print(f"Appended document with metadata: {doc.metadata}")
        except Exception as e:
            print(f"Error processing .doc file {file}: {e}")

    print(f"--- Completed processing folders ---")
    print(f"Total documents loaded: {len(documents)}")

    # Check the loaded documents
    print("Documents loaded:")
    for i, doc in enumerate(documents):
        print(f"Document {i + 1}: {doc}")

    # Split the documents into smaller chunks
    print("Splitting documents into chunks...")
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(documents)
    print(f"Number of chunks created: {len(chunks)}")

    # List document types found
    doc_types = set(chunk.metadata["doc_type"] for chunk in chunks)
    print(f"Document types found: {', '.join(doc_types)}")

    return documents, chunks


## 2) Managing the Chroma Vector Store
Handles loading an existing DB or creating a new one from scratch.

In [5]:
def manage_vector_store(
    folders,
    embeddings,
    persist_directory,
    load_from_scratch=False,
    text_loader_kwargs=None,
    chunk_size=2000,
    chunk_overlap=400
):
    """
    Manage a Chroma vector store: load from existing or process documents to create a new one.

    Args:
        folders (list): List of folder paths to process.
        embeddings: Embedding function for Chroma.
        persist_directory (str): Directory to persist or load the vector store.
        load_from_scratch (bool): Whether to process documents from scratch.
        text_loader_kwargs (dict): Arguments for text loading (e.g., encoding).
        chunk_size (int): Size of text chunks for splitting.
        chunk_overlap (int): Overlap between text chunks.

    Returns:
        Chroma: Loaded or created vector store.
    """
    if not load_from_scratch and os.path.exists(persist_directory):
        # Load existing vector store
        print("[INFO] Loading existing vector store...")
        vectorstore = Chroma(
            persist_directory=persist_directory,
            embedding_function=embeddings
        )
        print(f"Loaded vectorstore with {vectorstore._collection.count()} documents")
    else:
        # Process documents and create a new vector store
        print("[INFO] Loading documents and creating a new vector store...")
        documents, chunks = process_folders(
            folders=folders,
            text_loader_kwargs=text_loader_kwargs,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        print("[INFO] Creating and saving vector store...")
        vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=persist_directory
        )
        print(f"Vectorstore created with {vectorstore._collection.count()} documents")

    return vectorstore


## 3) The Chat Function
Simple wrapper to invoke the conversation chain.

In [6]:
def chat(message, history):
    """
    Simple wrapper to invoke the conversation chain.
    Note: 'history' is not used here directly because the chain manages memory itself.
    """
    result = conversation_chain.invoke({"question": message})
    return result["answer"]


## 4) Main Execution
Putting it all together: embedding initialization, vector store management, TSNE visualizations, and launching the Gradio interface.

In [7]:
# We define 'conversation_chain' globally so it's accessible in the chat() function.
conversation_chain = None

if __name__ == "__main__":
    # Step 1: Initialize embeddings
    print("[INFO] Initializing embeddings...")
    embeddings = OpenAIEmbeddings()
    print("[INFO] Embeddings initialized successfully.")

    # Step 2: Manage vector store
    vectorstore = manage_vector_store(
            folders=None if not load_from_scratch else folders,  # Pass folders only when needed
            embeddings=embeddings,
            persist_directory=persist_directory,
            load_from_scratch=load_from_scratch,
            text_loader_kwargs=text_loader_kwargs,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
    # Step 3: Fetch Collection and Extract Data
    print("[INFO] Fetching vector store collection...")
    collection = vectorstore._collection
    if collection.count() == 0:
        print("[ERROR] Vector store is empty. Exiting...")
        exit()

    result = collection.get(include=["embeddings", "documents", "metadatas"])
    vectors = np.array(result["embeddings"])
    documents = result["documents"]
    doc_types = [metadata["doc_type"] for metadata in result["metadatas"]]

    # Step 4: Assign colors (simple approach; single category 'procedures')
    categories = ["procedures"]
    colors = [
        ["blue"][categories.index(t)]
        for t in doc_types
    ]

    # Step 5: 2D Visualization with t-SNE
    print("[INFO] Reducing dimensionality to 2D for visualization...")
    n_samples = len(vectors)
    perplexity = min(30, n_samples - 1)  # Perplexity must be < number of samples
    tsne_2d = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    reduced_vectors_2d = tsne_2d.fit_transform(vectors)

    print("[INFO] Creating 2D scatter plot...")
    fig_2d = go.Figure(data=[go.Scatter(
        x=reduced_vectors_2d[:, 0],
        y=reduced_vectors_2d[:, 1],
        mode="markers",
        marker=dict(size=5, color=colors, opacity=0.8),
        text=[
            f"Type: {t}<br>Text: {d[:100]}..."
            for t, d in zip(doc_types, documents)
        ],
        hoverinfo="text"
    )])

    fig_2d.update_layout(
        title="2D Chroma Vector Store Visualization",
        xaxis_title="t-SNE Dimension 1",
        yaxis_title="t-SNE Dimension 2",
        width=800,
        height=600,
        margin=dict(r=20, b=10, l=10, t=40)
    )
    fig_2d.show()

    # Step 6: 3D Visualization with t-SNE
    print("[INFO] Reducing dimensionality to 3D for visualization...")
    tsne_3d = TSNE(n_components=3, random_state=42, perplexity=perplexity)
    reduced_vectors_3d = tsne_3d.fit_transform(vectors)

    print("[INFO] Creating 3D scatter plot...")
    fig_3d = go.Figure(data=[go.Scatter3d(
        x=reduced_vectors_3d[:, 0],
        y=reduced_vectors_3d[:, 1],
        z=reduced_vectors_3d[:, 2],
        mode="markers",
        marker=dict(size=5, color=colors, opacity=0.8),
        text=[
            f"Type: {t}<br>Text: {d[:100]}..."
            for t, d in zip(doc_types, documents)
        ],
        hoverinfo="text"
    )])

    fig_3d.update_layout(
        title="3D Chroma Vector Store Visualization",
        scene=dict(
            xaxis_title="t-SNE Dimension 1",
            yaxis_title="t-SNE Dimension 2",
            zaxis_title="t-SNE Dimension 3"
        ),
        width=900,
        height=700,
        margin=dict(r=20, b=10, l=10, t=40)
    )
    fig_3d.show()

    # Step 7: Create a new Chat with OpenAI (GPT-4o-mini)
    print("[INFO] Initializing ChatOpenAI with streaming enabled...")
    llm = ChatOpenAI(
        temperature=0.7,          # Control randomness
        model_name="gpt-4o-mini", # Specify the model
        streaming=True           # Enable streaming
    )
    print("[INFO] LLM initialized successfully.")

    # Step 8: Set up conversation memory
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    # Create a retriever from our vectorstore
    retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

    # Build a ConversationalRetrievalChain from the LLM, retriever, and memory
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory
    )

    # Step 9: Launch Gradio Chat Interface
    print("[INFO] Launching Gradio chat interface...")
    # make sure there's a clear button to exit the chat interface
    gr.ChatInterface(chat, type="messages").launch(inbrowser=True)


[INFO] Initializing embeddings...
[INFO] Embeddings initialized successfully.
[INFO] Loading existing vector store...
Loaded vectorstore with 444 documents
[INFO] Fetching vector store collection...
[INFO] Reducing dimensionality to 2D for visualization...
[INFO] Creating 2D scatter plot...


[INFO] Reducing dimensionality to 3D for visualization...
[INFO] Creating 3D scatter plot...


[INFO] Initializing ChatOpenAI with streaming enabled...
[INFO] LLM initialized successfully.
[INFO] Launching Gradio chat interface...



Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
