In [1]:
!pip install python-dotenv langchain_openai langchain_chroma plotly gradio langchain_community unstructured python.docx

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.2.14-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting gradio
  Downloading gradio-5.11.0-py3-none-any.whl.metadata (16 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting unstructured
  Downloading unstructured-0.16.12-py3-none-any.whl.metadata (24 kB)
Collecting python.docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting tiktoken<1,>=0.7 (from langchain_openai)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting chromadb!=0.5.10,!=0.5.11,!=0.5.12,!=0.5.4,!=0.5.5,!=0.5.7,!=0.5.9,<0.6.0,>=0.4.0 (from langchain_chroma)
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 

In [2]:
!apt-get install -y libreoffice


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  apparmor default-jre default-jre-headless dictionaries-common firebird3.0-common
  firebird3.0-common-doc firebird3.0-server-core firebird3.0-utils fonts-crosextra-caladea
  fonts-crosextra-carlito fonts-dejavu fonts-dejavu-core fonts-dejavu-extra fonts-liberation2
  fonts-linuxlibertine fonts-noto-core fonts-noto-extra fonts-noto-mono fonts-noto-ui-core
  fonts-opensymbol fonts-sil-gentium fonts-sil-gentium-basic gstreamer1.0-gl gstreamer1.0-gtk3
  hunspell-en-us libabsl20210324 libabw-0.1-1 libatk-wrapper-java libatk-wrapper-java-jni
  libbsh-java libcdr-0.1-1 libclucene-contribs1v5 libclucene-core1v5 libcolamd2 libe-book-0.1-1
  libel-api-java libeot0 libepubgen-0.1-1 libetonyek-0.1-1 libexttextcat-2.0-0 libexttextcat-data
  libfbclient2 libfontenc1 libfreehand-0.1-1 libgpgme11 libgpgmepp6 libgraphene-1.0-0
  libgstreamer-gl1.0-0 lib

In [3]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
import docx
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from google.colab import drive, userdata
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


In [4]:
def initialize_environment():
    """Initialize environment variables and mount Google Drive."""
    MODEL = "gpt-4o-mini"
    db_name = "/content/drive/MyDrive/Pedigo/procedures/vector_db"
    drive.mount("/content/drive")
    folders = "/content/drive/MyDrive/Pedigo"

    load_dotenv()
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

    return MODEL, db_name, folders

In [5]:
def get_existing_vectorstore(db_name):
    """Load existing vector store if it exists."""
    if os.path.exists(db_name):
        embeddings = OpenAIEmbeddings()
        vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)
        print(f"Existing vector store loaded with {vectorstore._collection.count()} documents")
        return vectorstore
    return None


In [6]:
def filter_valid_files(folder, extensions):
    """Filter valid files in a folder with given extensions."""
    all_files = glob.glob(os.path.join(folder, "**/*"), recursive=True)
    return [
        file for file in all_files
        if file.endswith(extensions) and not os.path.basename(file).startswith("~$")
    ]

In [7]:
def load_documents(folders, text_loader_kwargs):
    """Load documents from specified folders with duplicate checking."""
    documents = []

    for folder in folders:
        doc_type = os.path.basename(folder)
        print(f"Processing folder: {folder}")

        # Process .docx files
        docx_files = filter_valid_files(folder, (".docx",))
        print(f"Found {len(docx_files)} valid .docx files.")
        for file in docx_files:
            try:
                loader = UnstructuredWordDocumentLoader(file, **text_loader_kwargs)
                docs = loader.load()
                for doc in docs:
                    doc.metadata["doc_type"] = doc_type
                    doc.metadata["source"] = file
                    documents.append(doc)
            except Exception as e:
                print(f"Error processing {file}: {e}")

        # Process .doc files
        doc_files = filter_valid_files(folder, (".doc",))
        print(f"Found {len(doc_files)} valid .doc files.")
        for file in doc_files:
            try:
                loader = UnstructuredWordDocumentLoader(file, **text_loader_kwargs)
                docs = loader.load()
                for doc in docs:
                    doc.metadata["doc_type"] = doc_type
                    doc.metadata["source"] = file
                    documents.append(doc)
            except Exception as e:
                print(f"Error processing {file}: {e}")

    return documents





In [8]:

def create_or_update_vectorstore(documents, db_name, embeddings):
    """Create or update vector store with document existence checking."""
    text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
    chunks = text_splitter.split_documents(documents)

    # Check if vector store exists
    if os.path.exists(db_name):
        print("Existing vector store found. Checking for new documents...")
        existing_store = Chroma(persist_directory=db_name, embedding_function=embeddings)

        # Get existing document sources
        existing_metadata = existing_store._collection.get(include=['metadatas'])
        existing_sources = set(meta.get('source', '') for meta in existing_metadata['metadatas'])

        # Filter out chunks from documents that already exist
        new_chunks = [chunk for chunk in chunks
                     if chunk.metadata.get('source', '') not in existing_sources]

        if new_chunks:
            print(f"Adding {len(new_chunks)} new document chunks to existing store...")
            existing_store.add_documents(new_chunks)
            vectorstore = existing_store
        else:
            print("No new documents to add.")
            vectorstore = existing_store
    else:
        print("Creating new vector store...")
        vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=db_name
        )

    return vectorstore


In [9]:
def visualize_vectorstore(collection, perplexity=30):
    """Create 2D and 3D visualizations of the vector store."""
    result = collection.get(include=['embeddings', 'documents', 'metadatas'])
    vectors = np.array(result['embeddings'])
    documents = result['documents']
    doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
    colors = [['blue'][['procedures'].index(t)] for t in doc_types]

    n_samples = len(vectors)
    perplexity = min(perplexity, n_samples - 1)

    # 2D and 3D visualizations (same as before)
    tsne_2d = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    reduced_vectors_2d = tsne_2d.fit_transform(vectors)

    fig_2d = go.Figure(data=[go.Scatter(
        x=reduced_vectors_2d[:, 0],
        y=reduced_vectors_2d[:, 1],
        mode='markers',
        marker=dict(size=5, color=colors, opacity=0.8),
        text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
        hoverinfo='text'
    )])

    fig_2d.update_layout(
        title='2D Chroma Vector Store Visualization',
        width=800,
        height=600,
        margin=dict(r=20, b=10, l=10, t=40)
    )

    tsne_3d = TSNE(n_components=3, random_state=42, perplexity=perplexity)
    reduced_vectors_3d = tsne_3d.fit_transform(vectors)

    fig_3d = go.Figure(data=[go.Scatter3d(
        x=reduced_vectors_3d[:, 0],
        y=reduced_vectors_3d[:, 1],
        z=reduced_vectors_3d[:, 2],
        mode='markers',
        marker=dict(size=5, color=colors, opacity=0.8),
        text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
        hoverinfo='text'
    )])

    fig_3d.update_layout(
        title='3D Chroma Vector Store Visualization',
        scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
        width=900,
        height=700,
        margin=dict(r=20, b=10, l=10, t=40)
    )

    return fig_2d, fig_3d

In [10]:
# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain

def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [None]:

# Initialize environment
MODEL, db_name, folders = initialize_environment()

# Ask user if they want to load new documents
user_input = input("Do you want to load new documents? (yes/no): ").lower()

if user_input == 'no':
    # Just load existing vector store
    vectorstore = get_existing_vectorstore(db_name)
    if vectorstore is None:
        print("No existing vector store found. Please run with document loading enabled.")
else:
    # Load and process new documents
    text_loader_kwargs = {'encoding': 'utf-8'}
    documents = load_documents(folders, text_loader_kwargs)
    print(f"Total documents loaded: {len(documents)}")

    # Create embeddings
    embeddings = OpenAIEmbeddings()

    # Create or update vector store
    vectorstore = create_or_update_vectorstore(documents, db_name, embeddings)

# Get collection info and visualize
collection = vectorstore._collection
print(f"Vector store contains {collection.count()} documents")

# # Create visualizations
fig_2d, fig_3d = visualize_vectorstore(collection)
fig_2d.show()
fig_3d.show()

# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

Mounted at /content/drive
Do you want to load new documents? (yes/no): yes
Processing folder: /


In [None]:
embeddings = OpenAIEmbeddings()