In [1]:
# imports

import os
import glob
import chromadb
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from docx import Document
import uuid
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
import plotly.graph_objects as go
import glob
import gradio as gr
from io import StringIO
import logging
import sys
import json

In [2]:
class SentenceTransformerEmbeddings:
    def __init__(self, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
    
    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        """Embed a list of documents."""
        embeddings = self.model.encode(texts)
        return embeddings.tolist()
    
    def embed_query(self, text: str) -> list[float]:
        """Embed a single query text."""
        embedding = self.model.encode([text])
        return embedding[0].tolist()

In [3]:
def load_docx_content(file_path):
    """Load content from a DOCX file."""
    try:
        doc = Document(file_path)
        content = []
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                content.append(paragraph.text.strip())
        return '\n'.join(content)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return ""

def split_text(text, chunk_size=1000, chunk_overlap=200):
    """Simple text splitter."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - chunk_overlap
        if start >= len(text):
            break
    return chunks

# Load all DOCX files from the Documents folder
files = glob.glob("Documents/*.docx")
all_chunks = []
all_metadatas = []
all_ids = []

for file_path in files:
    doc_type = os.path.splitext(os.path.basename(file_path))[0]
    content = load_docx_content(file_path)
    
    if content:
        chunks = split_text(content)
        for i, chunk in enumerate(chunks):
            all_chunks.append(chunk)
            all_metadatas.append({
                "doc_type": doc_type,
                "file_path": file_path,
                "chunk_index": i
            })
            all_ids.append(f"{doc_type}_{i}_{uuid.uuid4().hex[:8]}")

print(f"Total number of chunks: {len(all_chunks)}")
print(f"Document types found: {set(meta['doc_type'] for meta in all_metadatas)}")

Total number of chunks: 17
Document types found: {'Process_Doc'}


In [5]:
# Initialize ChromaDB client
DB_PATH = "agile_process"
client = chromadb.PersistentClient(path=DB_PATH)

# Initialize embeddings
embeddings_model = SentenceTransformerEmbeddings('sentence-transformers/all-MiniLM-L6-v2')

In [6]:
# Uncomment to create the vectorstore again

# Delete existing collection if it exists
collection_name = "process_docs"
try:
    client.delete_collection(name=collection_name)
    print("Existing collection deleted.")
except Exception as e:
    print(f"Collection didn't exist or couldn't be deleted: {e}")

# Create a new collection
collection = client.create_collection(
    name=collection_name,
    metadata={"hnsw:space": "cosine"}  # Use cosine similarity
)

# Add documents to the collection in batches (ChromaDB has limits)
batch_size = 100
for i in range(0, len(all_chunks), batch_size):
    batch_chunks = all_chunks[i:i + batch_size]
    batch_metadatas = all_metadatas[i:i + batch_size]
    batch_ids = all_ids[i:i + batch_size]
    
    # Generate embeddings for this batch
    batch_embeddings = embeddings_model.embed_documents(batch_chunks)
    
    # Add to collection
    collection.add(
        documents=batch_chunks,
        metadatas=batch_metadatas,
        ids=batch_ids,
        embeddings=batch_embeddings
    )
    
    print(f"Added batch {i//batch_size + 1}/{(len(all_chunks) + batch_size - 1)//batch_size}")

print(f"Vectorstore created with {collection.count()} documents")

# Test query (uncomment to test)
# results = query_documents("your query here")
# print("Query results:", results)

Collection didn't exist or couldn't be deleted: Collection [process_docs] does not exists
Added batch 1/1
Vectorstore created with 17 documents
