In [1]:
import os
import ollama
import chromadb

In [2]:
import os
import ollama
import chromadb

def load_text_files(text_dir):
    """Load all .txt files from the directory."""
    texts = []
    for fname in os.listdir(text_dir):
        if fname.lower().endswith(".txt"):
            with open(os.path.join(text_dir, fname), "r", encoding="utf-8") as f:
                texts.append((fname, f.read()))
    return texts

def get_embedding(text, model="nomic-embed-text:latest"):
    """Generate embedding vector for text via Ollama."""
    response = ollama.embeddings(model=model, prompt=text)
    return response["embedding"]

def create_chroma_collection(client, collection_name="policy_docs"):
    """Create or get ChromaDB collection."""
    return client.get_or_create_collection(collection_name)

def embed_and_ingest(text_dir, chroma_path):
    # Initialize clients
    client = chromadb.PersistentClient(path=chroma_path)
    collection = create_chroma_collection(client)

    # Load texts
    documents = load_text_files(text_dir)

    ids = []
    embeddings = []
    metadatas = []
    documents_text = []

    for filename, content in documents:
        try:
            emb = get_embedding(content)
            ids.append(filename)
            embeddings.append(emb)
            metadatas.append({"source": filename})
            documents_text.append(content)
            print(f"Embedded and prepared document: {filename}")
        except Exception as e:
            print(f"Error embedding {filename}: {e}")

    # Store in ChromaDB
    collection.add(
        ids=ids,
        embeddings=embeddings,
        metadatas=metadatas,
        documents=documents_text
    )
    print(f"Stored {len(ids)} documents into ChromaDB at {chroma_path}")

if __name__ == "__main__":
    text_folder = "../../Data/CleanedText"   # Adjust relative path for your notebook location
    chroma_db_folder = "../../Chroma_db_database"

    embed_and_ingest(text_folder, chroma_db_folder)


Embedded and prepared document: press_release_ugc_net_june_2025.txt
Embedded and prepared document: public-notice-issuance-certificate-june-2025.txt
Stored 2 documents into ChromaDB at ../../Chroma_db_database
