In [None]:
import chromadb
from chromadb.config import Settings
import os
from chromadb.utils import embedding_functions
import hashlib

In [None]:
# Initialiser le client chroma_db en précisant le sous-répertoire de la base
client = chromadb.PersistentClient(path="chroma_db")

In [None]:
# Define the embedding function (e.g., OpenAI embedding function)
# Define a local embedding function using sentence-transformers
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create a new collection in the Chroma DB
collection_name = "genai"
collection = client.get_or_create_collection(
    name=collection_name, embedding_function=embedding_function
)

# Directory containing text files
directory_path = "data"

# Scan the directory and process each text file
for file_name in os.listdir(directory_path):
    file_path = os.path.join(directory_path, file_name)
    if os.path.isfile(file_path) and file_name.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()

        # Create a unique ID for the text data using its hash
        file_id = hashlib.md5(file_name.encode("utf-8")).hexdigest()[:12]

        # Add the text data to the collection
        collection.add(
            documents=[text_data], metadatas=[{"source": file_path}], ids=[file_id]
        )

print(
    f"Collection '{collection_name}' created and all text files embedded successfully."
)

In [None]:
# Lister les collections présentes
collections = client.list_collections()
print("Nombre de collections :", len(collections))

# Afficher le nombre de documents par collection
for col_name in collections:
    col_obj = client.get_collection(col_name)
    data = col_obj.get(include=["documents", "metadatas", "embeddings"])
    doc_count = len(data.get("documents", []))
    print(f"Collection '{col_name}' - Nombre de documents : {doc_count}")

    # Display IDs in the collection
    ids = data.get("ids", [])
    print(f"  IDs: {', '.join(ids)}")

    # Display metadata if available
    metadatas = data.get("metadatas")
    if metadatas:
        print("  Metadata:")
        for i, metadata in enumerate(metadatas):
            print(f"    Document {i + 1}: {metadata}")

    # Show preview of first document (truncated)
    documents = data.get("documents", [])
    if documents and len(documents) > 0:
        preview = (
            documents[0][:200] + "..." if len(documents[0]) > 200 else documents[0]
        )
        print(f"  Premier document (aperçu): {preview}")

    # Display embeddings if available
    embeddings = data.get("embeddings")
    if embeddings is not None and len(embeddings) > 0:
        print("  Embeddings:")
        for i, embedding in enumerate(embeddings):
            print(
                f"    Document {i + 1}: {embedding[:10]}..."
            )  # Truncate for readability

    print("-" * 50)