In [1]:
import chromadb
from chromadb.config import Settings
import os
from chromadb.utils import embedding_functions
import hashlib

In [2]:
# Initialiser le client chroma_db en précisant le sous-répertoire de la base
client = chromadb.PersistentClient(path="chroma_db")

In [3]:
# Define the embedding function (e.g., OpenAI embedding function)
# Define a local embedding function using sentence-transformers
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create a new collection in the Chroma DB
collection_name = "genai"
collection = client.get_or_create_collection(
    name=collection_name, embedding_function=embedding_function
)

# Directory containing text files
directory_path = "data"

# Scan the directory and process each text file
for file_name in os.listdir(directory_path):
    file_path = os.path.join(directory_path, file_name)
    if os.path.isfile(file_path) and file_name.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()

        # Create a unique ID for the text data using its hash
        file_id = hashlib.md5(file_name.encode("utf-8")).hexdigest()[:12]

        # Add the text data to the collection
        collection.add(
            documents=[text_data], metadatas=[{"source": file_path}], ids=[file_id]
        )

print(
    f"Collection '{collection_name}' created and all text files embedded successfully."
)

  from .autonotebook import tqdm as notebook_tqdm


Collection 'genai' created and all text files embedded successfully.


In [12]:
# Lister les collections présentes
collections = client.list_collections()
print("Nombre de collections :", len(collections))

# Afficher le nombre de documents par collection
for col_name in collections:
    col_obj = client.get_collection(col_name)
    data = col_obj.get(include=["documents", "metadatas", "embeddings"])
    doc_count = len(data.get("documents", []))
    print(f"Collection '{col_name}' - Nombre de documents : {doc_count}")

    # Display IDs in the collection
    ids = data.get("ids", [])
    print(f"  IDs: {', '.join(ids)}")

    # Display metadata if available
    metadatas = data.get("metadatas")
    if metadatas:
        print("  Metadata:")
        for i, metadata in enumerate(metadatas):
            print(f"    Document {i + 1}: {metadata}")

    # Show preview of first document (truncated)
    documents = data.get("documents", [])
    if documents and len(documents) > 0:
        preview = (
            documents[0][:200] + "..." if len(documents[0]) > 200 else documents[0]
        )
        print(f"  Premier document (aperçu): {preview}")

    # Display embeddings if available
    embeddings = data.get("embeddings")
    if embeddings is not None and len(embeddings) > 0:
        print("  Embeddings:")
        for i, embedding in enumerate(embeddings):
            print(
                f"    Document {i + 1}: {embedding[:10]}..."
            )  # Truncate for readability

    print("-" * 50)

Nombre de collections : 1
Collection 'genai' - Nombre de documents : 1
  IDs: 291aad172699
  Metadata:
    Document 1: {'source': 'data\\paul_graham_essay.txt'}
  Premier document (aperçu): 

What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed ...
  Embeddings:
    Document 1: [-0.03938635  0.03998332 -0.01110203 -0.00434901 -0.06904357 -0.08148401
 -0.04976932  0.05688301 -0.03658195  0.01380224]...
--------------------------------------------------


In [None]:
# First, install LlamaIndex if not already installed
# !pip install llama-index llama-index-vector-stores-chroma

from llama_index.core import SimpleDirectoryReader, Document, Settings
from llama_index.core.indices import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.embeddings import HuggingFaceEmbedding

# Use the same embedding model as before for consistency
embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
Settings.embed_model = embed_model

# Connect to existing Chroma collection
chroma_store = ChromaVectorStore.from_existing_collection(
    client=client,
    collection_name=collection_name
)

# Create a vector index from the Chroma store
vector_index = VectorStoreIndex.from_vector_store(chroma_store)

# Create a simplified search function
def search_documents(query, top_k=5):
    """
    Search the collection for documents similar to the query
    
    Args:
        query (str): The search query
        top_k (int): Number of results to return
        
    Returns:
        List of search results with content and metadata
    """
    query_engine = vector_index.as_query_engine(similarity_top_k=top_k)
    response = query_engine.query(query)
    
    return response

# Example search
query = "startup technology"
print(f"\nSearching for: '{query}'")
results = search_documents(query)

print(f"Results: {results}")
print("\nSource Documents:")
for node in results.source_nodes:
    print(f"\nScore: {node.score:.4f}")
    print(f"Source: {node.metadata.get('source', 'Unknown')}")
    print(f"Preview: {node.text[:200]}...")
