**1. SETUP**

In [None]:
# ------------------------------------------------------------------------------
# 1.1. Install Dependencies
# ------------------------------------------------------------------------------

!pip install numpy Pillow torch sentence-transformers transformers accelerate google-generativeai requests huggingface_hub

# ------------------------------------------------------------------------------
# 1.2. Import Libraries
# ------------------------------------------------------------------------------
import numpy as np
import os
import sys
import time
from typing import List, Any, Optional
from PIL import Image
import torch
from sentence_transformers import SentenceTransformer
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoProcessor, Gemma3ForConditionalGeneration
import google.generativeai as genai
import requests
from io import BytesIO
from huggingface_hub import login as hf_login
import pandas as pd

print("All base libraries imported.")

# ------------------------------------------------------------------------------
# 1.3. Hercules Path Setup
# ------------------------------------------------------------------------------
try:
    sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add repo root to path
    from pyhercules import Hercules
    print("Hercules imported successfully.")
except ImportError as e:
    print(f"Error importing Hercules: {e}")
    # sys.exit(1)

# ------------------------------------------------------------------------------
# 1.4. Google API Key Configuration
# ------------------------------------------------------------------------------
# Load environment variables from a .env file if it exists
try:
    from dotenv import load_dotenv
    if load_dotenv():
        print("Loaded environment variables from .env file.")
    else:
        print("No .env file found, relying on system environment variables.")
except ImportError:
    print("dotenv library not found. pip install python-dotenv")
print("Configuring Google API Key...")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# --- Option 1: Set Environment Variable (Recommended) ---
# If GOOGLE_API_KEY is not set as an environment variable,
# you can set it here directly. Replace "YOUR_API_KEY_HERE" with your actual key.
# However, be cautious about committing keys directly into notebooks.
# if not GOOGLE_API_KEY:
#     GOOGLE_API_KEY = "YOUR_API_KEY_HERE" 
#     # os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY # Optionally set it for the current session

if not GOOGLE_API_KEY or "YOUR_API_KEY_HERE" in GOOGLE_API_KEY:
    print("WARNING: GOOGLE_API_KEY environment variable not set or is a placeholder.")
    print("Google Cloud dependent examples will fail.")
    print("Please set the GOOGLE_API_KEY environment variable or update the cell above.")
    # To stop execution if key is absolutely mandatory for any part:
    # sys.exit(1)
else:
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        print("Google GenAI configured successfully.")
    except Exception as e:
        print(f"Error configuring Google GenAI (check API key validity and permissions): {e}")
        # sys.exit(1)

# --- Hugging Face Hub Token ---
print("\nConfiguring Hugging Face Hub Token...")
HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
if HF_TOKEN:
    try:
        hf_login(token=HF_TOKEN)
        print("Successfully logged into Hugging Face Hub.")
    except Exception as e:
        print(f"Error logging into Hugging Face Hub: {e}")
        print("Some models from Hugging Face might require authentication to download.")
else:
    print("HUGGINGFACE_HUB_TOKEN environment variable not found.")
    print("Proceeding without Hugging Face Hub login. This may be fine for public models.")
    print("For gated models or to ensure download rates, set the HUGGINGFACE_HUB_TOKEN environment variable.")

**2. CLIENT DEFINITIONS (SHARED ACROSS EXAMPLES)**

In [None]:
print("Defining client functions...")

# ------------------------------------------------------------------------------
# 2.1. Local Text Embedding Client (SentenceTransformer)
# ------------------------------------------------------------------------------
_local_text_embedding_model = None
_local_text_embedding_dim = None

def get_local_text_embedding_client(model_name="all-MiniLM-L6-v2"):
    global _local_text_embedding_model, _local_text_embedding_dim
    if _local_text_embedding_model is None:
        try:
            print(f"Loading local text embedding model: {model_name}...")
            _local_text_embedding_model = SentenceTransformer(model_name)
            _local_text_embedding_dim = _local_text_embedding_model.get_sentence_embedding_dimension()
            print(f"Local text embedding model '{model_name}' loaded. Dimension: {_local_text_embedding_dim}")
        except Exception as e:
            print(f"Error loading SentenceTransformer model '{model_name}': {e}")
            raise # Re-raise to stop if model loading fails

    def embed_texts_local(texts: List[str]) -> np.ndarray:
        if not _local_text_embedding_model:
            raise RuntimeError("Local text embedding model not loaded.")
        if not texts:
            return np.empty((0, _local_text_embedding_dim))
        try:
            embeddings = _local_text_embedding_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
            return embeddings
        except Exception as e:
            print(f"Error during local text embedding generation: {e}")
            return np.empty((0, _local_text_embedding_dim))
    return embed_texts_local

# ------------------------------------------------------------------------------
# 2.2. Google Cloud Text Embedding Client
# ------------------------------------------------------------------------------
_google_embedding_model_name = "models/embedding-001"
_google_embedding_dim = 768 # Dimension for models/embedding-001

def embed_texts_google(texts: List[str]) -> np.ndarray:
    if not GOOGLE_API_KEY or "YOUR_API_KEY_HERE" in GOOGLE_API_KEY:
        print("Error: Google API Key not configured. Cannot use Google embedding client.")
        return np.empty((0, _google_embedding_dim))
    if not texts:
        return np.empty((0, _google_embedding_dim))
    try:
        result = genai.embed_content(
            model=_google_embedding_model_name,
            content=texts,
            task_type="clustering"
        )
        embeddings = np.array(result["embedding"])
        if embeddings.ndim != 2 or embeddings.shape[0] != len(texts) or embeddings.shape[1] != _google_embedding_dim:
             print(f"Warning: Unexpected Google embedding shape. Expected ({len(texts)}, {_google_embedding_dim}), Got {embeddings.shape}")
        return embeddings
    except Exception as e:
        print(f"Error calling Google Embedding API: {e}")
        return np.empty((0, _google_embedding_dim))

# ------------------------------------------------------------------------------
# 2.3. Local LLM Client (Gemma)
# ------------------------------------------------------------------------------
_gemma_model = None
_gemma_processor = None
_gemma_model_id = "google/gemma-3-4b-it" # Using the 4B IT model from script

def get_gemma_llm_client(model_id=_gemma_model_id):
    global _gemma_model, _gemma_processor
    if _gemma_model is None:
        try:
            print(f"Loading Gemma model: {model_id}...")
            print("This may take a while and require significant RAM/VRAM.")
            # Determine data type based on GPU availability and capability
            dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
            
            _gemma_model = Gemma3ForConditionalGeneration.from_pretrained(
                model_id, 
                device_map="auto", # Automatically uses GPU if available
                torch_dtype=dtype  # Use bfloat16 if supported, else float32
            ).eval()
            _gemma_processor = AutoProcessor.from_pretrained(model_id)
            print(f"Gemma model '{model_id}' loaded to device: {_gemma_model.device} with dtype: {dtype}.")
        except Exception as e:
            print(f"Error loading Gemma model '{model_id}': {e}")
            print("Ensure you have 'transformers', 'torch', and 'accelerate' installed.")
            print("A powerful GPU with sufficient VRAM is recommended for larger Gemma models.")
            raise

    def gemma_llm_function(prompt: str) -> str:
        if not _gemma_model or not _gemma_processor:
            raise RuntimeError("Gemma model/processor not loaded.")
        
        messages = [
            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
            {"role": "user", "content": [{"type": "text", "text": prompt}]}
        ]
        try:
            inputs = _gemma_processor.apply_chat_template(
                messages, add_generation_prompt=True, tokenize=True,
                return_dict=True, return_tensors="pt"
            ).to(_gemma_model.device) # Ensure inputs are on the same device as the model

            input_len = inputs["input_ids"].shape[-1]

            with torch.inference_mode():
                generation = _gemma_model.generate(**inputs, max_new_tokens=150, do_sample=False) # Increased max_new_tokens for better summaries
                generation = generation[0][input_len:]
            
            decoded = _gemma_processor.decode(generation, skip_special_tokens=True)
            return decoded.strip()
        except Exception as e:
            print(f"Error during Gemma LLM generation: {e}")
            return ""
    return gemma_llm_function

# ------------------------------------------------------------------------------
# 2.4. Google Cloud LLM Client (Gemini)
# ------------------------------------------------------------------------------
_gemini_model_name = "gemini-2.0-flash"
_gemini_model_genai = None

def get_gemini_llm_client(model_name=_gemini_model_name):
    global _gemini_model_genai
    if not GOOGLE_API_KEY or "YOUR_API_KEY_HERE" in GOOGLE_API_KEY:
        print("Error: Google API Key not configured. Cannot initialize Gemini client.")
        # Return a dummy function that indicates error
        def error_gemini_fn(prompt: str) -> str:
            print("Gemini LLM client not available due to API key issue.")
            return ""
        return error_gemini_fn

    if _gemini_model_genai is None:
        try:
            print(f"Initializing Gemini model: {model_name}...")
            _gemini_model_genai = genai.GenerativeModel(model_name)
            print(f"Gemini model '{model_name}' initialized.")
        except Exception as e:
            print(f"Error initializing Gemini model '{model_name}': {e}")
            # Return a dummy function that indicates error
            def error_gemini_fn_init(prompt: str) -> str:
                print(f"Gemini LLM client not available due to initialization error: {e}")
                return ""
            return error_gemini_fn_init

    def gemini_llm_function(prompt: str) -> str:
        if not _gemini_model_genai:
            # This case should ideally be caught by the API key check or init check
            print("Gemini model not available (was not initialized).")
            return ""
        try:
            response = _gemini_model_genai.generate_content(prompt)
            if not response.candidates:
                print("Warning: Gemini response blocked or empty.")
                # You can inspect response.prompt_feedback or response.candidates[0].finish_reason for details
                return ""
            return response.text.strip()
        except Exception as e:
            print(f"Error calling Gemini API: {e}")
            return ""
    return gemini_llm_function

# ------------------------------------------------------------------------------
# 2.5. Local Image Embedding Client (CLIP)
# ------------------------------------------------------------------------------
DEFAULT_MAX_IMAGE_DIMENSION = 512 # Max width/height for images (pixels)
print(f"Default max image dimension set to: {DEFAULT_MAX_IMAGE_DIMENSION}px")

def resize_pil_image(image: Image.Image, max_dim: int = DEFAULT_MAX_IMAGE_DIMENSION) -> Image.Image:
    """Resizes a PIL Image to have its largest dimension be max_dim, preserving aspect ratio."""
    try:
        # Use thumbnail as it resizes in-place and maintains aspect ratio
        # It modifies the image to contain a thumbnail version of itself, no larger than the given size.
        original_size = image.size
        image.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS) # LANCZOS is good for downscaling
        # print(f"Resized image from {original_size} to {image.size}") # Optional: for debugging
        return image
    except Exception as e:
        print(f"Error during image resizing: {e}")
        return image # Return original image on error

_clip_model = None
def get_clip_image_embedding_client(model_name="clip-ViT-B-32", max_dim: int = DEFAULT_MAX_IMAGE_DIMENSION): # Added max_dim
    global _clip_model
    if _clip_model is None:
        try:
            print(f"Loading CLIP model: {model_name}...")
            _clip_model = SentenceTransformer(model_name)
            print(f"CLIP model '{model_name}' loaded.")
        except Exception as e: print(f"Error loading CLIP model '{model_name}': {e}"); raise

    def embed_images_clip(image_identifiers: List[Any]) -> np.ndarray:
        if not _clip_model: raise RuntimeError("CLIP model not loaded.")
        clip_dim = _clip_model.get_sentence_embedding_dimension()
        if not image_identifiers: return np.empty((0, clip_dim))
        
        images_to_process = []
        for identifier in image_identifiers:
            try:
                if isinstance(identifier, str): # Path
                    img = Image.open(identifier).convert("RGB")
                    img = resize_pil_image(img, max_dim) # Resize if loaded from path
                    images_to_process.append(img)
                elif isinstance(identifier, Image.Image): # PIL Image
                    # Assuming PIL images passed directly are already appropriately sized or resizing is handled upstream.
                    # For consistency, we can also resize here, but be mindful if user intentionally passed a specific size.
                    # img = resize_pil_image(identifier, max_dim) # Uncomment if all PIL images should be resized by client
                    images_to_process.append(identifier.convert("RGB"))
                else: 
                    print(f"Warning: Skipping unsupported image id type: {type(identifier)}")
                    continue
            except Exception as e: 
                print(f"Warning: Error loading/processing image '{identifier}': {e}. Skipping.")
        
        if not images_to_process: return np.empty((0, clip_dim))
        
        try:
            # print(f"CLIP: Embedding {len(images_to_process)} images...")
            embeddings = _clip_model.encode(images_to_process, batch_size=32, convert_to_numpy=True, show_progress_bar=False)
            # print(f"CLIP: Embedding successful, shape {embeddings.shape}")
            return embeddings
        except Exception as e:
            print(f"Error during CLIP embedding: {e}")
            return np.empty((0, clip_dim))
    return embed_images_clip

# ------------------------------------------------------------------------------
# 2.6. Local Image Captioning Client (BLIP)
# ------------------------------------------------------------------------------
_blip_processor = None
_blip_model = None
_blip_device = "cuda" if torch.cuda.is_available() else "cpu"

def get_blip_image_captioning_client(model_name="Salesforce/blip-image-captioning-large", max_dim: int = DEFAULT_MAX_IMAGE_DIMENSION): # Added max_dim
    global _blip_processor, _blip_model
    if _blip_processor is None or _blip_model is None:
        try:
            print(f"Loading BLIP processor/model: {model_name} to device: {_blip_device}...")
            _blip_processor = BlipProcessor.from_pretrained(model_name)
            _blip_model = BlipForConditionalGeneration.from_pretrained(model_name).to(_blip_device)
            print(f"BLIP model '{model_name}' loaded to {_blip_device}.")
        except Exception as e: print(f"Error loading BLIP model/processor '{model_name}': {e}"); raise

    def caption_images_blip(image_identifiers: List[Any], prompt: Optional[str] = None) -> List[Optional[str]]:
        if not _blip_processor or not _blip_model: raise RuntimeError("BLIP model/processor not loaded.")
        if not image_identifiers: return []
        
        images_to_process, valid_indices = [], []
        for i, identifier in enumerate(image_identifiers):
            try:
                if isinstance(identifier, str): # Path
                    img = Image.open(identifier).convert("RGB")
                    img = resize_pil_image(img, max_dim) # Resize if loaded from path
                    images_to_process.append(img)
                    valid_indices.append(i)
                elif isinstance(identifier, Image.Image): # PIL Image
                    # img = resize_pil_image(identifier, max_dim) # Uncomment if all PIL images should be resized by client
                    images_to_process.append(identifier.convert("RGB"))
                    valid_indices.append(i)
                else: 
                    print(f"Warning: Skipping unsupported image id type: {type(identifier)}")
                    continue
            except Exception as e: 
                print(f"Warning: Error loading/processing image '{identifier}': {e}. Skipping.")
        
        final_captions = [None] * len(image_identifiers)
        if not images_to_process: return final_captions
        
        try:
            # print(f"BLIP: Captioning {len(images_to_process)} images...")
            if prompt: 
                inputs = _blip_processor(images=images_to_process, text=[prompt] * len(images_to_process), return_tensors="pt", padding=True).to(_blip_device)
            else: 
                inputs = _blip_processor(images=images_to_process, return_tensors="pt", padding=True).to(_blip_device)
            
            with torch.no_grad(): 
                outputs = _blip_model.generate(**inputs, max_length=75, num_beams=3) # Increased max_length for potentially better captions
            
            generated_captions_raw = _blip_processor.batch_decode(outputs, skip_special_tokens=True)
            for i_cap, cap_text in enumerate(generated_captions_raw): 
                final_captions[valid_indices[i_cap]] = cap_text.strip()
            # print(f"BLIP: Captioning successful.")
            return final_captions
        except Exception as e:
            print(f"Error during BLIP captioning: {e}")
            return [None] * len(image_identifiers)
    return caption_images_blip

print("All client functions defined.")

**3. HERCULES CLUSTERING EXAMPLES**

*Example 1: Local Text Clustering*

In [None]:
print("\n--- EXAMPLE 1: LOCAL TEXT CLUSTERING ---")
try:
    # Instantiate clients for this example
    print("Instantiating clients for Local Text Clustering...")
    local_text_embed_client = get_local_text_embedding_client()
    # Note: Gemma model is large. Loading might take time and resources.
    # If you encounter issues, you might not have enough RAM/VRAM.
    try:
        local_llm_client_gemma = get_gemma_llm_client()
    except Exception as e:
        print(f"Could not load Gemma LLM client for local text clustering: {e}")
        print("Proceeding without LLM for this example, or using a fallback if defined.")
        local_llm_client_gemma = None # Or a fallback if you have one

    # Sample Data
    sample_texts_local = [
        "Introduction to machine learning concepts.",
        "Advanced techniques in deep neural networks.",
        "A guide to Python programming for beginners.",
        "Web development using Flask and Jinja.",
        "Understanding gradient descent and backpropagation.",
        "Natural language processing with transformers.",
        "Getting started with SQL databases.",
        "Data structures and algorithms in Python."
    ]
    print(f"Using {len(sample_texts_local)} sample text documents.")

    # Define Hierarchy
    hierarchy_levels_local_text = [3, 2] # 3 top-level, then subdivide
    print(f"Target hierarchy levels: {hierarchy_levels_local_text}")

    # Instantiate Hercules
    print("Initializing Hercules for local text clustering...")
    hercules_local_text = Hercules(
        level_cluster_counts=hierarchy_levels_local_text,
        representation_mode="direct", # "direct" means use original item embeddings
        text_embedding_client=local_text_embed_client,
        llm_client=local_llm_client_gemma, # Using Gemma
        image_embedding_client=None,
        image_captioning_client=None,
    )
    print("Hercules initialized for local text clustering.")

    # Run Clustering
    print("\nStarting local text clustering process...")
    start_time_local_text = time.time()
    if local_llm_client_gemma is None: # Check if LLM client failed to load
        print("Warning: LLM client (Gemma) not available. Cluster names/summaries might be basic or missing.")

    top_clusters_local_text = hercules_local_text.cluster(sample_texts_local)
    end_time_local_text = time.time()

    if top_clusters_local_text:
        print(f"\nFound {len(top_clusters_local_text)} top-level clusters for local text.")
        for i, cluster in enumerate(top_clusters_local_text):
            print(f"\n--- Top Cluster {i+1} (ID: {cluster.id}) ---")
            cluster.print_hierarchy(indent_increment=hercules_local_text.cluster_print_indent_increment)
    else:
        print("No clusters were formed for local text.")
    print(f"Local text clustering finished in {end_time_local_text - start_time_local_text:.2f} seconds.")

except Exception as e:
    print(f"\nAn error occurred during Local Text Clustering example: {e}")
    import traceback
    traceback.print_exc()


*Example 2: Google Cloud Text Clustering*

In [None]:
print("\n--- EXAMPLE 2: GOOGLE CLOUD TEXT CLUSTERING ---")
if not GOOGLE_API_KEY or "YOUR_API_KEY_HERE" in GOOGLE_API_KEY:
    print("Skipping Google Cloud Text Clustering example as GOOGLE_API_KEY is not set.")
else:
    try:
        # Instantiate clients for this example
        print("Instantiating clients for Google Cloud Text Clustering...")
        google_text_embed_client = embed_texts_google # Direct function, not factory
        google_llm_client_gemini = get_gemini_llm_client()

        # Sample Data (can reuse from local example or define new)
        sample_texts_google = [
            "The history of ancient Rome and its emperors.",
            "Exploring the cosmos: galaxies, stars, and black holes.",
            "Sustainable farming practices for a healthier planet.",
            "The impact of social media on modern society.",
            "Principles of quantum physics and string theory.",
            "Renewable energy sources: solar, wind, and hydro.",
            "Culinary arts: a journey through world cuisines.",
            "The rise of artificial intelligence and its ethical implications."
        ]
        print(f"Using {len(sample_texts_google)} sample text documents for Google Cloud.")

        # Define Hierarchy
        hierarchy_levels_google_text = [3, 2]
        print(f"Target hierarchy levels: {hierarchy_levels_google_text}")

        # Instantiate Hercules
        print("Initializing Hercules for Google Cloud text clustering...")
        hercules_google_text = Hercules(
            level_cluster_counts=hierarchy_levels_google_text,
            representation_mode="direct",
            text_embedding_client=google_text_embed_client,
            llm_client=google_llm_client_gemini, # Using Gemini
            image_embedding_client=None,
            image_captioning_client=None,
        )
        print("Hercules initialized for Google Cloud text clustering.")

        # Run Clustering
        print("\nStarting Google Cloud text clustering process...")
        start_time_google_text = time.time()
        top_clusters_google_text = hercules_google_text.cluster(sample_texts_google)
        end_time_google_text = time.time()

        if top_clusters_google_text:
            print(f"\nFound {len(top_clusters_google_text)} top-level clusters for Google Cloud text.")
            for i, cluster in enumerate(top_clusters_google_text):
                print(f"\n--- Top Cluster {i+1} (ID: {cluster.id}) ---")
                cluster.print_hierarchy(indent_increment=hercules_google_text.cluster_print_indent_increment)
        else:
            print("No clusters were formed for Google Cloud text.")
        print(f"Google Cloud text clustering finished in {end_time_google_text - start_time_google_text:.2f} seconds.")

    except Exception as e:
        print(f"\nAn error occurred during Google Cloud Text Clustering example: {e}")
        import traceback
        traceback.print_exc()

*Example 3: Local Image Clustering*

In [None]:
print("\n--- EXAMPLE 3: LOCAL IMAGE CLUSTERING (PEXELS URLS) ---")
def load_image_from_url(url: str, timeout: int = 20) -> Optional[Image.Image]:
    """Downloads an image from a URL and returns it as a PIL.Image object."""
    try:
        print(f"Downloading image from: {url}")
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        print(f"Successfully loaded image from: {url}")
        return img
    except requests.exceptions.Timeout:
        print(f"Timeout error downloading image from {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Request error downloading image from {url}: {e}")
        return None
    except IOError as e:
        print(f"Error opening image data from {url}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred with {url}: {e}")
        return None
try:
    # Instantiate clients for this example
    print("Instantiating clients for Local Image Clustering...")
    # Image specific clients
    clip_image_embed_client = get_clip_image_embedding_client()
    blip_image_caption_client = get_blip_image_captioning_client()
    
    local_text_embed_for_image_example = get_local_text_embedding_client()
    
    if not GOOGLE_API_KEY or "YOUR_API_KEY_HERE" in GOOGLE_API_KEY:
        print("Warning: GOOGLE_API_KEY not set. LLM for image cluster naming will use a fallback (None) or fail if Gemini is strictly needed.")
        llm_for_image_example = None # Fallback: no LLM for naming
    else:
        llm_for_image_example = get_gemini_llm_client()


    # Sample Data: Pexels URLs
    # Using diverse images to encourage meaningful clusters
    pexels_image_urls = [
        "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg",      # Cat 1
        "https://images.pexels.com/photos/1643457/pexels-photo-1643457.jpeg",     # Cat 2
        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",     # Dogs
        "https://images.pexels.com/photos/170811/pexels-photo-170811.jpeg",       # Car
        "https://images.pexels.com/photos/3763313/pexels-photo-3763313.jpeg",     # Dog
        "https://images.pexels.com/photos/346529/pexels-photo-346529.jpeg",     # Landscape (Mountains, Lake)
        "https://images.pexels.com/photos/1640777/pexels-photo-1640777.jpeg",     # Food (Salad Bowl)
        "https://images.pexels.com/photos/208701/pexels-photo-208701.jpeg",       # Architecture (Building)
        "https://images.pexels.com/photos/994605/pexels-photo-994605.jpeg"        # Landscape (Beach sunset)
    ]
    print(f"Defined {len(pexels_image_urls)} Pexels image URLs.")

    # Download and load images
    print("Downloading and loading images...")
    sample_pil_images = []
    for url in pexels_image_urls:
        img = load_image_from_url(url)
        if img:
            sample_pil_images.append(img)

    if not sample_pil_images:
        print("Error: No images were successfully loaded. Skipping image clustering example.")
    else:
        print(f"Successfully loaded {len(sample_pil_images)} images for clustering.")

        # Define Hierarchy
        hierarchy_levels_image = [3, 2] # Adjust as needed based on number of images
        print(f"Target hierarchy levels for images: {hierarchy_levels_image}")

        # Instantiate Hercules
        print("Initializing Hercules for local image clustering...")
        hercules_image = Hercules(
            level_cluster_counts=hierarchy_levels_image,
            representation_mode="direct", # Use direct image embeddings
            image_embedding_client=clip_image_embed_client,
            image_captioning_client=blip_image_caption_client, # Used if mode was 'caption' or for some internal processing
            llm_client=llm_for_image_example, # For cluster naming/summarization
            text_embedding_client=local_text_embed_for_image_example, # For processing any text (e.g. captions if used)
        )
        print("Hercules initialized for local image clustering.")

        # Run Clustering
        print("\nStarting image clustering process...")
        start_time_image = time.time()
        if llm_for_image_example is None:
            print("Warning: LLM client (Gemini) not available for image example. Cluster names/summaries might be basic or missing.")

        top_clusters_image = hercules_image.cluster(sample_pil_images)
        end_time_image = time.time()

        if top_clusters_image:
            print(f"\nFound {len(top_clusters_image)} top-level clusters for images.")
            for i, cluster in enumerate(top_clusters_image):
                print(f"\n--- Top Cluster {i+1} (ID: {cluster.id}) ---")
                cluster.print_hierarchy(indent_increment=hercules_image.cluster_print_indent_increment)
        else:
            print("No clusters were formed for images.")
        print(f"Image clustering finished in {end_time_image - start_time_image:.2f} seconds.")

except Exception as e:
    print(f"\nAn error occurred during Local Image Clustering example: {e}")
    import traceback
    traceback.print_exc()

*Example 4: Numeric Data Clustering*

In [None]:
print("\n--- EXAMPLE 4: DATAFRAME (NUMERIC) CLUSTERING ---")
try:
    print("Instantiating clients for DataFrame Clustering...")
    local_text_embed_client_ex4 = get_local_text_embedding_client()
    
    llm_client_for_df_ex4 = None
    print("Attempting to use Gemini for DataFrame cluster naming (if API key available), else Gemma, else None.")
    if GOOGLE_API_KEY and "YOUR_API_KEY_HERE" not in GOOGLE_API_KEY:
        llm_client_for_df_ex4 = get_gemini_llm_client()
    
    if llm_client_for_df_ex4 is None: # If Gemini not used or failed
        try:
            llm_client_for_df_ex4 = get_gemma_llm_client()
            print("Using Gemma for DataFrame clustering LLM.")
        except Exception as e_gemma_df:
            print(f"Could not load Gemma for DataFrame clustering: {e_gemma_df}. Proceeding without LLM.")
            llm_client_for_df_ex4 = None


    # Sample DataFrame: Customer Segments
    data = {
        'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        'Age': [25, 65, 30, 70, 22, 58, 45, 48, 28, 68, 35, 62],
        'AnnualIncome_k': [40, 120, 50, 110, 35, 90, 70, 75, 45, 130, 60, 100],
        'SpendingScore_1_100': [80, 20, 75, 15, 85, 30, 55, 50, 82, 22, 70, 25],
        'YearsAsCustomer': [1, 10, 2, 12, 1, 8, 5, 6, 2, 11, 3, 9]
    }
    sample_df = pd.DataFrame(data)
    
    features_for_clustering_df = sample_df[['Age', 'AnnualIncome_k', 'SpendingScore_1_100', 'YearsAsCustomer']]
    
    print(f"Using a DataFrame with {features_for_clustering_df.shape[0]} samples and {features_for_clustering_df.shape[1]} features for clustering:")
    print(features_for_clustering_df.head())

    # Define Hierarchy
    hierarchy_levels_df = [3, 2] # Target: 3 top-level clusters, then attempt to subdivide
    print(f"Target hierarchy levels: {hierarchy_levels_df}")

    print("Initializing Hercules for DataFrame clustering...")
    hercules_df = Hercules(
        level_cluster_counts=hierarchy_levels_df,
        representation_mode="direct", # Use the numeric feature vectors directly
        text_embedding_client=local_text_embed_client_ex4,
        llm_client=llm_client_for_df_ex4,
        image_embedding_client=None,
        image_captioning_client=None,
    )
    print("Hercules initialized for DataFrame clustering.")

    print("\nStarting DataFrame clustering process...")
    if llm_client_for_df_ex4 is None: print("Warning: LLM client not available for DataFrame example. Cluster names might be basic.")
    start_time = time.time()
    
    top_clusters_df = hercules_df.cluster(features_for_clustering_df)

    end_time = time.time()

    if top_clusters_df:
        print(f"\nFound {len(top_clusters_df)} top-level clusters for DataFrame data.")
        for i, cluster in enumerate(top_clusters_df):
            print(f"\n--- Top Cluster {i+1} (ID: {cluster.id}) ---")
            cluster.print_hierarchy(indent_increment=hercules_df.cluster_print_indent_increment)            
    else:
        print("No clusters were formed for DataFrame data.")
    print(f"DataFrame clustering finished in {end_time - start_time:.2f} seconds.")

except Exception as e:
    print(f"\nAn error occurred during DataFrame Clustering example: {e}")
    import traceback
    traceback.print_exc()