### Imports

In [None]:
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt


In [6]:
# Import core components from ImageAtlas
from imageatlas import (
    ImageClusterer,
    ClusteringResults,
    FeaturePipeline,
    create_feature_extractor,
    create_reducer,
    PCAReducer,
    UMAPReducer,
    create_clustering_algorithm,
    GridVisualizer,
    create_cluster_grids
)

In [8]:
IMAGE_DIR = "./input_images"
OUTPUT_DIR = "./output"

### High Level Workflow
This is the main entry point. It demonstrates
- ImageClusterer
- ClusteringResults

In [10]:
print("--- 1. High-Level API: ImageClusterer ---")

# Initialize the main Clusterer
clusterer = ImageClusterer(
    model='dinov2',
    n_clusters=5,
    clustering_method='kmeans',
    reducer='pca',                  # Optional dimensionality reduction
    n_components=100,
    device='auto',
    batch_size=4,
)

# Fit to the image directory
# This returns a ClusteringResults object
results = clusterer.fit(IMAGE_DIR)

# Results summary
print(results.summary())

# Export the clustering results
# To CSV
results.to_csv(os.path.join(OUTPUT_DIR, "clustering_results.csv"))
print(f"Results saved to: {OUTPUT_DIR}/clustering_results.csv")

# To JSON
results.to_json(os.path.join(OUTPUT_DIR, "clustering_results.json"))
print(f"Results saved to: {OUTPUT_DIR}/clustering_results.json")

--- 1. High-Level API: ImageClusterer ---
IMAGE CLUSTERING PIPELINE

Step 1: Feature Extraction.
Creating feature extractor: dinov2


Using cache found in /home/ahmad/.cache/torch/hub/facebookresearch_dinov2_main


Found 662 images in ./input_images


Extracting features: 100%|██████████| 166/166 [01:34<00:00,  1.75batch/s, processed=662, corrupted=0]



Extraction completed!
Feature Extraction Summary:
Model: dinov2feature(vits14)
Feature dimension: 384
Samples processed: 662
Extraction date: 2026-01-22T03:00:53.698923
Device: cpu
Batch size: 4
Total time: 94.71279859542847
Speed: 6.99 images/sec
  Extracted features: (662, 384)

Step 2: Dimensionality Reduction (PCA)
Creating dimensionality reducer: pca
   Reduced to: (662, 100)
  Variance explained: 90.86%

3. Step 3: Clustering (KMEANS)
Creating clusterer: kmeans
   Found 5 clusters
   Cluster sizes: {0: 130, 3: 164, 1: 116, 4: 171, 2: 81}

CLUSTERING COMPLETE

Clustering results summary:    Total Images: 662
   Number of clusters: 5
   Cluster sizes: {0: 130, 3: 164, 1: 116, 4: 171, 2: 81}
   Feature dimension: 384
   Reduced dimension: 100
   Clustering method: kmeans
   Model: dinov2
Clustering results summary:    Total Images: 662
   Number of clusters: 5
   Cluster sizes: {0: 130, 3: 164, 1: 116, 4: 171, 2: 81}
   Feature dimension: 384
   Reduced dimension: 100
   Clustering

### Feature extraction API

In [12]:
print("\n--- 2. Feature Extraction API ---")

# Manually create a feature extractor
# variants: 'vits14' (DINOV2), '50' (ResNet), 'b_16' (ViT)
extractor = create_feature_extractor(
    model_type='resnet',
    variant='50',
    device='cpu'
)

# Initialize the pipeline
pipeline = FeaturePipeline(extractor, batch_size=32)

# Extract features from the directory
pipeline.extract_from_directory(IMAGE_DIR)

# Get raw embeddings (Numpy Array)
embeddings = pipeline.get_features()
filenames = pipeline.get_filenames()

print(f"Extracted features shape: {embeddings.shape}")
print(f"First filename: {filenames[0]}")


--- 2. Feature Extraction API ---
Found 662 images in ./input_images


Extracting features: 100%|██████████| 21/21 [01:42<00:00,  4.87s/batch, processed=662, corrupted=0]


Extraction completed!
Feature Extraction Summary:
Model: resnet(50)
Feature dimension: 2048
Samples processed: 662
Extraction date: 2026-01-22T03:08:06.484052
Device: cpu
Batch size: 32
Total time: 102.31135082244873
Speed: 6.47 images/sec
Extracted features shape: (662, 2048)
First filename: input_images/000106949d.jpg





### Dimentionality Reduction API

This shows how to use
- create_reducer,
- PCAReducer,
- UMAPReducer

In [15]:
# Method 1: Using the factory function
reducer = create_reducer('pca', n_components=50)
reducer_output = reducer.fit_transform(embeddings)
print(f"Factory PCA output shape: {reducer_output.shape}")

# Method 2: Direct class Usage (PCAReducer)
pca = PCAReducer(n_components=5)
reduced_pca = pca.fit_transform(embeddings)
print(f"Direct PCAReducer output shape: {reduced_pca.shape}")


Factory PCA output shape: (662, 50)
Direct PCAReducer output shape: (662, 5)


### Clustering Algorithm API

In [16]:
print("\n--- 4. Clustering Algorithm API ---")

# Create a KMeans algorithm instance manually
# This is useful if you already have embeddings and just want to cluster them
algo = create_clustering_algorithm('kmeans', n_clusters=3)

# Fit and predict
result = algo.fit_predict(embeddings)

print(f"Algorithm used: {algo.get_algorithm_name()}")
print(f"Found clusters: {result.n_clusters}")
print(f"Cluster sizes: {result.get_cluster_sizes()}")


--- 4. Clustering Algorithm API ---
Algorithm used: KMeans
Found clusters: 3
Cluster sizes: {1: 139, 0: 260, 2: 263}
