# Data Sampling for Clustering Analysis

This notebook creates evenly distributed dataset samples from downloaded center images for clustering analysis.

Features:
- Analyzes tenant distribution in the input directory
- Creates evenly distributed samples (equal ratio per tenant)
- Resizes images to 500x250 pixels for clustering
- Copies sampled images to a new directory
- Maintains original filename convention: {tenant}_{SID}_{filename}_C.png

In [31]:
import os
import shutil
import random
from pathlib import Path
from collections import defaultdict, Counter
from typing import Dict, List, Tuple
import json
from PIL import Image
import time

In [None]:
# Configuration
INPUT_PATH = "./downloads/center_images"  # Path to downloaded images
SAMPLE_SIZE = 10000  # Total number of images in sample
OUTPUT_PATH = f"./datasets/clustering_sample_{SAMPLE_SIZE}"      # Output path for sampled dataset
TARGET_IMAGE_SIZE = (224, 224)                    # Resize images to this size (width, height)
IMAGE_EXTENSION = "_C.png"                        # File extension filter
RANDOM_SEED = 42                                  # For reproducible sampling

In [33]:
# Set random seed for reproducibility
random.seed(RANDOM_SEED)

# Create output directory
Path(OUTPUT_PATH).mkdir(parents=True, exist_ok=True)
print(f"Input directory: {INPUT_PATH}")
print(f"Output directory: {OUTPUT_PATH}")
print(f"Target sample size: {SAMPLE_SIZE} images")
print(f"Target image size: {TARGET_IMAGE_SIZE[0]}x{TARGET_IMAGE_SIZE[1]} pixels")

Input directory: ./downloads/center_images
Output directory: ./datasets/clustering_sample_5000
Target sample size: 5000 images
Target image size: 500x250 pixels


In [34]:
def parse_filename(filename: str) -> Tuple[str, str, str]:
    """
    Parse filename to extract tenant, SID, and original filename.
    Expected format: {tenant}_{SID}_{filename}_C.png
    
    Args:
        filename: Image filename
    
    Returns:
        Tuple of (tenant, sid, original_filename)
    """
    if not filename.endswith(IMAGE_EXTENSION):
        return None, None, None
    
    # Remove extension
    name_without_ext = filename[:-len(IMAGE_EXTENSION)]
    
    # Split by underscore
    parts = name_without_ext.split('_')
    
    if len(parts) >= 3:
        tenant = parts[0]
        sid = parts[1]
        original_filename = '_'.join(parts[2:])  # Rejoin in case original filename had underscores
        return tenant, sid, original_filename
    
    return None, None, None

In [35]:
def analyze_dataset(input_path: str) -> Dict[str, List[str]]:
    """
    Analyze the input dataset to understand tenant distribution.
    
    Args:
        input_path: Path to input directory
    
    Returns:
        Dictionary with tenant as key and list of filenames as value
    """
    tenant_files = defaultdict(list)
    invalid_files = []
    
    print("Analyzing dataset...")
    
    if not os.path.exists(input_path):
        print(f"Error: Input path {input_path} does not exist")
        return {}
    
    for filename in os.listdir(input_path):
        if filename.endswith(IMAGE_EXTENSION):
            tenant, sid, original_filename = parse_filename(filename)
            if tenant:
                tenant_files[tenant].append(filename)
            else:
                invalid_files.append(filename)
    
    print(f"Found {sum(len(files) for files in tenant_files.values())} valid images")
    print(f"Found {len(invalid_files)} invalid filenames")
    
    if invalid_files:
        print(f"Invalid files (first 5): {invalid_files[:5]}")
    
    return dict(tenant_files)

In [36]:
def calculate_sample_distribution(tenant_files: Dict[str, List[str]], total_sample_size: int) -> Dict[str, int]:
    """
    Calculate how many images to sample from each tenant for even distribution.
    
    Args:
        tenant_files: Dictionary of tenant -> list of files
        total_sample_size: Total number of images to sample
    
    Returns:
        Dictionary of tenant -> number of images to sample
    """
    if not tenant_files:
        return {}
    
    num_tenants = len(tenant_files)
    base_per_tenant = total_sample_size // num_tenants
    remainder = total_sample_size % num_tenants
    
    sample_distribution = {}
    tenants = sorted(tenant_files.keys())
    
    for i, tenant in enumerate(tenants):
        available_files = len(tenant_files[tenant])
        target_sample = base_per_tenant + (1 if i < remainder else 0)
        
        # Don't sample more than available
        actual_sample = min(target_sample, available_files)
        sample_distribution[tenant] = actual_sample
    
    return sample_distribution

In [37]:
def resize_image(input_path: str, output_path: str, target_size: Tuple[int, int]) -> bool:
    """
    Resize an image to target size and save to output path.
    
    Args:
        input_path: Path to input image
        output_path: Path to save resized image
        target_size: Target size (width, height)
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        with Image.open(input_path) as img:
            # Convert to RGB if necessary (handles RGBA, grayscale, etc.)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Resize image
            resized_img = img.resize(target_size, Image.Resampling.LANCZOS)
            
            # Save resized image
            resized_img.save(output_path, 'PNG')
            return True
    except Exception as e:
        print(f"Error resizing {input_path}: {e}")
        return False

In [38]:
def create_sample_dataset(input_path: str, output_path: str, tenant_files: Dict[str, List[str]], 
                         sample_distribution: Dict[str, int], target_size: Tuple[int, int]) -> Dict[str, int]:
    """
    Create the sampled dataset by copying and resizing selected images.
    
    Args:
        input_path: Input directory path
        output_path: Output directory path
        tenant_files: Dictionary of tenant -> list of files
        sample_distribution: Dictionary of tenant -> number to sample
        target_size: Target image size (width, height)
    
    Returns:
        Dictionary with statistics
    """
    stats = {
        'total_copied': 0,
        'total_failed': 0,
        'tenant_stats': {}
    }
    
    print("Creating sample dataset...")
    print("-" * 60)
    
    for tenant, target_count in sample_distribution.items():
        if target_count == 0:
            continue
            
        available_files = tenant_files[tenant]
        
        # Randomly sample files for this tenant
        sampled_files = random.sample(available_files, target_count)
        
        copied_count = 0
        failed_count = 0
        
        for filename in sampled_files:
            input_file_path = os.path.join(input_path, filename)
            output_file_path = os.path.join(output_path, filename)
            
            # Resize and copy image
            if resize_image(input_file_path, output_file_path, target_size):
                copied_count += 1
                stats['total_copied'] += 1
            else:
                failed_count += 1
                stats['total_failed'] += 1
        
        stats['tenant_stats'][tenant] = {
            'available': len(available_files),
            'target': target_count,
            'copied': copied_count,
            'failed': failed_count
        }
        
        print(f"{tenant}: {copied_count}/{target_count} images copied ({len(available_files)} available)")
    
    return stats

In [39]:
# Main execution
start_time = time.time()

# Step 1: Analyze input dataset
print("=" * 80)
print("STEP 1: ANALYZING INPUT DATASET")
print("=" * 80)

tenant_files = analyze_dataset(INPUT_PATH)

if not tenant_files:
    print("No valid images found. Exiting.")
    exit()

# Display tenant distribution
print("\nTenant distribution:")
total_images = 0
for tenant, files in sorted(tenant_files.items()):
    count = len(files)
    total_images += count
    percentage = (count / sum(len(f) for f in tenant_files.values())) * 100
    print(f"  {tenant}: {count:,} images ({percentage:.1f}%)")

print(f"\nTotal images found: {total_images:,}")

STEP 1: ANALYZING INPUT DATASET
Analyzing dataset...
Found 250000 valid images
Found 0 invalid filenames

Tenant distribution:
  ava: 21,980 images (8.8%)
  bernmobil: 16,421 images (6.6%)
  bvb: 27,024 images (10.8%)
  cts: 30,913 images (12.4%)
  gent: 23,456 images (9.4%)
  gvb: 18,651 images (7.5%)
  retm: 65,297 images (26.1%)
  vbz: 46,258 images (18.5%)

Total images found: 250,000
Found 250000 valid images
Found 0 invalid filenames

Tenant distribution:
  ava: 21,980 images (8.8%)
  bernmobil: 16,421 images (6.6%)
  bvb: 27,024 images (10.8%)
  cts: 30,913 images (12.4%)
  gent: 23,456 images (9.4%)
  gvb: 18,651 images (7.5%)
  retm: 65,297 images (26.1%)
  vbz: 46,258 images (18.5%)

Total images found: 250,000


In [40]:
# Step 2: Calculate sample distribution
print("\n" + "=" * 80)
print("STEP 2: CALCULATING SAMPLE DISTRIBUTION")
print("=" * 80)

sample_distribution = calculate_sample_distribution(tenant_files, SAMPLE_SIZE)

print(f"Target sample size: {SAMPLE_SIZE:,} images")
print(f"Number of tenants: {len(tenant_files)}")
print("\nSample distribution:")

total_to_sample = 0
for tenant, count in sorted(sample_distribution.items()):
    available = len(tenant_files[tenant])
    percentage = (count / SAMPLE_SIZE) * 100
    total_to_sample += count
    print(f"  {tenant}: {count:,} images ({percentage:.1f}%) [available: {available:,}]")

print(f"\nTotal to sample: {total_to_sample:,} images")

# Check if we have enough images
insufficient_tenants = [t for t, c in sample_distribution.items() if c < len(tenant_files[t])]
if len(insufficient_tenants) < len(tenant_files):
    limited_tenants = [t for t in tenant_files.keys() if t not in insufficient_tenants]
    print(f"\nNote: {len(limited_tenants)} tenant(s) have fewer images than target sample size:")
    for tenant in limited_tenants:
        print(f"  {tenant}: {len(tenant_files[tenant])} available, {sample_distribution[tenant]} needed")


STEP 2: CALCULATING SAMPLE DISTRIBUTION
Target sample size: 5,000 images
Number of tenants: 8

Sample distribution:
  ava: 625 images (12.5%) [available: 21,980]
  bernmobil: 625 images (12.5%) [available: 16,421]
  bvb: 625 images (12.5%) [available: 27,024]
  cts: 625 images (12.5%) [available: 30,913]
  gent: 625 images (12.5%) [available: 23,456]
  gvb: 625 images (12.5%) [available: 18,651]
  retm: 625 images (12.5%) [available: 65,297]
  vbz: 625 images (12.5%) [available: 46,258]

Total to sample: 5,000 images


In [41]:
# Step 3: Create sample dataset
print("\n" + "=" * 80)
print("STEP 3: CREATING SAMPLE DATASET")
print("=" * 80)

stats = create_sample_dataset(
    INPUT_PATH, 
    OUTPUT_PATH, 
    tenant_files, 
    sample_distribution, 
    TARGET_IMAGE_SIZE
)


STEP 3: CREATING SAMPLE DATASET
Creating sample dataset...
------------------------------------------------------------
ava: 625/625 images copied (21980 available)
ava: 625/625 images copied (21980 available)
bernmobil: 625/625 images copied (16421 available)
bernmobil: 625/625 images copied (16421 available)
bvb: 625/625 images copied (27024 available)
bvb: 625/625 images copied (27024 available)
cts: 625/625 images copied (30913 available)
cts: 625/625 images copied (30913 available)
gent: 625/625 images copied (23456 available)
gent: 625/625 images copied (23456 available)
gvb: 625/625 images copied (18651 available)
gvb: 625/625 images copied (18651 available)
retm: 625/625 images copied (65297 available)
retm: 625/625 images copied (65297 available)
vbz: 625/625 images copied (46258 available)
vbz: 625/625 images copied (46258 available)


In [42]:
# Step 4: Summary and statistics
end_time = time.time()
duration = end_time - start_time

print("\n" + "=" * 80)
print("SAMPLING SUMMARY")
print("=" * 80)

print(f"Execution time: {duration:.2f} seconds")
print(f"Total images copied: {stats['total_copied']:,}")
print(f"Total failed: {stats['total_failed']:,}")
print(f"Success rate: {(stats['total_copied']/(stats['total_copied']+stats['total_failed'])*100):.1f}%")

print("\nDetailed tenant statistics:")
for tenant, tenant_stats in sorted(stats['tenant_stats'].items()):
    print(f"  {tenant}:")
    print(f"    Available: {tenant_stats['available']:,}")
    print(f"    Target: {tenant_stats['target']:,}")
    print(f"    Copied: {tenant_stats['copied']:,}")
    print(f"    Failed: {tenant_stats['failed']:,}")
    if tenant_stats['target'] > 0:
        success_rate = (tenant_stats['copied'] / tenant_stats['target']) * 100
        print(f"    Success rate: {success_rate:.1f}%")

print(f"\nSample dataset saved to: {OUTPUT_PATH}")
print(f"Images resized to: {TARGET_IMAGE_SIZE[0]}x{TARGET_IMAGE_SIZE[1]} pixels")


SAMPLING SUMMARY
Execution time: 156.16 seconds
Total images copied: 5,000
Total failed: 0
Success rate: 100.0%

Detailed tenant statistics:
  ava:
    Available: 21,980
    Target: 625
    Copied: 625
    Failed: 0
    Success rate: 100.0%
  bernmobil:
    Available: 16,421
    Target: 625
    Copied: 625
    Failed: 0
    Success rate: 100.0%
  bvb:
    Available: 27,024
    Target: 625
    Copied: 625
    Failed: 0
    Success rate: 100.0%
  cts:
    Available: 30,913
    Target: 625
    Copied: 625
    Failed: 0
    Success rate: 100.0%
  gent:
    Available: 23,456
    Target: 625
    Copied: 625
    Failed: 0
    Success rate: 100.0%
  gvb:
    Available: 18,651
    Target: 625
    Copied: 625
    Failed: 0
    Success rate: 100.0%
  retm:
    Available: 65,297
    Target: 625
    Copied: 625
    Failed: 0
    Success rate: 100.0%
  vbz:
    Available: 46,258
    Target: 625
    Copied: 625
    Failed: 0
    Success rate: 100.0%

Sample dataset saved to: ./datasets/clustering_sa

In [43]:
# Save sampling log
log_data = {
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    'config': {
        'input_path': INPUT_PATH,
        'output_path': OUTPUT_PATH,
        'sample_size': SAMPLE_SIZE,
        'target_image_size': TARGET_IMAGE_SIZE,
        'random_seed': RANDOM_SEED
    },
    'original_dataset': {
        'total_images': total_images,
        'tenant_distribution': {tenant: len(files) for tenant, files in tenant_files.items()}
    },
    'sample_dataset': {
        'total_copied': stats['total_copied'],
        'total_failed': stats['total_failed'],
        'execution_time_seconds': duration,
        'tenant_stats': stats['tenant_stats']
    }
}

log_file = os.path.join(OUTPUT_PATH, 'sampling_log.json')
try:
    with open(log_file, 'w') as f:
        json.dump(log_data, f, indent=2)
    print(f"\nSampling log saved to: {log_file}")
except Exception as e:
    print(f"\nWarning: Could not save sampling log: {e}")


Sampling log saved to: ./datasets/clustering_sample_5000/sampling_log.json


In [44]:
# Verify the sample dataset
print("\n" + "=" * 80)
print("VERIFICATION")
print("=" * 80)

# Count files in output directory
output_files = [f for f in os.listdir(OUTPUT_PATH) if f.endswith(IMAGE_EXTENSION)]
print(f"Files in output directory: {len(output_files)}")

# Verify tenant distribution in output
output_tenant_count = Counter()
for filename in output_files:
    tenant, _, _ = parse_filename(filename)
    if tenant:
        output_tenant_count[tenant] += 1

print("\nFinal tenant distribution in sample:")
for tenant, count in sorted(output_tenant_count.items()):
    percentage = (count / len(output_files)) * 100 if output_files else 0
    print(f"  {tenant}: {count} images ({percentage:.1f}%)")

# Check image sizes (sample a few)
if output_files:
    print("\nVerifying image sizes (sampling 3 images)...")
    sample_files = random.sample(output_files, min(3, len(output_files)))
    for filename in sample_files:
        try:
            img_path = os.path.join(OUTPUT_PATH, filename)
            with Image.open(img_path) as img:
                print(f"  {filename}: {img.size[0]}x{img.size[1]} pixels")
        except Exception as e:
            print(f"  {filename}: Error reading image - {e}")

print("\n✓ Dataset sampling completed successfully!")


VERIFICATION
Files in output directory: 5000

Final tenant distribution in sample:
  ava: 625 images (12.5%)
  bernmobil: 625 images (12.5%)
  bvb: 625 images (12.5%)
  cts: 625 images (12.5%)
  gent: 625 images (12.5%)
  gvb: 625 images (12.5%)
  retm: 625 images (12.5%)
  vbz: 625 images (12.5%)

Verifying image sizes (sampling 3 images)...
  ava_104_0000005580_C.png: 500x250 pixels
  gent_59_0000001770_C.png: 500x250 pixels
  bernmobil_152_0000004800_C.png: 500x250 pixels

✓ Dataset sampling completed successfully!
