In [1]:
import os
import re
from collections import Counter

# Try different possible paths for images directory
possible_image_paths = [
    '../images',
    '../../images',
    './images',
    '../phase2_style_evaluation/images',
    'C:/Users/SouayedBelkiss/OneDrive - gae/Desktop/Thesis/phase2_style_evaluation/images'
]

# Find the correct path
base_path = None
for path in possible_image_paths:
    if os.path.exists(path) and os.path.isdir(path):
        if os.path.exists(os.path.join(path, 'by_style')) or os.path.exists(os.path.join(path, 'by_dataset')):
            base_path = path
            print(f"Found images directory at: {base_path}")
            break

if base_path is None:
    print("ERROR: Could not find the images directory. Please specify the path manually.")
    base_path = input("Enter the full path to your images directory: ")
    if not os.path.exists(base_path):
        raise FileNotFoundError(f"The specified path does not exist: {base_path}")

def parse_filename(filename):
    """Parse filename to extract dataset, sample_id, and style"""
    patterns = [
        r"(\w+)_([A-Za-z0-9-]+)_(\w+)\.(png|jpg|jpeg)",  # dataset_sampleid_style.ext
        r"([A-Za-z0-9-]+)_(\w+)\.(png|jpg|jpeg)"         # sampleid_style.ext (missing dataset)
    ]
    
    for pattern in patterns:
        match = re.match(pattern, filename)
        if match:
            if len(match.groups()) == 4:  # dataset_sampleid_style.ext
                return match.group(1), match.group(2), match.group(3)
            elif len(match.groups()) == 3:  # sampleid_style.ext
                return None, match.group(1), match.group(2)
    
    return None, None, None

def count_images():
    """Count images by style and dataset"""
    total_count = 0
    styles_count = Counter()
    datasets_count = Counter()
    samples_count = Counter()
    
    # First check style directory
    style_path = os.path.join(base_path, 'by_style')
    if os.path.exists(style_path):
        print(f"\nCounting images in by_style directory...")
        styles = [d for d in os.listdir(style_path) if os.path.isdir(os.path.join(style_path, d))]
        
        for style in styles:
            style_dir = os.path.join(style_path, style)
            image_files = [f for f in os.listdir(style_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            
            style_count = len(image_files)
            total_count += style_count
            styles_count[style] += style_count
            
            for img_file in image_files:
                dataset, sample_id, _ = parse_filename(img_file)
                if dataset:
                    datasets_count[dataset] += 1
                if sample_id:
                    samples_count[sample_id] += 1
    
    # Then check dataset directory
    dataset_path = os.path.join(base_path, 'by_dataset')
    if os.path.exists(dataset_path):
        print(f"\nCounting images in by_dataset directory...")
        datasets = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
        
        dataset_total = 0
        for dataset in datasets:
            dataset_dir = os.path.join(dataset_path, dataset)
            image_files = [f for f in os.listdir(dataset_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            
            dataset_count = len(image_files)
            dataset_total += dataset_count
            datasets_count[dataset] += dataset_count
            
            for img_file in image_files:
                _, sample_id, style = parse_filename(img_file)
                if style:
                    styles_count[style] += 0  # Just to ensure it's in the counter
                if sample_id:
                    samples_count[sample_id] += 0  # Just to ensure it's in the counter
    
    return {
        'total': total_count,
        'styles': styles_count,
        'datasets': datasets_count,
        'unique_samples': len(samples_count)
    }

def main():
    """Main function to count and display image statistics"""
    print("Counting images in directory...")
    counts = count_images()
    
    print("\n===== IMAGE COUNT SUMMARY =====")
    print(f"Total images found: {counts['total']}")
    print(f"Unique samples: {counts['unique_samples']}")
    
    if counts['styles']:
        print("\nBreakdown by Style:")
        print("-----------------")
        for style, count in sorted(counts['styles'].items(), key=lambda x: x[1], reverse=True):
            print(f"{style}: {count} images")
    
    if counts['datasets']:
        print("\nBreakdown by Dataset:")
        print("-------------------")
        for dataset, count in sorted(counts['datasets'].items(), key=lambda x: x[1], reverse=True):
            print(f"{dataset}: {count} images")
    
    # Calculate expected total based on styles and samples
    if counts['styles'] and counts['unique_samples'] > 0:
        expected_total = counts['unique_samples'] * len(counts['styles'])
        completion_percent = (counts['total'] / expected_total) * 100 if expected_total > 0 else 0
        
        print("\nCompletion Analysis:")
        print("-------------------")
        print(f"Expected total (samples × styles): {expected_total}")
        print(f"Current completion: {completion_percent:.1f}%")
        print(f"Missing images: {expected_total - counts['total']}")

if __name__ == "__main__":
    main()

Found images directory at: ../images
Counting images in directory...

Counting images in by_style directory...

Counting images in by_dataset directory...

===== IMAGE COUNT SUMMARY =====
Total images found: 3935
Unique samples: 298

Breakdown by Style:
-----------------
geometric: 397 images
cartoon: 396 images
artistic: 395 images
storybook: 395 images
3d rendered: 393 images
digital art: 393 images
minimalistic: 393 images
technical: 393 images
realistic: 390 images
retro: 390 images

Breakdown by Dataset:
-------------------
simpa: 999 images
onestopenglish: 992 images
asset: 976 images
wikipedia: 968 images
simpa_simpa: 799 images
onestopenglish_onestop: 794 images
asset_asset: 782 images
wikipedia_wikipedia: 774 images

Completion Analysis:
-------------------
Expected total (samples × styles): 2980
Current completion: 132.0%
Missing images: -955
