# Complete Image Extraction from Amazing Logos V4

This notebook extracts all images present in metadata9.csv from the Amazing Logos V4 dataset:
- Loads the HuggingFace dataset from input/amazing_logos_v4/
- Loads metadata9.csv for complete image list
- Extracts all images that are present in the metadata (after cleanup)
- Saves all images to total_after_cleanup folder
- No sampling - extracts every image from the cleaned metadata

In [1]:
# Configuration parameters
IMAGE_SIZE = (256, 256)  # Size to save images
OUTPUT_FORMAT = 'PNG'  # Image format to save

print(f"Configuration:")
print(f"  Extracting ALL images from metadata9.csv")
print(f"  Image size: {IMAGE_SIZE}")
print(f"  Output format: {OUTPUT_FORMAT}")
print(f"  No sampling - complete extraction based on cleaned metadata")

Configuration:
  Extracting ALL images from metadata9.csv
  Image size: (256, 256)
  Output format: PNG
  No sampling - complete extraction based on cleaned metadata


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from datasets import Dataset
from PIL import Image
import io
from collections import defaultdict, Counter
from tqdm import tqdm
import json
import os

# Setup paths
input_dataset_path = Path('../../input/amazing_logos_v4/train')
metadata_path = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata9.csv')
output_images_path = Path('../../output/amazing_logos_v4/images/total_after_cleanup')
output_metadata_path = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_image_prep/total_after_cleanup_metadata.csv')

# Create output directory
output_images_path.mkdir(parents=True, exist_ok=True)

print(f"Input dataset: {input_dataset_path}")
print(f"Metadata file: {metadata_path}")
print(f"Output images: {output_images_path}")
print(f"Output metadata: {output_metadata_path}")

# Check if paths exist
if not input_dataset_path.exists():
    print(f"‚ùå Input dataset path does not exist: {input_dataset_path}")
if not metadata_path.exists():
    print(f"‚ùå Metadata file does not exist: {metadata_path}")
else:
    print("‚úÖ All required paths exist")

Input dataset: ..\..\input\amazing_logos_v4\train
Metadata file: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\metadata9.csv
Output images: ..\..\output\amazing_logos_v4\images\total_after_cleanup
Output metadata: ..\..\output\amazing_logos_v4\data\total_after_cleanup_metadata.csv
‚úÖ All required paths exist


In [3]:
# Load complete metadata for all images
print("Loading complete metadata...")
try:
    # Load full metadata (only id and category columns for efficiency)
    metadata_df = pd.read_csv(metadata_path, usecols=['id', 'category'])
    print(f"‚úÖ Loaded metadata: {len(metadata_df):,} total images to extract")
    
    # Show category distribution
    category_counts = metadata_df['category'].value_counts()
    print(f"\nFound {len(category_counts)} unique categories")
    print(f"Top 10 categories:")
    for i, (cat, count) in enumerate(category_counts.head(10).items(), 1):
        percentage = (count / len(metadata_df)) * 100
        print(f"  {i:2d}. {cat}: {count:,} ({percentage:.1f}%)")
    
    print(f"\nTotal images to extract: {len(metadata_df):,}")
    
except Exception as e:
    print(f"‚ùå Error loading metadata: {e}")
    raise

Loading complete metadata...
‚úÖ Loaded metadata: 352,154 total images to extract

Found 109 unique categories
Top 10 categories:
   1. unclassified: 73,954 (21.0%)
   2. sports_recreation: 11,383 (3.2%)
   3. restaurant_dining: 10,963 (3.1%)
   4. real_estate_residential: 10,825 (3.1%)
   5. healthcare_general: 10,534 (3.0%)
   6. design_creative: 10,366 (2.9%)
   7. nonprofit_charity: 10,134 (2.9%)
   8. fashion_apparel: 9,174 (2.6%)
   9. education_k12: 8,572 (2.4%)
  10. music_industry: 7,318 (2.1%)

Total images to extract: 352,154
‚úÖ Loaded metadata: 352,154 total images to extract

Found 109 unique categories
Top 10 categories:
   1. unclassified: 73,954 (21.0%)
   2. sports_recreation: 11,383 (3.2%)
   3. restaurant_dining: 10,963 (3.1%)
   4. real_estate_residential: 10,825 (3.1%)
   5. healthcare_general: 10,534 (3.0%)
   6. design_creative: 10,366 (2.9%)
   7. nonprofit_charity: 10,134 (2.9%)
   8. fashion_apparel: 9,174 (2.6%)
   9. education_k12: 8,572 (2.4%)
  10. music_

In [4]:
# Use all metadata - no sampling needed
print(f"Extraction strategy: COMPLETE")
print(f"  Total images in metadata: {len(metadata_df):,}")
print(f"  All images will be extracted (no sampling)")

# Use the complete metadata as-is
sampled_df = metadata_df.copy()

print(f"\nDistribution of all images by category:")
category_counts = sampled_df['category'].value_counts()
for i, (category, count) in enumerate(category_counts.head(15).items(), 1):
    percentage = (count / len(sampled_df)) * 100
    print(f"  {i:2d}. {category}: {count:,} ({percentage:.1f}%)")

if len(category_counts) > 15:
    remaining = len(category_counts) - 15
    remaining_count = category_counts.tail(remaining).sum()
    remaining_percentage = (remaining_count / len(sampled_df)) * 100
    print(f"  ... and {remaining} more categories: {remaining_count:,} ({remaining_percentage:.1f}%)")

print(f"\nTotal images to process: {len(sampled_df):,}")

Extraction strategy: COMPLETE
  Total images in metadata: 352,154
  All images will be extracted (no sampling)

Distribution of all images by category:
   1. unclassified: 73,954 (21.0%)
   2. sports_recreation: 11,383 (3.2%)
   3. restaurant_dining: 10,963 (3.1%)
   4. real_estate_residential: 10,825 (3.1%)
   5. healthcare_general: 10,534 (3.0%)
   6. design_creative: 10,366 (2.9%)
   7. nonprofit_charity: 10,134 (2.9%)
   8. fashion_apparel: 9,174 (2.6%)
   9. education_k12: 8,572 (2.4%)
  10. music_industry: 7,318 (2.1%)
  11. marketing_advertising: 6,806 (1.9%)
  12. retail_general: 6,503 (1.8%)
  13. home_improvement: 6,200 (1.8%)
  14. entertainment_venues: 6,089 (1.7%)
  15. chemical_materials: 5,501 (1.6%)
  ... and 94 more categories: 157,832 (44.8%)

Total images to process: 352,154


In [5]:
# Get list of arrow files
arrow_files = list(input_dataset_path.glob('data-*.arrow'))
arrow_files.sort()

print(f"Found {len(arrow_files)} arrow files:")
for i, file in enumerate(arrow_files[:5], 1):
    print(f"  {i}. {file.name}")
if len(arrow_files) > 5:
    print(f"  ... and {len(arrow_files) - 5} more")

# Update the variable name for consistency
parquet_files = arrow_files  # Keep the same variable name to avoid breaking other code

Found 29 arrow files:
  1. data-00000-of-00029.arrow
  2. data-00001-of-00029.arrow
  3. data-00002-of-00029.arrow
  4. data-00003-of-00029.arrow
  5. data-00004-of-00029.arrow
  ... and 24 more


In [6]:
# Debug: Examine the structure and ID format
if arrow_files:
    print("Examining the structure of the first dataset file...")
    try:
        # Try loading with datasets library
        dataset = Dataset.from_file(str(arrow_files[0]))
        print(f"Dataset features: {dataset.features}")
        print(f"Dataset length: {len(dataset)}")
        
        if len(dataset) > 0:
            print("\nFirst example:")
            first_example = dataset[0]
            print("Keys:", list(first_example.keys()))
            
            for key, value in first_example.items():
                if key == 'image':
                    print(f"  {key}: {type(value)} - PIL Image with size {value.size if hasattr(value, 'size') else 'unknown'}")
                elif isinstance(value, str) and len(value) > 100:
                    print(f"  {key}: {value[:100]}...")
                else:
                    print(f"  {key}: {value}")
                    
        # Check if we can load the full dataset
        print(f"\nTrying to load the full dataset from directory...")
        try:
            from datasets import load_from_disk
            full_dataset = load_from_disk(str(input_dataset_path.parent))
            print(f"Full dataset loaded: {full_dataset}")
            if 'train' in full_dataset:
                train_data = full_dataset['train']
                print(f"Train split: {len(train_data)} examples")
                print(f"Features: {train_data.features}")
        except Exception as e:
            print(f"Could not load full dataset: {e}")
            
    except Exception as e:
        print(f"Error examining dataset file: {e}")

# Examine sampled metadata ID format
print(f"\n" + "="*50)
print("EXAMINING SAMPLED METADATA ID FORMAT")
print("="*50)

if len(sampled_df) > 0:
    print(f"Sample of IDs from metadata:")
    sample_ids = sampled_df['id'].head(10).tolist()
    for i, id_val in enumerate(sample_ids, 1):
        print(f"  {i:2d}. {id_val}")
    
    # Try to extract index pattern
    print(f"\nAnalyzing ID format to extract indices...")
    id_patterns = []
    for id_val in sample_ids:
        if isinstance(id_val, str):
            # Try to extract numeric parts
            import re
            numbers = re.findall(r'\d+', id_val)
            if numbers:
                print(f"  {id_val} -> all numbers: {numbers}")
                
                # Try the corrected pattern for v4
                match = re.search(r'amazing_logo_v4(\d+)', id_val)
                if match:
                    extracted_idx = int(match.group(1))
                    print(f"    -> extracted index (after v4): {extracted_idx}")
                else:
                    # Alternative pattern
                    match = re.search(r'v4(\d+)', id_val)
                    if match:
                        extracted_idx = int(match.group(1))
                        print(f"    -> extracted index (v4 pattern): {extracted_idx}")
                    else:
                        # Filter out version number
                        filtered_numbers = [int(n) for n in numbers if int(n) != 4]
                        if filtered_numbers:
                            extracted_idx = max(filtered_numbers)
                            print(f"    -> extracted index (largest non-4): {extracted_idx}")
                        else:
                            print(f"    -> no valid index found")
                
                id_patterns.append(numbers)
            else:
                print(f"  {id_val} -> no numbers found")
        else:
            print(f"  {id_val} -> non-string ID")
    
    print(f"\nCorrected approach: Extract numbers after 'v4' prefix, ignoring version number.")
else:
    print("No sampled metadata available for ID analysis!")

Examining the structure of the first dataset file...
Dataset features: {'image': Image(mode=None, decode=True, id=None), 'text': Value(dtype='string', id=None)}
Dataset length: 13699

First example:
Keys: ['image', 'text']
  image: <class 'PIL.PngImagePlugin.PngImageFile'> - PIL Image with size (512, 512)
  text: Simple elegant logo for Mandarin Oriental, Fan Hong kong Lines Paper, Hospitality, successful vibe, ...

Trying to load the full dataset from directory...
Keys: ['image', 'text']
  image: <class 'PIL.PngImagePlugin.PngImageFile'> - PIL Image with size (512, 512)
  text: Simple elegant logo for Mandarin Oriental, Fan Hong kong Lines Paper, Hospitality, successful vibe, ...

Trying to load the full dataset from directory...


Loading dataset from disk:   0%|          | 0/29 [00:00<?, ?it/s]

Full dataset loaded: DatasetDict({
    train: Dataset({
        features: ['image', 'text'],
        num_rows: 397251
    })
})
Train split: 397251 examples
Features: {'image': Image(mode=None, decode=True, id=None), 'text': Value(dtype='string', id=None)}

EXAMINING SAMPLED METADATA ID FORMAT
Sample of IDs from metadata:
   1. amazing_logo_v4000000
   2. amazing_logo_v4000001
   3. amazing_logo_v4000003
   4. amazing_logo_v4000004
   5. amazing_logo_v4000005
   6. amazing_logo_v4000006
   7. amazing_logo_v4000007
   8. amazing_logo_v4000008
   9. amazing_logo_v4000009
  10. amazing_logo_v4000010

Analyzing ID format to extract indices...
  amazing_logo_v4000000 -> all numbers: ['4000000']
    -> extracted index (after v4): 0
  amazing_logo_v4000001 -> all numbers: ['4000001']
    -> extracted index (after v4): 1
  amazing_logo_v4000003 -> all numbers: ['4000003']
    -> extracted index (after v4): 3
  amazing_logo_v4000004 -> all numbers: ['4000004']
    -> extracted index (after v4):

In [7]:
# Complete image extraction from metadata
extracted_images = []
failed_extractions = []

if len(sampled_df) == 0:
    print("‚ùå No metadata available for extraction!")
else:
    print(f"Starting complete extraction of {len(sampled_df):,} images...")
    
    # Load the full dataset once
    try:
        from datasets import load_from_disk
        print("Loading full dataset...")
        full_dataset = load_from_disk(str(input_dataset_path.parent))
        
        if 'train' in full_dataset:
            dataset = full_dataset['train']
            print(f"‚úÖ Loaded dataset with {len(dataset):,} examples")
            
            # Process each metadata row
            with tqdm(total=len(sampled_df), desc="Extracting all images") as pbar:
                for idx, row in sampled_df.iterrows():
                    image_id = row['id']
                    category = row['category']
                    
                    try:
                        # Extract dataset index from ID
                        dataset_idx = None
                        
                        if isinstance(image_id, str):
                            # Try different patterns to extract index
                            import re
                            
                            # Pattern 1: amazing_logo_v4XXXXXXX (digits after v4)
                            match = re.search(r'amazing_logo_v4(\d+)', image_id)
                            if match:
                                dataset_idx = int(match.group(1))
                            else:
                                # Pattern 2: Look for version pattern and extract number after it
                                match = re.search(r'v4(\d+)', image_id)
                                if match:
                                    dataset_idx = int(match.group(1))
                                else:
                                    # Pattern 3: amazing_logo_v4_XXXXX_XXXXXX (file_row format)
                                    match = re.search(r'amazing_logo_v4_(\d{5})_(\d{6})', image_id)
                                    if match:
                                        file_idx = int(match.group(1))
                                        row_idx = int(match.group(2))
                                        # Estimate dataset index (rough approximation)
                                        estimated_rows_per_file = len(dataset) // len(arrow_files)
                                        dataset_idx = file_idx * estimated_rows_per_file + row_idx
                                    else:
                                        # Pattern 4: Extract all numbers and use the last/largest one
                                        numbers = re.findall(r'\d+', image_id)
                                        if numbers:
                                            # Filter out the version number (4) and use the actual index
                                            filtered_numbers = [int(n) for n in numbers if int(n) != 4]
                                            if filtered_numbers:
                                                dataset_idx = max(filtered_numbers)  # Use largest number as index
                                            else:
                                                dataset_idx = int(numbers[-1])  # Fallback to last number
                        
                        elif isinstance(image_id, (int, float)):
                            dataset_idx = int(image_id)
                        
                        if dataset_idx is None or dataset_idx >= len(dataset):
                            failed_extractions.append({
                                'id': image_id,
                                'reason': f'Could not extract valid index (extracted: {dataset_idx}, max: {len(dataset)-1})'
                            })
                            pbar.update(1)
                            continue
                        
                        # Get the image directly by index
                        item = dataset[dataset_idx]
                        
                        if 'image' not in item:
                            failed_extractions.append({
                                'id': image_id,
                                'reason': 'No image field in dataset item'
                            })
                            pbar.update(1)
                            continue
                        
                        image_data = item['image']
                        
                        # Handle PIL Image (most common in HuggingFace datasets)
                        if hasattr(image_data, 'save'):  # PIL Image
                            image = image_data
                        elif isinstance(image_data, dict) and 'bytes' in image_data:
                            image_bytes = image_data['bytes']
                            image = Image.open(io.BytesIO(image_bytes))
                        elif isinstance(image_data, bytes):
                            image = Image.open(io.BytesIO(image_data))
                        else:
                            failed_extractions.append({
                                'id': image_id,
                                'reason': f'Unknown image format: {type(image_data)}'
                            })
                            pbar.update(1)
                            continue
                        
                        # Resize image
                        image = image.convert('RGB')
                        image = image.resize(IMAGE_SIZE, Image.Resampling.LANCZOS)
                        
                        # Save image
                        image_filename = f"{image_id}.{OUTPUT_FORMAT.lower()}"
                        image_path = output_images_path / image_filename
                        image.save(image_path, OUTPUT_FORMAT)
                        
                        # Record successful extraction
                        extracted_images.append({
                            'id': image_id,
                            'category': category,
                            'filename': image_filename,
                            'dataset_index': dataset_idx
                        })
                        
                        # Update progress description every 1000 images
                        if len(extracted_images) % 1000 == 0:
                            pbar.set_description(f"Extracted {len(extracted_images):,} images")
                            
                    except Exception as e:
                        failed_extractions.append({
                            'id': image_id,
                            'reason': f'Error: {str(e)}'
                        })
                    
                    pbar.update(1)
            
            print(f"\n‚úÖ Complete extraction finished!")
            print(f"  Successfully extracted: {len(extracted_images):,}")
            print(f"  Failed extractions: {len(failed_extractions):,}")
            print(f"  Success rate: {(len(extracted_images)/len(sampled_df)*100):.1f}%")
            
            if failed_extractions and len(failed_extractions) <= 10:
                print(f"\nFirst few failures:")
                for i, failure in enumerate(failed_extractions[:5], 1):
                    print(f"  {i}. {failure['id']}: {failure['reason']}")
                    
        else:
            print("‚ùå No 'train' split found in dataset")
            
    except Exception as e:
        print(f"‚ùå Could not load dataset: {e}")
        print("This approach requires the full dataset to be loadable.")

Starting complete extraction of 352,154 images...
Loading full dataset...


Loading dataset from disk:   0%|          | 0/29 [00:00<?, ?it/s]

‚úÖ Loaded dataset with 397,251 examples


Extracted 352,000 images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 352154/352154 [49:31<00:00, 118.51it/s] 


‚úÖ Complete extraction finished!
  Successfully extracted: 352,154
  Failed extractions: 0
  Success rate: 100.0%





In [8]:
# Show complete extraction results by category
if extracted_images:
    extracted_df = pd.DataFrame(extracted_images)
    category_counts_extracted = extracted_df['category'].value_counts()
    
    print("\nComplete extraction results by category:")
    print(f"{'Category':<30} {'In Metadata':<12} {'Extracted':<10} {'%':<6}")
    print("-" * 65)

    metadata_counts = sampled_df['category'].value_counts()
    for category in sorted(metadata_counts.index):
        in_metadata = metadata_counts[category]
        extracted = category_counts_extracted.get(category, 0)
        percentage = (extracted / in_metadata * 100) if in_metadata > 0 else 0
        print(f"{category[:30]:<30} {in_metadata:<12} {extracted:<10} {percentage:>5.1f}%")

    total_in_metadata = len(sampled_df)
    total_extracted = len(extracted_images)
    overall_percentage = (total_extracted / total_in_metadata * 100) if total_in_metadata > 0 else 0

    print("-" * 65)
    print(f"{'TOTAL':<30} {total_in_metadata:<12} {total_extracted:<10} {overall_percentage:>5.1f}%")
    
    # Show top categories by extracted count
    print(f"\nTop 10 categories by extracted image count:")
    for i, (category, count) in enumerate(category_counts_extracted.head(10).items(), 1):
        percentage = (count / total_extracted * 100)
        print(f"  {i:2d}. {category}: {count:,} ({percentage:.1f}%)")
        
else:
    print("‚ùå No images were extracted!")


Complete extraction results by category:
Category                       In Metadata  Extracted  %     
-----------------------------------------------------------------
accounting_tax                 661          661        100.0%
aerospace_defense              229          229        100.0%
agriculture_farming            637          637        100.0%
animal_services                744          744        100.0%
arts_culture                   5195         5195       100.0%
auto_services                  392          392        100.0%
automotive_transport           2201         2201       100.0%
aviation_services              734          734        100.0%
bar_nightlife                  495          495        100.0%
beauty_cosmetics               1511         1511       100.0%
beverage_general               3916         3916       100.0%
brewery_alcohol                4233         4233       100.0%
cafe_coffee                    2911         2911       100.0%
catering_events         

In [9]:
# Save complete extraction metadata and summary
if extracted_images:
    extracted_df = pd.DataFrame(extracted_images)
    extracted_df.to_csv(output_metadata_path, index=False)
    print(f"\n‚úÖ Saved complete extraction metadata to: {output_metadata_path}")
    
    # Show sample of extracted data
    print(f"\nSample of extracted images:")
    print(extracted_df.head())
    
    # Save comprehensive summary statistics
    category_distribution = extracted_df['category'].value_counts().to_dict()
    
    summary_stats = {
        'extraction_type': 'complete_from_metadata9',
        'total_images_in_metadata': len(sampled_df),
        'total_images_extracted': len(extracted_images),
        'extraction_success_rate': (len(extracted_images) / len(sampled_df) * 100) if len(sampled_df) > 0 else 0,
        'categories_processed': len(category_distribution),
        'failed_extractions': len(failed_extractions),
        'image_size': IMAGE_SIZE,
        'output_format': OUTPUT_FORMAT,
        'output_folder': 'total_after_cleanup',
        'category_distribution': category_distribution,
        'source_metadata': 'metadata9.csv'
    }
    
    summary_path = output_images_path.parent / 'complete_extraction_summary.json'
    with open(summary_path, 'w') as f:
        json.dump(summary_stats, f, indent=2)
    
    print(f"\n‚úÖ Saved complete extraction summary to: {summary_path}")
    print(f"\nFinal Extraction Summary:")
    print(f"  Total images in metadata9.csv: {summary_stats['total_images_in_metadata']:,}")
    print(f"  Images successfully extracted: {summary_stats['total_images_extracted']:,}")
    print(f"  Extraction success rate: {summary_stats['extraction_success_rate']:.1f}%")
    print(f"  Categories processed: {summary_stats['categories_processed']}")
    print(f"  Failed extractions: {summary_stats['failed_extractions']}")
    print(f"  Output folder: {output_images_path}")
    
    # Save failed extractions for debugging if any
    if failed_extractions:
        failed_df = pd.DataFrame(failed_extractions)
        failed_path = output_images_path.parent / 'failed_extractions_complete.csv'
        failed_df.to_csv(failed_path, index=False)
        print(f"  Failed extractions saved to: {failed_path}")
        
        # Show failure analysis
        failure_reasons = failed_df['reason'].value_counts()
        print(f"\nFailure analysis:")
        for reason, count in failure_reasons.items():
            print(f"  {reason}: {count}")
        
else:
    print("\n‚ùå No images were extracted!")
    if failed_extractions:
        print(f"Total failures: {len(failed_extractions)}")
        print("Sample failures:")
        for i, failure in enumerate(failed_extractions[:10], 1):
            print(f"  {i:2d}. {failure['id']}: {failure['reason']}")
            
print(f"\nüéØ Extraction complete! All images from metadata9.csv have been processed.")
print(f"Check the '{output_images_path.name}' folder for your extracted images.")


‚úÖ Saved complete extraction metadata to: ..\..\output\amazing_logos_v4\data\total_after_cleanup_metadata.csv

Sample of extracted images:
                      id            category                   filename  \
0  amazing_logo_v4000000      hotels_lodging  amazing_logo_v4000000.png   
1  amazing_logo_v4000001  chemical_materials  amazing_logo_v4000001.png   
2  amazing_logo_v4000003        unclassified  amazing_logo_v4000003.png   
3  amazing_logo_v4000004          film_video  amazing_logo_v4000004.png   
4  amazing_logo_v4000005        unclassified  amazing_logo_v4000005.png   

   dataset_index  
0              0  
1              1  
2              3  
3              4  
4              5  

‚úÖ Saved complete extraction summary to: ..\..\output\amazing_logos_v4\images\complete_extraction_summary.json

Final Extraction Summary:
  Total images in metadata9.csv: 352,154
  Images successfully extracted: 352,154
  Extraction success rate: 100.0%
  Categories processed: 109
  Failed e