# Balanced Image Extraction from Amazing Logos V4

This notebook extracts a balanced sample of images from the Amazing Logos V4 dataset:
- Loads the HuggingFace dataset from input/amazing_logos_v4/
- Loads metadata9.csv for category information
- Samples metadata with equal distribution across categories
- Extracts images efficiently using index-based access
- Configurable total number of images to extract

In [1]:
# Configuration parameters
TOTAL_IMAGES_TO_EXTRACT = 2000  # Total number of images to extract
IMAGE_SIZE = (512, 512)  # Size to save images
OUTPUT_FORMAT = 'PNG'  # Image format to save
OUTPUT_NAME = 'balanced_sample_2k_512x512' # Output folder and file name prefix

print(f"Configuration:")
print(f"  Total images to extract: {TOTAL_IMAGES_TO_EXTRACT:,}")
print(f"  Image size: {IMAGE_SIZE}")
print(f"  Output format: {OUTPUT_FORMAT}")
print(f"  Output name: {OUTPUT_NAME}")

Configuration:
  Total images to extract: 2,000
  Image size: (512, 512)
  Output format: PNG
  Output name: balanced_sample_2k_512x512


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from datasets import Dataset
from PIL import Image
import io
from collections import defaultdict, Counter
from tqdm import tqdm
import json
import os

# Setup paths
input_dataset_path = Path('../../input/amazing_logos_v4/train')
metadata_path = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_image_prep/metadata10.csv')
output_images_path = Path(f'../../output/amazing_logos_v4/images/{OUTPUT_NAME}')
output_metadata_path = Path(f'../../output/amazing_logos_v4/data/amazing_logos_v4_image_prep/{OUTPUT_NAME}_metadata.csv')

# Create output directory
output_images_path.mkdir(parents=True, exist_ok=True)

print(f"Input dataset: {input_dataset_path}")
print(f"Metadata file: {metadata_path}")
print(f"Output images: {output_images_path}")
print(f"Output metadata: {output_metadata_path}")

# Check if paths exist
if not input_dataset_path.exists():
    print(f"❌ Input dataset path does not exist: {input_dataset_path}")
if not metadata_path.exists():
    print(f"❌ Metadata file does not exist: {metadata_path}")
else:
    print("✅ All required paths exist")

  from .autonotebook import tqdm as notebook_tqdm


Input dataset: ..\..\input\amazing_logos_v4\train
Metadata file: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_image_prep\metadata10.csv
Output images: ..\..\output\amazing_logos_v4\images\balanced_sample_2k_512x512
Output metadata: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_image_prep\balanced_sample_2k_512x512_metadata.csv
✅ All required paths exist


In [3]:
# Load metadata to get category information
print("Loading metadata...")
try:
    # Load metadata in chunks due to large size
    metadata_chunks = []
    chunk_size = 50000
    
    for chunk in pd.read_csv(metadata_path, chunksize=chunk_size):
        metadata_chunks.append(chunk[['id', 'category_main']])  # Only load relevant columns
    
    metadata_df = pd.concat(metadata_chunks, ignore_index=True)
    print(f"✅ Loaded metadata: {len(metadata_df):,} rows")
    
    # Show category distribution
    category_counts = metadata_df['category_main'].value_counts()
    print(f"\nFound {len(category_counts)} unique categories")
    print(f"Top 10 categories:")
    for i, (cat, count) in enumerate(category_counts.head(10).items(), 1):
        percentage = (count / len(metadata_df)) * 100
        print(f"  {i:2d}. {cat}: {count:,} ({percentage:.1f}%)")
    
except Exception as e:
    print(f"❌ Error loading metadata: {e}")
    raise

Loading metadata...
✅ Loaded metadata: 177,224 rows

Found 10 unique categories
Top 10 categories:
   1. other: 65,033 (36.7%)
   2. tech: 17,631 (9.9%)
   3. retail_hospitality: 17,380 (9.8%)
   4. entertainment_sports_media: 15,427 (8.7%)
   5. food_beverage: 12,835 (7.2%)
   6. health: 12,277 (6.9%)
   7. professional_financial_legal: 11,931 (6.7%)
   8. real_estate_construction: 11,022 (6.2%)
   9. manufacturing_transport: 8,508 (4.8%)
  10. education: 5,180 (2.9%)
✅ Loaded metadata: 177,224 rows

Found 10 unique categories
Top 10 categories:
   1. other: 65,033 (36.7%)
   2. tech: 17,631 (9.9%)
   3. retail_hospitality: 17,380 (9.8%)
   4. entertainment_sports_media: 15,427 (8.7%)
   5. food_beverage: 12,835 (7.2%)
   6. health: 12,277 (6.9%)
   7. professional_financial_legal: 11,931 (6.7%)
   8. real_estate_construction: 11,022 (6.2%)
   9. manufacturing_transport: 8,508 (4.8%)
  10. education: 5,180 (2.9%)


In [4]:
# Sample metadata with balanced distribution across categories
unique_categories = metadata_df['category_main'].unique()
num_categories = len(unique_categories)
target_per_category = TOTAL_IMAGES_TO_EXTRACT // num_categories
remainder = TOTAL_IMAGES_TO_EXTRACT % num_categories

print(f"Balanced sampling strategy:")
print(f"  Total categories: {num_categories}")
print(f"  Base images per category: {target_per_category}")
print(f"  Remainder to distribute: {remainder}")

# Sample from metadata directly
sampled_metadata = []
for i, category in enumerate(unique_categories):
    target = target_per_category + (1 if i < remainder else 0)
    
    # Get all rows for this category
    category_rows = metadata_df[metadata_df['category_main'] == category]
    available = len(category_rows)
    actual_target = min(target, available)
    
    # Sample randomly from this category
    if actual_target > 0:
        sampled_rows = category_rows.sample(n=actual_target, random_state=42)
        sampled_metadata.append(sampled_rows)
        print(f"  {category}: sampled {actual_target}/{available}")

# Combine all sampled metadata
if sampled_metadata:
    sampled_df = pd.concat(sampled_metadata, ignore_index=True)
    print(f"\nTotal sampled metadata: {len(sampled_df):,} rows")
    
    # Show distribution
    sampled_counts = sampled_df['category_main'].value_counts()
    print(f"\nSampled distribution:")
    for category, count in sampled_counts.head(10).items():
        print(f"  {category}: {count}")
else:
    print("❌ No metadata could be sampled!")
    sampled_df = pd.DataFrame()

Balanced sampling strategy:
  Total categories: 10
  Base images per category: 200
  Remainder to distribute: 0
  retail_hospitality: sampled 200/17380
  manufacturing_transport: sampled 200/8508
  other: sampled 200/65033
  entertainment_sports_media: sampled 200/15427
  health: sampled 200/12277
  education: sampled 200/5180
  food_beverage: sampled 200/12835
  real_estate_construction: sampled 200/11022
  tech: sampled 200/17631
  professional_financial_legal: sampled 200/11931

Total sampled metadata: 2,000 rows

Sampled distribution:
  retail_hospitality: 200
  manufacturing_transport: 200
  other: 200
  entertainment_sports_media: 200
  health: 200
  education: 200
  food_beverage: 200
  real_estate_construction: 200
  tech: 200
  professional_financial_legal: 200


In [5]:
# Get list of arrow files
arrow_files = list(input_dataset_path.glob('data-*.arrow'))
arrow_files.sort()

print(f"Found {len(arrow_files)} arrow files:")
for i, file in enumerate(arrow_files[:5], 1):
    print(f"  {i}. {file.name}")
if len(arrow_files) > 5:
    print(f"  ... and {len(arrow_files) - 5} more")

# Update the variable name for consistency
parquet_files = arrow_files  # Keep the same variable name to avoid breaking other code

Found 29 arrow files:
  1. data-00000-of-00029.arrow
  2. data-00001-of-00029.arrow
  3. data-00002-of-00029.arrow
  4. data-00003-of-00029.arrow
  5. data-00004-of-00029.arrow
  ... and 24 more


In [6]:
# Debug: Examine the structure and ID format
if arrow_files:
    print("Examining the structure of the first dataset file...")
    try:
        # Try loading with datasets library
        dataset = Dataset.from_file(str(arrow_files[0]))
        print(f"Dataset features: {dataset.features}")
        print(f"Dataset length: {len(dataset)}")
        
        if len(dataset) > 0:
            print("\nFirst example:")
            first_example = dataset[0]
            print("Keys:", list(first_example.keys()))
            
            for key, value in first_example.items():
                if key == 'image':
                    print(f"  {key}: {type(value)} - PIL Image with size {value.size if hasattr(value, 'size') else 'unknown'}")
                elif isinstance(value, str) and len(value) > 100:
                    print(f"  {key}: {value[:100]}...")
                else:
                    print(f"  {key}: {value}")
                    
        # Check if we can load the full dataset
        print(f"\nTrying to load the full dataset from directory...")
        try:
            from datasets import load_from_disk
            full_dataset = load_from_disk(str(input_dataset_path.parent))
            print(f"Full dataset loaded: {full_dataset}")
            if 'train' in full_dataset:
                train_data = full_dataset['train']
                print(f"Train split: {len(train_data)} examples")
                print(f"Features: {train_data.features}")
        except Exception as e:
            print(f"Could not load full dataset: {e}")
            
    except Exception as e:
        print(f"Error examining dataset file: {e}")

# Examine sampled metadata ID format
print(f"\n" + "="*50)
print("EXAMINING SAMPLED METADATA ID FORMAT")
print("="*50)

if len(sampled_df) > 0:
    print(f"Sample of IDs from metadata:")
    sample_ids = sampled_df['id'].head(10).tolist()
    for i, id_val in enumerate(sample_ids, 1):
        print(f"  {i:2d}. {id_val}")
    
    # Try to extract index pattern
    print(f"\nAnalyzing ID format to extract indices...")
    id_patterns = []
    for id_val in sample_ids:
        if isinstance(id_val, str):
            # Try to extract numeric parts
            import re
            numbers = re.findall(r'\d+', id_val)
            if numbers:
                print(f"  {id_val} -> all numbers: {numbers}")
                
                # Try the corrected pattern for v4
                match = re.search(r'amazing_logo_v4(\d+)', id_val)
                if match:
                    extracted_idx = int(match.group(1))
                    print(f"    -> extracted index (after v4): {extracted_idx}")
                else:
                    # Alternative pattern
                    match = re.search(r'v4(\d+)', id_val)
                    if match:
                        extracted_idx = int(match.group(1))
                        print(f"    -> extracted index (v4 pattern): {extracted_idx}")
                    else:
                        # Filter out version number
                        filtered_numbers = [int(n) for n in numbers if int(n) != 4]
                        if filtered_numbers:
                            extracted_idx = max(filtered_numbers)
                            print(f"    -> extracted index (largest non-4): {extracted_idx}")
                        else:
                            print(f"    -> no valid index found")
                
                id_patterns.append(numbers)
            else:
                print(f"  {id_val} -> no numbers found")
        else:
            print(f"  {id_val} -> non-string ID")
    
    print(f"\nCorrected approach: Extract numbers after 'v4' prefix, ignoring version number.")
else:
    print("No sampled metadata available for ID analysis!")

Examining the structure of the first dataset file...
Dataset features: {'image': Image(mode=None, decode=True), 'text': Value('string')}
Dataset length: 13699

First example:
Keys: ['image', 'text']
  image: <class 'PIL.PngImagePlugin.PngImageFile'> - PIL Image with size (512, 512)
  text: Simple elegant logo for Mandarin Oriental, Fan Hong kong Lines Paper, Hospitality, successful vibe, ...

Trying to load the full dataset from directory...
Keys: ['image', 'text']
  image: <class 'PIL.PngImagePlugin.PngImageFile'> - PIL Image with size (512, 512)
  text: Simple elegant logo for Mandarin Oriental, Fan Hong kong Lines Paper, Hospitality, successful vibe, ...

Trying to load the full dataset from directory...
Full dataset loaded: DatasetDict({
    train: Dataset({
        features: ['image', 'text'],
        num_rows: 397251
    })
})
Train split: 397251 examples
Features: {'image': Image(mode=None, decode=True), 'text': Value('string')}

EXAMINING SAMPLED METADATA ID FORMAT
Sample of ID

In [7]:
# Efficient index-based image extraction
extracted_images = []
failed_extractions = []

if len(sampled_df) == 0:
    print("❌ No sampled metadata available for extraction!")
else:
    print(f"Starting efficient extraction of {len(sampled_df)} images...")
    
    # Load the full dataset once
    try:
        from datasets import load_from_disk
        print("Loading full dataset...")
        full_dataset = load_from_disk(str(input_dataset_path.parent))
        
        if 'train' in full_dataset:
            dataset = full_dataset['train']
            print(f"✅ Loaded dataset with {len(dataset)} examples")
            
            # Process each sampled metadata row
            for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df), desc="Extracting images"):
                image_id = row['id']
                category = row['category_main']
                
                try:
                    # Extract dataset index from ID
                    dataset_idx = None
                    
                    if isinstance(image_id, str):
                        # Try different patterns to extract index
                        import re
                        
                        # Pattern 1: amazing_logo_v4XXXXXXX (digits after v4)
                        match = re.search(r'amazing_logo_v4(\d+)', image_id)
                        if match:
                            dataset_idx = int(match.group(1))
                        else:
                            # Pattern 2: Look for version pattern and extract number after it
                            match = re.search(r'v4(\d+)', image_id)
                            if match:
                                dataset_idx = int(match.group(1))
                            else:
                                # Pattern 3: amazing_logo_v4_XXXXX_XXXXXX (file_row format)
                                match = re.search(r'amazing_logo_v4_(\d{5})_(\d{6})', image_id)
                                if match:
                                    file_idx = int(match.group(1))
                                    row_idx = int(match.group(2))
                                    # Estimate dataset index (rough approximation)
                                    # This assumes each file has roughly the same number of rows
                                    estimated_rows_per_file = len(dataset) // len(arrow_files)
                                    dataset_idx = file_idx * estimated_rows_per_file + row_idx
                                else:
                                    # Pattern 4: Extract all numbers and use the last/largest one
                                    numbers = re.findall(r'\d+', image_id)
                                    if numbers:
                                        # Filter out the version number (4) and use the actual index
                                        filtered_numbers = [int(n) for n in numbers if int(n) != 4]
                                        if filtered_numbers:
                                            dataset_idx = max(filtered_numbers)  # Use largest number as index
                                        else:
                                            dataset_idx = int(numbers[-1])  # Fallback to last number
                    
                    elif isinstance(image_id, (int, float)):
                        dataset_idx = int(image_id)
                    
                    if dataset_idx is None or dataset_idx >= len(dataset):
                        failed_extractions.append({
                            'id': image_id,
                            'reason': f'Could not extract valid index (extracted: {dataset_idx}, max: {len(dataset)-1})'
                        })
                        continue
                    
                    # Get the image directly by index
                    item = dataset[dataset_idx]
                    
                    if 'image' not in item:
                        failed_extractions.append({
                            'id': image_id,
                            'reason': 'No image field in dataset item'
                        })
                        continue
                    
                    image_data = item['image']
                    
                    # Handle PIL Image (most common in HuggingFace datasets)
                    if hasattr(image_data, 'save'):  # PIL Image
                        image = image_data
                    elif isinstance(image_data, dict) and 'bytes' in image_data:
                        image_bytes = image_data['bytes']
                        image = Image.open(io.BytesIO(image_bytes))
                    elif isinstance(image_data, bytes):
                        image = Image.open(io.BytesIO(image_data))
                    else:
                        failed_extractions.append({
                            'id': image_id,
                            'reason': f'Unknown image format: {type(image_data)}'
                        })
                        continue
                    
                    # Resize image
                    image = image.convert('RGB')
                    image = image.resize(IMAGE_SIZE, Image.Resampling.LANCZOS)
                    
                    # Save image
                    image_filename = f"{image_id}.{OUTPUT_FORMAT.lower()}"
                    image_path = output_images_path / image_filename
                    image.save(image_path, OUTPUT_FORMAT)
                    
                    # Record successful extraction
                    extracted_images.append({
                        'id': image_id,
                        'category_main': category,
                        'filename': image_filename,
                        'dataset_index': dataset_idx
                    })
                    
                    # Progress update every 100 images
                    if len(extracted_images) % 100 == 0:
                        print(f"  Extracted {len(extracted_images)} images...")
                        
                except Exception as e:
                    failed_extractions.append({
                        'id': image_id,
                        'reason': f'Error: {str(e)}'
                    })
                    continue
            
            print(f"\n✅ Extraction completed!")
            print(f"  Successfully extracted: {len(extracted_images):,}")
            print(f"  Failed extractions: {len(failed_extractions):,}")
            
            if failed_extractions and len(failed_extractions) <= 10:
                print(f"\nFirst few failures:")
                for i, failure in enumerate(failed_extractions[:5], 1):
                    print(f"  {i}. {failure['id']}: {failure['reason']}")
                    
        else:
            print("❌ No 'train' split found in dataset")
            
    except Exception as e:
        print(f"❌ Could not load dataset: {e}")
        print("This approach requires the full dataset to be loadable.")

Starting efficient extraction of 2000 images...
Loading full dataset...
✅ Loaded dataset with 397251 examples
✅ Loaded dataset with 397251 examples


Extracting images:   6%|▌         | 112/2000 [00:01<00:20, 93.20it/s]

  Extracted 100 images...


Extracting images:  11%|█         | 211/2000 [00:02<00:20, 85.35it/s]

  Extracted 200 images...


Extracting images:  16%|█▌        | 315/2000 [00:03<00:18, 90.35it/s]

  Extracted 300 images...


Extracting images:  21%|██        | 419/2000 [00:04<00:17, 92.43it/s]

  Extracted 400 images...


Extracting images:  26%|██▌       | 512/2000 [00:05<00:17, 84.64it/s]

  Extracted 500 images...


Extracting images:  30%|███       | 608/2000 [00:06<00:14, 96.90it/s] 

  Extracted 600 images...


Extracting images:  36%|███▌      | 712/2000 [00:07<00:13, 96.19it/s]

  Extracted 700 images...


Extracting images:  41%|████      | 815/2000 [00:08<00:13, 88.87it/s]

  Extracted 800 images...


Extracting images:  46%|████▌     | 916/2000 [00:10<00:13, 82.38it/s]

  Extracted 900 images...


Extracting images:  51%|█████     | 1013/2000 [00:11<00:11, 88.06it/s]

  Extracted 1000 images...


Extracting images:  56%|█████▌    | 1111/2000 [00:12<00:10, 87.97it/s]

  Extracted 1100 images...


Extracting images:  61%|██████    | 1220/2000 [00:13<00:08, 93.63it/s]

  Extracted 1200 images...


Extracting images:  66%|██████▌   | 1317/2000 [00:14<00:06, 98.07it/s] 

  Extracted 1300 images...


Extracting images:  70%|███████   | 1409/2000 [00:15<00:06, 91.82it/s]

  Extracted 1400 images...


Extracting images:  76%|███████▌  | 1512/2000 [00:16<00:05, 92.64it/s]

  Extracted 1500 images...


Extracting images:  81%|████████  | 1613/2000 [00:17<00:04, 93.51it/s]

  Extracted 1600 images...


Extracting images:  86%|████████▌ | 1713/2000 [00:18<00:02, 96.16it/s]

  Extracted 1700 images...


Extracting images:  91%|█████████ | 1811/2000 [00:19<00:02, 87.71it/s]

  Extracted 1800 images...


Extracting images:  96%|█████████▌| 1913/2000 [00:21<00:00, 90.40it/s]

  Extracted 1900 images...


Extracting images: 100%|██████████| 2000/2000 [00:22<00:00, 90.86it/s] 

  Extracted 2000 images...

✅ Extraction completed!
  Successfully extracted: 2,000
  Failed extractions: 0





In [8]:
# Show extraction results by category
if extracted_images:
    extracted_df = pd.DataFrame(extracted_images)
    category_counts_extracted = extracted_df['category_main'].value_counts()
    
    print("\nExtraction results by category:")
    print(f"{'Category':<30} {'Target':<8} {'Extracted':<10} {'%':<6}")
    print("-" * 60)

    sampled_counts = sampled_df['category_main'].value_counts()
    for category in sorted(sampled_counts.index):
        target = sampled_counts[category]
        extracted = category_counts_extracted.get(category, 0)
        percentage = (extracted / target * 100) if target > 0 else 0
        print(f"{category[:30]:<30} {target:<8} {extracted:<10} {percentage:>5.1f}%")

    total_target = len(sampled_df)
    total_extracted = len(extracted_images)
    overall_percentage = (total_extracted / total_target * 100) if total_target > 0 else 0

    print("-" * 60)
    print(f"{'TOTAL':<30} {total_target:<8} {total_extracted:<10} {overall_percentage:>5.1f}%")
else:
    print("❌ No images were extracted!")


Extraction results by category:
Category                       Target   Extracted  %     
------------------------------------------------------------
education                      200      200        100.0%
entertainment_sports_media     200      200        100.0%
food_beverage                  200      200        100.0%
health                         200      200        100.0%
manufacturing_transport        200      200        100.0%
other                          200      200        100.0%
professional_financial_legal   200      200        100.0%
real_estate_construction       200      200        100.0%
retail_hospitality             200      200        100.0%
tech                           200      200        100.0%
------------------------------------------------------------
TOTAL                          2000     2000       100.0%


In [9]:
# Save extraction metadata and summary
if extracted_images:
    extracted_df = pd.DataFrame(extracted_images)
    extracted_df.to_csv(output_metadata_path, index=False)
    print(f"\n✅ Saved extraction metadata to: {output_metadata_path}")
    
    # Show sample of extracted data
    print(f"\nSample of extracted images:")
    print(extracted_df.head())
    
    # Save summary statistics
    category_distribution = extracted_df['category_main'].value_counts().to_dict()
    
    summary_stats = {
        'total_images_extracted': len(extracted_images),
        'total_images_target': TOTAL_IMAGES_TO_EXTRACT,
        'total_images_sampled': len(sampled_df),
        'extraction_percentage': (len(extracted_images) / len(sampled_df) * 100) if len(sampled_df) > 0 else 0,
        'categories_processed': len(category_distribution),
        'failed_extractions': len(failed_extractions),
        'image_size': IMAGE_SIZE,
        'output_format': OUTPUT_FORMAT,
        'category_distribution': category_distribution
    }
    
    summary_path = output_images_path.parent / 'extraction_summary.json'
    with open(summary_path, 'w') as f:
        json.dump(summary_stats, f, indent=2)
    
    print(f"\n✅ Saved extraction summary to: {summary_path}")
    print(f"\nExtraction Summary:")
    print(f"  Images sampled from metadata: {summary_stats['total_images_sampled']:,}")
    print(f"  Images successfully extracted: {summary_stats['total_images_extracted']:,}")
    print(f"  Extraction success rate: {summary_stats['extraction_percentage']:.1f}%")
    print(f"  Categories: {summary_stats['categories_processed']}")
    print(f"  Failed extractions: {summary_stats['failed_extractions']}")
    
    # Save failed extractions for debugging if any
    if failed_extractions:
        failed_df = pd.DataFrame(failed_extractions)
        failed_path = output_images_path.parent / 'failed_extractions.csv'
        failed_df.to_csv(failed_path, index=False)
        print(f"  Failed extractions saved to: {failed_path}")
        
else:
    print("\n❌ No images were extracted!")
    if failed_extractions:
        print(f"Total failures: {len(failed_extractions)}")
        print("Sample failures:")
        for i, failure in enumerate(failed_extractions[:5], 1):
            print(f"  {i}. {failure['id']}: {failure['reason']}")


✅ Saved extraction metadata to: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_image_prep\balanced_sample_2k_512x512_metadata.csv

Sample of extracted images:
                      id       category_main                   filename  \
0  amazing_logo_v4294142  retail_hospitality  amazing_logo_v4294142.png   
1  amazing_logo_v4201237  retail_hospitality  amazing_logo_v4201237.png   
2  amazing_logo_v4164292  retail_hospitality  amazing_logo_v4164292.png   
3  amazing_logo_v4280843  retail_hospitality  amazing_logo_v4280843.png   
4  amazing_logo_v4007417  retail_hospitality  amazing_logo_v4007417.png   

   dataset_index  
0         294142  
1         201237  
2         164292  
3         280843  
4           7417  

✅ Saved extraction summary to: ..\..\output\amazing_logos_v4\images\extraction_summary.json

Extraction Summary:
  Images sampled from metadata: 2,000
  Images successfully extracted: 2,000
  Extraction success rate: 100.0%
  Categories: 10
  Failed extractions: 0
