# Create metadata10.csv from total_filtered

This slim notebook reads metadata9.csv and keeps only rows whose image IDs exist in the total_filtered folder. The result is saved as metadata10.csv.

In [1]:
from pathlib import Path
import pandas as pd

# Paths (relative to this notebook)
base_output = Path('../../output/amazing_logos_v4')
images_dir = base_output / 'images' / 'total_filtered'
metadata9_path = base_output / 'data' / 'amazing_logos_v4_cleanup' / 'metadata9_with_main.csv'
metadata10_dir = base_output / 'data' / 'amazing_logos_v4_image_prep'
metadata10_dir.mkdir(parents=True, exist_ok=True)
metadata10_path = metadata10_dir / 'metadata10.csv'

print(f'Images folder: {images_dir}')
print(f'Metadata9:    {metadata9_path}')
print(f'Output (metadata10): {metadata10_path}')

# Collect IDs present in total_filtered (strip known image extensions)
valid_exts = {'.png', '.jpg', '.jpeg', '.webp'}
image_ids = set()
if images_dir.exists():
    for p in images_dir.iterdir():
        if p.is_file():
            if p.suffix.lower() in valid_exts:
                image_ids.add(p.stem)  # filename without extension is the id
else:
    raise FileNotFoundError(f'Images directory not found: {images_dir}')

print(f'Found {len(image_ids):,} image IDs in total_filtered')

# Load metadata9.csv (expect at least an ‘id’ column)
if not metadata9_path.exists():
    raise FileNotFoundError(f'Metadata9 not found: {metadata9_path}')

meta9 = pd.read_csv(metadata9_path)
if 'id' not in meta9.columns:
    raise ValueError('metadata9.csv must contain an id column')

# Filter rows where id is present in image_ids
meta10 = meta9[meta9['id'].astype(str).isin(image_ids)].copy()
print(f'Kept {len(meta10):,} rows out of {len(meta9):,}')

# Save
meta10.to_csv(metadata10_path, index=False)
print(f'✅ Saved metadata10.csv to: {metadata10_path}')

Images folder: ..\..\output\amazing_logos_v4\images\total_filtered
Metadata9:    ..\..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\metadata9_with_main.csv
Output (metadata10): ..\..\output\amazing_logos_v4\data\amazing_logos_v4_image_prep\metadata10.csv
Found 177,224 image IDs in total_filtered
Kept 177,224 rows out of 352,173
✅ Saved metadata10.csv to: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_image_prep\metadata10.csv


In [2]:
meta10

Unnamed: 0,id,company,description,category,tags,category_main
0,amazing_logo_v4000000,Simple elegant logo for Mandarin Oriental,Fan Hong kong Lines Paper,hotels_lodging,"successful_vibe,minimalist,thoughtprovoking,ab...",retail_hospitality
1,amazing_logo_v4000001,Simple elegant logo for Alfa,Hexagon Poland Triangles,chemical_materials,"successful_vibe,minimalist,thoughtprovoking,ab...",manufacturing_transport
2,amazing_logo_v4000003,Simple elegant logo for Valwood Park,Lines Rounded United states V,unclassified,"successful_vibe, minimalist, thoughtprovoking,...",other
3,amazing_logo_v4000004,Simple elegant logo for Cinepaq,C Circle Film reel Spain Square,film_video,"successful_vibe,minimalist,thoughtprovoking,ab...",entertainment_sports_media
4,amazing_logo_v4000005,Simple elegant logo for Baumechanik Barleben,B Circle Germany,unclassified,"successful_vibe, minimalist, thoughtprovoking,...",other
...,...,...,...,...,...,...
352163,amazing_logo_v4397239,Simple elegant logo for Trammell Crow Resident...,Alexan Heartwood Tree H Green Issaquah Washing...,unclassified,"real_estate, successful_vibe, minimalist, thou...",other
352166,amazing_logo_v4397243,Simple elegant logo for Selkirk Sport,red s italic pickleball Selkirk Sport modern i...,unclassified,"successful_vibe, minimalist, thoughtprovoking,...",other
352167,amazing_logo_v4397244,Simple elegant logo for Eye To Eye,optometry care glasses sight vision eye,unclassified,"successful_vibe, minimalist, thoughtprovoking,...",other
352170,amazing_logo_v4397248,Simple elegant logo for Elite Angling,fishing fish boat truck drive,sports_recreation,"successful_vibe,minimalist,thoughtprovoking,ab...",entertainment_sports_media
