# Filter Metadata by Available Maps

This notebook:
1. Loads `metadata10.csv`.
2. Lists available map images in `balanced_sample_2k_512x512_maps`.
3. Filters the metadata to include only rows for which a map image exists.
4. Saves the result as `metadata_filtered_by_maps.csv` with reordered columns.

In [1]:
# Imports & Paths
from pathlib import Path
import pandas as pd
import os

BASE_PATH = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()
PROJECT_ROOT = BASE_PATH / '../..'
META_PATH = PROJECT_ROOT / 'output' / 'amazing_logos_v4' / 'data' / 'amazing_logos_v4_image_prep' / 'metadata10.csv'
MAPS_DIR = PROJECT_ROOT / 'output' / 'amazing_logos_v4' / 'images' / 'balanced_sample_2k_512x512_maps'
OUTPUT_DATA_DIR = PROJECT_ROOT / 'output' / 'amazing_logos_v4' / 'data'
OUTPUT_DATA_DIR.mkdir(parents=True, exist_ok=True)
FINAL_PATH = OUTPUT_DATA_DIR / 'meta_postprep' /'metadata_filtered_by_maps.csv'

print('Metadata path:', META_PATH)
print('Maps dir     :', MAPS_DIR)
print('Output dir   :', OUTPUT_DATA_DIR)

metadata = pd.read_csv(META_PATH)
print('Loaded metadata rows:', len(metadata))
metadata.head()

Metadata path: c:\studium\master_thesis\data_prep\notebooks\meta_postprep\..\..\output\amazing_logos_v4\data\amazing_logos_v4_image_prep\metadata10.csv
Maps dir     : c:\studium\master_thesis\data_prep\notebooks\meta_postprep\..\..\output\amazing_logos_v4\images\balanced_sample_2k_512x512_maps
Output dir   : c:\studium\master_thesis\data_prep\notebooks\meta_postprep\..\..\output\amazing_logos_v4\data
Loaded metadata rows: 177224
Loaded metadata rows: 177224


Unnamed: 0,id,company,description,category,tags,category_main
0,amazing_logo_v4000000,Simple elegant logo for Mandarin Oriental,Fan Hong kong Lines Paper,hotels_lodging,"successful_vibe,minimalist,thoughtprovoking,ab...",retail_hospitality
1,amazing_logo_v4000001,Simple elegant logo for Alfa,Hexagon Poland Triangles,chemical_materials,"successful_vibe,minimalist,thoughtprovoking,ab...",manufacturing_transport
2,amazing_logo_v4000003,Simple elegant logo for Valwood Park,Lines Rounded United states V,unclassified,"successful_vibe, minimalist, thoughtprovoking,...",other
3,amazing_logo_v4000004,Simple elegant logo for Cinepaq,C Circle Film reel Spain Square,film_video,"successful_vibe,minimalist,thoughtprovoking,ab...",entertainment_sports_media
4,amazing_logo_v4000005,Simple elegant logo for Baumechanik Barleben,B Circle Germany,unclassified,"successful_vibe, minimalist, thoughtprovoking,...",other


In [2]:
# Collect map image IDs
SUPPORTED_EXTS = {'.png', '.jpg', '.jpeg', '.webp'}
map_ids = []
if MAPS_DIR.exists():
    for p in MAPS_DIR.iterdir():
        if p.is_file() and p.suffix.lower() in SUPPORTED_EXTS:
            map_ids.append(p.stem)
else:
    raise FileNotFoundError(f"Maps directory not found: {MAPS_DIR}")

map_ids = sorted(set(map_ids))
print('Found map images:', len(map_ids))
map_ids[:5]

Found map images: 1810


['amazing_logo_v4000185',
 'amazing_logo_v4000313',
 'amazing_logo_v4000469',
 'amazing_logo_v4000481',
 'amazing_logo_v4000523']

In [3]:
# Filter metadata rows to those with matching IDs
filtered = metadata[metadata['id'].isin(map_ids)].copy()
print('Filtered rows:', len(filtered))

missing_from_maps = set(metadata['id']) - set(filtered['id'])
print('Metadata rows without map image:', len(missing_from_maps))
filtered.head()

Filtered rows: 1810
Metadata rows without map image: 175414


Unnamed: 0,id,company,description,category,tags,category_main
105,amazing_logo_v4000185,Simple elegant logo for Santa Fe By Design,Drop Oval United states Water,textiles_manufacturing,"successful_vibe,minimalist,thoughtprovoking,ab...",manufacturing_transport
174,amazing_logo_v4000313,Simple elegant logo for Bluebird Cafe,Bird Fork United states,restaurant_dining,"successful_vibe,minimalist,thoughtprovoking,ab...",food_beverage
257,amazing_logo_v4000469,Simple elegant logo for Collezione Nai,A I Italy N,home_improvement,"successful_vibe,minimalist,thoughtprovoking,ab...",retail_hospitality
264,amazing_logo_v4000481,Simple elegant logo for Renault Alpine,A Arrow France Line Triangle,automotive_transport,"successful_vibe,minimalist,thoughtprovoking,ab...",manufacturing_transport
289,amazing_logo_v4000523,Simple elegant logo for Rotation,Circle Germany Line Rotation Spiral Swirl,arts_culture,"successful_vibe,minimalist,thoughtprovoking,ab...",entertainment_sports_media


In [4]:
# Reorder columns and save
final_cols = ['id','category_main','category','description','tags','company']
missing_cols = [c for c in final_cols if c not in filtered.columns]
if missing_cols:
    raise KeyError(f"Missing expected columns: {missing_cols}")

final_df = filtered[final_cols].copy()
final_df.to_csv(FINAL_PATH, index=False)
print(f"Saved final metadata: {FINAL_PATH} ({len(final_df)} rows)")
final_df.head()

Saved final metadata: c:\studium\master_thesis\data_prep\notebooks\meta_postprep\..\..\output\amazing_logos_v4\data\meta_postprep\metadata_filtered_by_maps.csv (1810 rows)


Unnamed: 0,id,category_main,category,description,tags,company
105,amazing_logo_v4000185,manufacturing_transport,textiles_manufacturing,Drop Oval United states Water,"successful_vibe,minimalist,thoughtprovoking,ab...",Simple elegant logo for Santa Fe By Design
174,amazing_logo_v4000313,food_beverage,restaurant_dining,Bird Fork United states,"successful_vibe,minimalist,thoughtprovoking,ab...",Simple elegant logo for Bluebird Cafe
257,amazing_logo_v4000469,retail_hospitality,home_improvement,A I Italy N,"successful_vibe,minimalist,thoughtprovoking,ab...",Simple elegant logo for Collezione Nai
264,amazing_logo_v4000481,manufacturing_transport,automotive_transport,A Arrow France Line Triangle,"successful_vibe,minimalist,thoughtprovoking,ab...",Simple elegant logo for Renault Alpine
289,amazing_logo_v4000523,entertainment_sports_media,arts_culture,Circle Germany Line Rotation Spiral Swirl,"successful_vibe,minimalist,thoughtprovoking,ab...",Simple elegant logo for Rotation
