# Step 6 — Create main category column
Add a coarse `category_main` column mapped to 10 top-level groups.
This notebook reads `metadata9.csv`, computes `category_main` using the helper in `utils/consolidation.py`, shows simple stats, and saves the updated CSV.

In [2]:
from pathlib import Path
import pandas as pd
from pathlib import Path
import sys

# Add utils folder to path
utils_path = Path('../..')
sys.path.append(str(utils_path))
utils_inner_path = Path('../../utils')
sys.path.append(str(utils_inner_path))

# Import helper added to utils/consolidation.py
from utils.consolidation import add_main_category_column, map_category_to_main


# Define paths similar to previous notebooks
base_output = Path('../../output/amazing_logos_v4')
metadata9_path = base_output / 'data' / 'amazing_logos_v4_cleanup' / 'metadata9.csv'

print(f'Metadata9: {metadata9_path}')

Metadata9: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\metadata9.csv


In [3]:
# Load metadata9.csv
if not metadata9_path.exists():
    raise FileNotFoundError(f'metadata9.csv not found at {metadata9_path}')

meta9 = pd.read_csv(metadata9_path)
print(f'Total rows: {len(meta9):,}')
print('Columns:', list(meta9.columns))

Total rows: 352,154
Columns: ['id', 'company', 'description', 'category', 'tags', 'category_main']


In [4]:
# Add `category_main` column
meta9_with_main = add_main_category_column(meta9, source_col='category', target_col='category_main')

# Quick inspection
print('Top 20 main categories:')
print(meta9_with_main['category_main'].value_counts().head(20).to_string())

# Show examples where mapping returned 'other' to help refine mapping later
print('Sample rows mapped to other:')
display(meta9_with_main[meta9_with_main['category_main']=='other'].head(10))

KeyboardInterrupt: 

In [None]:
# Save updated metadata back to the cleanup folder (also write a separate file)
out_dir = metadata9_path.parent
out_file = out_dir / 'metadata9.csv'
backup_file = out_dir / 'metadata9_with_main.csv'

meta9_with_main.to_csv(backup_file, index=False)
meta9_with_main.to_csv(out_file, index=False)

print(f'Written updated metadata to: {out_file}')
print(f'Also wrote a backup copy to: {backup_file}')

Written updated metadata to: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\metadata9.csv
Also wrote a backup copy to: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\metadata9_with_main.csv


Next steps:
- Inspect samples mapped to `other` and extend the mapping in `utils/consolidation.py` if needed.
- Use `category_main` in downstream image extraction / sampling notebooks to balance per-top-level category.