# Amazing Logos V4 - Step 5: Categories Final Cleanup

This notebook performs final category cleanup:
- Loads metadata8.csv
- Changes categories to 'unclassified' if they're not in the consolidation_map.keys()
- For frequent unclassified categories (>5 occurrences), adds the original category to tags
- Saves the result as metadata9.csv

In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
from collections import Counter

# Add utils folder to path
utils_path = Path('../../utils')
sys.path.append(str(utils_path))

# Import consolidation functions
from consolidation import consolidate_categories, normalize_category, consolidation_map

# Paths
input_metadata_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata8.csv')
output_metadata_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata9.csv')

print(f"Input metadata CSV: {input_metadata_csv}")
print(f"Output metadata CSV: {output_metadata_csv}")

# Check if metadata input exists
if not input_metadata_csv.exists():
    print(f"ERROR: Input file {input_metadata_csv} does not exist!")
else:
    print(f"Input metadata file exists.")

Input metadata CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata8.csv
Output metadata CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata9.csv
Input metadata file exists.


In [10]:
# Load metadata
print("Loading metadata...")
df = pd.read_csv(input_metadata_csv)

print(f"Loaded metadata: {len(df):,} rows")
print(f"Columns: {list(df.columns)}")

# Display basic info about categories
print(f"\nUnique categories before cleanup: {df['category'].nunique()}")
print(f"\nTop 10 current categories:")
category_counts = df['category'].value_counts()
for i, (cat, count) in enumerate(category_counts.head(10).items(), 1):
    percentage = (count / len(df)) * 100
    print(f"{i:2d}. {cat:<30} {count:>7} ({percentage:>5.2f}%)")

# Show categories with NA values
na_count = df['category'].isna().sum()
print(f"\nNA categories: {na_count}")

Loading metadata...
Loaded metadata: 352,154 rows
Columns: ['id', 'company', 'description', 'category', 'tags']

Unique categories before cleanup: 1622

Top 10 current categories:
 1. unclassified                     29442 ( 8.36%)
 2. sports_recreation                11383 ( 3.23%)
 3. restaurant_dining                10963 ( 3.11%)
 4. real_estate_residential          10825 ( 3.07%)
 5. healthcare_general               10534 ( 2.99%)
 6. design_creative                  10366 ( 2.94%)
 7. nonprofit_charity                10134 ( 2.88%)
 8. fashion_apparel                   9174 ( 2.61%)
 9. education_k12                     8572 ( 2.43%)
10. food                              7866 ( 2.23%)

NA categories: 0


In [11]:
# Check what categories are valid according to consolidation_map
valid_categories = set(consolidation_map.keys())
print(f"Valid categories from consolidation_map: {len(valid_categories)}")
print(f"\nValid categories list:")
for i, cat in enumerate(sorted(valid_categories), 1):
    print(f"{i:2d}. {cat}")

# Check current categories vs valid categories
current_categories = set(df['category'].dropna().astype(str))
print(f"\nCurrent unique categories (non-NA): {len(current_categories)}")

# Find categories that are already valid
already_valid = current_categories.intersection(valid_categories)
print(f"Categories already in consolidation_map: {len(already_valid)}")

# Find categories that need to be changed to unclassified
to_unclassify = current_categories - valid_categories
print(f"Categories to be changed to 'unclassified': {len(to_unclassify)}")

if to_unclassify:
    print(f"\nSample categories to be unclassified:")
    sample_to_unclassify = list(to_unclassify)[:20]
    for cat in sample_to_unclassify:
        count = category_counts.get(cat, 0)
        print(f"  {cat} ({count} occurrences)")
    if len(to_unclassify) > 20:
        print(f"  ... and {len(to_unclassify) - 20} more")

Valid categories from consolidation_map: 109

Valid categories list:
 1. accounting_tax
 2. aerospace_defense
 3. agriculture_farming
 4. animal_services
 5. arts_culture
 6. auto_services
 7. automotive_transport
 8. aviation_services
 9. bar_nightlife
10. beauty_cosmetics
11. beverage_general
12. brewery_alcohol
13. cafe_coffee
14. catering_events
15. chemical_materials
16. childcare_youth
17. cleaning_maintenance
18. community_social
19. construction_general
20. construction_home
21. construction_materials
22. construction_specialty
23. consulting_business
24. data_analytics
25. dental_services
26. design_creative
27. ecommerce_online
28. education_k12
29. educational_services
30. energy_utilities
31. entertainment_venues
32. environmental_services
33. event_services
34. fashion_apparel
35. film_video
36. financial_services
37. fintech_crypto
38. fitness_health
39. food_production
40. footwear_leather
41. forestry_lumber
42. funeral_services
43. gambling_betting
44. gaming_entertain

In [12]:
# Find frequent categories that will become unclassified (>5 occurrences)
frequent_unclassified = []
for cat in to_unclassify:
    count = category_counts.get(cat, 0)
    if count > 5:
        frequent_unclassified.append((cat, count))

# Sort by frequency
frequent_unclassified.sort(key=lambda x: x[1], reverse=True)

print(f"Frequent categories (>5 occurrences) that will become 'unclassified': {len(frequent_unclassified)}")
print(f"These will be added to tags list:\n")

total_logos_to_tag = 0
for cat, count in frequent_unclassified:
    total_logos_to_tag += count
    print(f"  {cat:<40} {count:>6} occurrences")

print(f"\nTotal logos that will get original category added to tags: {total_logos_to_tag:,}")

# Create set for quick lookup
frequent_unclassified_set = {cat for cat, _ in frequent_unclassified}

Frequent categories (>5 occurrences) that will become 'unclassified': 1101
These will be added to tags list:

  food                                       7866 occurrences
  financial                                  2263 occurrences
  inc                                         785 occurrences
  cannabis                                    735 occurrences
  engineering                                 719 occurrences
  tech                                        620 occurrences
  graphic                                     415 occurrences
  bank                                        340 occurrences
  toys                                        336 occurrences
  science                                     283 occurrences
  housing_development                         276 occurrences
  blog                                        225 occurrences
  corporate                                   211 occurrences
  auto                                        204 occurrences
  music_entertainment 

In [13]:
# Create a copy for processing
df_processed = df.copy()

print("Processing categories and tags...")

# Ensure tags column exists
if 'tags' not in df_processed.columns:
    df_processed['tags'] = ''
    print("Created empty tags column")

# Statistics tracking
stats = {
    'total_processed': 0,
    'kept_valid': 0,
    'changed_to_unclassified': 0,
    'added_to_tags': 0,
    'na_categories': 0
}

def process_row(row):
    """Process a single row to update category and tags"""
    stats['total_processed'] += 1
    
    original_category = row['category']
    
    # Handle NA categories
    if pd.isna(original_category):
        stats['na_categories'] += 1
        return row
    
    original_category_str = str(original_category)
    
    # Check if category is valid (in consolidation_map)
    if original_category_str in valid_categories:
        # Keep the valid category
        stats['kept_valid'] += 1
        return row
    else:
        # Change to unclassified
        stats['changed_to_unclassified'] += 1
        row['category'] = 'unclassified'
        
        # If it's a frequent category, add to tags
        if original_category_str in frequent_unclassified_set:
            stats['added_to_tags'] += 1
            
            # Get current tags
            current_tags = row.get('tags', '')
            if pd.isna(current_tags):
                current_tags = ''
            
            # Parse existing tags
            existing_tags = [tag.strip() for tag in str(current_tags).split(',') if tag.strip()]
            
            # Add original category if not already present
            if original_category_str not in existing_tags:
                existing_tags.append(original_category_str)
                row['tags'] = ', '.join(existing_tags)
        
        return row

# Apply processing to each row
print("Applying category and tag updates...")
df_processed = df_processed.apply(process_row, axis=1)

print(f"\n=== PROCESSING STATISTICS ===")
print(f"Total rows processed: {stats['total_processed']:,}")
print(f"Categories kept (valid): {stats['kept_valid']:,}")
print(f"Categories changed to 'unclassified': {stats['changed_to_unclassified']:,}")
print(f"Original categories added to tags: {stats['added_to_tags']:,}")
print(f"NA categories preserved: {stats['na_categories']:,}")

Processing categories and tags...
Applying category and tag updates...

=== PROCESSING STATISTICS ===
Total rows processed: 352,154
Categories kept (valid): 307,642
Categories changed to 'unclassified': 44,512
Original categories added to tags: 42,741
NA categories preserved: 0


In [14]:
# Analyze the results
print("\n=== FINAL CATEGORY ANALYSIS ===")
final_category_counts = df_processed['category'].value_counts()
print(f"Final unique categories: {df_processed['category'].nunique()}")
print(f"\nFinal category distribution:")
for i, (cat, count) in enumerate(final_category_counts.items(), 1):
    percentage = (count / len(df_processed)) * 100
    print(f"{i:2d}. {cat:<30} {count:>7} ({percentage:>5.2f}%)")

# Check tags analysis
print(f"\n=== TAGS ANALYSIS ===")
rows_with_tags = df_processed['tags'].notna() & (df_processed['tags'] != '')
print(f"Rows with non-empty tags: {rows_with_tags.sum():,} ({(rows_with_tags.sum() / len(df_processed)) * 100:.2f}%)")

# Show some examples of rows that got tags added
print(f"\nSample rows with added tags (from frequent unclassified categories):")
sample_tagged = df_processed[df_processed['category'] == 'unclassified'][['category', 'tags']].head(10)
for i, (idx, row) in enumerate(sample_tagged.iterrows(), 1):
    if row['tags']:
        print(f"{i:2d}. Tags: {row['tags'][:80]}{'...' if len(str(row['tags'])) > 80 else ''}")

# Verify all categories are now valid
final_categories = set(df_processed['category'].dropna().astype(str))
invalid_final = final_categories - valid_categories - {'unclassified'}
print(f"\nInvalid categories remaining (should be 0): {len(invalid_final)}")
if invalid_final:
    print(f"WARNING: Still have invalid categories: {invalid_final}")


=== FINAL CATEGORY ANALYSIS ===
Final unique categories: 109

Final category distribution:
 1. unclassified                     73954 (21.00%)
 2. sports_recreation                11383 ( 3.23%)
 3. restaurant_dining                10963 ( 3.11%)
 4. real_estate_residential          10825 ( 3.07%)
 5. healthcare_general               10534 ( 2.99%)
 6. design_creative                  10366 ( 2.94%)
 7. nonprofit_charity                10134 ( 2.88%)
 8. fashion_apparel                   9174 ( 2.61%)
 9. education_k12                     8572 ( 2.43%)
10. music_industry                    7318 ( 2.08%)
11. marketing_advertising             6806 ( 1.93%)
12. retail_general                    6503 ( 1.85%)
13. home_improvement                  6200 ( 1.76%)
14. entertainment_venues              6089 ( 1.73%)
15. chemical_materials                5501 ( 1.56%)
16. web_digital                       5470 ( 1.55%)
17. arts_culture                      5195 ( 1.48%)
18. food_production     

In [15]:
# Save the processed metadata
print(f"\nSaving processed metadata to {output_metadata_csv}...")
df_processed.to_csv(output_metadata_csv, index=False)
print(f"Successfully saved metadata9.csv!")

print(f"\n=== FINAL SUMMARY ===")
print(f"Input file: {input_metadata_csv}")
print(f"Output file: {output_metadata_csv}")
print(f"Total logos: {len(df_processed):,}")
print(f"Valid categories in consolidation_map: {len(valid_categories)}")
print(f"Final unique categories: {df_processed['category'].nunique()}")
print(f"Categories changed to 'unclassified': {stats['changed_to_unclassified']:,}")
print(f"Frequent unclassified categories added to tags: {len(frequent_unclassified)}")
print(f"Logos with tags added: {stats['added_to_tags']:,}")
print(f"Rows with non-empty tags: {rows_with_tags.sum():,}")

# Show the frequency threshold used
print(f"\nFrequency threshold for adding to tags: >5 occurrences")
print(f"Categories above threshold: {len(frequent_unclassified)}")
if frequent_unclassified:
    min_freq = min(count for _, count in frequent_unclassified)
    max_freq = max(count for _, count in frequent_unclassified)
    print(f"Range of frequencies added to tags: {min_freq} - {max_freq}")


Saving processed metadata to ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata9.csv...
Successfully saved metadata9.csv!

=== FINAL SUMMARY ===
Input file: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata8.csv
Output file: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata9.csv
Total logos: 352,154
Valid categories in consolidation_map: 109
Final unique categories: 109
Categories changed to 'unclassified': 44,512
Frequent unclassified categories added to tags: 1101
Logos with tags added: 42,741
Rows with non-empty tags: 352,154

Frequency threshold for adding to tags: >5 occurrences
Categories above threshold: 1101
Range of frequencies added to tags: 6 - 7866
