# Amazing Logos V4 - Step 4: Categories Consolidation Using Utils

This notebook consolidates categories using the consolidation utility:
- Loads metadata3.csv and analyzes categories directly
- Uses utils/consolidation.py to map categories to consolidated groups
- Creates analysis3.json with consolidated category analysis
- Replaces categories in metadata with consolidated ones
- Creates metadata4.csv with updated consolidated categories

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
from collections import defaultdict

# Add utils folder to path
utils_path = Path('../../utils')
sys.path.append(str(utils_path))

# Import consolidation functions
from consolidation import consolidate_categories, normalize_category, consolidation_map

# Paths
input_metadata_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata5.csv')
output_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/categories_analysis3.json')
output_metadata_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata6.csv')

print(f"Input metadata CSV: {input_metadata_csv}")
print(f"Output categories JSON: {output_csv}")
print(f"Output metadata CSV: {output_metadata_csv}")

# Check if metadata input exists
if not input_metadata_csv.exists():
    print(f"ERROR: Input file {input_metadata_csv} does not exist!")
    print("Please run previous steps to create metadata3.csv.")
else:
    print(f"Input metadata file exists.")

Input metadata CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata5.csv
Output categories JSON: ..\output\amazing_logos_v4\data\amazing_logos_v4_categories_analysis3.json
Output metadata CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata6.csv
Input metadata file exists.


In [2]:
consolidation_map.keys()

dict_keys(['real_estate_residential', 'real_estate_commercial', 'real_estate_development', 'construction_general', 'construction_home', 'construction_specialty', 'property_management', 'restaurant_dining', 'cafe_coffee', 'bar_nightlife', 'brewery_alcohol', 'catering_events', 'food_production', 'grocery_retail', 'beverage_general', 'software_development', 'web_digital', 'it_services', 'telecommunications', 'fintech_crypto', 'gaming_entertainment', 'data_analytics', 'tech_hardware', 'healthcare_general', 'dental_services', 'medical_specialty', 'wellness_fitness', 'mental_health', 'veterinary', 'education_k12', 'higher_education', 'training_development', 'childcare_youth', 'educational_services', 'music_industry', 'film_video', 'arts_culture', 'entertainment_venues', 'sports_recreation', 'media_publishing', 'radio_podcast', 'fashion_apparel', 'retail_general', 'ecommerce_online', 'jewelry_accessories', 'footwear_leather', 'textiles_manufacturing', 'legal_services', 'financial_services', '

In [3]:
# Load metadata and create category analysis
print("Loading metadata and analyzing categories...")
df_metadata = pd.read_csv(input_metadata_csv)

print(f"Loaded metadata: {len(df_metadata):,} rows")
print(f"Columns: {list(df_metadata.columns)}")


# Find categories that are only digits
numeric_mask = df_metadata['category'].astype(str).str.match(r'^\d+$', na=False)
numeric_categories = df_metadata[numeric_mask]['category'].tolist()
numeric_count = numeric_mask.sum()

if numeric_count > 0:
    print(f"Found {numeric_count} numeric-only categories")

    
    # Replace numeric categories with NA
    df_metadata.loc[numeric_mask, 'category'] = pd.NA
    print(f"\nReplaced {numeric_count} numeric categories with NA")
    
    # set rows with NA categories for consolidation to 'na' string
    df_metadata.loc[df_metadata.category.isna(), 'category'] = 'na'
else:
    print("No numeric-only categories found")
df_for_consolidation = df_metadata.copy()

Loading metadata and analyzing categories...
Loaded metadata: 393,298 rows
Columns: ['id', 'company', 'description', 'category', 'tags']
Found 632 numeric-only categories

Replaced 632 numeric categories with NA


In [4]:
# Use consolidation function to get consolidated categories
print("\n=== CONSOLIDATING CATEGORIES ===")
print("Using consolidation.py utility...")

# Apply consolidation to cleaned data (excluding numeric categories)
df_consolidated, unmatched_categories, consolidation_mapping = consolidate_categories(df_for_consolidation)

print(f"\nConsolidation complete!")
print(f"Consolidated categories: {len(consolidation_mapping)}")
print(f"Unmatched categories: {len(unmatched_categories)}")
print(f"Total logos processed: {len(df_consolidated):,}")

# Show some statistics about the consolidation
total_logos = sum(data['count'] for data in consolidation_mapping.values())
print(f"Total logos in consolidation mapping: {total_logos:,}")

print(f"\n=== TOP 10 CONSOLIDATED CATEGORIES ===")
sorted_categories = sorted(consolidation_mapping.items(), key=lambda x: x[1]['count'], reverse=True)
for i, (category, data) in enumerate(sorted_categories[:10], 1):
    percentage = (data['count'] / total_logos) * 100 if total_logos > 0 else 0
    num_originals = len(data['original_categories'])
    print(f"{i:2d}. {category:<30} {data['count']:>7} ({percentage:>5.2f}%) [{num_originals:>3} orig]")


=== CONSOLIDATING CATEGORIES ===
Using consolidation.py utility...
DEBUG TAG ADD: 'adhesives' -> 'chemical_materials' | Tags: ['successful_vibe', 'minimalist', 'thoughtprovoking', 'abstract', 'recognizable', 'relatable', 'sharp', 'vector_art', 'even_edges', 'black_and_white', 'adhesives']
DEBUG TAG ADD: 'design' -> 'design_creative' | Tags: ['successful_vibe', 'minimalist', 'thoughtprovoking', 'abstract', 'recognizable', 'relatable', 'sharp', 'vector_art', 'even_edges', 'black_and_white', 'design']
DEBUG TAG ADD: 'meat_processing_machinery' -> 'food_production' | Tags: ['successful_vibe', 'minimalist', 'thoughtprovoking', 'abstract', 'recognizable', 'relatable', 'sharp', 'vector_art', 'even_edges', 'black_and_white', 'meat_processing_machinery']
DEBUG TAG ADD: 'design' -> 'design_creative' | Tags: ['successful_vibe', 'minimalist', 'thoughtprovoking', 'abstract', 'recognizable', 'relatable', 'sharp', 'vector_art', 'even_edges', 'black_and_white', 'design']
DEBUG TAG ADD: 'design' -> 'd

KeyboardInterrupt: 

In [None]:
# Create the final DataFrame with required format
print("\n=== CREATING FINAL OUTPUT ===")

final_data = []
for category, data in consolidation_mapping.items():
    # Join original categories with commas
    original_categories_str = ', '.join(sorted(set(data['original_categories'])))
    
    final_data.append({
        'category': category,
        'count': data['count'],
        'original_categories': original_categories_str
    })

# Create DataFrame and sort by count
df_final = pd.DataFrame(final_data)
df_final = df_final.sort_values('count', ascending=False).reset_index(drop=True)

print(f"Created final DataFrame with {len(df_final)} consolidated categories")
print(f"Total logos: {df_final['count'].sum():,}")

# Show top consolidated categories
print(f"\n=== TOP 20 CONSOLIDATED CATEGORIES ===")
for i, row in df_final.head(20).iterrows():
    percentage = (row['count'] / df_final['count'].sum()) * 100
    num_originals = len(row['original_categories'].split(', '))
    print(f"{i+1:2d}. {row['category']:<30} {row['count']:>7} ({percentage:>5.2f}%) [{num_originals:>3} orig]")


=== CREATING FINAL OUTPUT ===
Created final DataFrame with 41807 consolidated categories
Total logos: 393,297

=== TOP 20 CONSOLIDATED CATEGORIES ===
 1. unclassified                     27314 ( 6.94%) [ 19 orig]
 2. real_estate_residential          10515 ( 2.67%) [212 orig]
 3. restaurant_dining                10304 ( 2.62%) [176 orig]
 4. healthcare_general                9870 ( 2.51%) [ 94 orig]
 5. nonprofit_charity                 9831 ( 2.50%) [205 orig]
 6. design_creative                   9756 ( 2.48%) [206 orig]
 7. sports_recreation                 9274 ( 2.36%) [ 24 orig]
 8. education_k12                     8275 ( 2.10%) [ 38 orig]
 9. fashion_apparel                   8054 ( 2.05%) [ 37 orig]
10. food                              7863 ( 2.00%) [  1 orig]
11. medical_specialty                 6652 ( 1.69%) [ 44 orig]
12. music_industry                    6628 ( 1.69%) [ 53 orig]
13. retail_general                    5682 ( 1.44%) [ 36 orig]
14. construction_general      

In [None]:
# Save the consolidated analysis
print(f"\nSaving consolidated analysis to {output_csv}...")

# Create category analysis from consolidation mapping
original_categories_count = len(set(cat for data in consolidation_mapping.values() for cat in data['original_categories']))

# Create JSON-friendly output with explicit type conversion to avoid int64 serialization issues
output_data = {
    "metadata": {
        "original_categories_count": int(original_categories_count),
        "numeric_categories_removed": int(numeric_count if 'numeric_count' in locals() else 0),
        "categories_after_cleaning": int(len(df_for_consolidation)),
        "final_consolidated_categories": int(len(df_final)),
        "total_reduction_factor": float(round(original_categories_count / len(df_final), 1)),
        "total_logos": int(df_final['count'].sum()),
        "consolidation_timestamp": pd.Timestamp.now().isoformat()
    },
    "consolidated_categories": []
}

# Add consolidated categories data with explicit type conversion
for _, row in df_final.iterrows():
    output_data["consolidated_categories"].append({
        "category": str(row['category']),
        "count": int(row['count']),
        "percentage": float(round((row['count'] / df_final['count'].sum()) * 100, 2)),
        "original_categories": [str(cat.strip()) for cat in row['original_categories'].split(', ')]
    })

# Save as JSON
import json
with open(output_csv, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"Consolidated analysis saved successfully as JSON!")

# Show final statistics
print(f"\n=== FINAL STATISTICS ===")
print(f"Original categories (before cleaning): {original_categories_count:,}")
print(f"Numeric categories removed: {numeric_count if 'numeric_count' in locals() else 0}")
print(f"Categories after cleaning: {len(df_for_consolidation):,}")
print(f"Final consolidated categories: {len(df_final)}")
print(f"Total reduction factor: {original_categories_count / len(df_final):.1f}x")
print(f"Total logos: {df_final['count'].sum():,}")
print(f"Output file: {output_csv}")
print(f"Columns: {list(df_final.columns)}")

# Show some examples of the mapping
print(f"\n=== SAMPLE MAPPING EXAMPLES ===")
for i, row in df_final.head(5).iterrows():
    original_list = row['original_categories'].split(', ')[:5]  # Show first 5 original categories
    print(f"\n{row['category']} ({row['count']} logos):")
    for orig in original_list:
        print(f"  - {orig}")
    if len(row['original_categories'].split(', ')) > 5:
        remaining = len(row['original_categories'].split(', ')) - 5
        print(f"  ... and {remaining} more")


Saving consolidated analysis to ..\output\amazing_logos_v4\data\amazing_logos_v4_categories_analysis3.json...


Consolidated analysis saved successfully as JSON!

=== FINAL STATISTICS ===
Original categories (before cleaning): 46,070
Numeric categories removed: 617
Categories after cleaning: 393,297
Final consolidated categories: 41807
Total reduction factor: 1.1x
Total logos: 393,297
Output file: ..\output\amazing_logos_v4\data\amazing_logos_v4_categories_analysis3.json
Columns: ['category', 'count', 'original_categories']

=== SAMPLE MAPPING EXAMPLES ===

unclassified (27314 logos):
  - all
  - call
  - defined
  - mall
  - misc
  ... and 14 more

real_estate_residential (10515 logos):
  - adult_living
  - apartment
  - apartment_real_estate
  - apartments
  - architectural_real_estate
  ... and 207 more

restaurant_dining (10304 logos):
  - airline_food_service
  - asian_cuisine
  - asian_fast_food
  - bakery_food_service
  - bar_and_restaurant
  ... and 171 more

healthcare_general (9870 logos):
  - ag_medicine
  - air_medical_services
  - child_health_care
  - child_health_services
  - clin

In [None]:
# Apply consolidation to metadata3.csv to create metadata4.csv
print(f"\n=== APPLYING CONSOLIDATION TO METADATA ===")
print(f"Using metadata already loaded from {input_metadata_csv}...")

print(f"Metadata: {len(df_metadata):,} rows")
print(f"Columns: {list(df_metadata.columns)}")

# Create mapping dictionary from original to consolidated categories
category_mapping = {}
for _, row in df_final.iterrows():
    consolidated_cat = row['category']
    original_cats = row['original_categories'].split(', ')
    for orig_cat in original_cats:
        category_mapping[orig_cat.strip()] = consolidated_cat

# Also handle numeric categories
if 'numeric_categories' in [row['category'] for _, row in df_final.iterrows()]:
    for num_cat in numeric_categories:
        category_mapping[str(num_cat)] = 'numeric_categories'

print(f"Created mapping for {len(category_mapping)} original categories")

# Apply consolidation to metadata
df_metadata_consolidated = df_metadata.copy()

# Track consolidation statistics
original_categories_in_metadata = df_metadata_consolidated['category'].value_counts()
consolidation_stats = {'mapped': 0, 'unmapped': 0, 'na_values': 0}

def consolidate_category(category):
    """Apply consolidation mapping to a single category"""
    if pd.isna(category):
        consolidation_stats['na_values'] += 1
        return pd.NA
    
    category_str = str(category).strip()
    if category_str in category_mapping:
        consolidation_stats['mapped'] += 1
        return category_mapping[category_str]
    else:
        consolidation_stats['unmapped'] += 1
        return 'unclassified'

# Apply consolidation
print("Applying category consolidation to metadata...")
df_metadata_consolidated['category'] = df_metadata_consolidated['category'].apply(consolidate_category)

print(f"\n=== CONSOLIDATION STATISTICS ===")
print(f"Categories mapped to consolidated: {consolidation_stats['mapped']:,}")
print(f"Categories mapped to 'unclassified': {consolidation_stats['unmapped']:,}")
print(f"NA values preserved: {consolidation_stats['na_values']:,}")
print(f"Total rows processed: {len(df_metadata_consolidated):,}")

# Show before/after category distribution
print(f"\n=== CATEGORY DISTRIBUTION COMPARISON ===")
print(f"Original unique categories: {df_metadata['category'].nunique()}")
print(f"Consolidated unique categories: {df_metadata_consolidated['category'].nunique()}")

print(f"\nTop 10 consolidated categories in metadata:")
consolidated_counts = df_metadata_consolidated['category'].value_counts().head(10)
for i, (cat, count) in enumerate(consolidated_counts.items(), 1):
    percentage = (count / len(df_metadata_consolidated)) * 100
    print(f"{i:2d}. {cat:<25} {count:>7} ({percentage:>5.2f}%)")

# Save consolidated metadata
print(f"\nSaving consolidated metadata to {output_metadata_csv}...")
df_metadata_consolidated.to_csv(output_metadata_csv, index=False)
print(f"Consolidated metadata saved successfully!")

print(f"\n=== FINAL OUTPUT FILES ===")
print(f"Categories analysis (JSON): {output_csv}")
print(f"Consolidated metadata (CSV): {output_metadata_csv}")
print(f"Total logos with consolidated categories: {len(df_metadata_consolidated):,}")


=== APPLYING CONSOLIDATION TO METADATA ===
Using metadata already loaded from ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata5.csv...
Metadata: 393,297 rows
Columns: ['id', 'company', 'description', 'category', 'tags']
Created mapping for 46070 original categories
Applying category consolidation to metadata...
Created mapping for 46070 original categories
Applying category consolidation to metadata...

=== CONSOLIDATION STATISTICS ===
Categories mapped to consolidated: 393,297
Categories mapped to 'unclassified': 0
NA values preserved: 0
Total rows processed: 393,297

=== CATEGORY DISTRIBUTION COMPARISON ===
Original unique categories: 46070
Consolidated unique categories: 41807

Top 10 consolidated categories in metadata:
 1. unclassified                27314 ( 6.94%)
 2. real_estate_residential     10515 ( 2.67%)
 3. restaurant_dining           10304 ( 2.62%)
 4. healthcare_general           9870 ( 2.51%)
 5. nonprofit_charity            9831 ( 2.50%)
 6. design_creative  