In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import sys

# Add utils folder to path
utils_path = Path('../../utils')
sys.path.append(str(utils_path))

# Paths
input_metadata_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata6.csv')
output_metadata_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata7.csv')

print(f"Input metadata CSV: {input_metadata_csv}")
print(f"Output metadata CSV: {output_metadata_csv}")

# Check if metadata input exists
if not input_metadata_csv.exists():
    print(f"ERROR: Input file {input_metadata_csv} does not exist!")
else:
    print(f"Input metadata file exists.")
    
# Read metadata6.csv
print("Loading metadata6.csv...")
df_metadata = pd.read_csv(input_metadata_csv)
print(f"Loaded {len(df_metadata):,} rows")
print(f"Columns: {list(df_metadata.columns)}")
print(f"First few rows:")
print(df_metadata.head())

Input metadata CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata6.csv
Output metadata CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata7.csv
Input metadata file exists.
Loading metadata6.csv...
Loaded 393,297 rows
Columns: ['id', 'company', 'description', 'category', 'tags']
First few rows:
                      id                                    company  \
0  amazing_logo_v4000000  Simple elegant logo for Mandarin Oriental   
1  amazing_logo_v4000001               Simple elegant logo for Alfa   
2  amazing_logo_v4000002            Simple elegant logo for Kuraray   
3  amazing_logo_v4000003       Simple elegant logo for Valwood Park   
4  amazing_logo_v4000004            Simple elegant logo for Cinepaq   

                       description              category  \
0        Fan Hong kong Lines Paper  hospitality_services   
1         Hexagon Poland Triangles    chemical_materials   
2                G Japan K Outline           safty_glass   
3    Lines Roun

In [5]:
# Extract a sorted list of all categories by count (many to few)
print("\nAnalyzing categories...")

# Count all categories (excluding unclassified)
category_counts = df_metadata[df_metadata['category'] != 'unclassified']['category'].value_counts()
print(f"Found {len(category_counts)} unique categories")

# Create sorted list of categories by count (most frequent first)
sorted_categories = category_counts.index.tolist()
print(f"\nCategories sorted by frequency: {sorted_categories[:10]}...")


Analyzing categories...
Found 40499 unique categories

Categories sorted by frequency: ['real_estate_residential', 'restaurant_dining', 'healthcare_general', 'nonprofit_charity', 'design_creative', 'sports_recreation', 'education_k12', 'fashion_apparel', 'food', 'medical_specialty']...


In [6]:
# Process unclassified rows
print("\nProcessing unclassified rows...")

# Create a copy of the dataframe to work with
df_result = df_metadata.copy()

# Count unclassified rows
unclassified_mask = df_result['category'] == 'unclassified'
unclassified_count = unclassified_mask.sum()
print(f"Found {unclassified_count:,} unclassified rows")

if unclassified_count > 0:
    # Counter for tracking changes
    changes_made = 0
    
    # For each row where category is unclassified
    for idx in df_result[unclassified_mask].index:
        row_tags = df_result.loc[idx, 'tags']
        
        # Skip if tags is NaN or empty
        if pd.isna(row_tags) or not row_tags.strip():
            continue
            
        # Split tags by comma and clean them
        tag_list = [tag.strip() for tag in str(row_tags).split(',')]
        
        # Find which tags appear in our sorted category list
        # We want the tag that corresponds to the category with highest count
        best_category = None
        best_category_position = len(sorted_categories)  # Initialize with max position
        
        for tag in tag_list:
            if tag in sorted_categories:
                category_position = sorted_categories.index(tag)
                if category_position < best_category_position:
                    best_category = tag
                    best_category_position = category_position
        
        # If we found a matching category, assign it and remove the tag
        if best_category:
            # Assign the category
            df_result.loc[idx, 'category'] = best_category
            
            # Remove the tag from the tags list
            updated_tags = [tag for tag in tag_list if tag != best_category]
            df_result.loc[idx, 'tags'] = ','.join(updated_tags)
            
            changes_made += 1
    
    print(f"Made {changes_made:,} category assignments from tags")
else:
    print("No unclassified rows to process")


Processing unclassified rows...
Found 27,335 unclassified rows
Made 129 category assignments from tags
Made 129 category assignments from tags


In [7]:
# Show results and statistics
print("\nResults Summary:")
print(f"Original unclassified count: {unclassified_count:,}")

final_unclassified_count = (df_result['category'] == 'unclassified').sum()
print(f"Final unclassified count: {final_unclassified_count:,}")
print(f"Rows reclassified: {unclassified_count - final_unclassified_count:,}")

# Show updated category distribution
print("\nUpdated category distribution:")
updated_category_counts = df_result['category'].value_counts()
print(updated_category_counts.head(15))

# Output the result to metadata7.csv
print(f"\nSaving results to {output_metadata_csv}")
df_result.to_csv(output_metadata_csv, index=False)
print(f"Successfully saved {len(df_result):,} rows to metadata7.csv")

# Verify the output file
if output_metadata_csv.exists():
    print(f"Output file size: {output_metadata_csv.stat().st_size:,} bytes")
    
    # Quick verification - read back the first few rows
    df_verify = pd.read_csv(output_metadata_csv)
    print(f"Verification: Read back {len(df_verify):,} rows")
    print("Sample of results:")
    print(df_verify.head())


Results Summary:
Original unclassified count: 27,335
Final unclassified count: 27,206
Rows reclassified: 129

Updated category distribution:
category
unclassified               27206
real_estate_residential    10504
restaurant_dining          10300
healthcare_general          9858
nonprofit_charity           9791
design_creative             9718
sports_recreation           9274
education_k12               8269
fashion_apparel             8054
food                        7858
medical_specialty           6633
music_industry              6625
retail_general              5665
construction_general        5500
entertainment_venues        5437
Name: count, dtype: int64

Saving results to ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata7.csv
Successfully saved 393,297 rows to metadata7.csv
Output file size: 95,198,168 bytes
Successfully saved 393,297 rows to metadata7.csv
Output file size: 95,198,168 bytes
Verification: Read back 393,297 rows
Sample of results:
                      