# Amazing Logos V4 - Step 4: Data Cleaning

This notebook cleans the structured data by setting category-tag overlaps to NA:
- Loads amazing_logos_v4_metadata2.csv from Step 2
- Loads amazing_logos_v4_tags_analysis.csv from Step 3  
- Sets category to NA where it matches any of the top 10 tags
- Sets description to NA where it contains any of the top 10 tags
- Saves cleaned data as amazing_logos_v4_metadata3.csv

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Paths
input_metadata_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata4.csv')
input_tags_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/tags_analysis.csv')
output_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata5.csv')

print(f"Input metadata CSV: {input_metadata_csv}")
print(f"Input tags analysis CSV: {input_tags_csv}")
print(f"Output filtered CSV: {output_csv}")

# Check if inputs exist
if not input_metadata_csv.exists():
    print(f"ERROR: Input file {input_metadata_csv} does not exist!")
    print("Please run Step 2 notebook first.")
if not input_tags_csv.exists():
    print(f"ERROR: Input file {input_tags_csv} does not exist!")
    print("Please run Step 3 tags notebook first.")
else:
    print(f"All input files exist.")

Input metadata CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata4.csv
Input tags analysis CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_tags_analysis.csv
Output filtered CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata5.csv
All input files exist.


In [2]:
# Load the metadata CSV
print("Loading metadata CSV...")
df_metadata = pd.read_csv(input_metadata_csv)

print(f"Loaded metadata: {len(df_metadata)} rows")
print(f"Columns: {list(df_metadata.columns)}")
print(f"\nFirst 5 rows:")
print(df_metadata.head())

Loading metadata CSV...
Loaded metadata: 393297 rows
Columns: ['id', 'company', 'description', 'category', 'tags']

First 5 rows:
                      id                                    company  \
0  amazing_logo_v4000000  Simple elegant logo for Mandarin Oriental   
1  amazing_logo_v4000001               Simple elegant logo for Alfa   
2  amazing_logo_v4000002            Simple elegant logo for Kuraray   
3  amazing_logo_v4000003       Simple elegant logo for Valwood Park   
4  amazing_logo_v4000004            Simple elegant logo for Cinepaq   

                       description     category  \
0        Fan Hong kong Lines Paper  hospitality   
1         Hexagon Poland Triangles    chemicals   
2                G Japan K Outline  safty_glass   
3    Lines Rounded United states V         park   
4  C Circle Film reel Spain Square         film   

                                                tags  
0  successful_vibe,minimalist,thoughtprovoking,ab...  
1  successful_vibe,minimal

In [3]:
# Load the tags analysis CSV
print("Loading tags analysis CSV...")
df_tags = pd.read_csv(input_tags_csv)

print(f"Loaded tags analysis: {len(df_tags)} rows")
print(f"Columns: {list(df_tags.columns)}")

# Get top 10 tags
top_10_tags = df_tags.head(10)['tag'].tolist()

print(f"\n=== TOP 10 TAGS TO FILTER ===") 
for i, (_, row) in enumerate(df_tags.head(10).iterrows()):
    print(f"{i+1:2d}. {row['tag']:<20} {row['count']:>8} occurrences")

print(f"\nTags to filter out: {top_10_tags}")

Loading tags analysis CSV...
Loaded tags analysis: 6525 rows
Columns: ['tag', 'count']

=== TOP 10 TAGS TO FILTER ===
 1. sharp                  393297 occurrences
 2. vector_art             393297 occurrences
 3. even_edges             393297 occurrences
 4. thoughtprovoking       393297 occurrences
 5. abstract               393297 occurrences
 6. recognizable           393297 occurrences
 7. relatable              393297 occurrences
 8. minimalist             367639 occurrences
 9. successful_vibe        367262 occurrences
10. black_and_white          9936 occurrences

Tags to filter out: ['sharp', 'vector_art', 'even_edges', 'thoughtprovoking', 'abstract', 'recognizable', 'relatable', 'minimalist', 'successful_vibe', 'black_and_white']


In [4]:
# Analyze current categories vs top tags overlap
print("=== CATEGORY-TAG OVERLAP ANALYSIS ===")

# Check how many rows have categories that match top 10 tags
overlap_counts = {}
total_to_remove = 0

for tag in top_10_tags:
    # Count rows where category matches this tag (case-insensitive)
    mask = df_metadata['category'].str.lower() == tag.lower()
    count = mask.sum()
    overlap_counts[tag] = count
    total_to_remove += count
    
    if count > 0:
        print(f"'{tag}' appears as category: {count} rows")

# Also check description column
description_overlap_counts = {}
total_description_to_clean = 0

print(f"\n=== DESCRIPTION-TAG OVERLAP ANALYSIS ===")
for tag in top_10_tags:
    # Count rows where description contains this tag (case-insensitive)
    mask = df_metadata['description'].str.lower().str.contains(tag.lower(), na=False, regex=False)
    count = mask.sum()
    description_overlap_counts[tag] = count
    total_description_to_clean += count
    
    if count > 0:
        print(f"'{tag}' appears in description: {count} rows")

print(f"\nTotal category values to set to NA: {total_to_remove}")
print(f"Total description values to set to NA: {total_description_to_clean}")
print(f"Original dataset size: {len(df_metadata)}")
print(f"Rows with category overlaps: {total_to_remove} ({(total_to_remove / len(df_metadata)) * 100:.2f}%)")
print(f"Rows with description overlaps: {total_description_to_clean} ({(total_description_to_clean / len(df_metadata)) * 100:.2f}%)")

=== CATEGORY-TAG OVERLAP ANALYSIS ===
'minimalist' appears as category: 25658 rows
'successful_vibe' appears as category: 377 rows

=== DESCRIPTION-TAG OVERLAP ANALYSIS ===
'sharp' appears in description: 1387 rows
'minimalist' appears as category: 25658 rows
'successful_vibe' appears as category: 377 rows

=== DESCRIPTION-TAG OVERLAP ANALYSIS ===
'sharp' appears in description: 1387 rows
'abstract' appears in description: 4210 rows
'recognizable' appears in description: 4 rows
'relatable' appears in description: 10 rows
'abstract' appears in description: 4210 rows
'recognizable' appears in description: 4 rows
'relatable' appears in description: 10 rows
'minimalist' appears in description: 1317 rows

Total category values to set to NA: 26035
Total description values to set to NA: 6928
Original dataset size: 393297
Rows with category overlaps: 26035 (6.62%)
Rows with description overlaps: 6928 (1.76%)
'minimalist' appears in description: 1317 rows

Total category values to set to NA: 26

In [5]:
# Clean the data by setting overlapping values to NA and preserving tags
print("Cleaning data to set category-tag overlaps to NA and preserve tags...")

# Create a copy of the data for cleaning
df_cleaned = df_metadata.copy()

# Track tags that need to be preserved
tags_to_preserve = []

# Process categories that match top 10 tags
category_mask = df_cleaned['category'].str.lower().isin([tag.lower() for tag in top_10_tags])
categories_to_preserve = df_cleaned.loc[category_mask, 'category'].tolist()

# Set categories that match top 10 tags to NA
df_cleaned.loc[category_mask, 'category'] =  'na'

print(f"Categories set to NA: {category_mask.sum()}")
print(f"Categories to preserve as tags: {len(categories_to_preserve)}")

# Process descriptions that contain top 10 tags and preserve the found tags
descriptions_processed = 0
for tag in top_10_tags:
    description_mask = df_cleaned['description'].str.lower().str.contains(tag.lower(), na=False, regex=False)
    if description_mask.sum() > 0:
        # For each row where description contains this tag, add tag to preserve list
        for idx in df_cleaned[description_mask].index:
            tags_to_preserve.append((idx, tag))
        
        # Set descriptions that contain this tag to NA
        df_cleaned.loc[description_mask, 'description'] = 'na'
        descriptions_processed += description_mask.sum()
        print(f"Description containing '{tag}' set to NA: {description_mask.sum()} rows")

print(f"Total descriptions set to NA: {descriptions_processed}")
print(f"Tags to preserve from descriptions: {len(tags_to_preserve)}")

# Now update the tags column to preserve the removed category/description tags
def add_tag_to_tags_column(existing_tags, new_tag):
    """Add a new tag to the existing tags string if not already present"""
    if pd.isna(existing_tags) or existing_tags == '':
        return new_tag
    
    # Split existing tags and normalize
    existing_list = [t.strip().lower() for t in str(existing_tags).split(',')]
    new_tag_lower = new_tag.lower()
    
    # Only add if not already present
    if new_tag_lower not in existing_list:
        return f"{existing_tags}, {new_tag}"
    else:
        return existing_tags

# Add preserved categories to tags column
for idx, original_category in enumerate(categories_to_preserve):
    if pd.notna(original_category):
        row_idx = df_cleaned[category_mask].index[idx]
        current_tags = df_cleaned.loc[row_idx, 'tags']
        df_cleaned.loc[row_idx, 'tags'] = add_tag_to_tags_column(current_tags, original_category)

# Add preserved description tags to tags column
for row_idx, tag in tags_to_preserve:
    current_tags = df_cleaned.loc[row_idx, 'tags']
    df_cleaned.loc[row_idx, 'tags'] = add_tag_to_tags_column(current_tags, tag)

print(f"\n=== CLEANING RESULTS ===")
print(f"Total rows: {len(df_cleaned)}")
print(f"Categories set to NA: {category_mask.sum()}")
print(f"Descriptions set to NA: {descriptions_processed}")
print(f"Tags preserved in tags column: {len(categories_to_preserve) + len(tags_to_preserve)}")

# Show statistics about the cleaned data
print(f"\n=== CLEANED DATA STATISTICS ===")
print(f"Rows with valid categories: {df_cleaned['category'].notna().sum()}")
print(f"Rows with valid descriptions: {df_cleaned['description'].notna().sum()}")
print(f"Rows with valid companies: {df_cleaned['company'].notna().sum()}")
print(f"Rows with valid tags: {df_cleaned['tags'].notna().sum()}")

print(f"Percentage with valid categories: {(df_cleaned['category'].notna().sum() / len(df_cleaned)) * 100:.2f}%")
print(f"Percentage with valid descriptions: {(df_cleaned['description'].notna().sum() / len(df_cleaned)) * 100:.2f}%")

# Show sample of cleaned data
print(f"\n=== SAMPLE CLEANED DATA ===")
print(df_cleaned.head(10))

Cleaning data to set category-tag overlaps to NA and preserve tags...
Categories set to NA: 26035
Categories to preserve as tags: 26035
Description containing 'sharp' set to NA: 1387 rows
Description containing 'abstract' set to NA: 4174 rows
Description containing 'recognizable' set to NA: 4 rows
Description containing 'relatable' set to NA: 10 rows
Description containing 'abstract' set to NA: 4174 rows
Description containing 'recognizable' set to NA: 4 rows
Description containing 'relatable' set to NA: 10 rows
Description containing 'minimalist' set to NA: 1239 rows
Total descriptions set to NA: 6814
Tags to preserve from descriptions: 6814
Description containing 'minimalist' set to NA: 1239 rows
Total descriptions set to NA: 6814
Tags to preserve from descriptions: 6814

=== CLEANING RESULTS ===
Total rows: 393297
Categories set to NA: 26035
Descriptions set to NA: 6814
Tags preserved in tags column: 32849

=== CLEANED DATA STATISTICS ===
Rows with valid categories: 393261
Rows with

In [6]:
# Save the cleaned data
print(f"Saving cleaned data to {output_csv}...")
df_cleaned.to_csv(output_csv, index=False)

print(f"Cleaned data saved successfully!")

# Show final statistics
print(f"\n=== FINAL STATISTICS ===")
print(f"Total dataset rows: {len(df_cleaned):,}")
print(f"Rows with valid categories: {df_cleaned['category'].notna().sum():,}")
print(f"Rows with valid descriptions: {df_cleaned['description'].notna().sum():,}")
print(f"Categories set to NA: {category_mask.sum():,}")
print(f"Descriptions set to NA: {descriptions_processed:,}")
print(f"Category NA rate: {(category_mask.sum() / len(df_cleaned)) * 100:.2f}%")
print(f"Description NA rate: {(descriptions_processed / len(df_cleaned)) * 100:.2f}%")
print(f"Tags preserved: {len(categories_to_preserve) + len(tags_to_preserve):,}")
print(f"Output file: {output_csv}")
print(f"Columns: {list(df_cleaned.columns)}")

# Verify the cleaning worked
print(f"\n=== VERIFICATION ===")
print("Checking if any of the top 10 tags still appear as categories:")
for tag in top_10_tags:
    remaining = (df_cleaned['category'].str.lower() == tag.lower()).sum()
    print(f"  '{tag}': {remaining} rows (should be 0)")

print("\nChecking NA values in cleaned data:")
print(f"  Category NA count: {df_cleaned['category'].isna().sum()}")
print(f"  Description NA count: {df_cleaned['description'].isna().sum()}")
print(f"  Company NA count: {df_cleaned['company'].isna().sum()}")
print(f"  Tags NA count: {df_cleaned['tags'].isna().sum()}")

# Verify tag preservation
print(f"\n=== TAG PRESERVATION VERIFICATION ===")
print("Checking if removed category/description tags were preserved in tags column:")
sample_preserved = 0
for tag in top_10_tags[:5]:  # Check first 5 tags as example
    # Count how many rows have this tag in their tags column
    has_tag = df_cleaned['tags'].str.lower().str.contains(tag.lower(), na=False, regex=False).sum()
    print(f"  '{tag}' now in tags column: {has_tag} rows")
    sample_preserved += has_tag

print(f"Sample verification shows {sample_preserved} preserved tags in first 5 categories")

Saving cleaned data to ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata5.csv...
Cleaned data saved successfully!

=== FINAL STATISTICS ===
Total dataset rows: 393,297
Rows with valid categories: 393,261
Rows with valid descriptions: 393,297
Categories set to NA: 26,035
Descriptions set to NA: 6,814
Category NA rate: 6.62%
Description NA rate: 1.73%
Tags preserved: 32,849
Output file: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata5.csv
Columns: ['id', 'company', 'description', 'category', 'tags']

=== VERIFICATION ===
Checking if any of the top 10 tags still appear as categories:
  'sharp': 0 rows (should be 0)
  'vector_art': 0 rows (should be 0)
  'even_edges': 0 rows (should be 0)
  'thoughtprovoking': 0 rows (should be 0)
Cleaned data saved successfully!

=== FINAL STATISTICS ===
Total dataset rows: 393,297
Rows with valid categories: 393,261
Rows with valid descriptions: 393,297
Categories set to NA: 26,035
Descriptions set to NA: 6,814
Category NA rate: 6.62%
D

In [7]:
# Additional analysis of the cleaned dataset
print("=== CLEANED DATASET ANALYSIS ===")

# Top categories in cleaned data (excluding NA)
top_categories_cleaned = df_cleaned['category'].value_counts().head(20)
print(f"\n=== TOP 20 CATEGORIES IN CLEANED DATA ===")
for i, (category, count) in enumerate(top_categories_cleaned.items()):
    percentage = (count / df_cleaned['category'].notna().sum()) * 100
    print(f"{i+1:2d}. {category:<25} {count:>6} logos ({percentage:>5.2f}%)")

# Compare category distribution before and after cleaning
print(f"\n=== CATEGORY DISTRIBUTION COMPARISON ===")
print(f"Original unique categories: {df_metadata['category'].nunique()}")
print(f"Cleaned unique categories (excl. NA): {df_cleaned['category'].nunique()}")
print(f"NA categories added: {df_cleaned['category'].isna().sum()}")

# Check data quality
print(f"\n=== DATA QUALITY CHECK ===")
print(f"Total rows: {len(df_cleaned)}")
print(f"Rows with valid company: {df_cleaned['company'].notna().sum()}")
print(f"Rows with valid description: {df_cleaned['description'].notna().sum()}")
print(f"Rows with valid category: {df_cleaned['category'].notna().sum()}")
print(f"Rows with valid tags: {df_cleaned['tags'].notna().sum()}")

# Show data completeness percentages
print(f"\n=== DATA COMPLETENESS ===")
print(f"Company completeness: {(df_cleaned['company'].notna().sum() / len(df_cleaned)) * 100:.2f}%")
print(f"Description completeness: {(df_cleaned['description'].notna().sum() / len(df_cleaned)) * 100:.2f}%")
print(f"Category completeness: {(df_cleaned['category'].notna().sum() / len(df_cleaned)) * 100:.2f}%")
print(f"Tags completeness: {(df_cleaned['tags'].notna().sum() / len(df_cleaned)) * 100:.2f}%")

# Show some examples of cleaned rows
print(f"\n=== EXAMPLES OF CLEANED ROWS ===")
print("Rows with NA categories (and their preserved tags):")
na_category_examples = df_cleaned[df_cleaned['category'].isna()].head(5)
if len(na_category_examples) > 0:
    for i, (_, row) in enumerate(na_category_examples.iterrows()):
        tags_preview = str(row['tags'])[:100] + "..." if len(str(row['tags'])) > 100 else str(row['tags'])
        print(f"  {i+1}. Company: {row['company']}")
        print(f"      Tags: {tags_preview}")

print("\nRows with NA descriptions (and their preserved tags):")
na_description_examples = df_cleaned[df_cleaned['description'].isna()].head(5)
if len(na_description_examples) > 0:
    for i, (_, row) in enumerate(na_description_examples.iterrows()):
        tags_preview = str(row['tags'])[:100] + "..." if len(str(row['tags'])) > 100 else str(row['tags'])
        print(f"  {i+1}. Company: {row['company']}, Category: {row['category']}")
        print(f"      Tags: {tags_preview}")

# Show examples of tag preservation
print(f"\n=== TAG PRESERVATION EXAMPLES ===")
for tag in top_10_tags[:3]:  # Show examples for first 3 tags
    # Find rows where this tag was likely preserved
    has_tag_mask = df_cleaned['tags'].str.lower().str.contains(tag.lower(), na=False, regex=False)
    na_category_mask = df_cleaned['category'].isna()
    
    preserved_examples = df_cleaned[has_tag_mask & na_category_mask].head(2)
    if len(preserved_examples) > 0:
        print(f"\nExamples where '{tag}' was preserved in tags:")
        for i, (_, row) in enumerate(preserved_examples.iterrows()):
            print(f"  {i+1}. Company: {row['company']}")
            print(f"      Tags: {row['tags']}")

=== CLEANED DATASET ANALYSIS ===

=== TOP 20 CATEGORIES IN CLEANED DATA ===
 1. na                         26043 logos ( 6.62%)
 2. food                        7863 logos ( 2.00%)
 3. real_estate                 7809 logos ( 1.99%)
 4. education                   7667 logos ( 1.95%)
 5. restaurant                  7338 logos ( 1.87%)
 6. sports                      7013 logos ( 1.78%)
 7. non_profit                  5917 logos ( 1.50%)
 8. music                       5885 logos ( 1.50%)
 9. retail                      5132 logos ( 1.30%)
10. entertainment               5021 logos ( 1.28%)
11. technology                  4952 logos ( 1.26%)
12. construction                4462 logos ( 1.13%)
13. design                      3850 logos ( 0.98%)
14. photography                 3727 logos ( 0.95%)
15. fashion                     3481 logos ( 0.89%)
16. healthcare                  3399 logos ( 0.86%)
17. hospitality                 3197 logos ( 0.81%)

=== TOP 20 CATEGORIES IN CLEANED DATA =