# Amazing Logos V4 - Step 3: Category Analysis

This notebook performs deeper analysis of the categories from the structured data:
- Loads amazing_logos_v4_metadata2.csv from Step 2
- Analyzes category distribution and frequency
- Creates category statistics and insights
- Outputs a CSV with category counts for further analysis

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# Paths
input_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata2.csv')
output_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/categories_analysis.csv')
output_csv2 = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata3.csv')

print(f"Input CSV: {input_csv}")
print(f"Output CSV: {output_csv}")

# Check if input exists
if not input_csv.exists():
    print(f"ERROR: Input file {input_csv} does not exist!")
    print("Please run Step 2 notebook first to generate the structured data.")
else:
    print(f"Input file exists.")

Input CSV: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\metadata2.csv
Output CSV: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\categories_analysis.csv
Input file exists.


In [2]:
# Load the structured metadata CSV
print("Loading structured metadata CSV...")
df = pd.read_csv(input_csv)

print(f"Loaded {len(df)} rows")
print(f"Columns: {list(df.columns)}")
print(f"\nDataset info:")
print(df.info())

print(f"\nFirst 5 rows:")
print(df.head())

Loading structured metadata CSV...
Loaded 393298 rows
Columns: ['id', 'company', 'description', 'category', 'tags']

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393298 entries, 0 to 393297
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           393298 non-null  object
 1   company      393298 non-null  object
 2   description  393298 non-null  object
 3   category     393264 non-null  object
 4   tags         393298 non-null  object
dtypes: object(5)
memory usage: 15.0+ MB
None

First 5 rows:
                      id                                    company  \
0  amazing_logo_v4000000  Simple elegant logo for Mandarin Oriental   
1  amazing_logo_v4000001               Simple elegant logo for Alfa   
2  amazing_logo_v4000002            Simple elegant logo for Kuraray   
3  amazing_logo_v4000003       Simple elegant logo for Valwood Park   
4  amazing_logo_v4000004            Simple elegan

In [3]:
def normalize_category(category):
    """Normalisiert Kategorienamen für besseren Vergleich"""
    category = re.sub(r'[^\w\s]', '', str(category).lower().strip())
    category = re.sub(r'\s+', ' ', category)  # Mehrere Leerzeichen zu einem reduzieren
    return category.replace(' ', '_')

In [4]:
# Normalize categories
df['category'] = df['category'].apply(normalize_category)
df.loc[df.category.isna(), 'category'] = 'na'

In [5]:
df.to_csv(output_csv2, index=False)

In [6]:
# Analyze categories
print("=== CATEGORY ANALYSIS ===")

# Get category counts
category_counts = df['category'].value_counts().reset_index()
category_counts.columns = ['category', 'count']

print(f"Total unique categories: {len(category_counts)}")
print(f"Total logos analyzed: {category_counts['count'].sum()}")

# Basic statistics
print(f"\n=== CATEGORY STATISTICS ===")
print(f"Mean logos per category: {category_counts['count'].mean():.1f}")
print(f"Median logos per category: {category_counts['count'].median():.1f}")
print(f"Standard deviation: {category_counts['count'].std():.1f}")
print(f"Min logos in category: {category_counts['count'].min()}")
print(f"Max logos in category: {category_counts['count'].max()}")

# Show top 20 categories
print(f"\n=== TOP 20 CATEGORIES ===")
for i, row in category_counts.head(20).iterrows():
    percentage = (row['count'] / len(df)) * 100
    print(f"{i+1:2d}. {row['category']:<25} {row['count']:>6} logos ({percentage:>5.2f}%)")

=== CATEGORY ANALYSIS ===
Total unique categories: 44811
Total logos analyzed: 393298

=== CATEGORY STATISTICS ===
Mean logos per category: 8.8
Median logos per category: 1.0
Standard deviation: 170.3
Min logos in category: 1
Max logos in category: 25630

=== TOP 20 CATEGORIES ===
 1. minimalist                 25630 logos ( 6.52%)
 2. food                        7858 logos ( 2.00%)
 3. real_estate                 7807 logos ( 1.99%)
 4. education                   7664 logos ( 1.95%)
 5. restaurant                  7340 logos ( 1.87%)
 6. sports                      7012 logos ( 1.78%)
 7. non_profit                  5896 logos ( 1.50%)
 8. music                       5882 logos ( 1.50%)
 9. retail                      5116 logos ( 1.30%)
10. entertainment               5008 logos ( 1.27%)
11. technology                  4940 logos ( 1.26%)
12. construction                4421 logos ( 1.12%)
13. design                      3844 logos ( 0.98%)
14. photography                 3727 logos

In [7]:
# Analyze category distribution patterns
print("=== CATEGORY DISTRIBUTION ANALYSIS ===")

# Categories with different count ranges
high_volume = category_counts[category_counts['count'] >= 1000]
medium_volume = category_counts[(category_counts['count'] >= 100) & (category_counts['count'] < 1000)]
low_volume = category_counts[(category_counts['count'] >= 10) & (category_counts['count'] < 100)]
very_low_volume = category_counts[category_counts['count'] < 10]

print(f"High volume categories (≥1000 logos): {len(high_volume)} categories")
print(f"Medium volume categories (100-999 logos): {len(medium_volume)} categories")
print(f"Low volume categories (10-99 logos): {len(low_volume)} categories")
print(f"Very low volume categories (<10 logos): {len(very_low_volume)} categories")

# Cumulative analysis
total_logos = len(df)
top_10_coverage = category_counts.head(10)['count'].sum()
top_50_coverage = category_counts.head(50)['count'].sum()
top_100_coverage = category_counts.head(100)['count'].sum()

print(f"\n=== COVERAGE ANALYSIS ===")
print(f"Top 10 categories cover: {top_10_coverage:,} logos ({top_10_coverage/total_logos*100:.1f}%)")
print(f"Top 50 categories cover: {top_50_coverage:,} logos ({top_50_coverage/total_logos*100:.1f}%)")
print(f"Top 100 categories cover: {top_100_coverage:,} logos ({top_100_coverage/total_logos*100:.1f}%)")

=== CATEGORY DISTRIBUTION ANALYSIS ===
High volume categories (≥1000 logos): 52 categories
Medium volume categories (100-999 logos): 308 categories
Low volume categories (10-99 logos): 2541 categories
Very low volume categories (<10 logos): 41910 categories

=== COVERAGE ANALYSIS ===
Top 10 categories cover: 85,213 logos (21.7%)
Top 50 categories cover: 166,233 logos (42.3%)
Top 100 categories cover: 200,592 logos (51.0%)


In [8]:
# Save category analysis to CSV
print(f"Saving category analysis to {output_csv}...")
category_counts.to_csv(output_csv, index=False)

print(f"Category analysis saved successfully!")

# Show final statistics
print(f"\n=== FINAL STATISTICS ===")
print(f"Total categories analyzed: {len(category_counts)}")
print(f"Total logos: {category_counts['count'].sum():,}")
print(f"Output file: {output_csv}")
print(f"Columns in output: {list(category_counts.columns)}")

# Show sample of saved data
print(f"\n=== SAMPLE OUTPUT DATA ===")
print(category_counts.head(15))

Saving category analysis to ..\..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\categories_analysis.csv...
Category analysis saved successfully!

=== FINAL STATISTICS ===
Total categories analyzed: 44811
Total logos: 393,298
Output file: ..\..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\categories_analysis.csv
Columns in output: ['category', 'count']

=== SAMPLE OUTPUT DATA ===
         category  count
0      minimalist  25630
1            food   7858
2     real_estate   7807
3       education   7664
4      restaurant   7340
5          sports   7012
6      non_profit   5896
7           music   5882
8          retail   5116
9   entertainment   5008
10     technology   4940
11   construction   4421
12         design   3844
13    photography   3727
14        fashion   3481


In [9]:
# Additional insights and patterns
print("=== CATEGORY INSIGHTS ===")

# Find categories that might be related or similar
print("\n=== BUSINESS SECTORS ANALYSIS ===")

# Define some business sector keywords to group categories
business_sectors = {
    'Technology': ['tech', 'software', 'app', 'digital', 'computer', 'internet', 'web', 'IT'],
    'Food & Beverage': ['food', 'restaurant', 'cafe', 'coffee', 'bar', 'beverage', 'drink', 'kitchen'],
    'Healthcare': ['health', 'medical', 'hospital', 'clinic', 'care', 'wellness', 'fitness'],
    'Finance': ['bank', 'financial', 'finance', 'investment', 'money', 'credit', 'insurance'],
    'Education': ['education', 'school', 'university', 'learning', 'training', 'academic'],
    'Entertainment': ['entertainment', 'music', 'game', 'sport', 'media', 'film', 'movie'],
    'Retail': ['shop', 'store', 'retail', 'fashion', 'clothing', 'accessories'],
    'Services': ['service', 'consulting', 'agency', 'marketing', 'design', 'cleaning']
}

# Count categories by sector
sector_counts = {}
for sector, keywords in business_sectors.items():
    count = 0
    for _, row in category_counts.iterrows():
        category_lower = row['category'].lower()
        if any(keyword in category_lower for keyword in keywords):
            count += row['count']
    sector_counts[sector] = count

# Display sector analysis
print("Business sector distribution (approximate):")
for sector, count in sorted(sector_counts.items(), key=lambda x: x[1], reverse=True):
    if count > 0:
        percentage = (count / total_logos) * 100
        print(f"  {sector:<15}: {count:>6,} logos ({percentage:>5.1f}%)")

print(f"\n=== BOTTOM 10 CATEGORIES ===")
print("Least common categories:")
for i, row in category_counts.tail(10).iterrows():
    print(f"  {row['category']:<30} {row['count']:>3} logos")

=== CATEGORY INSIGHTS ===

=== BUSINESS SECTORS ANALYSIS ===
Business sector distribution (approximate):
  Entertainment  : 33,939 logos (  8.6%)
  Food & Beverage: 31,392 logos (  8.0%)
  Services       : 27,730 logos (  7.1%)
  Technology     : 22,671 logos (  5.8%)
  Healthcare     : 22,514 logos (  5.7%)
  Retail         : 17,929 logos (  4.6%)
  Education      : 12,315 logos (  3.1%)
  Finance        : 10,124 logos (  2.6%)

=== BOTTOM 10 CATEGORIES ===
Least common categories:
  raw_materials_supplier           1 logos
  surfing_tourism                  1 logos
  online_tool_for_charitable_organizations   1 logos
  stock_clothing                   1 logos
  animal_veterinary                1 logos
  control                          1 logos
  gym_supplement                   1 logos
  industrial_fabric_agent          1 logos
  film_photography_production      1 logos
  secure_legal_document_storage    1 logos


In [10]:
mask = df['category'] == 'minimalist'
df[mask].head(10)

Unnamed: 0,id,company,description,category,tags
9937,amazing_logo_v4010124,Simple elegant logo for Elka Minimalista,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
9948,amazing_logo_v4010135,Simple elegant logo for parrot,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
9965,amazing_logo_v4010152,Simple elegant logo for MIXINGLE,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
9969,amazing_logo_v4010156,Simple elegant logo for Duck,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
9994,amazing_logo_v4010181,Simple elegant logo for Black Knight,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
10008,amazing_logo_v4010195,Simple elegant logo for David,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
10015,amazing_logo_v4010202,Simple elegant logo for Reisebuero Boehm,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
10016,amazing_logo_v4010203,Simple elegant logo for Lotus Massage Therapy,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
10029,amazing_logo_v4010216,Simple elegant logo for Urbanscape8jpg,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
10097,amazing_logo_v4010284,Simple elegant logo for NC Quilt Symposium,successful vibe,minimalist,"thought-provoking, abstract, recognizable, rel..."
