# Amazing Logos V4 - Step 4: Categories Analysis (Post-Filtering)

This notebook analyzes categories from the filtered dataset:
- Loads amazing_logos_v4_metadata3.csv from Step 4 filtering
- Analyzes category distribution and frequency
- Creates category statistics
- Outputs a CSV with category counts as categories_analysis2.csv

In [6]:
import pandas as pd
import numpy as np
from pathlib import Path

# Paths
input_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata3.csv')
output_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/categories_analysis2.csv')

print(f"Input CSV: {input_csv}")
print(f"Output CSV: {output_csv}")

# Check if input exists
if not input_csv.exists():
    print(f"ERROR: Input file {input_csv} does not exist!")
    print("Please run Step 4 filtering notebook first.")
else:
    print(f"Input file exists.")

Input CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata3.csv
Output CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_categories_analysis2.csv
Input file exists.


In [7]:
# Load the filtered metadata CSV
print("Loading filtered metadata CSV...")
df = pd.read_csv(input_csv)

print(f"Loaded {len(df)} rows")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst 5 rows:")
print(df.head())

Loading filtered metadata CSV...
Loaded 367266 rows
Columns: ['id', 'company', 'description', 'category', 'tags']

First 5 rows:
                      id                                    company  \
0  amazing_logo_v4000000  Simple elegant logo for Mandarin Oriental   
1  amazing_logo_v4000001               Simple elegant logo for Alfa   
2  amazing_logo_v4000002            Simple elegant logo for Kuraray   
3  amazing_logo_v4000003       Simple elegant logo for Valwood Park   
4  amazing_logo_v4000004            Simple elegant logo for Cinepaq   

                       description     category  \
0        Fan Hong kong Lines Paper  Hospitality   
1         Hexagon Poland Triangles    Chemicals   
2                G Japan K Outline  Safty Glass   
3    Lines Rounded United states V         Park   
4  C Circle Film reel Spain Square         Film   

                                                tags  
0  successful vibe, minimalist, thought-provoking...  
1  successful vibe, minimal

In [8]:
# Analyze categories
print("=== CATEGORY ANALYSIS (POST-FILTERING) ===")

# Get category counts
category_counts = df['category'].value_counts().reset_index()
category_counts.columns = ['category', 'count']

print(f"Total unique categories: {len(category_counts)}")
print(f"Total logos analyzed: {category_counts['count'].sum()}")

# Basic statistics
print(f"\n=== CATEGORY STATISTICS ===")
print(f"Mean logos per category: {category_counts['count'].mean():.1f}")
print(f"Median logos per category: {category_counts['count'].median():.1f}")
print(f"Standard deviation: {category_counts['count'].std():.1f}")
print(f"Min logos in category: {category_counts['count'].min()}")
print(f"Max logos in category: {category_counts['count'].max()}")

# Show top 20 categories
print(f"\n=== TOP 20 CATEGORIES (FILTERED DATA) ===")
for i, row in category_counts.head(20).iterrows():
    percentage = (row['count'] / len(df)) * 100
    print(f"{i+1:2d}. {row['category']:<25} {row['count']:>6} logos ({percentage:>5.2f}%)")

=== CATEGORY ANALYSIS (POST-FILTERING) ===
Total unique categories: 54448
Total logos analyzed: 367232

=== CATEGORY STATISTICS ===
Mean logos per category: 6.7
Median logos per category: 1.0
Standard deviation: 90.9
Min logos in category: 1
Max logos in category: 6525

=== TOP 20 CATEGORIES (FILTERED DATA) ===
 1. Real Estate                 6525 logos ( 1.78%)
 2. Education                   6502 logos ( 1.77%)
 3. Restaurant                  6278 logos ( 1.71%)
 4. Food                        6031 logos ( 1.64%)
 5. Sports                      5800 logos ( 1.58%)
 6. Music                       4687 logos ( 1.28%)
 7. Retail                      4170 logos ( 1.14%)
 8. Entertainment               4125 logos ( 1.12%)
 9. Technology                  4109 logos ( 1.12%)
10. Construction                3797 logos ( 1.03%)
11. Design                      3734 logos ( 1.02%)
12. Healthcare                  3010 logos ( 0.82%)
13. Hospitality                 2950 logos ( 0.80%)
14. Photogr

In [9]:
# Save category analysis to CSV
print(f"Saving category analysis to {output_csv}...")
category_counts.to_csv(output_csv, index=False)

print(f"Category analysis saved successfully!")

# Show final statistics
print(f"\n=== FINAL STATISTICS ===")
print(f"Total categories analyzed: {len(category_counts)}")
print(f"Total logos: {category_counts['count'].sum():,}")
print(f"Output file: {output_csv}")
print(f"Columns in output: {list(category_counts.columns)}")

# Show sample of saved data
print(f"\n=== SAMPLE OUTPUT DATA ===")
print(category_counts.head(15))

Saving category analysis to ..\output\amazing_logos_v4\data\amazing_logos_v4_categories_analysis2.csv...
Category analysis saved successfully!

=== FINAL STATISTICS ===
Total categories analyzed: 54448
Total logos: 367,232
Output file: ..\output\amazing_logos_v4\data\amazing_logos_v4_categories_analysis2.csv
Columns in output: ['category', 'count']

=== SAMPLE OUTPUT DATA ===
         category  count
0     Real Estate   6525
1       Education   6502
2      Restaurant   6278
3            Food   6031
4          Sports   5800
5           Music   4687
6          Retail   4170
7   Entertainment   4125
8      Technology   4109
9    Construction   3797
10         Design   3734
11     Healthcare   3010
12    Hospitality   2950
13    Photography   2898
14     Non Profit   2829


In [10]:
# Additional distribution analysis
print("=== CATEGORY DISTRIBUTION ANALYSIS ===")

# Categories with different count ranges
high_volume = category_counts[category_counts['count'] >= 1000]
medium_volume = category_counts[(category_counts['count'] >= 100) & (category_counts['count'] < 1000)]
low_volume = category_counts[(category_counts['count'] >= 10) & (category_counts['count'] < 100)]
very_low_volume = category_counts[category_counts['count'] < 10]

print(f"High volume categories (≥1000 logos): {len(high_volume)} categories")
print(f"Medium volume categories (100-999 logos): {len(medium_volume)} categories")
print(f"Low volume categories (10-99 logos): {len(low_volume)} categories")
print(f"Very low volume categories (<10 logos): {len(very_low_volume)} categories")

# Coverage analysis
total_logos = len(df)
top_10_coverage = category_counts.head(10)['count'].sum()
top_50_coverage = category_counts.head(50)['count'].sum()
top_100_coverage = category_counts.head(100)['count'].sum()

print(f"\n=== COVERAGE ANALYSIS ===")
print(f"Top 10 categories cover: {top_10_coverage:,} logos ({top_10_coverage/total_logos*100:.1f}%)")
print(f"Top 50 categories cover: {top_50_coverage:,} logos ({top_50_coverage/total_logos*100:.1f}%)")
print(f"Top 100 categories cover: {top_100_coverage:,} logos ({top_100_coverage/total_logos*100:.1f}%)")

=== CATEGORY DISTRIBUTION ANALYSIS ===
High volume categories (≥1000 logos): 48 categories
Medium volume categories (100-999 logos): 316 categories
Low volume categories (10-99 logos): 2765 categories
Very low volume categories (<10 logos): 51319 categories

=== COVERAGE ANALYSIS ===
Top 10 categories cover: 52,024 logos (14.2%)
Top 50 categories cover: 118,982 logos (32.4%)
Top 100 categories cover: 153,361 logos (41.8%)
