# Amazing Logos V4 - Step 3: Tags Analysis

This notebook performs deeper analysis of the tags from the structured data:
- Loads amazing_logos_v4_metadata2.csv from Step 2
- Analyzes tag distribution and frequency
- Creates tag statistics and insights
- Outputs a CSV with tag counts for further analysis

In [None]:
import pandas as pd
import re
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import sys

# Add utils folder to path
utils_path = Path('../../utils')
sys.path.append(str(utils_path))

# Import consolidation functions
from text import normalize_tags

# Paths
input_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata3.csv')
output_csv = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/tags_analysis.csv')
output_csv2 = Path('../../output/amazing_logos_v4/data/amazing_logos_v4_cleanup/metadata5.csv')

print(f"Input CSV: {input_csv}")
print(f"Output CSV: {output_csv}")

# Check if input exists
if not input_csv.exists():
    print(f"ERROR: Input file {input_csv} does not exist!")
    print("Please run Step 2 notebook first to generate the structured data.")
else:
    print(f"Input file exists.")

Input CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_metadata3.csv
Output CSV: ..\output\amazing_logos_v4\data\amazing_logos_v4_tags_analysis.csv
Input file exists.


In [2]:
# Load the structured metadata CSV
print("Loading structured metadata CSV...")
df = pd.read_csv(input_csv)

print(f"Loaded {len(df)} rows")
print(f"Columns: {list(df.columns)}")
print(f"\nDataset info:")
print(df.info())

print(f"\nFirst 5 rows (tags column):")
for i in range(min(5, len(df))):
    print(f"{i+1}. {df.iloc[i]['tags']}")

Loading structured metadata CSV...
Loaded 393297 rows
Columns: ['id', 'company', 'description', 'category', 'tags']

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393297 entries, 0 to 393296
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           393297 non-null  object
 1   company      393297 non-null  object
 2   description  393297 non-null  object
 3   category     393261 non-null  object
 4   tags         393297 non-null  object
dtypes: object(5)
memory usage: 15.0+ MB
None

First 5 rows (tags column):
1. successful vibe, minimalist, thought-provoking, abstract, recognizable, relatable, sharp, vector art, even edges, black and white
2. successful vibe, minimalist, thought-provoking, abstract, recognizable, relatable, sharp, vector art, even edges, black and white
3. successful vibe, minimalist, thought-provoking, abstract, recognizable, relatable, sharp, vector art, even edges, black a

In [4]:
# Normalize categories
df['tags'] = df['tags'].apply(normalize_tags)
df.loc[df.tags.isna(), 'tags'] = 'na'

In [5]:
df.to_csv(output_csv2, index=False)

In [6]:
# Analyze tags column
print("=== TAGS ANALYSIS ===")

# Parse tags from the tags column (comma-separated values)
all_tags = []
for tags_str in df['tags']:
    if pd.notna(tags_str) and tags_str.strip():
        # Split by comma and clean up
        tag_list = [tag.strip() for tag in tags_str.split(',') if tag.strip()]
        all_tags.extend(tag_list)

print(f"Total tag instances: {len(all_tags)}")

# Count tag occurrences
tag_counts = Counter(all_tags)
tag_counts_df = pd.DataFrame(list(tag_counts.items()), columns=['tag', 'count'])
tag_counts_df = tag_counts_df.sort_values('count', ascending=False).reset_index(drop=True)

print(f"Total unique tags: {len(tag_counts_df)}")
print(f"Total tag instances: {tag_counts_df['count'].sum()}")

# Basic statistics
print(f"\n=== TAG STATISTICS ===")
print(f"Mean occurrences per tag: {tag_counts_df['count'].mean():.1f}")
print(f"Median occurrences per tag: {tag_counts_df['count'].median():.1f}")
print(f"Standard deviation: {tag_counts_df['count'].std():.1f}")
print(f"Min occurrences: {tag_counts_df['count'].min()}")
print(f"Max occurrences: {tag_counts_df['count'].max()}")

# Show top 30 tags
print(f"\n=== TOP 30 TAGS ===")
for i, row in tag_counts_df.head(30).iterrows():
    percentage = (row['count'] / len(all_tags)) * 100
    print(f"{i+1:2d}. {row['tag']:<30} {row['count']:>7} occurrences ({percentage:>5.2f}%)")

=== TAGS ANALYSIS ===
Total tag instances: 3539625
Total unique tags: 6525
Total tag instances: 3539625

=== TAG STATISTICS ===
Mean occurrences per tag: 542.5
Median occurrences per tag: 1.0
Standard deviation: 14390.3
Min occurrences: 1
Max occurrences: 393297

=== TOP 30 TAGS ===
 1. sharp                           393297 occurrences (11.11%)
 2. vector_art                      393297 occurrences (11.11%)
 3. even_edges                      393297 occurrences (11.11%)
 4. thoughtprovoking                393297 occurrences (11.11%)
 5. abstract                        393297 occurrences (11.11%)
 6. recognizable                    393297 occurrences (11.11%)
 7. relatable                       393297 occurrences (11.11%)
 8. minimalist                      367639 occurrences (10.39%)
 9. successful_vibe                 367262 occurrences (10.38%)
10. black_and_white                   9936 occurrences ( 0.28%)
11. entertainment                      841 occurrences ( 0.02%)
12. food    

In [7]:
# Analyze tag distribution patterns
print("=== TAG DISTRIBUTION ANALYSIS ===")

# Tags with different occurrence ranges
high_freq_tags = tag_counts_df[tag_counts_df['count'] >= 50000]
medium_freq_tags = tag_counts_df[(tag_counts_df['count'] >= 10000) & (tag_counts_df['count'] < 50000)]
low_freq_tags = tag_counts_df[(tag_counts_df['count'] >= 1000) & (tag_counts_df['count'] < 10000)]
very_low_freq_tags = tag_counts_df[tag_counts_df['count'] < 1000]

print(f"High frequency tags (≥50,000 occurrences): {len(high_freq_tags)} tags")
print(f"Medium frequency tags (10,000-49,999 occurrences): {len(medium_freq_tags)} tags")
print(f"Low frequency tags (1,000-9,999 occurrences): {len(low_freq_tags)} tags")
print(f"Very low frequency tags (<1,000 occurrences): {len(very_low_freq_tags)} tags")

# Cumulative analysis
total_tag_instances = len(all_tags)
top_10_tag_coverage = tag_counts_df.head(10)['count'].sum()
top_50_tag_coverage = tag_counts_df.head(50)['count'].sum()
top_100_tag_coverage = tag_counts_df.head(100)['count'].sum()

print(f"\n=== TAG COVERAGE ANALYSIS ===")
print(f"Top 10 tags cover: {top_10_tag_coverage:,} instances ({top_10_tag_coverage/total_tag_instances*100:.1f}%)")
print(f"Top 50 tags cover: {top_50_tag_coverage:,} instances ({top_50_tag_coverage/total_tag_instances*100:.1f}%)")
print(f"Top 100 tags cover: {top_100_tag_coverage:,} instances ({top_100_tag_coverage/total_tag_instances*100:.1f}%)")

=== TAG DISTRIBUTION ANALYSIS ===
High frequency tags (≥50,000 occurrences): 9 tags
Medium frequency tags (10,000-49,999 occurrences): 0 tags
Low frequency tags (1,000-9,999 occurrences): 1 tags
Very low frequency tags (<1,000 occurrences): 6515 tags

=== TAG COVERAGE ANALYSIS ===
Top 10 tags cover: 3,497,916 instances (98.8%)
Top 50 tags cover: 3,510,895 instances (99.2%)
Top 100 tags cover: 3,516,048 instances (99.3%)


In [8]:
# Save tag analysis to CSV
print(f"Saving tag analysis to {output_csv}...")
tag_counts_df.to_csv(output_csv, index=False)

print(f"Tag analysis saved successfully!")

# Show final statistics
print(f"\n=== FINAL STATISTICS ===")
print(f"Total tags analyzed: {len(tag_counts_df)}")
print(f"Total tag instances: {tag_counts_df['count'].sum():,}")
print(f"Output file: {output_csv}")
print(f"Columns in output: {list(tag_counts_df.columns)}")

# Show sample of saved data
print(f"\n=== SAMPLE OUTPUT DATA ===")
print(tag_counts_df.head(15))

Saving tag analysis to ..\output\amazing_logos_v4\data\amazing_logos_v4_tags_analysis.csv...
Tag analysis saved successfully!

=== FINAL STATISTICS ===
Total tags analyzed: 6525
Total tag instances: 3,539,625
Output file: ..\output\amazing_logos_v4\data\amazing_logos_v4_tags_analysis.csv
Columns in output: ['tag', 'count']

=== SAMPLE OUTPUT DATA ===
                 tag   count
0              sharp  393297
1         vector_art  393297
2         even_edges  393297
3   thoughtprovoking  393297
4           abstract  393297
5       recognizable  393297
6          relatable  393297
7         minimalist  367639
8    successful_vibe  367262
9    black_and_white    9936
10     entertainment     841
11              food     780
12            retail     765
13        restaurant     676
14         education     616


In [9]:
# Additional insights and tag patterns
print("=== TAG INSIGHTS ===")

# Analyze tag themes
print("\n=== TAG THEMES ANALYSIS ===")

# Define some tag themes to group tags
tag_themes = {
    'Style & Aesthetics': ['minimalist', 'elegant', 'modern', 'stylish', 'sleek', 'sophisticated', 'clean', 'simple', 'bold', 'artistic'],
    'Emotions & Feelings': ['professional', 'confident', 'successful', 'trustworthy', 'friendly', 'creative', 'innovative', 'energetic', 'calm', 'warm'],
    'Visual Elements': ['geometric', 'abstract', 'colorful', 'monochrome', 'gradient', 'textual', 'iconic', 'symbolic', 'illustrative', 'typographic'],
    'Business Attributes': ['corporate', 'startup', 'established', 'premium', 'luxury', 'affordable', 'local', 'global', 'sustainable', 'eco-friendly'],
    'Design Qualities': ['memorable', 'versatile', 'scalable', 'timeless', 'unique', 'recognizable', 'distinctive', 'cohesive', 'balanced', 'harmonious'],
    'Communication': ['clear', 'direct', 'subtle', 'expressive', 'informative', 'persuasive', 'engaging', 'approachable', 'authoritative', 'welcoming']
}

# Count tags by theme
theme_counts = {}
for theme, keywords in tag_themes.items():
    count = 0
    for _, row in tag_counts_df.iterrows():
        tag_lower = row['tag'].lower()
        if any(keyword in tag_lower for keyword in keywords):
            count += row['count']
    theme_counts[theme] = count

# Display theme analysis
print("Tag theme distribution (approximate):")
for theme, count in sorted(theme_counts.items(), key=lambda x: x[1], reverse=True):
    if count > 0:
        percentage = (count / total_tag_instances) * 100
        print(f"  {theme:<20}: {count:>7,} instances ({percentage:>5.1f}%)")

print(f"\n=== BOTTOM 20 TAGS ===")
print("Least common tags:")
for i, row in tag_counts_df.tail(20).iterrows():
    print(f"  {row['tag']:<40} {row['count']:>3} occurrences")

# Average tags per logo
total_logos = len(df)
avg_tags_per_logo = total_tag_instances / total_logos
print(f"\n=== USAGE STATISTICS ===")
print(f"Average tags per logo: {avg_tags_per_logo:.1f}")
print(f"Total logos with tags: {len(df[df['tags'] != ''])}")
print(f"Logos without tags: {len(df[df['tags'] == ''])}")

=== TAG INSIGHTS ===

=== TAG THEMES ANALYSIS ===
Tag theme distribution (approximate):
  Visual Elements     : 393,312 instances ( 11.1%)
  Design Qualities    : 393,297 instances ( 11.1%)
  Style & Aesthetics  : 367,723 instances ( 10.4%)
  Emotions & Feelings : 367,538 instances ( 10.4%)
  Business Attributes :     160 instances (  0.0%)
  Communication       :      37 instances (  0.0%)

=== BOTTOM 20 TAGS ===
Least common tags:
  memorial                                   1 occurrences
  technology_research                        1 occurrences
  engineering_automation                     1 occurrences
  publishing_and_packaging                   1 occurrences
  information_film                           1 occurrences
  industry_machinery_manufacturing           1 occurrences
  bathing_suit                               1 occurrences
  music_sales                                1 occurrences
  cuisine_master_classes                     1 occurrences
  commercial_cleaning           