# Logo Data Processor

This notebook downloads the logo dataset in parquet format, processes the data to add unique IDs, and saves the logo metadata and images in specified formats.

In [None]:
import pandas as pd
import os
from PIL import Image
import requests
from io import BytesIO
from pathlib import Path
import base64

In [None]:
def decode_image_data(image_data):
    """Decode image data to PIL Image - handles multiple formats"""
    try:
        # Case 1: Image data is a dictionary (HuggingFace datasets format)
        if isinstance(image_data, dict):
            if 'bytes' in image_data:
                # Direct bytes data
                image_bytes = image_data['bytes']
                if isinstance(image_bytes, bytes):
                    image = Image.open(BytesIO(image_bytes))
                    return image
                elif isinstance(image_bytes, str):
                    # Base64 encoded bytes
                    image_bytes = base64.b64decode(image_bytes)
                    image = Image.open(BytesIO(image_bytes))
                    return image
            elif 'path' in image_data:
                # Path to image file
                image_path = image_data['path']
                image = Image.open(image_path)
                return image
            else:
                print(f"Unknown dict format: {list(image_data.keys())}")
                return None
        
        # Case 2: Image data is a base64 string
        elif isinstance(image_data, str):
            # Remove data URL prefix if present
            if image_data.startswith('data:image'):
                image_data = image_data.split(',')[1]
            
            # Decode base64
            image_bytes = base64.b64decode(image_data)
            image = Image.open(BytesIO(image_bytes))
            return image
        
        # Case 3: Image data is already bytes
        elif isinstance(image_data, bytes):
            image = Image.open(BytesIO(image_data))
            return image
        
        else:
            print(f"Unknown image data type: {type(image_data)}")
            return None
            
    except Exception as e:
        print(f"Error decoding image: {e}")
        return None

print("Image decoding function defined")

Image decoding function defined


In [None]:
# Define output directories
output_dir = Path('../output')
data_dir = output_dir / 'data'
images_dir = output_dir / 'images'
sizes = [256, 512, 1024]

for size in sizes:
    (images_dir / f'{size}x{size}').mkdir(parents=True, exist_ok=True)

# Download the parquet data
url = 'https://huggingface.co/datasets/logo-wizard/modern-logo-dataset/resolve/main/data/train-00000-of-00001-b64601da56687a05.parquet'
response = requests.get(url)
with open('logo_dataset.parquet', 'wb') as f:
    f.write(response.content)

# Load the dataset
df = pd.read_parquet('logo_dataset.parquet')

# Add unique ID with prefix 'logowiz'
df['unique_id'] = ['logowiz' + str(i).zfill(6) for i in range(len(df))]

In [7]:
# Save logo metadata as CSV
metadata_file = data_dir / 'logowiz_logo_metadata.csv'
df.drop(columns='image').to_csv(metadata_file, index=False)
print(f'Saved logo metadata to {metadata_file}')

Saved logo metadata to ..\output\data\logowiz_logo_metadata.csv


In [8]:
# Save logo bytes as CSV
metadata_file = data_dir / 'logowiz_logo_bytes.csv'
df.drop(columns='text').to_csv(metadata_file, index=False)
print(f'Saved logo bytes to {metadata_file}')

Saved logo bytes to ..\output\data\logowiz_logo_bytes.csv


In [None]:


# Function to save images in different sizes
def save_image(image_data, unique_id):
    try:
        image = decode_image_data(image_data)
        if image:
            # Convert RGBA to RGB if necessary
            if image.mode in ('RGBA', 'LA', 'P'):
                # Create a white background
                background = Image.new('RGB', image.size, (255, 255, 255))
                if image.mode == 'P':
                    image = image.convert('RGBA')
                background.paste(image, mask=image.split()[-1] if image.mode == 'RGBA' else None)
                image = background
            elif image.mode != 'RGB':
                image = image.convert('RGB')
            
            for size in sizes:
                img_resized = image.resize((size, size), Image.LANCZOS)
                img_resized.save(images_dir / f'{size}x{size}' / f'{unique_id}.jpg')
    except Exception as e:
        print(f'Error processing image for {unique_id}: {e}')

# Process and save images
for idx, row in df.iterrows():
    image_data = row['image']  # Assuming 'image' is the column name
    unique_id = row['unique_id']
    save_image(image_data, unique_id)

print('Image processing completed.')

Saved logo metadata to ..\output\data\logowiz_logo_metadata.csv
Image processing completed.
Image processing completed.
