In [1]:
import os
import numpy as np
import pandas as pd
import re
from PIL import Image
from io import BytesIO
from tqdm.auto import tqdm
import random
import json
from collections import Counter
from google.cloud import storage
import logging
import time

In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# GCS Configuration
BUCKET_NAME = "logo-generation-dataset"
DATASET_FOLDER = "modern_logo_dataset"

# Local paths
LOCAL_DATA_DIR = "data"
LOCAL_PROCESSED_DIR = os.path.join(LOCAL_DATA_DIR, "processed_modern")

# Creating necessary directories
os.makedirs(LOCAL_DATA_DIR, exist_ok=True)
os.makedirs(LOCAL_PROCESSED_DIR, exist_ok=True)
os.makedirs(os.path.join(LOCAL_PROCESSED_DIR, "images"), exist_ok=True)
os.makedirs(os.path.join(LOCAL_PROCESSED_DIR, "augmented_images"), exist_ok=True)

# GCS Client setup
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

In [3]:
def upload_to_gcs(local_path, gcs_path):
    """Uploading a file to GCS bucket"""
    blob = bucket.blob(gcs_path)
    blob.upload_from_filename(local_path)
    logger.info(f"Uploaded {local_path} to gs://{BUCKET_NAME}/{gcs_path}")

In [4]:
def convert_to_pil_image(image_data):
    """Safely converting various image formats to PIL Image"""
    try:
        # 1. If it's already a PIL Image, return it
        if isinstance(image_data, Image.Image):
            return image_data
        
        # 2. If it's a dictionary with 'bytes' key, convert bytes to PIL Image
        if isinstance(image_data, dict) and 'bytes' in image_data:
            return Image.open(BytesIO(image_data['bytes']))
        
        # 3. If it's bytes directly, convert to PIL Image
        if isinstance(image_data, bytes):
            return Image.open(BytesIO(image_data))
        
        # 4. If it's a numpy array, convert to PIL Image
        if hasattr(image_data, 'shape') and hasattr(image_data, 'dtype'):
            # Convert numpy array to PIL Image
            return Image.fromarray(np.array(image_data))
        
        # If we can't handle the format, log and return None
        logger.error(f"Unknown image format: {type(image_data)}")
        if isinstance(image_data, dict):
            logger.error(f"Dictionary keys: {image_data.keys()}")
        return None
    
    except Exception as e:
        logger.error(f"Error converting image: {e}")
        return None

In [5]:
def process_image(img, target_size=1024):
    """Processing image: remove background, resize, and center"""
    try:
        # Converting to PIL Image if needed
        img = convert_to_pil_image(img)
        if img is None:
            return None
            
        # Ensuring image is in RGBA mode
        if img.mode != 'RGBA':
            img = img.convert("RGBA")
        
        # Removing background
        from rembg import remove
        img_no_bg = remove(img)
        
        # Creating a new transparent image with target size
        new_img = Image.new("RGBA", (target_size, target_size), (0, 0, 0, 0))
        
        # Resize original image preserving aspect ratio
        width, height = img_no_bg.size
        ratio = min(target_size / width, target_size / height) * 0.85  # 85% of space for margin
        new_width = int(width * ratio)
        new_height = int(height * ratio)
        
        # Using LANCZOS resampling for higher quality downscaling
        img_resized = img_no_bg.resize((new_width, new_height), Image.LANCZOS)
        
        # Center the image
        x_offset = (target_size - new_width) // 2
        y_offset = (target_size - new_height) // 2
        new_img.paste(img_resized, (x_offset, y_offset), img_resized)
        
        return new_img
    except Exception as e:
        logger.error(f"Error processing image: {e}")
        return None

In [6]:
def generate_company_name(business_type):
    """Generate a plausible company name based on business type"""
    # Dictionary of common prefixes and suffixes for different business types
    business_names = {
        "technology": {
            "prefixes": ["Tech", "Byte", "Data", "Cyber", "Digital", "Smart", "Cloud", "Quantum", "Nexus", "Nova"],
            "suffixes": ["Solutions", "Systems", "Tech", "Technologies", "Innovations", "Labs", "Network", "Dynamics", "Logic", "Wave"],
            "formats": ["{}{}"]  # Format strings for combining parts
        },
        "coffee shop": {
            "prefixes": ["Bean", "Brew", "Coffee", "Café", "Morning", "Daily", "Urban", "Roast", "Aroma", "Java"],
            "suffixes": ["Coffee", "Café", "Roasters", "Brews", "Cup", "House", "Co.", "Corner", "Fix", "Express"],
            "formats": ["{} {}", "{}'s"]
        },
        "restaurant": {
            "prefixes": ["Taste", "Flavor", "The", "Royal", "Urban", "Golden", "Silver", "Blue", "Green", "Red"],
            "suffixes": ["Table", "Kitchen", "Bistro", "Grill", "Eatery", "Cuisine", "Dining", "Restaurant", "House", "Garden"],
            "formats": ["The {} {}", "{} {}", "{}'s"]
        },
        "fitness": {
            "prefixes": ["Flex", "Power", "Strong", "Elite", "Prime", "Peak", "Core", "Iron", "Vital", "Active"],
            "suffixes": ["Fitness", "Gym", "Athletics", "Strength", "Health", "Training", "Performance", "Fit", "Wellness", "Body"],
            "formats": ["{} {}", "{}-{}"]
        },
        "salon": {
            "prefixes": ["Style", "Beauty", "Chic", "Elegant", "Divine", "Classic", "Modern", "Elite", "Pure", "Luxe"],
            "suffixes": ["Salon", "Beauty", "Styles", "Hair", "Cuts", "Studio", "Spa", "Stylists", "Lounge", "Parlor"],
            "formats": ["{} {}", "{}'s", "{} & {}"]
        },
        "design agency": {
            "prefixes": ["Creative", "Design", "Pixel", "Visual", "Idea", "Art", "Studio", "Brand", "Concept", "Identity"],
            "suffixes": ["Design", "Studio", "Creatives", "Agency", "Partners", "Works", "Lab", "Group", "Collective", "Designers"],
            "formats": ["{} {}", "{} + {}", "{} & {}"]
        }
    }
    
    # Default components if business type isn't in our dictionary
    default_components = {
        "prefixes": ["Alpha", "Omega", "Prime", "Core", "Global", "First", "United", "National", "Metro", "City"],
        "suffixes": ["Group", "Inc", "Co", "Company", "Corporation", "Services", "Solutions", "Enterprises", "International", "Associates"],
        "formats": ["{} {}", "{}, {}"]
    }
    
    # Normalizing business type
    normalized_type = business_type.lower() if business_type else "business"
    
    # Find closest match in our dictionary keys
    components = None
    for key in business_names:
        if key in normalized_type:
            components = business_names[key]
            break
    
    # Using default if no match
    if not components:
        components = default_components
    
    # Generating name
    prefix = random.choice(components["prefixes"])
    suffix = random.choice(components["suffixes"])
    name_format = random.choice(components["formats"])
    
    # Format name and return
    name = name_format.format(prefix, suffix)
    return name

def extract_company_name(text):
    """Extracting company name from text description"""
    # Patterns to match company names in quotes
    patterns = [
        r'"([^"]+)"',  # Text in double quotes
        r"'([^']+)'",  # Text in single quotes
        r'called ([A-Z][A-Za-z0-9\s\-&]+)',  # Words after "called" starting with capital letter
        r'named ([A-Z][A-Za-z0-9\s\-&]+)',   # Words after "named" starting with capital letter
        r'for ([A-Z][A-Za-z0-9\s\-&]{2,}?)[ \.,]'  # Words after "for" starting with capital letter
    ]
    
    # Trying each pattern
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            name = match.group(1).strip()
            # Cleaning up the name
            name = re.sub(r'[\.,]$', '', name)  # Removing trailing punctuation
            name = re.sub(r'\s+', ' ', name)    # Normalizing whitespace
            if len(name) > 1 and not name.lower() in ['a', 'an', 'the', 'this', 'that', 'logo']:
                return name
    
    return None

In [7]:
def parse_description(text):
    """Extract key elements from text descriptions"""
    elements = {
        "business_type": "",
        "style": "",
        "composition": "",
        "color_scheme": "",
        "iconography": "",
        "layout": "",
        "company_name": ""
    }
    
    # Clean text
    if not text:
        return elements
    
    # Extracting company name first
    elements["company_name"] = extract_company_name(text)
    
    # Converting to lowercase for pattern matching
    text_lower = text.lower()
    
    # Business type extraction - refined for Modern Logo dataset patterns
    business_match = re.search(r'logo of(?: a| an)? ([a-zA-Z0-9 ]+)', text_lower)
    if business_match:
        elements["business_type"] = business_match.group(1).strip()
    else:
        # Try alternative pattern: "[business type] logo"
        alt_match = re.search(r'([a-zA-Z0-9 ]+) logo', text_lower)
        if alt_match:
            elements["business_type"] = alt_match.group(1).strip()
    
    # Style detection
    style_keywords = {
        "minimalist": ["minimalist", "minimal", "simplistic", "clean lines"],
        "modern": ["modern", "contemporary", "sleek", "current"],
        "vintage": ["vintage", "retro", "classic", "old-fashioned", "nostalgic"],
        "abstract": ["abstract", "artistic", "conceptual", "non-figurative"],
        "geometric": ["geometric", "shapes", "symmetrical", "structured"],
        "elegant": ["elegant", "sophisticated", "refined", "graceful"],
        "bold": ["bold", "strong", "powerful", "striking"],
        "clean": ["clean", "simple", "neat", "uncluttered"],
        "playful": ["playful", "fun", "friendly", "lively", "cheerful"]
    }
    
    for style, keywords in style_keywords.items():
        if any(keyword in text_lower for keyword in keywords):
            elements["style"] = style
            break
    
    # Composition type
    if any(word in text_lower for word in ["wordmark", "text", "letter", "font", "typography", "type"]):
        elements["composition"] = "text-based"
    elif any(word in text_lower for word in ["emblem", "icon", "symbol", "pictorial", "mascot"]):
        elements["composition"] = "graphic-based"
    else:
        elements["composition"] = "combination"
    
    # Color scheme
    color_keywords = {
        "monochrome": ["monochrome", "black and white", "grayscale", "single color"],
        "gradient": ["gradient", "gradation", "fading", "transition"],
        "colorful": ["colorful", "vibrant", "multicolor", "multi-colored"],
        "primary": ["primary colors", "blue and red", "yellow and blue"]
    }
    
    for scheme, keywords in color_keywords.items():
        if any(keyword in text_lower for keyword in keywords):
            elements["color_scheme"] = scheme
            break
    
    # Extract specific colors mentioned
    colors = re.findall(r'(red|blue|green|yellow|purple|orange|black|white|gray|brown|pink|gold|silver|teal|navy|olive|maroon|turquoise)', text_lower)
    if colors:
        elements["color_scheme"] = ", ".join(sorted(set(colors)))
    
    # Layout detection
    layout_keywords = {
        "circular": ["circular", "round", "circle", "spherical", "radial"],
        "horizontal": ["horizontal", "side by side", "landscape", "wide"],
        "vertical": ["vertical", "stacked", "portrait", "tall"],
        "square": ["square", "box", "quadratic", "equilateral"],
        "diagonal": ["diagonal", "slanted", "angled", "tilted"],
        "asymmetric": ["asymmetric", "dynamic", "irregular", "uneven"]
    }
    
    for layout, keywords in layout_keywords.items():
        if any(keyword in text_lower for keyword in keywords):
            elements["layout"] = layout
            break
    
    return elements

In [8]:
def get_capitalization_variation(name):
    """Generatong capitalization variations for company names"""
    if not name:
        return name
    
    r = random.random()
    if r < 0.15:  # ALL CAPS
        return name.upper()
    elif r < 0.25:  # all lowercase
        return name.lower()
    elif r < 0.4:  # CamelCase (no spaces)
        return ''.join(word.capitalize() for word in name.split())
    elif r < 0.5:  # Alternating case
        return ''.join([c.upper() if i % 2 == 0 else c.lower() for i, c in enumerate(name)])
    elif r < 0.6:  # First Letter Only
        words = name.split()
        if len(words) > 1:
            return words[0].capitalize() + ' ' + ' '.join(w.lower() for w in words[1:])
    
    # Default: Title Case
    return ' '.join(word.capitalize() for word in name.split())

In [9]:
def load_modern_logo_dataset():
    """Loading the Modern Logo Dataset directly using pandas"""
    logger.info("Loading Modern Logo Dataset...")
    
    try:
        # Using huggingface_hub to download the dataset file
        from huggingface_hub import hf_hub_download, list_repo_files
        
        # Listing files in the repository
        logger.info("Listing files in the HuggingFace repository...")
        files = list_repo_files("logo-wizard/modern-logo-dataset", repo_type="dataset")
        
        # Looking for data files
        data_files = [f for f in files if f.endswith('.parquet') or f.endswith('.csv')]
        logger.info(f"Found data files: {data_files}")
        
        if not data_files:
            raise ValueError("No data files found in repository")
        
        # Downloading the first data file
        file_path = hf_hub_download(
            repo_id="logo-wizard/modern-logo-dataset", 
            filename=data_files[0],
            repo_type="dataset"
        )
        logger.info(f"Downloaded {data_files[0]} to {file_path}")
        
        # Loading based on file type
        if data_files[0].endswith('.parquet'):
            df = pd.read_parquet(file_path)
        elif data_files[0].endswith('.csv'):
            df = pd.read_csv(file_path)
        
        # Log the structure
        logger.info(f"DataFrame shape: {df.shape}")
        logger.info(f"DataFrame columns: {df.columns.tolist()}")
        
        # Print the first sample to understand structure
        if not df.empty:
            sample = df.iloc[0]
            logger.info("First sample:")
            for col in df.columns:
                logger.info(f"  {col}: {type(sample[col])}")
                if col == 'image' and isinstance(sample[col], dict):
                    logger.info(f"  image keys: {sample[col].keys()}")
        
        logger.info(f"Loaded {len(df)} samples")
        return df
    
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

In [10]:
def main():
    """Main function to execute the simplified data preparation pipeline"""
    # Start timing
    start_time = time.time()
    logger.info("Starting Simplified Modern Logo Dataset Preparation")
    
    # Setting output directories
    output_dir = LOCAL_PROCESSED_DIR
    images_dir = os.path.join(output_dir, "images")
    augmented_dir = os.path.join(output_dir, "augmented_images")
    
    # Step 1: Loading the dataset
    try:
        df = load_modern_logo_dataset()
        logger.info(f"Successfully loaded dataset with {len(df)} samples")
        
        # Sample entry for diagnostics
        logger.info("\nDiagnostic information for first sample:")
        sample = df.iloc[0]
        logger.info(f"Text: {sample.get('text', 'No text')}")
        if 'image' in sample:
            if isinstance(sample['image'], dict):
                logger.info(f"Image type: dict with keys {sample['image'].keys()}")
            else:
                logger.info(f"Image type: {type(sample['image'])}")
    except Exception as e:
        logger.error(f"Failed to load dataset: {e}")
        return
    
    # Step 2: Processing images and create metadata
    logger.info("\nProcessing images and creating metadata...")
    metadata = []
    
    for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing logos"):
        try:
            # Extract image and text
            img = row.get('image')
            text = row.get('text', '')
            
            # Processing image
            processed_img = process_image(img)
            if processed_img:
                # Save processed image
                output_filename = f"modern_{i}.png"
                output_path = os.path.join(images_dir, output_filename)
                processed_img.save(output_path)
                
                # Parse description
                elements = parse_description(text)
                
                # Generating company name if none found
                if not elements["company_name"]:
                    elements["company_name"] = generate_company_name(elements["business_type"])
                
                # Add metadata
                elements["original_text"] = text
                elements["dataset"] = "modern_logo"
                elements["filename"] = output_filename
                elements["image_id"] = i
                metadata.append(elements)
                
                # Log progress at intervals
                if (i + 1) % 100 == 0:
                    logger.info(f"Processed {i + 1} images so far")
        except Exception as e:
            logger.error(f"Error processing sample {i}: {e}")
            continue
    
    # Converting metadata to DataFrame
    metadata_df = pd.DataFrame(metadata)
    logger.info(f"Created metadata for {len(metadata_df)} images")
    
    # Step 3: Enhance metadata
    logger.info("\nEnhancing metadata...")
    
    # Standardizing business types
    business_mapping = {
        'coffee': 'coffee shop',
        'cafe': 'coffee shop',
        'tech': 'technology',
        'it': 'technology',
        'electronic': 'technology',
        'hair': 'salon',
        'barbershop': 'salon',
        'beauty': 'salon',
        'food': 'restaurant',
        'design': 'design agency',
        'fitness': 'fitness',
        'gym': 'fitness',
        'hotel': 'hospitality',
        'resort': 'hospitality'
    }
    
    # Applying mapping to standardize business types
    for old, new in business_mapping.items():
        if 'business_type' in metadata_df.columns:
            metadata_df.loc[metadata_df['business_type'].str.contains(old, case=False, na=False), 'business_type'] = new
    
    # Filling missing values with reasonable defaults
    if 'style' in metadata_df.columns:
        metadata_df['style'] = metadata_df['style'].fillna('modern')
    if 'composition' in metadata_df.columns:
        metadata_df['composition'] = metadata_df['composition'].fillna('combination')
    if 'color_scheme' in metadata_df.columns:
        metadata_df['color_scheme'] = metadata_df['color_scheme'].fillna('professional')
    
    # Creating standardized prompt format
    if all(col in metadata_df.columns for col in ['business_type', 'company_name', 'style', 'composition', 'color_scheme']):
        metadata_df['prompt'] = metadata_df.apply(
            lambda row: f"A logo for {row['business_type'] or 'a business'} named \"{row['company_name']}\", " +
                      f"{row['style']} style, " +
                      f"{row['composition']} design with " +
                      f"{row['color_scheme']}" +
                      (f", featuring {row['iconography']} elements" if pd.notna(row.get('iconography')) and row.get('iconography') else "") +
                      (f", in a {row['layout']} layout" if pd.notna(row.get('layout')) and row.get('layout') else "") +
                      ", with clear English text and professional appearance",
            axis=1
        )
    
    # Step 4: Creating augmentations
    logger.info("\nCreating augmentations...")
    
    # Import necessary modules for image augmentation
    from PIL import ImageEnhance
    
    augmentations_per_logo = 3
    augmented_metadata = []
    
    for idx, row in tqdm(metadata_df.iterrows(), total=len(metadata_df), desc="Generating augmentations"):
        try:
            # Loading original image
            img_path = os.path.join(images_dir, row['filename'])
            img = Image.open(img_path).convert("RGBA")
            
            # Generating augmentations
            for i in range(augmentations_per_logo):
                # Making a copy of the image
                aug_img = img.copy()
                
                # Applying augmentations based on index
                if i % 3 == 0:
                    # Brightness and contrast adjustments
                    enhancer = ImageEnhance.Brightness(aug_img)
                    aug_img = enhancer.enhance(random.uniform(0.92, 1.08))
                    enhancer = ImageEnhance.Contrast(aug_img)
                    aug_img = enhancer.enhance(random.uniform(0.95, 1.05))
                elif i % 3 == 1:
                    # Slight rotation
                    angle = random.uniform(-3, 3)
                    aug_img = aug_img.rotate(angle, resample=Image.BICUBIC, expand=False)
                else:
                    # Color adjustments
                    enhancer = ImageEnhance.Color(aug_img)
                    aug_img = enhancer.enhance(random.uniform(0.95, 1.05))
                
                # Saving augmented image
                base_name, ext = os.path.splitext(row['filename'])
                aug_filename = f"{base_name}_aug{i+1}{ext}"
                aug_path = os.path.join(augmented_dir, aug_filename)
                aug_img.save(aug_path, format='PNG')
                
                # Creating metadata for augmentation
                aug_entry = row.copy()
                aug_entry['filename'] = aug_filename
                aug_entry['augmented'] = True
                aug_entry['original_filename'] = row['filename']
                
                # Vary company name capitalization
                if 'company_name' in aug_entry and aug_entry['company_name']:
                    aug_entry['company_name'] = get_capitalization_variation(aug_entry['company_name'])
                
                # Updating prompt with new company name
                if 'prompt' in aug_entry and 'company_name' in aug_entry:
                    style_variations = [
                        ", with fine details",
                        ", with precise elements",
                        ", with professional finish",
                        ", with balanced composition",
                        ", with premium appearance"
                    ]
                    
                    # Updating prompt with new company name and style variation
                    business_type = aug_entry.get('business_type', 'business')
                    style = aug_entry.get('style', 'modern')
                    composition = aug_entry.get('composition', 'combination')
                    color_scheme = aug_entry.get('color_scheme', 'professional colors')
                    
                    aug_entry['prompt'] = (
                        f"A logo for {business_type} named \"{aug_entry['company_name']}\", "
                        f"{style} style, "
                        f"{composition} design with {color_scheme}"
                        + random.choice(style_variations)
                    )
                
                augmented_metadata.append(aug_entry)
        except Exception as e:
            logger.error(f"Error augmenting {row['filename']}: {e}")
            continue
    
    # Creating DataFrame with augmentations
    aug_df = pd.DataFrame(augmented_metadata)
    logger.info(f"Created {len(aug_df)} augmentations")
    
    # Combine original and augmented metadata
    combined_df = pd.concat([metadata_df, aug_df], ignore_index=True)
    logger.info(f"Total dataset size: {len(combined_df)} logos")
    
    # Step 5: Saving all results
    logger.info("\nSaving results...")
    
    # Saving metadata CSV files
    metadata_df.to_csv(os.path.join(output_dir, "modern_metadata.csv"), index=False)
    aug_df.to_csv(os.path.join(output_dir, "modern_augmented_metadata.csv"), index=False)
    combined_df.to_csv(os.path.join(output_dir, "modern_combined_with_augmentations.csv"), index=False)
    
    # Creating training data JSON
    training_data = []
    for _, row in combined_df.iterrows():
        img_dir = "augmented_images" if row.get('augmented', False) else "images"
        file_name = row['filename']
        
        training_data.append({
            "file_name": file_name,
            "prompt": row['prompt'] if 'prompt' in row else f"A logo for {row.get('business_type', 'a business')}",
            "augmented": bool(row.get('augmented', False))
        })
    
    # Saving training data JSON
    with open(os.path.join(output_dir, "modern_training_data.json"), 'w') as f:
        json.dump(training_data, f, indent=2)
    
    # Step 6: Uploading results to GCS
    logger.info("\nUploading results to GCS...")
    
    # Uploading all processed images
    logger.info("Uploading processed images...")
    for file in tqdm(os.listdir(images_dir), desc="Uploading images"):
        if file.endswith(('.png', '.jpg')):
            local_path = os.path.join(images_dir, file)
            gcs_path = f"{DATASET_FOLDER}/images/{file}"
            upload_to_gcs(local_path, gcs_path)
    
    # Uploading all augmented images
    logger.info("Uploading augmented images...")
    for file in tqdm(os.listdir(augmented_dir), desc="Uploading augmented images"):
        if file.endswith(('.png', '.jpg')):
            local_path = os.path.join(augmented_dir, file)
            gcs_path = f"{DATASET_FOLDER}/augmented_images/{file}"
            upload_to_gcs(local_path, gcs_path)
    
    # Uploading metadata files
    upload_to_gcs(
        os.path.join(output_dir, "modern_metadata.csv"), 
        f"{DATASET_FOLDER}/modern_metadata.csv"
    )
    upload_to_gcs(
        os.path.join(output_dir, "modern_augmented_metadata.csv"), 
        f"{DATASET_FOLDER}/modern_augmented_metadata.csv"
    )
    upload_to_gcs(
        os.path.join(output_dir, "modern_combined_with_augmentations.csv"), 
        f"{DATASET_FOLDER}/modern_combined_with_augmentations.csv"
    )
    upload_to_gcs(
        os.path.join(output_dir, "modern_training_data.json"), 
        f"{DATASET_FOLDER}/modern_training_data.json"
    )
    
    # Calculating execution time
    execution_time = time.time() - start_time
    hours, remainder = divmod(execution_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    
    logger.info(f"\nData preparation completed successfully!")
    logger.info(f"Total execution time: {int(hours)}h {int(minutes)}m {int(seconds)}s")
    logger.info(f"Processed {len(metadata_df)} original logos and created {len(aug_df)} augmentations")
    logger.info(f"Final dataset contains {len(combined_df)} logos")
    logger.info(f"Data saved locally to {output_dir} and synced to GCS: gs://{BUCKET_NAME}/{DATASET_FOLDER}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.error(f"Error in data preparation: {e}", exc_info=True)

2025-04-30 16:12:40,343 - INFO - Starting Simplified Modern Logo Dataset Preparation
2025-04-30 16:12:40,344 - INFO - Loading Modern Logo Dataset...
2025-04-30 16:12:40,949 - INFO - Listing files in the HuggingFace repository...
2025-04-30 16:12:41,071 - INFO - Found data files: ['data/train-00000-of-00001-b64601da56687a05.parquet']
2025-04-30 16:12:41,146 - INFO - Downloaded data/train-00000-of-00001-b64601da56687a05.parquet to /home/jupyter/.cache/huggingface/hub/datasets--logo-wizard--modern-logo-dataset/snapshots/82051101cfdf253729e5f418c38d2d1305ef0a91/data/train-00000-of-00001-b64601da56687a05.parquet
2025-04-30 16:12:44,075 - INFO - DataFrame shape: (803, 2)
2025-04-30 16:12:44,076 - INFO - DataFrame columns: ['image', 'text']
2025-04-30 16:12:44,077 - INFO - First sample:
2025-04-30 16:12:44,077 - INFO -   image: <class 'dict'>
2025-04-30 16:12:44,078 - INFO -   image keys: dict_keys(['bytes', 'path'])
2025-04-30 16:12:44,079 - INFO -   text: <class 'str'>
2025-04-30 16:12:44,0

Processing logos:   0%|          | 0/803 [00:00<?, ?it/s]

2025-04-30 16:14:39,365 - INFO - Processed 100 images so far
2025-04-30 16:16:27,404 - INFO - Processed 200 images so far
2025-04-30 16:18:14,556 - INFO - Processed 300 images so far
2025-04-30 16:20:04,959 - INFO - Processed 400 images so far
2025-04-30 16:21:52,389 - INFO - Processed 500 images so far
2025-04-30 16:23:40,405 - INFO - Processed 600 images so far
2025-04-30 16:25:29,262 - INFO - Processed 700 images so far
2025-04-30 16:27:17,337 - INFO - Processed 800 images so far
2025-04-30 16:27:20,527 - INFO - Created metadata for 803 images
2025-04-30 16:27:20,528 - INFO - 
Enhancing metadata...
2025-04-30 16:27:20,567 - INFO - 
Creating augmentations...


Generating augmentations:   0%|          | 0/803 [00:00<?, ?it/s]

2025-04-30 16:33:40,073 - INFO - Created 2409 augmentations
2025-04-30 16:33:40,076 - INFO - Total dataset size: 3212 logos
2025-04-30 16:33:40,077 - INFO - 
Saving results...
2025-04-30 16:33:40,450 - INFO - 
Uploading results to GCS...
2025-04-30 16:33:40,451 - INFO - Uploading processed images...


Uploading images:   0%|          | 0/803 [00:00<?, ?it/s]

2025-04-30 16:33:40,527 - INFO - Uploaded data/processed_modern/images/modern_272.png to gs://logo-generation-dataset/modern_logo_dataset/images/modern_272.png
2025-04-30 16:33:40,584 - INFO - Uploaded data/processed_modern/images/modern_25.png to gs://logo-generation-dataset/modern_logo_dataset/images/modern_25.png
2025-04-30 16:33:40,647 - INFO - Uploaded data/processed_modern/images/modern_589.png to gs://logo-generation-dataset/modern_logo_dataset/images/modern_589.png
2025-04-30 16:33:40,706 - INFO - Uploaded data/processed_modern/images/modern_183.png to gs://logo-generation-dataset/modern_logo_dataset/images/modern_183.png
2025-04-30 16:33:40,770 - INFO - Uploaded data/processed_modern/images/modern_466.png to gs://logo-generation-dataset/modern_logo_dataset/images/modern_466.png
2025-04-30 16:33:40,829 - INFO - Uploaded data/processed_modern/images/modern_598.png to gs://logo-generation-dataset/modern_logo_dataset/images/modern_598.png
2025-04-30 16:33:40,883 - INFO - Uploaded 

Uploading augmented images:   0%|          | 0/2409 [00:00<?, ?it/s]

2025-04-30 16:34:25,734 - INFO - Uploaded data/processed_modern/augmented_images/modern_488_aug1.png to gs://logo-generation-dataset/modern_logo_dataset/augmented_images/modern_488_aug1.png
2025-04-30 16:34:25,792 - INFO - Uploaded data/processed_modern/augmented_images/modern_504_aug1.png to gs://logo-generation-dataset/modern_logo_dataset/augmented_images/modern_504_aug1.png
2025-04-30 16:34:25,864 - INFO - Uploaded data/processed_modern/augmented_images/modern_368_aug3.png to gs://logo-generation-dataset/modern_logo_dataset/augmented_images/modern_368_aug3.png
2025-04-30 16:34:25,917 - INFO - Uploaded data/processed_modern/augmented_images/modern_412_aug1.png to gs://logo-generation-dataset/modern_logo_dataset/augmented_images/modern_412_aug1.png
2025-04-30 16:34:25,972 - INFO - Uploaded data/processed_modern/augmented_images/modern_641_aug3.png to gs://logo-generation-dataset/modern_logo_dataset/augmented_images/modern_641_aug3.png
2025-04-30 16:34:26,021 - INFO - Uploaded data/pro