# Preprocess Logos

Acest notebook prelucreaza logo-urile din folderul 'dataset/logos_raw/' si le salveaza in 'dataset/logos_preprocessed/'. Imaginile sunt redimensionate si se adauga fundal alb. De asemenea, pixelii prea deschisi la culoare sunt intunecati pentru a se distinge cand logo-ul e pe fundal alb. A fost o problema in special la svg-uri, pe care le-am curatat. 

In [None]:
import os
import io
import numpy as np
from PIL import Image, ImageOps
from tqdm import tqdm
import cairosvg
import html
from lxml import etree, html as lxml_html
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
RAW_DIR = "dataset/logos_raw"
OUT_DIR = "dataset/logos_processed"
TARGET_SIZE = (256, 256)  
BACKGROUND = (255, 255, 255)  
MAX_WORKERS = 8  
LOG_FAIL = os.path.join(OUT_DIR, "failed_images.txt")

In [None]:
def sanitize_svg(svg_text):
    try:
        parser = lxml_html.HTMLParser()
        root = lxml_html.fromstring(svg_text, parser=parser)
        svg_elem = root.find(".//svg")
        if svg_elem is not None:
            cleaned = etree.tostring(svg_elem, encoding="unicode")
            return cleaned
    except Exception:
        pass
    return svg_text

In [None]:
def convert_svg_to_png_bytes(svg_path):
    try:
        with open(svg_path, "r", encoding="utf-8", errors="ignore") as f:
            svg_text = f.read()

        svg_text = html.unescape(svg_text)
        svg_text = sanitize_svg(svg_text)

        png_data = cairosvg.svg2png(bytestring=svg_text.encode("utf-8"))
        return png_data

    except Exception as e:
        return None

In [None]:
def load_image(path):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".svg":
        png_bytes = convert_svg_to_png_bytes(path)
        if png_bytes is None:
            return None
        return Image.open(io.BytesIO(png_bytes)).convert("RGBA")
    try:
        img = Image.open(path).convert("RGBA")
        return img
    except Exception:
        return None

In [None]:
def dim_pure_white_pixels(img, threshold=250, new_color=(200,200,200)):
    arr = np.array(img)
    if arr.shape[-1] == 4:
        rgb = arr[..., :3]
    else:
        rgb = arr

    mask = np.all(rgb > threshold, axis=-1)

    rgb[mask] = new_color

    if arr.shape[-1] == 4:
        arr[..., :3] = rgb
    else:
        arr = rgb

    return Image.fromarray(arr.astype(np.uint8))

In [None]:
def preprocess_image(img, size=(256, 256), background=(255,255,255)):
    bbox = img.getbbox()
    if bbox:
        img = img.crop(bbox)

    img = ImageOps.pad(img, size, method=Image.Resampling.LANCZOS, color=(0,0,0,0))

    img = dim_pure_white_pixels(img)

    if img.mode == "RGBA":
        background_img = Image.new("RGB", img.size, background)
        background_img.paste(img, mask=img.split()[3])  
        img = background_img
    else:
        img = img.convert("RGB")

    return img

In [8]:
def process_all_images(input_dir=RAW_DIR, output_dir=OUT_DIR):
    files = []
    for root, _, fnames in os.walk(input_dir):
        for fn in fnames:
            if fn.lower().endswith((".png", ".jpg", ".jpeg", ".webp", ".svg")):
                files.append(os.path.join(root, fn))

    print(f"Total imagini de procesat: {len(files)}")

    failed = []
    MAX_WORKERS = min(8, os.cpu_count() or 4)

    def worker(path):
        try:
            img = load_image(path)
            if img is None:
                return path, False
            img_proc = preprocess_image(img, TARGET_SIZE, BACKGROUND)
            out_name = os.path.splitext(os.path.basename(path))[0] + ".png"
            out_path = os.path.join(output_dir, out_name)
            img_proc.save(out_path, "PNG")
            return path, True
        except Exception:
            return path, False

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(worker, path): path for path in files}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing logos", dynamic_ncols=True):
            path, ok = future.result()
            if not ok:
                failed.append(path)

    if failed:
        log_path = os.path.join(output_dir, "failed_images.txt")
        with open(log_path, "w", encoding="utf-8") as f:
            f.write("\n".join(failed))
        print(f"\n{len(failed)} imagini esuate.")

    print(f"Procesare completa")

In [9]:
if __name__ == "__main__":
    os.makedirs(OUT_DIR, exist_ok=True)
    process_all_images()

Total imagini de procesat: 2356


Processing logos: 100%|██████████| 2356/2356 [00:19<00:00, 119.79it/s]


42 imagini esuate.
Procesare completa



