# Logo Similarity Project

In [2]:
import os
import pandas as pd
import requests
from tqdm import tqdm
from PIL import Image, ImageDraw
import imagehash
from collections import defaultdict
import json


## Load Dataset
We start by loading a CSV file containing the list of website domains.

In [4]:
df = pd.read_parquet("logos.snappy.parquet")
df.to_csv("logos.csv", index=False)
df = pd.read_csv("logos.csv")  
domains = df["domain"].dropna().unique().tolist()
print(f"Loaded {len(domains)} domains.")


Loaded 3416 domains.


## Download Logos

Download each logo using: 
1. [Clearbit Logo API](https://clearbit.com/docs#logo-api)
2. Google’s favicon service (as a backup)

In [5]:
os.makedirs("logos", exist_ok=True)

def fetch_logo(domain, save_path):
    sources = [
        f"https://logo.clearbit.com/{domain}",
        f"https://www.google.com/s2/favicons?sz=128&domain={domain}"
    ]
    for url in sources:
        try:
            resp = requests.get(url, timeout=5)
            if resp.status_code == 200 and resp.content:
                with open(save_path, "wb") as f:
                    f.write(resp.content)
                return True
        except:
            continue
    return False

success_count = 0
logo_status = {}

for domain in tqdm(domains, desc="Downloading logos"):
    path = f"logos/{domain}.png"
    if os.path.exists(path):
        logo_status[domain] = True
        success_count += 1
        continue
    if fetch_logo(domain, path):
        logo_status[domain] = True
        success_count += 1
    else:
        logo_status[domain] = False


Downloading logos: 100%|██████████| 3416/3416 [44:07<00:00,  1.29it/s]    


### See how many logos are successfully extracted




In [6]:
total = len(domains)
print(f"Successfully downloaded: {success_count}/{total} logos ({success_count / total:.2%})")


Successfully downloaded: 3282/3416 logos (96.08%)


## Hashing Logos

We use perceptual hashing to create a fingerprint of each logo image.
This allows us to compare how visually similar two logos are.


In [7]:
hashes = {}
for domain in domains:
    path = f"logos/{domain}.png"
    if not os.path.exists(path):
        continue
    try:
        img = Image.open(path).convert("L").resize((64, 64))
        h = imagehash.phash(img)
        hashes[domain] = h
    except Exception as e:
        print(f"Error processing {domain}: {e}")




## Group Similar Logos

We compare hashes using Hamming Distance (how many bits differ).  
If two logos have a distance of 5 or less, we group them together.

In [8]:
SIMILARITY_THRESHOLD = 5
visited = set()
groups = []

domain_list = list(hashes.keys())
for i, d1 in enumerate(domain_list):
    if d1 in visited:
        continue
    group = [d1]
    visited.add(d1)
    for j in range(i + 1, len(domain_list)):
        d2 = domain_list[j]
        if d2 in visited:
            continue
        if hashes[d1] - hashes[d2] <= SIMILARITY_THRESHOLD:
            group.append(d2)
            visited.add(d2)
    groups.append(group)

for domain in domains:
    if domain not in [d for g in groups for d in g]:
        groups.append([domain])

print(f"Created {len(groups)} groups.")


Created 1929 groups.


## Visualize Logo Groups

We create a small preview image for each group, showing all logos in that group.


In [9]:
os.makedirs("group_previews", exist_ok=True)

def create_group_preview(group, index):
    images = []
    for domain in group:
        path = f"logos/{domain}.png"
        if not os.path.exists(path):
            continue
        try:
            img = Image.open(path).convert("RGBA").resize((128, 128))
            draw = ImageDraw.Draw(img)
            draw.text((5, 5), domain[:15], fill="black")
            images.append(img)
        except:
            continue
    if images:
        width = 128 * len(images)
        combined = Image.new("RGBA", (width, 128), (255, 255, 255, 0))
        for i, img in enumerate(images):
            combined.paste(img, (i * 128, 0))
        combined.save(f"group_previews/group_{index:03d}.png")

for i, group in enumerate(groups):
    create_group_preview(group, i)

print("Previews saved in 'group_previews/' folder.")


Previews saved in 'group_previews/' folder.


## Save Group Results

We save all grouped domains to a JSON file.


In [10]:
with open("similar_logo_groups.json", "w") as f:
    json.dump(groups, f, indent=2)
print("Saved as 'similar_logo_groups.json'")


Saved as 'similar_logo_groups.json'
