In [2]:
# # install al lthis stuff
# !pip install opencv-python
# !pip install ultralytics
# !pip install pillow
# !pip install faiss-cpu
# !pip install transformers

In [3]:
import torch
from PIL import Image
from transformers import BeitImageProcessor, BeitModel
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np
from ultralytics import YOLO
import cv2
from torchvision import models, transforms

In [4]:
# BEiT Embedding because this article used it:
# https://medium.com/@nathanjjacob/how-to-build-a-logo-detection-recognition-system-at-scale-c2b094ae4fd2

# This basically turns an image into a vector
class BEiTEmbedding:
    def __init__(self, model_name="microsoft/beit-base-patch16-224"):
        # Load the BEiT model from Hugging Face
        self.feature_extractor = BeitImageProcessor.from_pretrained(model_name)  # Corrected class name
        self.model = BeitModel.from_pretrained(model_name)

    def extract_embedding(self, img):
        inputs = self.feature_extractor(images=img, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Use average pooling
        return embedding


class CLIPEmbedding:
    def __init__(self, model_name="openai/clip-vit-base-patch32"):
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)

    def extract_embedding(self, img):
        inputs = self.processor(images=img, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model.get_image_features(**inputs)
        return outputs.squeeze().numpy()
# ResNet Embedding
class ResNetEmbedding:
    def __init__(self, model_name="resnet50"):
        # Load the ResNet model from torchvision
        self.model = models.resnet50(pretrained=True)
        self.model.eval()  # Set the model to evaluation mode
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def extract_embedding(self, img):
        img = self.transform(img).unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            embedding = self.model(img)  # Forward pass to get the embedding
        return embedding.squeeze().cpu().numpy()  # Remove batch dimension and return as numpy array


In [5]:
def compute_cosine_similarity(embedding1, embedding2):
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)

    # this function is from sklearn. Thank you sklearn :)
    # this does the math for us
    return cosine_similarity(embedding1, embedding2)[0][0]
    # return euclidean_distances(embedding1, embedding2)
  
def compute_euclidean_distances(embedding1, embedding2):
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)

    # this function is from sklearn. Thank you sklearn :)
    # this does the math for us
    # return cosine_similarity(embedding1, embedding2)
    return euclidean_distances(embedding1, embedding2)[0][0]
  


In [6]:
def extract_logo_regions(image_path, model, bounding_box_threshold=0.0):
    # Returns all of the logos found within an image

    img = cv2.imread(image_path)  # Read the image here
    results = model(img)
    logo_regions = []
    bounding_boxes = []
    counter = 1
    for box in results[0].boxes:
        xyxy = box.xyxy[0].tolist()
        x1, y1, x2, y2 = map(int, xyxy)
        confidence = box.conf[0].item()  # Get the confidence score for the detection

        # print(f'bounding box confidence: {confidence}')
        if confidence >= bounding_box_threshold:
          cropped_logo = img[y1:y2, x1:x2]  # Extract detected region
        

          if cropped_logo.size > 0:
            
            height, width = cropped_logo.shape[:2]

            # Calculate new width while maintaining aspect ratio
            new_height = 224
            new_width = int((new_height / height) * width)
            
            # Resize while keeping aspect ratio
            resized_logo = cv2.resize(cropped_logo, (new_width, new_height))

            cv2.imwrite(f'./cropped/cropped_{counter}_{image_path}', resized_logo)
            counter += 1
            # Convert grayscale to 3-channel (RGB-like) image
            # three_channel_logo = cv2.cvtColor(resized_logo, cv2.COLOR_GRAY2BGR)

            logo_regions.append(resized_logo)
            bounding_boxes.append((x1, y1, x2, y2))

    return logo_regions, bounding_boxes, img  # Return img here

def compare_logo_embeddings(input_path, reference_path, model, feature_extractor, similarity_threshold=0.0):
    #compare logos using embeddings
    
    # Extract logos and bounding boxes, also get the image
    input_logos, input_bboxes, input_img = extract_logo_regions(input_path, model)
    reference_logos, reference_bboxes, reference_img = extract_logo_regions(reference_path, model)
    
    if not input_logos or not reference_logos:
        print("No logos detected in one or both images.")
        return
    
    # get embeddings (vectors) from the reference and input image
    input_embeddings = [feature_extractor.extract_embedding(Image.fromarray(input_logo)) for input_logo in input_logos]
    reference_embeddings = [feature_extractor.extract_embedding(Image.fromarray(reference_logo)) for reference_logo in reference_logos]
    
    # Compare logos using cosine similarity
    for index, input_embedding in enumerate(input_embeddings):
        for ref_index, reference_embedding in enumerate(reference_embeddings):
            cos_similarity = compute_cosine_similarity(input_embedding, reference_embedding)
            euc_dist = compute_euclidean_distances(input_embedding, reference_embedding)

            print(f'cosine similarity score: {cos_similarity}')
            print(f'euclidean distance score: {euc_dist}\n')
            # If similarity is above the threshold, then they match
            if cos_similarity >= similarity_threshold:
                # Get the bounding box for the matching input logo
                x1, y1, x2, y2 = input_bboxes[index]
                color = [255, 255, 255]
                cv2.rectangle(input_img, (x1, y1), (x2, y2), color, 2)  # Draw on the input image

    # Save the processed image after drawing the rectangles
    cv2.imwrite("output_image.jpg", input_img)
    print("Processed image saved.")

In [32]:
input_image_path = "starbucks2.jpg"  # Change this to your image file
reference_image_path = "starbucks4.png"
yolo_model = YOLO("../488_back/best.pt")  # Load your YOLO model

print('BEiT')
compare_logo_embeddings(input_image_path, reference_image_path, yolo_model, BEiTEmbedding())

print(f'\nCLIP \n---------------------------------------------------------------------------')
compare_logo_embeddings(input_image_path, reference_image_path, yolo_model, CLIPEmbedding())

print(f'\nResNet \n-------------------------------------------------------------------------')
compare_logo_embeddings(input_image_path, reference_image_path, yolo_model, ResNetEmbedding())


BEiT


  return func(*args, **kwargs)



0: 448x640 1 logo, 107.5ms
Speed: 6.1ms preprocess, 107.5ms inference, 1.1ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 1 logo, 125.4ms
Speed: 6.3ms preprocess, 125.4ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)
cosine similarity score: 0.23492184281349182
euclidean distance score: 130.15493774414062

Processed image saved.

CLIP 
---------------------------------------------------------------------------

0: 448x640 1 logo, 118.6ms
Speed: 5.3ms preprocess, 118.6ms inference, 1.3ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 1 logo, 126.8ms
Speed: 6.5ms preprocess, 126.8ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)
cosine similarity score: 0.8996052742004395
euclidean distance score: 5.047374725341797

Processed image saved.

ResNet 
-------------------------------------------------------------------------





0: 448x640 1 logo, 121.9ms
Speed: 4.3ms preprocess, 121.9ms inference, 1.3ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 1 logo, 125.2ms
Speed: 5.4ms preprocess, 125.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)
cosine similarity score: 0.7854627370834351
euclidean distance score: 48.24747085571289

Processed image saved.
