In [2]:
# # install al lthis stuff
# !pip install opencv-python
# !pip install ultralytics
# !pip install pillow
# !pip install faiss-cpu
# !pip install transformers

In [7]:
import torch
from PIL import Image
from transformers import BeitImageProcessor, BeitModel
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from ultralytics import YOLO
import cv2

In [8]:
# BEiT Embedding because this article used it:
# https://medium.com/@nathanjjacob/how-to-build-a-logo-detection-recognition-system-at-scale-c2b094ae4fd2

# This basically turns an image into a vector
class BEiTEmbedding:
    def __init__(self, model_name="microsoft/beit-base-patch16-224"):
        # Load the BEiT model from Hugging Face
        self.feature_extractor = BeitImageProcessor.from_pretrained(model_name)  # Corrected class name
        self.model = BeitModel.from_pretrained(model_name)

    def extract_embedding(self, img):
        inputs = self.feature_extractor(images=img, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Use average pooling
        return embedding


class CLIPEmbedding:
    def __init__(self, model_name="openai/clip-vit-base-patch32"):
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)

    def extract_embedding(self, img):
        inputs = self.processor(images=img, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model.get_image_features(**inputs)
        return outputs.squeeze().numpy()



In [9]:
def compute_cosine_similarity(embedding1, embedding2):
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)
    # this function is from sklearn. Thank you sklearn :)
    # this does the math for us
    return cosine_similarity(embedding1, embedding2)


In [30]:
def extract_logo_regions(image_path, model):
    # Returns all of the logos found within an image

    img = cv2.imread(image_path)  # Read the image here
    results = model(img)
    logo_regions = []
    bounding_boxes = []
    counter = 1
    for box in results[0].boxes:
        xyxy = box.xyxy[0].tolist()
        x1, y1, x2, y2 = map(int, xyxy)
        cropped_logo = img[y1:y2, x1:x2]  # Extract detected region

        if cropped_logo.size > 0:
            
            height, width = cropped_logo.shape[:2]

            # Calculate new width while maintaining aspect ratio
            new_height = 128
            new_width = int((new_height / height) * width)
            
            # Resize while keeping aspect ratio
            resized_logo = cv2.resize(cropped_logo, (new_width, new_height))

            cv2.imwrite(f'./cropped/cropped_{counter}_{image_path}', resized_logo)
            counter += 1
            # Convert grayscale to 3-channel (RGB-like) image
            # three_channel_logo = cv2.cvtColor(resized_logo, cv2.COLOR_GRAY2BGR)

            logo_regions.append(resized_logo)
            bounding_boxes.append((x1, y1, x2, y2))

    return logo_regions, bounding_boxes, img  # Return img here

def compare_logo_embeddings(input_path, reference_path, model, feature_extractor, similarity_threshold=0.0):
    #compare logos using embeddings
    
    # Extract logos and bounding boxes, also get the image
    input_logos, input_bboxes, input_img = extract_logo_regions(input_path, model)
    reference_logos, reference_bboxes, reference_img = extract_logo_regions(reference_path, model)
    
    if not input_logos or not reference_logos:
        print("No logos detected in one or both images.")
        return
    
    # get embeddings (vectors) from the reference and input image
    input_embeddings = [feature_extractor.extract_embedding(Image.fromarray(input_logo)) for input_logo in input_logos]
    reference_embeddings = [feature_extractor.extract_embedding(Image.fromarray(reference_logo)) for reference_logo in reference_logos]
    
    # Compare logos using cosine similarity
    for index, input_embedding in enumerate(input_embeddings):
        for ref_index, reference_embedding in enumerate(reference_embeddings):
            similarity = compute_cosine_similarity(input_embedding, reference_embedding)

            print(f'similarity score: {similarity}')
            # If similarity is above the threshold, then they match
            if similarity >= similarity_threshold:
                # Get the bounding box for the matching input logo
                x1, y1, x2, y2 = input_bboxes[index]
                color = [255, 255, 255]
                cv2.rectangle(input_img, (x1, y1), (x2, y2), color, 2)  # Draw on the input image

    # Save the processed image after drawing the rectangles
    cv2.imwrite("output_image.jpg", input_img)
    print("Processed image saved.")

In [35]:
input_image_path = "starbucks.jpg"  # Change this to your image file
reference_image_path = "reference.jpg"
yolo_model = YOLO("../488_back/best.pt")  # Load your YOLO model
compare_logo_embeddings(input_image_path, reference_image_path, yolo_model, BEiTEmbedding())

  return func(*args, **kwargs)



0: 448x640 3 logos, 20.2ms
Speed: 1.2ms preprocess, 20.2ms inference, 0.4ms postprocess per image at shape (1, 3, 448, 640)

0: 320x640 1 logo, 17.0ms
Speed: 0.9ms preprocess, 17.0ms inference, 0.4ms postprocess per image at shape (1, 3, 320, 640)
similarity score: [[    0.10802]]
similarity score: [[    0.13226]]
similarity score: [[    0.46391]]
Processed image saved.
