In [2]:
# # install al lthis stuff
# !pip install opencv-python
# !pip install ultralytics
# !pip install pillow
# !pip install faiss-cpu
# !pip install transformers

In [11]:
import torch
from PIL import Image
from transformers import BeitImageProcessor, BeitModel
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np
from ultralytics import YOLO
import cv2
from torchvision import models, transforms

In [1]:
# BEiT Embedding because this article used it:
# https://medium.com/@nathanjjacob/how-to-build-a-logo-detection-recognition-system-at-scale-c2b094ae4fd2

# This basically turns an image into a vector
class BEiTEmbedding:
    def __init__(self, model_name="microsoft/beit-base-patch16-224"):
        # Load the BEiT model from Hugging Face
        self.feature_extractor = BeitImageProcessor.from_pretrained(model_name)  # Corrected class name
        self.model = BeitModel.from_pretrained(model_name)

    def extract_embedding(self, img):
        inputs = self.feature_extractor(images=img, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Use average pooling
        return embedding


class CLIPEmbedding:
    def __init__(self, model_name="openai/clip-vit-base-patch32"):
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)

    def extract_embedding(self, img):
        inputs = self.processor(images=img, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model.get_image_features(**inputs)
        return outputs.squeeze().numpy()
# ResNet Embedding
class ResNetEmbedding:
    def __init__(self, model_name="resnet50"):
        # Load the ResNet model from torchvision
        self.model = models.resnet50(pretrained=True)
        self.model.eval()  # Set the model to evaluation mode
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def extract_embedding(self, img):
        img = self.transform(img).unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            embedding = self.model(img)  # Forward pass to get the embedding
        return embedding.squeeze().cpu().numpy()  # Remove batch dimension and return as numpy array


In [3]:
def compute_cosine_similarity(embedding1, embedding2):
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)

    # this function is from sklearn. Thank you sklearn :)
    # this does the math for us
    return cosine_similarity(embedding1, embedding2)[0][0]
    # return euclidean_distances(embedding1, embedding2)
  
def compute_euclidean_distances(embedding1, embedding2):
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)

    # this function is from sklearn. Thank you sklearn :)
    # this does the math for us
    # return cosine_similarity(embedding1, embedding2)
    return euclidean_distances(embedding1, embedding2)[0][0]
  


In [5]:
def extract_logo_regions(image_path, model, bounding_box_threshold=0.0):
    # Returns all of the logos found within an image

    img = cv2.imread(image_path)  # Read the image here
    results = model(img)
    logo_regions = []
    bounding_boxes = []
    counter = 1
    for box in results[0].boxes:
        xyxy = box.xyxy[0].tolist()
        x1, y1, x2, y2 = map(int, xyxy)
        confidence = box.conf[0].item()  # Get the confidence score for the detection

        # print(f'bounding box confidence: {confidence}')
        if confidence >= bounding_box_threshold:
          cropped_logo = img[y1:y2, x1:x2]  # Extract detected region
        

          if cropped_logo.size > 0:
            
            height, width = cropped_logo.shape[:2]

            # Calculate new width while maintaining aspect ratio
            new_height = 224
            new_width = int((new_height / height) * width)
            
            # Resize while keeping aspect ratio
            resized_logo = cv2.resize(cropped_logo, (new_width, new_height))

            cv2.imwrite(f'./cropped/cropped_{counter}_{image_path}', resized_logo)
            counter += 1
            # Convert grayscale to 3-channel (RGB-like) image
            # three_channel_logo = cv2.cvtColor(resized_logo, cv2.COLOR_GRAY2BGR)

            logo_regions.append(resized_logo)
            bounding_boxes.append((x1, y1, x2, y2))

    return logo_regions, bounding_boxes, img  # Return img here

In [21]:
def compare_logo_embeddings(input_path, reference_path, model, embedding_models, thresholds, score_threshold):

  '''
  input_path is the file location of the main image
  reference path is the file location of the reference image
  model is what we use for general logo detection (this will always be YOLO in our case)
  embedding_models will be an array of classes used to make our images vectors
  thresholds is a dict of the cosine and euclidean thresholds the image embeddings should meet for each embedding model
  score_threshold is a number of how many points an image needs to recieve for it to 'pass'
  '''
  # Extract logos and bounding boxes
  input_logos, input_bboxes, input_img = extract_logo_regions(input_path, model)
  reference_logos, _, _ = extract_logo_regions(reference_path, model)

  if not input_logos or not reference_logos:
    print("No logos detected in one or both images.")
    return

  # Initialize score tracker
  # Matrix of len(reference_logos) x len(input_logos). 
  # This keeps the scores separate for each reference logo found
  scores = [[0] * len(input_logos) for _ in range(len(reference_logos))]

  # For each embedding model
  for feature_extractor in embedding_models:
    # Get the name of the embedding model so we can index the thresholds dict
    model_name = type(feature_extractor).__name__

    # Get model-specific thresholds
    cosine_threshold = thresholds[model_name]["cosine"]
    euclidean_threshold = thresholds[model_name]["euclidean"]

    # Maybe send to user 'Embedding images' with a loading screen here
    # Compute embeddings and put them into an array
    input_embeddings = [feature_extractor.extract_embedding(Image.fromarray(logo)) for logo in input_logos]
    reference_embeddings = [feature_extractor.extract_embedding(Image.fromarray(logo)) for logo in reference_logos]

    # Maybe send to user 'iterating through images'
    # Iterate through each embeddings (basically each logo)
    for i, ref_embedding in enumerate(reference_embeddings):
      for j, input_embedding in enumerate(input_embeddings):
        # Compute similarity scores
        cosine_sim = compute_cosine_similarity(input_embedding, ref_embedding)
        euclidean_dist = compute_euclidean_distances(input_embedding, ref_embedding)

        # Check if similarities meet the model specific thresholds
        # Again, scores is a 2d array. Rows = num of reference images, cols = num of main image
        if cosine_sim >= cosine_threshold:
          scores[i][j] += 1 # Plus 1 if cosine sim is met
        if euclidean_dist <= euclidean_threshold:
          scores[i][j] += 1 # Plus 1 if euclidean distance is met

        # Print what the scores are
        print(f'{model_name} score: {scores[i][j]}')



  # Final decision: Classify as match if score is at least 4/6
  for i in range(len(reference_logos)):  # Iterate over reference logos (rows)
    for j in range(len(input_logos)):  # Iterate over input logos (columns)
      
      if scores[i][j] >= score_threshold:  # Check per reference logo
        x1, y1, x2, y2 = input_bboxes[j]
        color = (255, 255, 255)  # White bounding box. We can change this later if needed
        cv2.rectangle(input_img, (x1, y1), (x2, y2), color, 2)

  # Save output image with bounding boxes
  cv2.imwrite("output_image.jpg", input_img)
  print("Processed image saved.")


In [26]:
input_image_path = "starbucks.jpg"  # Change this to your image file
reference_image_path = "starbucks4.png"
yolo_model = YOLO("../488_back/best.pt")  # Load your YOLO model

embedding_models = [BEiTEmbedding(), CLIPEmbedding(), ResNetEmbedding()]
thresholds = {
  'BEiTEmbedding': {'cosine': .3, 'euclidean': 110},
  'CLIPEmbedding': {'cosine': .65, 'euclidean': 7.5},
  'ResNetEmbedding': {'cosine': .75, 'euclidean': 50}
}


print('BEiT')
compare_logo_embeddings(input_image_path, reference_image_path, yolo_model, embedding_models, thresholds, 4)

# print(f'\nCLIP \n---------------------------------------------------------------------------')
# compare_logo_embeddings(input_image_path, reference_image_path, yolo_model, CLIPEmbedding())

# print(f'\nResNet \n-------------------------------------------------------------------------')
# compare_logo_embeddings(input_image_path, reference_image_path, yolo_model, ResNetEmbedding())


  return func(*args, **kwargs)


BEiT

0: 448x640 3 logos, 77.7ms
Speed: 4.0ms preprocess, 77.7ms inference, 1.2ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 1 logo, 94.4ms
Speed: 5.0ms preprocess, 94.4ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)
BEiTEmbedding score: 0
BEiTEmbedding score: 0
BEiTEmbedding score: 1
CLIPEmbedding score: 0
CLIPEmbedding score: 0
CLIPEmbedding score: 3
ResNetEmbedding score: 0
ResNetEmbedding score: 0
ResNetEmbedding score: 3
Processed image saved.
