<a href="https://colab.research.google.com/github/caleb-stewart/Trademark-Analysis-Identification-Tool/blob/main/video_trait.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# !pip install opencv-python
# !pip install ultralytics
# !pip install pillow
# !pip install faiss-cpu

Collecting ultralytics
  Downloading ultralytics-8.3.99-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading nv

In [1]:
import torch
import torchvision
import torch.nn as nn
import torchvision.transforms as tr
import cv2
import faiss
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, BeitFeatureExtractor, BeitModel
from torchvision.models.feature_extraction import create_feature_extractor
from ultralytics import YOLO
import os

In [2]:
class EmbeddingExtractor:
    """Class for extracting image embeddings using ResNet-50, CLIP, and BEiT."""

    def __init__(self, device=None):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # Load ResNet-50 model
        self.resnet = torchvision.models.resnet50(pretrained=True)
        self.resnet.fc = nn.Identity()
        self.resnet = self.resnet.to(self.device).eval()

        # Load CLIP model
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device).eval()
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

        # Load BEiT model
        self.beit_model = BeitModel.from_pretrained("microsoft/beit-base-patch16-224").to(self.device).eval()
        self.beit_processor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224")

    def preprocess_resnet(self, img):
        transformations = tr.Compose([
            tr.Resize((224, 224)),
            tr.ToTensor(),
            tr.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])
        img = transformations(img).unsqueeze(0).to(self.device)
        return img

    def preprocess_clip(self, img):
        return self.clip_processor(images=img, return_tensors="pt")["pixel_values"].to(self.device)

    def preprocess_beit(self, img):
        return self.beit_processor(images=img, return_tensors="pt")["pixel_values"].to(self.device)

    def get_embedding(self, img, model_name="resnet"):

        img = Image.fromarray(img)

        if model_name == "resnet":
            img_tensor = self.preprocess_resnet(img)
            with torch.no_grad():
                embedding = self.resnet(img_tensor).cpu().numpy()

        elif model_name == "clip":
            img_tensor = self.preprocess_clip(img)
            with torch.no_grad():
                embedding = self.clip_model.get_image_features(img_tensor).cpu().numpy()

        elif model_name == "beit":
            img_tensor = self.preprocess_beit(img)
            with torch.no_grad():
                embedding = self.beit_model(img_tensor).last_hidden_state.mean(dim=1).cpu().numpy()

        else:
            raise ValueError("Invalid model name. Choose from: resnet, clip, beit.")

        return embedding

    @staticmethod
    def cosine_similarity(emb1, emb2):

        return torch.nn.functional.cosine_similarity(torch.tensor(emb1), torch.tensor(emb2)).item()

    @staticmethod
    def euclidean_distance(emb1, emb2):

        return np.linalg.norm(emb1 - emb2)

In [6]:
model = YOLO("best.pt")
similarity_checker = EmbeddingExtractor()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  return func(*args, **kwargs)


In [8]:
def extract_logo_regions(image, save_crop=False, output_dir="cropped_logos"):
    """Runs YOLO on an image and extracts detected logo regions."""

    # Check if input is a file path or an image array
    if isinstance(image, str):
        img = cv2.imread(image)
    else:
        img = image

    if img is None:
        print("Error: Could not load image.")
        return [], []

    results = model(img)
    logo_regions = []
    bounding_boxes = []

    if save_crop and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for idx, box in enumerate(results[0].boxes):
        xyxy = box.xyxy[0].tolist()
        x1, y1, x2, y2 = map(int, xyxy)
        cropped_logo = img[y1:y2, x1:x2]  # extract detected region

        if save_crop and cropped_logo.size > 0:
            cropped_logo_path = os.path.join(output_dir, f"cropped_logo_{idx}.jpg")
            cv2.imwrite(cropped_logo_path, cropped_logo)
            print(f"Logo {idx} saved: {cropped_logo_path}")

        if cropped_logo.size > 0:
            logo_regions.append(cropped_logo)
            bounding_boxes.append((x1, y1, x2, y2))
            print(f"Logo {idx} detected at coordinates: ({x1}, {y1}) -> ({x2}, {y2})")

    return logo_regions, bounding_boxes

In [48]:
import cv2
import torch
import torchvision.transforms as tr
import numpy as np
import os
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, BeitModel, BeitFeatureExtractor

thresholds = {
  'beit': {'cosine': .3, 'euclidean': 110},
  'clip': {'cosine': .65, 'euclidean': 7.5},
  'resnet': {'cosine': .75, 'euclidean': 50}
}



def process_video(input_video_path, output_video_path, reference_image_path, similarity_threshold=0.50, frame_skip=5):
    cap = cv2.VideoCapture(input_video_path)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    frame_idx = 0
    reference_logos, _ = extract_logo_regions(reference_image_path, save_crop=False)

    reference_embeddings = {model: [] for model in ["resnet", "beit", "clip"]}
    for ref_logo in reference_logos:
      for model_name in reference_embeddings.keys():
          reference_embeddings[model_name].append(similarity_checker.get_embedding(ref_logo, model_name))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break  # stop if video ends

        if frame_idx % frame_skip == 0:  # process every 5th frame
            print(f"Processing frame {frame_idx}")

            # extract detected logos from the current frame
            input_logos, input_bboxes = extract_logo_regions(frame, save_crop=False)

            if not input_logos or not reference_logos:
                print("No logos detected in one or both images.")
            else:
                save_frame = False
                for idx, input_logo in enumerate(input_logos):
                    votes = 0
                    input_embeddings = {model: similarity_checker.get_embedding(input_logo, model) for model in ["resnet", "beit", "clip"]}
                    
                    for model_name in ["resnet", "beit", "clip"]:
                      for ref_embedding in reference_embeddings[model_name]:
                        cosine_sim = similarity_checker.cosine_similarity(input_embeddings[model_name], ref_embedding)
                        euclidean_dist = similarity_checker.euclidean_distance(input_embeddings[model_name], ref_embedding)

                        if cosine_sim >= thresholds[model_name]['cosine']:
                            votes += 1
                        if euclidean_dist <= thresholds[model_name]['euclidean']:
                            votes += 1

                    if votes >= 2:
                      x1, y1, x2, y2 = input_bboxes[idx]
                      cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 255), 5)
                      save_frame = True
                      break

                if save_frame:
                    print(f"Match found in frame {frame_idx}!")

        out.write(frame)  # write processed frame to output

        frame_idx += 1

    cap.release()
    out.release()
    print(f"Processed video saved as {output_video_path}")



In [50]:

input_video_path = "starbucks_video.mp4"  # path to input video
output_video_path = "output_video.mp4"  # output processed video
reference_image_path = "starbucks4.png"  # reference image for logo detection

process_video(input_video_path, output_video_path, reference_image_path)


0: 640x640 1 logo, 131.2ms
Speed: 7.6ms preprocess, 131.2ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 640)
Logo 0 detected at coordinates: (2, 6) -> (221, 224)
Processing frame 0

0: 384x640 1 logo, 120.6ms
Speed: 3.8ms preprocess, 120.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)
Logo 0 detected at coordinates: (1128, 503) -> (1345, 654)
Match found in frame 0!
Processing frame 5

0: 384x640 1 logo, 82.5ms
Speed: 2.9ms preprocess, 82.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
Logo 0 detected at coordinates: (1129, 504) -> (1343, 654)
Match found in frame 5!
Processing frame 10

0: 384x640 1 logo, 98.8ms
Speed: 3.5ms preprocess, 98.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)
Logo 0 detected at coordinates: (1129, 503) -> (1342, 653)
Match found in frame 10!
Processing frame 15

0: 384x640 1 logo, 87.5ms
Speed: 3.3ms preprocess, 87.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 