<a href="https://colab.research.google.com/github/caleb-stewart/Trademark-Analysis-Identification-Tool/blob/main/video_trait.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install opencv-python
!pip install ultralytics
!pip install pillow
!pip install faiss-cpu

Collecting ultralytics
  Downloading ultralytics-8.3.99-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading nv

In [4]:
import torch
import torchvision
import torch.nn as nn
import torchvision.transforms as tr
import cv2
import faiss
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, BeitFeatureExtractor, BeitModel
from torchvision.models.feature_extraction import create_feature_extractor
from ultralytics import YOLO
import os

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [5]:
class EmbeddingExtractor:
    """Class for extracting image embeddings using ResNet-50, CLIP, and BEiT."""

    def __init__(self, device=None):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # Load ResNet-50 model
        self.resnet = torchvision.models.resnet50(pretrained=True)
        self.resnet.fc = nn.Identity()
        self.resnet = self.resnet.to(self.device).eval()

        # Load CLIP model
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device).eval()
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

        # Load BEiT model
        self.beit_model = BeitModel.from_pretrained("microsoft/beit-base-patch16-224").to(self.device).eval()
        self.beit_processor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224")

    def preprocess_resnet(self, img):
        transformations = tr.Compose([
            tr.Resize((224, 224)),
            tr.ToTensor(),
            tr.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])
        img = transformations(img).unsqueeze(0).to(self.device)
        return img

    def preprocess_clip(self, img):
        return self.clip_processor(images=img, return_tensors="pt")["pixel_values"].to(self.device)

    def preprocess_beit(self, img):
        return self.beit_processor(images=img, return_tensors="pt")["pixel_values"].to(self.device)

    def get_embedding(self, img, model_name="resnet"):

        img = Image.fromarray(img)

        if model_name == "resnet":
            img_tensor = self.preprocess_resnet(img)
            with torch.no_grad():
                embedding = self.resnet(img_tensor).cpu().numpy()

        elif model_name == "clip":
            img_tensor = self.preprocess_clip(img)
            with torch.no_grad():
                embedding = self.clip_model.get_image_features(img_tensor).cpu().numpy()

        elif model_name == "beit":
            img_tensor = self.preprocess_beit(img)
            with torch.no_grad():
                embedding = self.beit_model(img_tensor).last_hidden_state.mean(dim=1).cpu().numpy()

        else:
            raise ValueError("Invalid model name. Choose from: resnet, clip, beit.")

        return embedding

    @staticmethod
    def cosine_similarity(emb1, emb2):

        return torch.nn.functional.cosine_similarity(torch.tensor(emb1), torch.tensor(emb2)).item()

    @staticmethod
    def euclidean_distance(emb1, emb2):

        return np.linalg.norm(emb1 - emb2)

In [8]:
model = YOLO("best.pt")
similarity_checker = EmbeddingExtractor()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 125MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.9k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/350M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/276 [00:00<?, ?B/s]

  return func(*args, **kwargs)


In [9]:
def extract_logo_regions(image, save_crop=False, output_dir="cropped_logos"):
    """Runs YOLO on an image and extracts detected logo regions."""

    # Check if input is a file path or an image array
    if isinstance(image, str):
        img = cv2.imread(image)
    else:
        img = image

    if img is None:
        print("Error: Could not load image.")
        return [], []

    results = model(img)
    logo_regions = []
    bounding_boxes = []

    if save_crop and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for idx, box in enumerate(results[0].boxes):
        xyxy = box.xyxy[0].tolist()
        x1, y1, x2, y2 = map(int, xyxy)
        cropped_logo = img[y1:y2, x1:x2]  # extract detected region

        if save_crop and cropped_logo.size > 0:
            cropped_logo_path = os.path.join(output_dir, f"cropped_logo_{idx}.jpg")
            cv2.imwrite(cropped_logo_path, cropped_logo)
            print(f"Logo {idx} saved: {cropped_logo_path}")

        if cropped_logo.size > 0:
            logo_regions.append(cropped_logo)
            bounding_boxes.append((x1, y1, x2, y2))
            print(f"Logo {idx} detected at coordinates: ({x1}, {y1}) -> ({x2}, {y2})")

    return logo_regions, bounding_boxes

In [10]:
import cv2
import torch
import torchvision.transforms as tr
import numpy as np
import os
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, BeitModel, BeitFeatureExtractor



def process_video(input_video_path, output_video_path, reference_image_path, similarity_threshold=0.50, frame_skip=5):
    cap = cv2.VideoCapture(input_video_path)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    frame_idx = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break  # stop if video ends

        if frame_idx % frame_skip == 0:  # process every 5th frame
            print(f"Processing frame {frame_idx}")

            # extract detected logos from the current frame
            input_logos, input_bboxes = extract_logo_regions(frame, save_crop=False)
            reference_logos, _ = extract_logo_regions(reference_image_path, save_crop=False)

            if not input_logos or not reference_logos:
                print("No logos detected in one or both images.")
            else:
                save_frame = False
                for idx, input_logo in enumerate(input_logos):
                    votes = 0
                    for ref_logo in reference_logos:
                        for model_name in ["resnet" ]:
                            input_embedding = similarity_checker.get_embedding(input_logo, model_name)
                            ref_embedding = similarity_checker.get_embedding(ref_logo, model_name)

                            cosine_sim = similarity_checker.cosine_similarity(input_embedding, ref_embedding)
                            euclidean_dist = similarity_checker.euclidean_distance(input_embedding, ref_embedding)

                            if cosine_sim >= similarity_threshold:
                                votes += 1
                            if euclidean_dist <= 20:
                                votes += 1

                        if votes >= 0:
                            x1, y1, x2, y2 = input_bboxes[idx]
                            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 255), 5)
                            save_frame = True
                            break

                if save_frame:
                    print(f"Match found in frame {frame_idx}!")

        out.write(frame)  # write processed frame to output

        frame_idx += 1

    cap.release()
    out.release()
    print(f"Processed video saved as {output_video_path}")



In [11]:

input_video_path = "video.mp4"  # path to input video
output_video_path = "output_video.mp4"  # output processed video
reference_image_path = "star2.png"  # reference image for logo detection

process_video(input_video_path, output_video_path, reference_image_path)

Processing frame 0

0: 640x384 1 logo, 289.5ms
Speed: 17.7ms preprocess, 289.5ms inference, 26.7ms postprocess per image at shape (1, 3, 640, 384)
Logo 0 detected at coordinates: (882, 1642) -> (1504, 2290)

0: 384x640 1 logo, 140.9ms
Speed: 3.4ms preprocess, 140.9ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)
Logo 0 detected at coordinates: (84, 15) -> (213, 143)
Match found in frame 0!
Processing frame 5

0: 640x384 1 logo, 185.2ms
Speed: 6.7ms preprocess, 185.2ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)
Logo 0 detected at coordinates: (881, 1644) -> (1505, 2292)

0: 384x640 1 logo, 132.7ms
Speed: 2.9ms preprocess, 132.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Logo 0 detected at coordinates: (84, 15) -> (213, 143)
Match found in frame 5!
Processing frame 10

0: 640x384 1 logo, 152.6ms
Speed: 4.6ms preprocess, 152.6ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)
Logo 0 detected at coordinates: 