# Prediction Pipeline using YOLO + Siamese

### Import libraries

In [1]:
from ultralytics import YOLO
from PIL import Image
import os
import cv2
import os
import numpy as np
import datetime
import tensorflow as tf
import csv
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Layer, Conv2D, Dense, MaxPooling2D, Input, Flatten

from tensorflow.keras import backend, layers, metrics
from tensorflow.keras.applications.xception import preprocess_input as xception_preprocess_input
from tensorflow.keras.applications import Xception
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.models import load_model


HOME = os.path.split(os.getcwd())[0]
print(HOME)

e:\Code\CowId


### Load YOLO model

In [2]:
# Load the model

yolo_model = YOLO(f"{HOME}/data/yolo_models/train11/weights/best.pt")


### Load Siamese model

In [3]:
def classify_image(encoder, cow1, cow2, threshold=1.3):
    # Getting the encodings for the passed faces
    tensor1 = encoder.predict(np.array([cow1]))
    tensor2 = encoder.predict(np.array([cow2]))
    
    distance = np.sum(np.square(tensor1-tensor2), axis=-1)
    return distance[0]

In [4]:
# Create bounding square to match siamese model input

def expand_rectangle_to_square(x1, y1, x2, y2):
    width = abs(x2 - x1)
    height = abs(y2 - y1)
    size = max(width, height)
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    new_x1 = center_x - size / 2
    new_y1 = center_y - size / 2
    new_x2 = center_x + size / 2
    new_y2 = center_y + size / 2
    return new_x1, new_y1, new_x2, new_y2

In [5]:
# Crop to correc size

def crop_to_square(image, x1, y1, x2, y2):
    cropped_image = Image.fromarray(image).crop((x1, y1, x2, y2)).resize((128, 128))
    return cropped_image

In [6]:
def seconds_to_time(seconds):
    td = datetime.timedelta(seconds=seconds)
    hours, remainder = divmod(td.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    time_str = f'{hours:02}:{minutes:02}:{seconds:02}'
    
    return time_str

In [7]:
def preload_validation_images(db_loc):
    validation_images = {}
    for image in os.listdir(db_loc):
        img = cv2.imread(os.path.join(db_loc, image))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (128, 128))
        processed_img = xception_preprocess_input(img.copy()) 
        validation_images[image] = (processed_img, img) 
    return validation_images


In [8]:
def verify(model, input_image, validation_images, detection_threshold):
    best = detection_threshold
    cow = "None"
    numpy_array = np.array(input_image)
    input_img = xception_preprocess_input(numpy_array)
    best_val_img_unprocessed = None  
    
    for image, (validation_img_processed, validation_img_unprocessed) in validation_images.items():
        distance = classify_image(model, input_img, validation_img_processed)
        if distance < detection_threshold and distance < best:
            best = distance
            cow = image
            best_val_img_unprocessed = validation_img_unprocessed  
    
    return cow, best, input_image, best_val_img_unprocessed

In [9]:

def video_to_frames_with_prediction(yolo_m, siamese_m, input_loc, output_loc, folder, db_loc, confidence):
    output_loc = os.path.join(output_loc, folder)

    if not os.path.exists(output_loc):
        os.makedirs(output_loc)
    vidcap = cv2.VideoCapture(input_loc)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    success = True
    count = 0
    detections = []

    validation_images = preload_validation_images(db_loc)

    while success:
        success,image = vidcap.read()
        count += 1
        if count % 2 == 0: continue
        image = np.uint8(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        res = yolo_m(image, conf=0.5)
        boxes = res[0].boxes.xyxy.tolist()
        cow_number = 0
        for inner_list in boxes:
            cow_number += 1
            x1, y1, x2, y2 = inner_list
            x1, y1, x2, y2 = expand_rectangle_to_square(x1, y1, x2, y2)
            cropped_image = crop_to_square(image, x1, y1, x2, y2)
            verified_result, distance, cow_a, cow_b = verify(siamese_m, cropped_image, validation_images, confidence)
            if verified_result != "None":
                time = seconds_to_time(count/fps)
                detection = f"{verified_result.split('.', 1)[0]} detected at time {time} with distance of {distance}"
                detections.append(detection)
                print(detection)
                fig, (ax1, ax2) = plt.subplots(ncols=2)
                ax1.imshow(cow_a)
                ax2.imshow(cow_b)
                plt.savefig(os.path.join(output_loc, f'{time.replace(":", "")}-{distance}.png'))
        
    with open(os.path.join(output_loc, 'data.csv'), 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        for row in detections:
            writer.writerow([row])

In [71]:
%%capture

siamese_model = load_model(os.path.join(HOME, "data", "optuna_siamese_runs", "176", "encoder.h5"))

video_to_frames_with_prediction(yolo_model, siamese_model, f'{HOME}\\data\\videos_230305\\video_cc_1.mp4', f'{HOME}\\data\\yolo_detections_2', "176-3", f"{HOME}\\data\\video_validation", 3)




0: 384x640 (no detections), 186.2ms
Speed: 1.0ms preprocess, 186.2ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.0ms
Speed: 0.0ms preprocess, 4.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.0ms
Speed: 0.0ms preprocess, 4.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 5.0ms
Speed: 0.5ms preprocess, 5.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 6.0ms
Speed: 0.0ms preprocess, 6.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.0ms
Speed: 0.0ms preprocess, 4.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 7.0ms
Speed: 51.6ms preprocess, 7.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.0ms
Speed: 1.0ms preprocess, 4.0ms inference, 0

In [12]:
%%capture

siamese_model = load_model(os.path.join(HOME, "data", "optuna_siamese_runs", "148", "encoder.h5"))

video_to_frames_with_prediction(yolo_model, siamese_model, f'{HOME}\\data\\videos_230305\\video_cc_1.mp4', f'{HOME}\\data\\yolo_detections_2', "148-3", f"{HOME}\\data\\video_validation", 3)




0: 384x640 (no detections), 168.1ms
Speed: 5.0ms preprocess, 168.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 3.0ms
Speed: 1.0ms preprocess, 3.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 6.0ms
Speed: 0.0ms preprocess, 6.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.0ms
Speed: 0.0ms preprocess, 4.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.0ms
Speed: 1.0ms preprocess, 4.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 7.0ms
Speed: 0.0ms preprocess, 7.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 9.0ms
Speed: 2.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 5.0ms
Speed: 2.0ms preprocess, 5.0ms inference, 1.

In [10]:
%%capture

siamese_model = load_model(os.path.join(HOME, "data", "optuna_siamese_runs", "199", "encoder.h5"))

video_to_frames_with_prediction(yolo_model, siamese_model, f'{HOME}\\data\\videos_230305\\video_cc_1.mp4', f'{HOME}\\data\\yolo_detections_2', "199-5", f"{HOME}\\data\\video_validation", 5)




0: 384x640 (no detections), 84.0ms
Speed: 7.6ms preprocess, 84.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 12.0ms
Speed: 1.3ms preprocess, 12.0ms inference, 6.9ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 17.0ms
Speed: 1.0ms preprocess, 17.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 10.0ms
Speed: 1.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 9.7ms
Speed: 1.0ms preprocess, 9.7ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 9.5ms
Speed: 1.0ms preprocess, 9.5ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 11.0ms
Speed: 1.1ms preprocess, 11.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 7.5ms
Speed: 1.0ms preprocess, 7.5ms inferen

In [11]:
%%capture

siamese_model = load_model(os.path.join(HOME, "data", "optuna_siamese_runs", "155", "encoder.h5"))

video_to_frames_with_prediction(yolo_model, siamese_model, f'{HOME}\\data\\videos_230305\\video_cc_1.mp4', f'{HOME}\\data\\yolo_detections_2', "155-2-5", f"{HOME}\\data\\video_validation", 2.5)




0: 384x640 (no detections), 70.0ms
Speed: 0.0ms preprocess, 70.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 20.0ms
Speed: 1.0ms preprocess, 20.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 31.0ms
Speed: 2.0ms preprocess, 31.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 11.0ms
Speed: 3.0ms preprocess, 11.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 15.0ms
Speed: 0.0ms preprocess, 15.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 13.0ms
Speed: 1.2ms preprocess, 13.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 17.0ms
Speed: 1.0ms preprocess, 17.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 7.0ms
Speed: 0.0ms preprocess, 7.0ms inf