# Prediction Pipeline using YOLO + Siamese

### Import libraries

In [1]:
from ultralytics import YOLO
from PIL import Image
import os
import cv2
import numpy as np
import datetime
import csv
import matplotlib.pyplot as plt

from tensorflow.keras.applications.xception import preprocess_input as xception_preprocess_input
from tensorflow.keras.models import load_model


HOME = os.path.split(os.getcwd())[0]
print(HOME)

e:\Code\CowId


### Helper functions

Classify if two cows are the same or different cows

In [2]:
def classify_image(encoder, cow1, cow2, threshold=1.3):
    # Getting the encodings for the passed faces
    tensor1 = encoder.predict(np.array([cow1]))
    tensor2 = encoder.predict(np.array([cow2]))
    
    distance = np.sum(np.square(tensor1-tensor2), axis=-1)
    return distance[0]

Create bounding square to match siamese model input

In [3]:
def expand_rectangle_to_square(x1, y1, x2, y2):
    width = abs(x2 - x1)
    height = abs(y2 - y1)
    size = max(width, height)
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    new_x1 = center_x - size / 2
    new_y1 = center_y - size / 2
    new_x2 = center_x + size / 2
    new_y2 = center_y + size / 2
    return new_x1, new_y1, new_x2, new_y2

Crop the image to match input size

In [4]:
def crop_to_square(image, x1, y1, x2, y2):
    cropped_image = Image.fromarray(image).crop((x1, y1, x2, y2)).resize((128, 128))
    return cropped_image

Translate seconds to readable time

In [5]:
def seconds_to_time(seconds):
    td = datetime.timedelta(seconds=seconds)
    hours, remainder = divmod(td.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    time_str = f'{hours:02}:{minutes:02}:{seconds:02}'
    
    return time_str

Load test images in the database

In [6]:
def preload_test_images(db_loc):
    validation_images = {}
    for image in os.listdir(db_loc):
        img = cv2.imread(os.path.join(db_loc, image))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (128, 128))
        processed_img = xception_preprocess_input(img.copy()) 
        validation_images[image] = (processed_img, img) 
    return validation_images


Compare an input image with all the database images

In [7]:
def verify(model, input_image, validation_images, detection_threshold):
    best = detection_threshold
    cow = "None"
    numpy_array = np.array(input_image)
    input_img = xception_preprocess_input(numpy_array)
    best_val_img_unprocessed = None  
    
    for image, (validation_img_processed, validation_img_unprocessed) in validation_images.items():
        distance = classify_image(model, input_img, validation_img_processed)
        if distance < detection_threshold and distance < best:
            best = distance
            cow = image
            best_val_img_unprocessed = validation_img_unprocessed  
    
    return cow, best, input_image, best_val_img_unprocessed

Main function that takes a video and creates predictions

In [9]:

def video_to_frames_with_prediction(yolo_m, siamese_m, input_loc, output_loc, folder, db_loc, confidence):
    output_loc = os.path.join(output_loc, folder)

    if not os.path.exists(output_loc):
        os.makedirs(output_loc)
    vidcap = cv2.VideoCapture(input_loc)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    success = True
    count = 0
    detections = []

    validation_images = preload_test_images(db_loc)

    while success:
        success,image = vidcap.read()
        count += 1
        if count % 4 == 0: continue
        try:
            image = np.uint8(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
            res = yolo_m(image, conf=0.5)
            boxes = res[0].boxes.xyxy.tolist()
            cow_number = 0
            for inner_list in boxes:
                cow_number += 1
                x1, y1, x2, y2 = inner_list
                x1, y1, x2, y2 = expand_rectangle_to_square(x1, y1, x2, y2)
                cropped_image = crop_to_square(image, x1, y1, x2, y2)
                verified_result, distance, cow_a, cow_b = verify(siamese_m, cropped_image, validation_images, confidence)
                if verified_result != "None":
                    time = seconds_to_time(count/fps)
                    detection = f"{verified_result.split('.', 1)[0]} detected at time {time} with distance of {distance}"
                    detections.append(detection)
                    print(detection)
                    fig, (ax1, ax2) = plt.subplots(ncols=2)
                    ax1.imshow(cow_a)
                    ax2.imshow(cow_b)
                    plt.savefig(os.path.join(output_loc, f'{time.replace(":", "")}-{distance}.png'))
        except Exception as e:
            print(e)
            success = False
        
    with open(os.path.join(output_loc, 'data.csv'), 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        for row in detections:
            writer.writerow([row])

Load the YOLO model

In [11]:
# yolo_model = YOLO(f"{HOME}/data/yolo_models/train11/weights/best.pt")

yolo_model = YOLO(f"{HOME}/models/siamese_encoder.pt")

Load the Siamese model

In [10]:
siamese_model = load_model(os.path.join(HOME, "models", "yolo_detection.h5"))



Running main function

In [12]:
%%capture

video_to_frames_with_prediction(yolo_model, siamese_model, f'{HOME}\\raw_videos\\T1VA.mp4', f'{HOME}\\data\\yolo_detections_2', "148-3", f"{HOME}\\data\\video_validation_2", 3)




0: 384x640 (no detections), 168.1ms
Speed: 5.0ms preprocess, 168.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 3.0ms
Speed: 1.0ms preprocess, 3.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 6.0ms
Speed: 0.0ms preprocess, 6.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.0ms
Speed: 0.0ms preprocess, 4.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.0ms
Speed: 1.0ms preprocess, 4.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 7.0ms
Speed: 0.0ms preprocess, 7.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 9.0ms
Speed: 2.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 5.0ms
Speed: 2.0ms preprocess, 5.0ms inference, 1.