# Prediction Pipeline using YOLO + Siamese

### Import libraries

In [1]:
from ultralytics import YOLO
from PIL import Image
import os
import cv2
import os
import numpy as np
import datetime
import tensorflow as tf
import csv

from tensorflow.keras.layers import Layer, Conv2D, Dense, MaxPooling2D, Input, Flatten

from tensorflow.keras import backend, layers, metrics
from tensorflow.keras.applications import Xception
from tensorflow.keras.models import Model, Sequential


HOME = os.path.split(os.getcwd())[0]
print(HOME)

c:\Users\arihs\Documents\Thesis\CowId


### Load YOLO model

In [2]:
# Load the model

yolo_model = YOLO(f"{HOME}/notebooks/runs/detect/train6/weights/best.pt")


### Load Siamese model

In [4]:
class L1Dist(Layer):
    
    # Init method - inheritance
    def __init__(self, **kwargs):
        super().__init__()
       
    # Similarity calculation
    def call(self, input_embedding, validation_embedding):
        return tf.math.abs(input_embedding - validation_embedding)

siamese_model = tf.keras.models.load_model('siamese_model_v3.h5', custom_objects={'L1Dist':L1Dist, 'BinaryCrossentropy':tf.losses.BinaryCrossentropy})



In [5]:
# Image preprocessor for siamese model

def preprocess(img):
    img = tf.image.resize(img, (128,128))
    img = img / 255.0
    return img

In [6]:
def get_encoder(input_shape):

    pretrained_model = Xception(
        input_shape=input_shape,
        weights='imagenet',
        include_top=False,
        pooling='avg',
    )
    
    for i in range(len(pretrained_model.layers)-27):
        pretrained_model.layers[i].trainable = False

    encode_model = Sequential([
        pretrained_model,
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(256, activation="relu"),
        layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))
    ], name="Encode_Model")
    return encode_model

In [7]:
def classify_image(encoder, cow1, cow2, threshold=1.3):
    # Getting the encodings for the passed faces
    tensor1 = encoder.predict(np.array([cow1]))
    tensor2 = encoder.predict(np.array([cow2]))
    
    distance = np.sum(np.square(tensor1-tensor2), axis=-1)
    return distance[0]

In [8]:
# Prediction for siamese model

def verify(model, input_image, db_loc, detection_threshold):
    # Build results array
    best = detection_threshold
    cow = "None"
    numpy_array = np.array(input_image)
    input_image = tf.convert_to_tensor(numpy_array, dtype=tf.float32)
    
    for image in os.listdir(db_loc):
        byte_img = tf.io.read_file(os.path.join(db_loc, image))
        img = tf.io.decode_jpeg(byte_img)
        
        input_img = preprocess(input_image)
        validation_img = preprocess(img)
        
        # Make Predictions 
        distance = classify_image(model, input_img, validation_img)
        if distance < detection_threshold and distance < best:
            best = distance
            cow = image
    return cow, best

In [9]:
# Create bounding square to match siamese model input

def expand_rectangle_to_square(x1, y1, x2, y2):
    width = abs(x2 - x1)
    height = abs(y2 - y1)
    size = max(width, height)
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    new_x1 = center_x - size / 2
    new_y1 = center_y - size / 2
    new_x2 = center_x + size / 2
    new_y2 = center_y + size / 2
    return new_x1, new_y1, new_x2, new_y2

In [10]:
# Crop to correc size

def crop_to_square(image, x1, y1, x2, y2):
    cropped_image = Image.fromarray(image).crop((x1, y1, x2, y2)).resize((128, 128))
    return cropped_image

In [11]:
def seconds_to_time(seconds):
    td = datetime.timedelta(seconds=seconds)
    hours, remainder = divmod(td.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    time_str = f'{hours:02}:{minutes:02}:{seconds:02}'
    
    return time_str

In [12]:
encoder = get_encoder((128, 128, 3))
encoder.load_weights("encoder")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1bc9c6725e0>

In [13]:
encoder.summary()

Model: "Encode_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 xception (Functional)       (None, 2048)              20861480  
                                                                 
 flatten (Flatten)           (None, 2048)              0         
                                                                 
 dense (Dense)               (None, 512)               1049088   
                                                                 
 batch_normalization_4 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 lambda (Lambda)             (None, 256)               0         
                                                      

In [14]:

def video_to_frames_with_prediction(yolo_m, siamese_m, input_loc, output_loc, db_loc, confidence):
    # Create folder if not exists
    # if not os.path.exists(output_loc):
    #     os.makedirs(output_loc)
    vidcap = cv2.VideoCapture(input_loc)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    success = True
    count = 0
    detections = []
    while success:
        success,image = vidcap.read()
        count += 1
        if count % 2 == 0: continue
        image = np.uint8(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        res = yolo_m(image, conf=0.5)
        boxes = res[0].boxes.xyxy.tolist()
        cow_number = 0
        for inner_list in boxes:
            cow_number += 1
            x1, y1, x2, y2 = inner_list
            x1, y1, x2, y2 = expand_rectangle_to_square(x1, y1, x2, y2)
            cropped_image = crop_to_square(image, x1, y1, x2, y2)
            verified_result, distance = verify(siamese_m, cropped_image, db_loc, confidence)
            if verified_result != "None":
                time = seconds_to_time(count/fps)
                detection = f"{verified_result.split('.', 1)[0]} detected at time {time} with distance of {distance}"
                detections.append(detection)
                print(detection)
        
    with open(output_loc, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        for row in detections:
            writer.writerow([row])

In [17]:
%%capture

video_to_frames_with_prediction(yolo_model, encoder, f'{HOME}\\CC001.mp4', f'{HOME}\\data.csv', f"{HOME}\\db", 1)


0: 384x640 (no detections), 158.1ms
Speed: 2756.1ms preprocess, 158.1ms inference, 46.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 6.0ms
Speed: 1.0ms preprocess, 6.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 9.0ms
Speed: 0.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 6.0ms
Speed: 1.0ms preprocess, 6.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 8.0ms
Speed: 1.0ms preprocess, 8.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 7.0ms
Speed: 1.0ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 7.0ms
Speed: 0.0ms preprocess, 7.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 15.0ms
Speed: 0.0ms preprocess, 15.0ms inferen