# Prediction Pipeline using YOLO + Siamese

### Import libraries

In [15]:
from ultralytics import YOLO
from PIL import Image
import os
import cv2
import os
import numpy as np
import datetime
import tensorflow as tf
import csv
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Layer, Conv2D, Dense, MaxPooling2D, Input, Flatten

from tensorflow.keras import backend, layers, metrics
from tensorflow.keras.applications import ResNet152
from tensorflow.keras.models import Model, Sequential


HOME = os.path.split(os.getcwd())[0]
print(HOME)

d:\Code\CowId


### Load YOLO model

In [16]:
# Load the model

yolo_model = YOLO(f"{HOME}/data/yolo_models/train11/weights/best.pt")


### Load Siamese model

In [17]:
siamese_model_loc = os.path.join(HOME, 'data', 'optuna_siamese_resnet', '27')

class L1Dist(Layer):
    
    # Init method - inheritance
    def __init__(self, **kwargs):
        super().__init__()
       
    # Similarity calculation
    def call(self, input_embedding, validation_embedding):
        return tf.math.abs(input_embedding - validation_embedding)

siamese_model = tf.keras.models.load_model(os.path.join(siamese_model_loc, 'siamese_model.h5'), custom_objects={'L1Dist':L1Dist, 'BinaryCrossentropy':tf.losses.BinaryCrossentropy})



In [18]:
# Image preprocessor for siamese model

def preprocess(img):
    img = tf.image.resize(img, (128,128))
    img = img / 255.0
    return img

In [19]:
def get_encoder(input_shape):

    pretrained_model = ResNet152(
        input_shape=input_shape,
        weights='imagenet',
        include_top=False,
        pooling='avg',
    )
    
    for i in range(len(pretrained_model.layers)-12):
        pretrained_model.layers[i].trainable = False

    encode_model = Sequential([
        pretrained_model,
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(256, activation="relu"),
        layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))
    ], name="Encode_Model")
    return encode_model

In [20]:
def classify_image(encoder, cow1, cow2, threshold=1.3):
    # Getting the encodings for the passed faces
    tensor1 = encoder.predict(np.array([cow1]))
    tensor2 = encoder.predict(np.array([cow2]))
    
    distance = np.sum(np.square(tensor1-tensor2), axis=-1)
    return distance[0]

In [21]:
# Prediction for siamese model

def verify(model, input_image, db_loc, detection_threshold):
    # Build results array
    best = detection_threshold
    cow = "None"
    numpy_array = np.array(input_image)
    input_image = tf.convert_to_tensor(numpy_array, dtype=tf.float32)
    best_val_img = None
    
    for image in os.listdir(db_loc):
        byte_img = tf.io.read_file(os.path.join(db_loc, image))
        img = tf.io.decode_jpeg(byte_img)
        
        input_img = preprocess(input_image)
        validation_img = preprocess(img)
        
        # Make Predictions 
        distance = classify_image(model, input_img, validation_img)
        if distance < detection_threshold and distance < best:
            best = distance
            cow = image
            best_val_img = validation_img
    return cow, best, input_img, best_val_img

In [22]:
# Create bounding square to match siamese model input

def expand_rectangle_to_square(x1, y1, x2, y2):
    width = abs(x2 - x1)
    height = abs(y2 - y1)
    size = max(width, height)
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    new_x1 = center_x - size / 2
    new_y1 = center_y - size / 2
    new_x2 = center_x + size / 2
    new_y2 = center_y + size / 2
    return new_x1, new_y1, new_x2, new_y2

In [23]:
# Crop to correc size

def crop_to_square(image, x1, y1, x2, y2):
    cropped_image = Image.fromarray(image).crop((x1, y1, x2, y2)).resize((128, 128))
    return cropped_image

In [24]:
def seconds_to_time(seconds):
    td = datetime.timedelta(seconds=seconds)
    hours, remainder = divmod(td.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    time_str = f'{hours:02}:{minutes:02}:{seconds:02}'
    
    return time_str

In [25]:
encoder = get_encoder((128, 128, 3))
encoder.load_weights(os.path.join(siamese_model_loc, "encoder"))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2468e4bc520>

In [26]:
encoder.summary()

Model: "Encode_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet152 (Functional)      (None, 2048)              58370944  
                                                                 
 flatten_1 (Flatten)         (None, 2048)              0         
                                                                 
 dense_2 (Dense)             (None, 512)               1049088   
                                                                 
 batch_normalization_5 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dense_3 (Dense)             (None, 256)               131328    
                                                                 
 lambda_1 (Lambda)           (None, 256)               0         
                                                      

In [27]:

def video_to_frames_with_prediction(yolo_m, siamese_m, input_loc, output_loc, db_loc, confidence):
    folder_names = [name for name in os.listdir(output_loc) if os.path.isdir(os.path.join(output_loc, name))]
    sorted_folders = sorted(folder_names, reverse=True)
    RUN_ITERATION = 1 if len(sorted_folders) == 0 else int(sorted_folders[0] )+ 1
    output_loc = os.path.join(output_loc, str(RUN_ITERATION))

    # Create folder if not exists
    if not os.path.exists(output_loc):
        os.makedirs(output_loc)
    vidcap = cv2.VideoCapture(input_loc)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    success = True
    count = 0
    detections = []
    while success:
        success,image = vidcap.read()
        count += 1
        if count % 2 == 0: continue
        image = np.uint8(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        res = yolo_m(image, conf=0.5)
        boxes = res[0].boxes.xyxy.tolist()
        cow_number = 0
        for inner_list in boxes:
            cow_number += 1
            x1, y1, x2, y2 = inner_list
            x1, y1, x2, y2 = expand_rectangle_to_square(x1, y1, x2, y2)
            cropped_image = crop_to_square(image, x1, y1, x2, y2)
            verified_result, distance, cow_a, cow_b = verify(siamese_m, cropped_image, db_loc, confidence)
            if verified_result != "None":
                time = seconds_to_time(count/fps)
                detection = f"{verified_result.split('.', 1)[0]} detected at time {time} with distance of {distance}"
                detections.append(detection)
                print(detection)

                fig, (ax1, ax2) = plt.subplots(ncols=2)
                ax1.imshow(cow_a)
                ax2.imshow(cow_b)
                plt.savefig(os.path.join(output_loc, f'{time.replace(":", "")}-{distance}.png'))
        
    with open(os.path.join(output_loc, 'data.csv'), 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        for row in detections:
            writer.writerow([row])

In [31]:
%%capture

video_to_frames_with_prediction(yolo_model, encoder, f'{HOME}\\data\\video_cc_1.mp4', f'{HOME}\\data\\yolo_detections', f"{HOME}\\data\\video_validation", 0.005)


0: 384x640 (no detections), 316.5ms
Speed: 2.0ms preprocess, 316.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 3.0ms
Speed: 0.0ms preprocess, 3.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.0ms
Speed: 0.0ms preprocess, 4.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 3.0ms
Speed: 0.0ms preprocess, 3.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 6.5ms
Speed: 1.0ms preprocess, 6.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.5ms
Speed: 0.0ms preprocess, 4.5ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 5.0ms
Speed: 13.5ms preprocess, 5.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 4.3ms
Speed: 1.0ms preprocess, 4.3ms inference, 0