In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D, Lambda
import numpy as np
import random
from PIL import Image
from io import BytesIO
import requests
import pandas as pd
from tqdm import tqdm

In [2]:
# Constants
IMG_SHAPE = (224, 224, 3)
EMBED_DIM = 128
BATCH_SIZE = 16
MARGIN = 0.2

# Siamese Network Architecture

## Custom Siamese Network Backbone (CNNs)

In [8]:
def build_siamese_backbone(input_shape=IMG_SHAPE, embedding_size=EMBED_DIM):
    """
      Builds the embeddings backbone of the Siamese Net without fully connected layers.
      The architecture is a deep convolutional neural network inspired by early AlexNet-like
      architectures and is suitable for computing visual similarity using distances (e.g., L2, cosine)

      Args:
          input_shape (tuple): Shape of the input image.

      Returns:
          keras.Model: CNN feature extractor model (convolutional base only).
    """
    input = layers.Input(shape=IMG_SHAPE)

    # CNN Backbone
    x = layers.Conv2D(96, (11,11), strides=1, padding="valid", activation="relu")(input)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool2D(pool_size=(3,3), strides=2)(x)

    x = layers.Conv2D(256, (5,5), strides=1, padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D(pool_size=3, strides=2)(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Conv2D(384, (3,3), strides=1, padding="same", activation="relu")(x)
    x = layers.Conv2D(256, (3,3), strides=1, padding="same", activation="relu")(x)
    x = layers.MaxPooling2D(pool_size=3, strides=2)(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Flatten()(x)
    model = Model(inputs=input, outputs=x, name="custom_siamese_backbone")
    return model

In [11]:
siamese_backbone = build_siamese_backbone()
siamese_backbone.summary()

## Construct Siamese Network (freeze CNN layers, only train on fully connected layers)

In [9]:
def construct_siamese(embedding_size=EMBED_DIM):
    """
    Constructs the full Siamese embedding network using the specified backbone.
    This model processes input images and outputs L2-normalized embeddings
    of a specified dimension. Only the fully connected layers are trainable;
    all convolutional layers are frozen (non-trainable).

    Args:
        embedding_size (int): Size of the final L2-normalized embedding vector.

    Returns:
        keras.Model: Siamese embedding model that takes an image as input and outputs
                     an L2-normalized embedding vector of size `embedding_size`.
    """
    base_model = build_siamese_backbone(IMG_SHAPE)
    base_model.trainable = False  # Freeze entire backbone

    # Add fully connected layers on top
    input = layers.Input(shape=IMG_SHAPE)
    x = base_model(input)
    x = layers.Dense(embedding_size)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(embedding_size)(x)
    outputs = layers.Lambda(lambda t: tf.math.l2_normalize(t, axis=1))(x)

    model = Model(inputs=input, outputs=outputs, name="siamese_net_embedder")
    return model

In [10]:
siamese_net = construct_siamese()
siamese_net.summary()

In [None]:
def build_triplet_model():
    '''
    Build a triplet model for training the embedding model.
    The model takes three inputs: scene, positive, and negative images.
    It outputs the embeddings for each of these images.

    Parameters:
        None
    Returns:
        A Keras Model that takes three images as input and outputs their embeddings.
    '''
    embedder = construct_siamese()
    scene_input = tf.keras.Input(shape=IMG_SHAPE)
    pos_input = tf.keras.Input(shape=IMG_SHAPE)
    neg_input = tf.keras.Input(shape=IMG_SHAPE)

    scene_emb = embedder(scene_input)
    pos_emb = embedder(pos_input)
    neg_emb = embedder(neg_input)

    return Model(inputs=[scene_input, pos_input, neg_input], outputs=[scene_emb, pos_emb, neg_emb])

Rest is the same as Shameek's code...