In [None]:
#pip install tensorflow pandas matplotlib scikit-learn pillow requests

# Setup & Configuration


In [1]:
import ssl
import urllib.request

ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D, Lambda
import numpy as np
import random
from PIL import Image
from io import BytesIO
import requests
import pandas as pd
from tqdm import tqdm

In [9]:
# Constants
IMG_SHAPE = (224, 224, 3)
EMBED_DIM = 128
BATCH_SIZE = 16
MARGIN = 0.2

# ResNet50 Model


- Pretrained ResNet50 or ResNet50V2 from Keras (without classification head)
- Applies global average pooling
- Adds a projection head to reduce to 128D
- Applies L2 normalization to create unit-length embeddings

The model is used to embed both **scene images** and **product images** into the same vector space for comparison L2 similarity. (or cosine)


- **Parameters:**
  - `embedding_dim`: Output dimension of the embedding vector (default is 128).
  - `backbone`: Choice of ResNet variant. Supports `'resnet50'` and `'resnet50v2'`.


In [4]:
def build_resnet_embedding_model(embedding_dim=EMBED_DIM, backbone='resnet50'):
    '''
    Build a ResNet-based embedding model for image classification, according to the Complete the Look paper.
    
    Parameters:
        `embedding_dim`: Output dimension of the embedding vector (default is 128)
        `backbone`: Choice of ResNet variant. Supports 'resnet50' and 'resnet50v2'
    Returns:
        A Keras Model that takes an image of size (224, 224, 3) as input and outputs an embedding vector of size 'embedding_dim'
    
    '''
    IMG_SHAPE = (224, 224, 3)
    
    if backbone == 'resnet50':
        base_model = tf.keras.applications.ResNet50(include_top=False, weights='imagenet', input_shape=IMG_SHAPE)
    elif backbone == 'resnet50v2':
        base_model = tf.keras.applications.ResNet50V2(include_top=False, weights='imagenet', input_shape=IMG_SHAPE)
    else:
        raise ValueError("Unsupported backbone")

    base_model.trainable = False  # freeze for baseline

    inputs = Input(shape=IMG_SHAPE) 
    if backbone == 'resnet50':
        x = tf.keras.applications.resnet.preprocess_input(inputs)
    elif backbone == 'resnet50v2':
        x = tf.keras.applications.resnet_v2.preprocess_input(inputs) 
    x = base_model(x, training=False) 
    x = GlobalAveragePooling2D()(x) 
    x = Dense(embedding_dim)(x) 
    x = Lambda(lambda t: tf.math.l2_normalize(t, axis=1))(x) # L2 normalization
    return Model(inputs, x, name="resnet50_embedder")

In [5]:
embedder = build_resnet_embedding_model()
embedder.summary()

2025-04-14 01:12:47.369530: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-04-14 01:12:47.369562: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-04-14 01:12:47.369570: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-04-14 01:12:47.369600: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-04-14 01:12:47.369631: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [6]:
embedder_v2 = build_resnet_embedding_model(backbone = 'resnet50v2')
embedder_v2.summary()

# Triplet Model

This wraps the shared embedding model to take in a scene image, a positive (compatible) product image, and a negative (incompatible) product image. 

It outputs the three corresponding embeddings which will be passed to the triplet loss during training.


In [None]:
def build_triplet_model():
    '''
    Build a triplet model for training the embedding model.
    The model takes three inputs: scene, positive, and negative images.
    It outputs the embeddings for each of these images.

    Parameters:
        None
    Returns:
        A Keras Model that takes three images as input and outputs their embeddings.
    '''
    embedder = build_resnet_embedding_model()
    scene_input = tf.keras.Input(shape=IMG_SHAPE)
    pos_input = tf.keras.Input(shape=IMG_SHAPE)
    neg_input = tf.keras.Input(shape=IMG_SHAPE)

    scene_emb = embedder(scene_input)
    pos_emb = embedder(pos_input)
    neg_emb = embedder(neg_input)

    return Model(inputs=[scene_input, pos_input, neg_input], outputs=[scene_emb, pos_emb, neg_emb])

# Triplet Loss and Training Model

- Pulls scene and positive closer
- Pushes scene and negative farther

In [None]:
def triplet_loss(anchor, positive, negative, margin=MARGIN):
    '''
    triplet loss function for training the embedding model.
    the loss function encourages the positive example to be closer to the anchor than the negative example by a margin.
    The loss is defined as the maximum of 0 and the difference between the distance of the anchor to the positive and the distance of the anchor to the negative plus a margin.
    Parameters:
        anchor: The embedding of the anchor image.
        positive: The embedding of the positive image.
        negative: The embedding of the negative image.
        margin: The margin to be enforced between the positive and negative distances.
    Returns:
        The triplet loss value.
    '''
    pos_dist = tf.reduce_sum(tf.square(anchor - positive), axis=1)
    neg_dist = tf.reduce_sum(tf.square(anchor - negative), axis=1)
    return tf.reduce_mean(tf.maximum(pos_dist - neg_dist + margin, 0.0))

class TripletModel(tf.keras.Model):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.loss_tracker = tf.keras.metrics.Mean(name="loss")

    def compile(self, optimizer):
        super().compile()
        self.optimizer = optimizer

    def call(self, inputs):
        return self.model(inputs)

    def train_step(self, data):
        scene, pos, neg = data
        with tf.GradientTape() as tape:
            scene_emb, pos_emb, neg_emb = self.model([scene, pos, neg], training=True)
            loss = triplet_loss(scene_emb, pos_emb, neg_emb)
        grads = tape.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def test_step(self, data):
        scene, pos, neg = data
        scene_emb, pos_emb, neg_emb = self.model([scene, pos, neg], training=False)
        loss = triplet_loss(scene_emb, pos_emb, neg_emb)
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}


# Triplet Dataset Loader (from local image paths)

This function builds a `tf.data.Dataset` that generates batches of (scene, positive, negative) triplets from local file paths.

It:
- Randomly samples a scene and one of its matching products
- Randomly picks a product that is *not* a match (negative)
- Loads all 3 images from disk and applies augmentation
This dataset feeds directly into our training loop.


In [15]:
def create_triplet_dataset_from_local(df, batch_size=16):
    '''
    create a triplet dataset from local images.
    The dataset is created from a pandas DataFrame containing the paths to the images.
    The DataFrame should contain two columns: 'scene_path' and 'product_path'.
    The 'scene_path' column contains the paths to the scene images, and the 'product_path' column contains the paths to the product images.
    The dataset is created by randomly selecting a scene image and a positive product image from the same scene, and a negative product image from a different scene.
    The dataset is then preprocessed by loading the images, resizing them to (224, 224), and normalizing them to the range [0, 1].
    Parameters:
        df: A pandas DataFrame containing the paths to the images.
        batch_size: The batch size for the dataset.
    Returns:
        A TensorFlow dataset containing the triplet images.
    '''
    scene_to_pos = df.groupby("scene_path")["product_path"].apply(list).to_dict()
    all_products = df["product_path"].tolist()
    scene_paths = list(scene_to_pos.keys())

    def get_triplet(scene_path):
        pos = random.choice(scene_to_pos[scene_path])
        while True:
            neg = random.choice(all_products)
            if neg not in scene_to_pos[scene_path]:
                break
        return scene_path.encode(), pos.encode(), neg.encode()

    def generator():
        while True:
            scene = random.choice(scene_paths)
            yield get_triplet(scene)

    def load_local_image(path):
        image = tf.io.read_file(path)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, [256, 256])
        image = tf.image.random_crop(image, [224, 224, 3])
        image = tf.image.random_flip_left_right(image)
        return tf.cast(image, tf.float32) / 255.0

    def map_paths(scene_path, pos_path, neg_path):
        s = load_local_image(scene_path)
        p = load_local_image(pos_path)
        n = load_local_image(neg_path)
        return s, p, n

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(), dtype=tf.string),
            tf.TensorSpec(shape=(), dtype=tf.string),
            tf.TensorSpec(shape=(), dtype=tf.string),
        )
    )

    return dataset.map(map_paths).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [20]:
fashion_train_df = pd.read_csv("/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data/fashion_train_data.csv")
fashion_val_df = pd.read_csv("/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data/fashion_validate_data.csv")

fashion_train_ds = create_triplet_dataset_from_local(fashion_train_df)
fashion_val_ds = create_triplet_dataset_from_local(fashion_val_df)

In [26]:
home_train_df = pd.read_csv("/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data/home_train_data.csv")
home_val_df = pd.read_csv("/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data/home_validate_data.csv")

home_train_ds = create_triplet_dataset_from_local(home_train_df)
home_val_ds = create_triplet_dataset_from_local(home_val_df)


In [None]:
triplet_net = build_triplet_model()
trainer = TripletModel(triplet_net)
trainer.compile(optimizer=tf.keras.optimizers.Adam(1e-4))

trainer.fit(fashion_train_ds, validation_data=fashion_val_ds, steps_per_epoch=100, validation_steps=20, epochs=10)

In [22]:
fashion_model = TripletModel(build_triplet_model())
fashion_model.compile(optimizer=tf.keras.optimizers.Adam(1e-4))

fashion_model.fit(fashion_train_ds, validation_data=fashion_val_ds,
                  steps_per_epoch=100, validation_steps=20, epochs=10)


Epoch 1/10


2025-04-14 08:26:45.379369: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 550ms/step - loss: 0.1974 - val_loss: 0.2063
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 457ms/step - loss: 0.1997 - val_loss: 0.2024
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 453ms/step - loss: 0.1980 - val_loss: 0.1938
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 466ms/step - loss: 0.2000 - val_loss: 0.1974
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 512ms/step - loss: 0.1951 - val_loss: 0.2042
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 626ms/step - loss: 0.1963 - val_loss: 0.2006
Epoch 7/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 635ms/step - loss: 0.1981 - val_loss: 0.1966
Epoch 8/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 627ms/step - loss: 0.1966 - val_loss: 0.1963
Epoch 9/10
[1m100/100[0m 

<keras.src.callbacks.history.History at 0x3473e7550>

In [27]:
home_model = TripletModel(build_triplet_model())
home_model.compile(optimizer=tf.keras.optimizers.Adam(1e-4))

home_model.fit(home_train_ds, validation_data=home_val_ds,
               steps_per_epoch=100, validation_steps=20, epochs=10)

Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 595ms/step - loss: 0.2019 - val_loss: 0.1985
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 483ms/step - loss: 0.1991 - val_loss: 0.1983
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 890ms/step - loss: 0.1975 - val_loss: 0.1932
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 890ms/step - loss: 0.1954 - val_loss: 0.1877
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 797ms/step - loss: 0.1766 - val_loss: 0.1892
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 805ms/step - loss: 0.1881 - val_loss: 0.1861
Epoch 7/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 920ms/step - loss: 0.1867 - val_loss: 0.1986
Epoch 8/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 1s/step - loss: 0.1829 - val_loss: 0.1895
Epoch 9/10
[1m100

<keras.src.callbacks.history.History at 0x320325bb0>

In [28]:
fashion_model.save("fashion_triplet_model.keras")
home_model.save("home_triplet_model.keras")

  return saving_lib.save_model(model, filepath)


In [30]:
sample = next(iter(fashion_val_ds.take(1)))
scene, pos, neg = sample
embedder = fashion_model.model.get_layer("resnet50_embedder")

print(embedder(scene).shape)
print(embedder(pos).shape)
print(embedder(neg).shape)

(16, 128)
(16, 128)
(16, 128)


In [31]:
sample = next(iter(fashion_val_ds.take(1)))
scene, pos, neg = sample
embedder = fashion_model.model.get_layer("resnet50_embedder")

print(embedder(scene).shape)
print(embedder(pos).shape)
print(embedder(neg).shape)

(16, 128)
(16, 128)
(16, 128)


In [None]:
for layer in fashion_model.model.layers:
    print(layer.name)

In [33]:
import numpy as np
def compute_avg_distances(embedder, dataset, steps=50):
    pos_dists, neg_dists = [], []

    for batch in dataset.take(steps):
        scene, pos, neg = batch
        s_emb = embedder(scene)
        p_emb = embedder(pos)
        n_emb = embedder(neg)

        pos_dist = tf.norm(s_emb - p_emb, axis=1).numpy()
        neg_dist = tf.norm(s_emb - n_emb, axis=1).numpy()

        pos_dists.extend(pos_dist.tolist())
        neg_dists.extend(neg_dist.tolist())

    return np.mean(pos_dists), np.mean(neg_dists)

# Example:
embedder = fashion_model.model.get_layer("resnet50_embedder")
avg_pos, avg_neg = compute_avg_distances(embedder, fashion_val_ds)
print(f"Avg positive distance: {avg_pos:.4f}")
print(f"Avg negative distance: {avg_neg:.4f}")


Avg positive distance: 0.2619
Avg negative distance: 0.2768


2025-04-14 08:57:07.610017: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
embedder = home_model.model.get_layer("resnet50_embedder")
avg_pos, avg_neg = compute_avg_distances(embedder, home_val_ds)
print(f"Avg positive distance: {avg_pos:.4f}")
print(f"Avg negative distance: {avg_neg:.4f}")

Avg positive distance: 0.3930
Avg negative distance: 0.4110


2025-04-14 08:58:30.133661: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
