# How to run

In directory containing this notebook, expected to find
- train.csv 
- folder noisy-images containing the images

Run cells sequentially

In [1]:
# from google.colab import files
# uploaded = files.upload()
# !unzip images.zip

In [4]:
import csv
import pandas as pd
import numpy as np
from PIL import Image

import tensorflow as tf
from tensorflow import keras
from keras import Model, losses, layers

import matplotlib.pyplot as plt

import os


print(tf.__version__)

2.12.0


### Get csv data and convert text embedding into np array 

In [5]:
df = pd.read_csv("train_preproc.csv", delimiter=',')

df['noisyTextDescription'] = df['noisyTextDescription'].map(lambda x: np.array(x[1:-1].split(), dtype=float))

### Load images

In [6]:
def load_image(id):
    img = Image.open('noisy-images/{id}.jpg'.format(id=id))
    data = np.asarray(img)
    return data/255

df["noisyImage"] = df.apply(lambda x: load_image(x['id']), axis=1)

### Get unique elements in categorical inputs 
- category
- gender
- baseColour
- season
- usage

Categorical features are embedded as a one-hot tensor

In [7]:
def get_categorical_embedding(categorical_input, silent=False):
    possible_values = df[categorical_input].unique().tolist()
    num_possible_values = len(possible_values)

    index = lambda x: possible_values.index(x)
    def encoder(x: int):
        encoding = np.zeros(num_possible_values)
        encoding[x] = 1
        return np.ndarray.copy(encoding)

    # index = layers.StringLookup(vocabulary = possible_values, output_mode = 'int')
    # encoder = layers.CategoryEncoding(num_tokens=num_possible_values + 1, output_mode='one_hot')

    
    if not silent:
        print("{n} possible values for {name}".format(n = num_possible_values, name = categorical_input))
        print(possible_values)
    
    return lambda feature: encoder(index(feature))


# for categorical_input in ['category', 'gender', 'baseColour', 'season', 'usage']:
#     possible_values = data_points[categorical_input].unique()
#     num_possible_values = len(possible_values)

#     print("{n} possible values for {name}".format(n = num_possible_values, name = categorical_input))
#     print(possible_values)
    
category_embedding = get_categorical_embedding('category')
gender_embedding = get_categorical_embedding('gender')
baseColour_embedding = get_categorical_embedding('baseColour')
season_embedding = get_categorical_embedding('season')
usage_embedding = get_categorical_embedding('usage')


27 possible values for category
['Sandal', 'Bottomwear', 'Shoes', 'Topwear', 'Innerwear', 'Loungewear and Nightwear', 'Watches', 'Fragrance', 'Eyewear', 'Lips', 'Bags', 'Saree', 'Wallets', 'Scarves', 'Jewellery', 'Dress', 'Ties', 'Flip Flops', 'Headwear', 'Makeup', 'Belts', 'Socks', 'Nails', 'Free Gifts', 'Apparel Set', 'Cufflinks', 'Accessories']
5 possible values for gender
['Men', 'Women', 'Girls', 'Unisex', 'Boys']
46 possible values for baseColour
['Tan', 'Blue', 'White', 'Black', 'Beige', 'Pink', 'Green', 'Red', 'Brown', 'Grey', 'Yellow', 'Magenta', 'Steel', 'Purple', 'Orange', 'Silver', 'Navy Blue', 'Maroon', 'Gold', 'Olive', 'Cream', 'Peach', 'Lavender', 'Coffee Brown', 'Grey Melange', 'Teal', 'Rust', 'Multi', 'Charcoal', 'Turquoise Blue', 'Rose', 'Off White', 'Skin', 'Khaki', 'Metallic', 'Nude', 'Mustard', 'Copper', 'Burgundy', 'Sea Green', 'Mauve', 'Mushroom Brown', 'Bronze', 'Taupe', 'Lime Green', 'Fluorescent Green']
4 possible values for season
['Summer', 'Fall', 'Winter',

### Encode categorical 

In [8]:
possible_categories = df['category'].unique()

print('{n} possible categories'.format(n=len(possible_categories)))


for category in possible_categories:
    print('{category} : {embedding}'.format(category=category, embedding=category_embedding(category)))


27 possible categories
Sandal : [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Bottomwear : [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Shoes : [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Topwear : [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Innerwear : [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Loungewear and Nightwear : [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Watches : [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Fragrance : [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Eyewear : [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Lips : [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Bags : [0. 0. 0. 0.

In [9]:
df['category'] = df['category'].map(lambda x: category_embedding(x))
df['gender'] = df['gender'].map(lambda x: gender_embedding(x))
df['baseColour'] = df['baseColour'].map(lambda x: baseColour_embedding(x))
df['season'] = df['season'].map(lambda x: season_embedding(x))
df['usage'] = df['usage'].map(lambda x: usage_embedding(x))


In [10]:
df["categoricalData"] = df.apply(lambda x: np.concatenate((x['gender'], x['baseColour'], x['season'], x['usage']), axis=0), axis=1)

### Separate into validationing and validation sets

In [11]:
# separate into training and validation 80/20 ratio
# sample(frac=1) randomly shuffles data
train_dataframe, validation_dataframe = np.split(df.sample(frac=1, random_state=480), [int(0.8 * len(df))])


print("{n} items in training set".format(n = len(train_dataframe)))
print("{n} items in validation set".format(n = len(validation_dataframe)))

17301 items in training set
4326 items in validation set


In [12]:
y_train = train_dataframe['category'].to_numpy()

x_text_train = train_dataframe['noisyTextDescription'].to_numpy()
x_img_train = train_dataframe['noisyImage'].to_numpy()
x_categorical_train = train_dataframe['categoricalData'].to_numpy()


y_validation = validation_dataframe['category'].to_numpy()

x_text_validation = validation_dataframe['noisyTextDescription'].to_numpy()
x_img_validation = validation_dataframe['noisyImage'].to_numpy()
x_categorical_validation = validation_dataframe['categoricalData'].to_numpy()


In [13]:
x_img_train = np.vstack([np.array([img]) for img in x_img_train])
print(x_img_train.shape)

x_text_train = np.vstack([np.array([text]) for text in x_text_train])
print(x_text_train.shape)

x_categorical_train = np.vstack([np.array([categorical]) for categorical in x_categorical_train])
print(x_categorical_train.shape)


x_img_validation = np.vstack([np.array([img]) for img in x_img_validation])
x_text_validation = np.vstack([np.array([text]) for text in x_text_validation])
x_categorical_validation = np.vstack([np.array([categorical]) for categorical in x_categorical_validation])



print(x_img_validation.shape)
print(x_text_validation.shape)
print(x_categorical_validation.shape)


y_train = np.vstack([np.array([y]) for y in y_train])
print(y_train.shape)

y_validation = np.vstack([np.array([y]) for y in y_validation])
print(y_validation.shape)


(17301, 80, 60, 3)
(17301, 768)
(17301, 62)
(4326, 80, 60, 3)
(4326, 768)
(4326, 62)
(17301, 27)
(4326, 27)


In [14]:
del df

### Define NN models

In [19]:
# Model: CNN for image (4 conv layers with skip connection -> maxpool 2 by 2)
# Model: dense linear for categorical attributes
# Model: transformer for text 

# flatten CNN, Linear, Transformer into a 1D layer
# linear layers -> softmax

class ResidualCNNLayer(layers.Layer):
    def __init__(self, n_filters=64, kernel_size=3) -> None:
        super(ResidualCNNLayer, self).__init__()
        self.conv1 = layers.Conv2D(filters=n_filters/2, kernel_size=kernel_size, padding='same')
        self.conv2 = layers.Conv2D(filters=n_filters/2, kernel_size=kernel_size, padding='same')
        self.conv3 = layers.Conv2D(filters=n_filters, kernel_size=kernel_size, padding='same')
        self.batch_normalization = layers.BatchNormalization()
        self.identity_mapping = layers.Conv2D(filters=n_filters/2, kernel_size=1, padding='same')
        self.relu = layers.ReLU()

    def call(self, x, training=False):
        x_skip = x
        # Layer 1
        x = self.conv1(x)
        x = self.batch_normalization(x, training=training)
        x = self.relu(x)
        # Layer 2
        x = self.conv2(x)

        # Add Residue
        x = layers.Add()([x, self.identity_mapping(x_skip)])

        x = self.batch_normalization(x, training=training)
        x = self.relu(x)

        x = self.conv3(x)
        x = self.relu(x)
        return x


class Img(layers.Layer):
    def __init__(self) -> None:
        super(Img, self).__init__()
        self.res1 = ResidualCNNLayer(n_filters=64, kernel_size=3)
        self.res2 = ResidualCNNLayer(n_filters=128, kernel_size=3)
        self.res3 = ResidualCNNLayer(n_filters=256, kernel_size=3)
        self.res4 = ResidualCNNLayer(n_filters=512, kernel_size=3)
        self.flattening_function = layers.GlobalAveragePooling2D()

        # padding = same is important for last residual layer, as size is not divisible by 2
        self.pool = layers.MaxPool2D(padding='same')
        self.dropout = layers.Dropout(0.35)

    def call(self, x, training=False):
        x = self.res1(x, training=training)
        x = self.pool(x)
        x = self.dropout(x, training=training)

        x = self.res2(x, training=training)
        x = self.pool(x)
        x = self.dropout(x, training=training)

        x = self.res3(x, training=training)
        x = self.pool(x)
        x = self.dropout(x, training=training)

        x = self.res4(x, training=training)
        x = self.pool(x)
        x = self.dropout(x, training=training)

        return self.flattening_function(x)


class Txt(layers.Layer):
    def __init__(self) -> None:
        super(Txt, self).__init__()
        self.dense1 = layers.Dense(256)

        self.relu = layers.ReLU()

    def call(self, x, training=False):
        x = self.dense1(x)
        x = self.relu(x)

        return x


class NN(Model):
    def __init__(self) -> None:
        super(NN, self).__init__()
        # for image
        self.img_layer = Img()

        # some processing on text embedding
        # since the proprocessed transformer embedding is from a general model, want to reduce the size of embedding
        self.txt_layer = Txt()

        # dense layers after combining       
        self.dense1 = layers.Dense(512, activation='relu')
        # self.dense2 = layers.Dense(512, activation='relu')
        # self.dense3 = layers.Dense(512, activation='relu')

        self.dropout = layers.Dropout(0.5)

        self.classification_layer = layers.Dense(27, activation='softmax')

        self.concat = layers.Concatenate()


    def call(self, x, training=False):
        categorical, text, image = x
        text = self.txt_layer(text, training=training)
        image = self.img_layer(image, training=training)
        combined = self.concat([categorical, text, image])
        combined = self.dense1(combined)
        combined = self.dropout(combined, training=training)
        # combined = self.dense2(combined)
        # combined = self.dropout(combined, training=training)
        # combined = self.dense3(combined)
        # combined = self.dropout(combined, training=training)
        return self.classification_layer(combined)

    

In [20]:
model = NN()
model.compile(optimizer='Adam', loss=losses.CategoricalCrossentropy(), metrics=['accuracy'])

In [21]:
checkpoint_path = "checkpoints/cp.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 save_best_only=True,
                                                 verbose=1)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=4, start_from_epoch=4)


In [21]:
tf.debugging.disable_traceback_filtering()

In [22]:
model.load_weights(checkpoint_path)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f9b08af2520>

In [22]:
r = model.fit((x_categorical_train, x_text_train, x_img_train), y_train,
              validation_data=((x_categorical_validation, x_text_validation, x_img_validation), y_validation),
              verbose=2,
              epochs=20,
              batch_size=64,
              callbacks=[stop_early, cp_callback])

Epoch 1/20

Epoch 1: val_loss improved from inf to 1.38941, saving model to checkpoints/cp.ckpt
271/271 - 52s - loss: 1.4358 - accuracy: 0.6234 - val_loss: 1.3894 - val_accuracy: 0.6579 - 52s/epoch - 190ms/step
Epoch 2/20

Epoch 2: val_loss improved from 1.38941 to 1.18043, saving model to checkpoints/cp.ckpt
271/271 - 27s - loss: 0.8106 - accuracy: 0.7840 - val_loss: 1.1804 - val_accuracy: 0.6993 - 27s/epoch - 101ms/step
Epoch 3/20

Epoch 3: val_loss improved from 1.18043 to 1.06523, saving model to checkpoints/cp.ckpt
271/271 - 27s - loss: 0.6023 - accuracy: 0.8316 - val_loss: 1.0652 - val_accuracy: 0.7168 - 27s/epoch - 99ms/step
Epoch 4/20

Epoch 4: val_loss did not improve from 1.06523
271/271 - 27s - loss: 0.5093 - accuracy: 0.8587 - val_loss: 1.1317 - val_accuracy: 0.6882 - 27s/epoch - 98ms/step
Epoch 5/20

Epoch 5: val_loss improved from 1.06523 to 1.05894, saving model to checkpoints/cp.ckpt
271/271 - 28s - loss: 0.4467 - accuracy: 0.8725 - val_loss: 1.0589 - val_accuracy: 0.70

In [21]:
!zip -r checkpoints.zip ./checkpoints/

updating: checkpoints/ (stored 0%)
updating: checkpoints/checkpoint (deflated 38%)
updating: checkpoints/.ipynb_checkpoints/ (stored 0%)
updating: checkpoints/cp.ckpt.index (deflated 72%)
updating: checkpoints/cp.ckpt.data-00000-of-00001 (deflated 28%)


In [3]:
files.download('checkpoints.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
model.load_weights(checkpoint_path)
model.save('nn_model5')
!zip -r nn_model5.zip ./nn_model5/
files.download('nn_model5.zip')



  adding: nn_model5/ (stored 0%)
  adding: nn_model5/variables/ (stored 0%)
  adding: nn_model5/variables/variables.index (deflated 73%)
  adding: nn_model5/variables/variables.data-00000-of-00001 (deflated 13%)
  adding: nn_model5/assets/ (stored 0%)
  adding: nn_model5/saved_model.pb (deflated 90%)
  adding: nn_model5/keras_metadata.pb (deflated 92%)
  adding: nn_model5/fingerprint.pb (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
model.summary()

Model: "nn_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 img_1 (Img)                 multiple                  788384    
                                                                 
 txt_1 (Txt)                 multiple                  196864    
                                                                 
 dense_5 (Dense)             multiple                  10649088  
                                                                 
 dense_6 (Dense)             multiple                  262656    
                                                                 
 dropout_3 (Dropout)         multiple                  0         
                                                                 
 dense_7 (Dense)             multiple                  13851     
                                                                 
 concatenate_1 (Concatenate)  multiple                 0      