# Data Science Festival x ASOS
## Build and Deploy a Recommender System in 3 Hours.

Data Science Festival - November 2020

A talk by Dr. Gordon Blackadder & Neha Patel

(This workbook was completed during the session by Arvindra Sehmi. All mistakes are my own. Use at your own discretion.)

### *** _BEST TO RUN THIS IN GPU MODE IN GOOGLE COLAB_ ***

# Imports

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

# Import training data

In [4]:
train = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_train_with_alphanumeric_dummy_ids.parquet")
valid = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_valid_with_alphanumeric_dummy_ids.parquet")
dummy_users = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_dummy_users_with_alphanumeric_dummy_ids.csv", header=None).values.flatten().astype(str)
products = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_productIds.csv", header=None).values.flatten().astype(int)

# The briefest intro to tf

Tensors

In [5]:
x = tf.constant([[1,2,3]], dtype=tf.float32)
Y = tf.constant([[1,2,3,4], [1,2,3,4], [1,2,3,4]], dtype=tf.float32)

In [6]:
tf.math.square(x)

<tf.Tensor: id=2, shape=(1, 3), dtype=float32, numpy=array([[1., 4., 9.]], dtype=float32)>

In [7]:
Y

<tf.Tensor: id=1, shape=(3, 4), dtype=float32, numpy=
array([[1., 2., 3., 4.],
       [1., 2., 3., 4.],
       [1., 2., 3., 4.]], dtype=float32)>

Gradients

In [10]:
with tf.GradientTape() as tape:
  y = tf.math.square(x)
  print(y)

tf.Tensor([[1. 4. 9.]], shape=(1, 3), dtype=float32)


Multiply and add tensors

In [11]:
tf.matmul(x,Y)

<tf.Tensor: id=6, shape=(1, 4), dtype=float32, numpy=array([[ 6., 12., 18., 24.]], dtype=float32)>

In [12]:
z = tf.constant([10, 11, 12, 13], dtype=tf.float32)

This operation is very common in deep learning, so it has been abstracted:

In [13]:
dl1 = tf.keras.layers.Dense(4, use_bias=True, weights=[Y, z])
dl1(x)

<tf.Tensor: id=36, shape=(1, 4), dtype=float32, numpy=array([[16., 23., 30., 37.]], dtype=float32)>

You can choose to apply a function to each value in the output

In [14]:
dl2 = tf.keras.layers.Dense(4, use_bias=True, weights=[Y, z], activation = lambda x: x+1)
dl2(x)

<tf.Tensor: id=66, shape=(1, 4), dtype=float32, numpy=array([[17., 24., 31., 38.]], dtype=float32)>

We can put different layers together in a sequence:

In [16]:
dl3 = tf.keras.layers.Dense(1, use_bias=False,
                            weights=[tf.constant([[0], [1], [0], [1]],
                            dtype=tf.float32)])

In [17]:
x_b = dl2(x)
x_b

<tf.Tensor: id=74, shape=(1, 4), dtype=float32, numpy=array([[17., 24., 31., 38.]], dtype=float32)>

In [18]:
dl3(x_b)

<tf.Tensor: id=90, shape=(1, 1), dtype=float32, numpy=array([[62.]], dtype=float32)>

We can get more flexibility if you use tf.keras.model:

In [31]:
class simple_model(tf.keras.Model):
  def __init__(self, weights=[Y, z], activation = lambda x: x+1):
    super (simple_model, self).__init__()
    self.dl2 = tf.keras.layers.Dense(4, use_bias=True, weights=weights, activation = activation)
    self.dl3 = tf.keras.layers.Dense(1, use_bias=False, \
                             weights=[tf.constant([[0], [1], [0], [1]], \
                                                  dtype=tf.float32)])
    
  def call(self, inputs):
    x_b = self.dl2(inputs)
    return (self.dl3(x_b))

# Y, z constants matrices produced above
model = simple_model(weights=[Y, z], activation = lambda x: x+1)

print(model(x))

# check output is same as dl3(x_b) above
print(dl3(x_b))

tf.Tensor([[62.]], shape=(1, 1), dtype=float32)
tf.Tensor([[62.]], shape=(1, 1), dtype=float32)


So far we have been setting the weights of the dense layers, but if we don't set the weights than weights get randomly chosen.

In [33]:
dl6 = tf.keras.layers.Dense(4, use_bias=True)
dl6(x)

<tf.Tensor: id=410, shape=(1, 4), dtype=float32, numpy=
array([[-0.9553546 ,  0.27141684,  0.7869278 ,  1.8094403 ]],
      dtype=float32)>

In [34]:
dl6.get_weights()

[array([[ 0.17946231,  0.3158579 ,  0.7906002 ,  0.89545834],
        [ 0.07899165,  0.36987293,  0.84525   ,  0.2538587 ],
        [-0.4309334 , -0.26139563, -0.5647241 ,  0.13542151]],
       dtype=float32), array([0., 0., 0., 0.], dtype=float32)]

# Define a Recommender Model

The embedding layer gives a list of random numbers for each user and each product.

In [35]:
embed1 = tf.keras.layers.Embedding(5, 8)

In [36]:
embed1(2)

<tf.Tensor: id=427, shape=(8,), dtype=float32, numpy=
array([-0.02888781, -0.01914313, -0.00491112,  0.03842533, -0.00172539,
       -0.03043145, -0.00501364, -0.03553481], dtype=float32)>

In [37]:
embed1.get_weights()

[array([[ 1.5794162e-02,  2.0063892e-03,  2.0384643e-02, -1.7105162e-02,
          4.2638209e-02,  2.9360678e-02,  1.5085403e-02, -1.6151022e-02],
        [ 2.6083898e-02,  2.3039188e-02, -3.3267990e-02,  1.8169213e-02,
         -1.1945985e-02,  4.6186294e-02,  3.9192449e-02, -4.8213243e-02],
        [-2.8887808e-02, -1.9143129e-02, -4.9111247e-03,  3.8425330e-02,
         -1.7253868e-03, -3.0431449e-02, -5.0136447e-03, -3.5534810e-02],
        [ 7.2288513e-04,  2.0099174e-02,  9.5255673e-06, -3.8681187e-02,
         -3.5180010e-02, -2.0404089e-02, -5.8357492e-03,  4.7624495e-02],
        [-6.3845031e-03,  4.9663570e-02,  4.7471736e-02,  1.1598073e-02,
         -1.9805059e-03, -3.7484538e-02,  3.5688531e-02,  3.8580671e-03]],
       dtype=float32)]

Scores can be found using the dot product.

In [38]:
products
dummy_user_embedding = tf.keras.layers.Embedding(len(dummy_users), 6)
product_embedding = tf.keras.layers.Embedding(len(products), 6)

In [39]:
product_embedding(99)

<tf.Tensor: id=443, shape=(6,), dtype=float32, numpy=
array([ 0.04675224,  0.03770312,  0.03921   , -0.04692386,  0.03347466,
        0.04358738], dtype=float32)>

We can score multiple products at the same time, which is what we need to create a ranking.

In [40]:
tf.tensordot(dummy_user_embedding(1), product_embedding(99), axes=[[0],[0]])

<tf.Tensor: id=471, shape=(), dtype=float32, numpy=-0.0032592646>

In [41]:
example_product =tf.constant([1, 77, 104, 2062])
product_embedding(example_product)

<tf.Tensor: id=473, shape=(4, 6), dtype=float32, numpy=
array([[ 0.02182484,  0.02987485,  0.00529094,  0.04103741, -0.04168433,
         0.02252043],
       [-0.00056878, -0.02282388, -0.02507555, -0.04795363,  0.04792894,
        -0.01607227],
       [ 0.03156673, -0.01974326, -0.00154377,  0.02860535,  0.01547731,
        -0.04691483],
       [ 0.00692496, -0.03354611, -0.04435643,  0.03910199, -0.02541283,
         0.02769155]], dtype=float32)>

In [42]:
tf.tensordot(dummy_user_embedding(1), product_embedding(example_product), axes=[[0],[1]])

<tf.Tensor: id=487, shape=(4,), dtype=float32, numpy=array([ 0.00067707, -0.00065943,  0.00081888,  0.00128921], dtype=float32)>

And we can score multiple users for multiple products which we will need to do if we are to train quickly.

But we need to map product ids to embedding ids.

In [48]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [49]:
product_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(tf.constant(products, dtype=tf.int32), 
                                        range(len(products))), -1)

In [50]:
product_table.lookup(tf.constant([12058615]))

<tf.Tensor: id=526, shape=(1,), dtype=int32, numpy=array([29694])>

Let's put those two things together

In [51]:
class SimpleRecommender(tf.keras.Model):
    def __init__(self, dummy_users, products, length_of_embedding):
        super(SimpleRecommender, self).__init__()
        self.products = tf.constant(products, dtype=tf.int32)
        self.dummy_users = tf.constant(dummy_users, dtype=tf.string)
        self.dummy_user_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.dummy_users, range(len(dummy_users))), -1)
        self.product_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.products, range(len(products))), -1)
        
        self.user_embedding = tf.keras.layers.Embedding(len(dummy_users), length_of_embedding)
        self.product_embedding = tf.keras.layers.Embedding(len(products), length_of_embedding)

        self.dot = tf.keras.layers.Dot(axes=-1)
        
    def call(self, inputs):
        user = inputs[0]
        products = inputs[1]

        user_embedding_index = self.dummy_user_table.lookup(user)
        product_embedding_index = self.product_table.lookup(products)

        user_embedding_values = self.user_embedding(user_embedding_index)
        product_embedding_values = self.product_embedding(product_embedding_index)

        # squeeze is needed to make data shapes compatiable - remove it to see what happens
        # (required only for GPU model fitting)
        return tf.squeeze(self.dot([user_embedding_values, product_embedding_values]), 1)
    
    @tf.function
    def call_item_item(self, product):
        product_x = self.product_table.lookup(product)
        pe = tf.expand_dims(self.product_embedding(product_x), 0)
        
        all_pe = tf.expand_dims(self.product_embedding.embeddings, 0)#note this only works if the layer has been built!
        scores = tf.reshape(self.dot([pe, all_pe]), [-1])
        
        top_scores, top_indices = tf.math.top_k(scores, k=100)
        top_ids = tf.gather(self.products, top_indices)
        return top_ids, top_scores

In [52]:
dummy_users

array(['pmfkU4BNZhmtLgJQwJ7x', 'UDRRwOlzlWVbu7H8YCCi',
       'QHGAef0TI6dhn0wTogvW', ..., 'lcORJ5hemOZc1iGo9z7k',
       '5CqDquDAszqJp27P7AL8', 'SSPNYxJMfuKhoe1dg24m'], dtype='<U20')

In [53]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [54]:
sr1 = SimpleRecommender(dummy_users, products, 15)
sr1([tf.constant([['pmfkU4BNZhmtLgJQwJ7x'],['UDRRwOlzlWVbu7H8YCCi']]), tf.constant([[8650774,  9306139,  9961521],[12058614, 12058615, 11927550]])])

<tf.Tensor: id=575, shape=(2, 3), dtype=float32, numpy=
array([[-0.00024299, -0.00437703, -0.00178926],
       [-0.00742602, -0.00197724, -0.00097019]], dtype=float32)>

# Creating a dataset

First create a tf.data.Dataset from the user purchase pairs.

In [55]:
train

Unnamed: 0,dummyUserId,productId
0,b'PIXcm7Ru5KmntCy0yA1K',10524048
1,b'd0RILFB1hUzNSINMY4Ow',9137713
2,b'Ebax7lyhnKRm4xeRlWW2',5808602
3,b'vtigDw2h2vxKt0sJpEeU',10548272
4,b'r4GfiEaUGxziyjX0PyU6',10988173
...,...,...
165037,b'7Eom5Ancozj01ozGxAMK',9071435
165038,b'zi9vZETHqSIZK0TM2nZc',10413104
165039,b'fVCveec9P946asY5wqGm',9859881
165040,b'VJtfpw602SZHh2qwarK4',10809487


In [56]:
dummy_user_tensor = tf.constant(train[["dummyUserId"]].values, dtype=tf.string)
product_tensor = tf.constant(train[["productId"]].values, dtype=tf.int32)

dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
for x, y in dataset:
    print(x)
    print(y)
    break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor([10524048], shape=(1,), dtype=int32)


In [57]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [58]:
random_negative_indexes = tf.random.uniform((7,), minval = 0, maxval = len(products), dtype=tf.int32 )
random_negative_indexes

<tf.Tensor: id=589, shape=(7,), dtype=int32, numpy=array([ 6181, 10012, 27496, 19431,  8281, 24337, 28733])>

In [59]:
tf.gather(products, random_negative_indexes)

<tf.Tensor: id=592, shape=(7,), dtype=int32, numpy=
array([12871981, 12889804,  9297200, 11490257, 11177882, 11119126,
       12448245])>

In [60]:
products[26597]

10604200

For each purchase let's sample a number of products that the user did not purchase. Then the model can score each of the products and we will know we are doing a good job if the product with the highest score is the product that the user actually purchased.

We can do this using dataset.map

In [61]:
tf.one_hot(0, depth=11)

<tf.Tensor: id=597, shape=(11,), dtype=float32, numpy=array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

In [62]:
class Mapper():
    
    def __init__(self, possible_products, num_negative_products):
        self.num_possible_products = len(possible_products)
        self.possible_products_tensor = tf.constant(possible_products, dtype=tf.int32)
        
        self.num_negative_products = num_negative_products
        self.y = tf.one_hot(0, depth=num_negative_products+1)
    
    def __call__(self, user, product):
        random_negative_indexes = tf.random.uniform((self.num_negative_products,), minval = 0, maxval = self.num_possible_products, dtype=tf.int32 )
        negatives = tf.gather(self.possible_products_tensor, random_negative_indexes)
        candidates = tf.concat([product, negatives], axis=0)
        return (user, candidates), self.y

In [64]:
dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor)).map(Mapper(products, 10))
for (u, c), y in dataset:
    print(u)
    print(c)
    print(y)
    break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor(
[10524048 11697587 11510443 11190050 12246662 12787701 10364265 12741203
 11427835 11679288 10272142], shape=(11,), dtype=int32)
tf.Tensor([1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(11,), dtype=float32)


Let's bring the steps together to define a function which creates a dataset 

In [65]:
def get_dataset(df, products, num_negative_products):
    dummy_user_tensor = tf.constant(df[["dummyUserId"]].values, dtype=tf.string)
    product_tensor = tf.constant(df[["productId"]].values, dtype=tf.int32)

    dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
    dataset = dataset.map(Mapper(products, num_negative_products))
    dataset = dataset.batch(1024)

    return dataset

In [66]:
for (u, c), y in get_dataset(train, products, 4):
  print(u)
  print(c)
  print(y)
  break

tf.Tensor(
[[b'PIXcm7Ru5KmntCy0yA1K']
 [b'd0RILFB1hUzNSINMY4Ow']
 [b'Ebax7lyhnKRm4xeRlWW2']
 ...
 [b'xuX9n8PHfSR0AP3UZ8ar']
 [b'iNnxsPFfOa9884fMjVPJ']
 [b'aD8Mn12im8lFPzXAY41P']], shape=(1024, 1), dtype=string)
tf.Tensor(
[[10524048  9500597  9412799  9791769 13120534]
 [ 9137713  9377456 11822513  9179829  9702625]
 [ 5808602 12501015 12154941 11428985  9870223]
 ...
 [11541336  8899028 10171028  9669331 10614982]
 [ 7779232 10712238 10460513 11122660 12515722]
 [ 4941259  9805645  8068539 10399596 10879982]], shape=(1024, 5), dtype=int32)
tf.Tensor(
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]], shape=(1024, 5), dtype=float32)


# Train a model

We need to compile a model, set the loss and create an evaluation metric. Then we need to train the model.

In [67]:
model = SimpleRecommender(dummy_users, products, 15)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              optimizer = tf.keras.optimizers.SGD(learning_rate=100.),
              metrics = [tf.keras.metrics.CategoricalAccuracy()])
model.fit(get_dataset(train, products, 100), validation_data= get_dataset(valid, products, 100), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x184517b4a08>

Let's do a manual check on whether the model is any good.

In [68]:
test_product = 11698965

In [69]:
print("Recs for item {}: {}".format(test_product, model.call_item_item(tf.constant(test_product, dtype=tf.int32))))

Recs for item 11698965: (<tf.Tensor: id=4563, shape=(100,), dtype=int32, numpy=
array([ 9565144, 10573792, 12280078, 12261521, 11890074, 10958640,
       11613063, 11187698, 11673558, 10999905, 11001547, 12280604,
        9489449,  8713815,  8678087, 10277067, 12254938, 11103317,
       12227117, 10104302, 10994983, 11227753, 10275703,  9097969,
       11001554, 11701095, 11173772, 12074695, 10294086, 10313006,
       10940885,  9544989,  8569920, 11001552, 11116710, 11051706,
        9882761,  8941836,  9585059,  7690166,  8853625, 13344830,
       11489649,  6036771, 10972726, 10102320, 12357421,  9616053,
       10534339, 12033161, 12695251, 10566525, 12297404, 10231893,
        9071435, 12273427, 11441270, 12507832, 10999903, 11778735,
       11506467, 10960540, 11865802, 13000935,  8867906, 10594200,
       11852897, 10123841, 10104809, 11633169, 10119530, 10442992,
        7908164, 11399936, 11088255, 11430300, 11230719, 10334478,
       11568490,  9414102, 11927847,  8868237, 11

try http://asos.com/prd/11698965 vs. http://asos.com/prd/10351406


# Save the model
### _This part works only in colab_

In [None]:
model_path = "models/recommender/1"

In [None]:
input_signature = tf.TensorSpec(shape=(), dtype=tf.int32)

In [None]:
signatures = { 'call_item_item': model.call_item_item.get_concrete_function(input_signature)}

In [None]:
tf.saved_model.save(model, model_path, signatures=signatures)

In [None]:
imported_model = tf.saved_model.load('models/recommender/1')
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['call_item_item'](tf.constant([14844847]))

In [None]:
os.makedirs("dummy/0")
tf.saved_model.save(model, 'dummy/0')    
imported = tf.saved_model.load("dummy/0")
imported(tf.constant([10351406]))

In [None]:
os.makedirs("dummy/1")
tf.saved_model.save(model, 'dummy/1',
                    model.call_item_item.get_concrete_function(tf.TensorSpec(shape=(), dtype=tf.int32)))      
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['serving_default'](tf.constant([10351406]))

Zipping the saved model will make it easier to download.

In [None]:
from zipfile import ZipFile
import os
# create a ZipFile object
with ZipFile('models.zip', 'w') as zipObj:
   # Iterate over all the files in directory
    for folderName, subfolders, filenames in os.walk("models/recommender"):
        for filename in filenames:
           #create complete filepath of file in directory
           filePath = os.path.join(folderName, filename)
           # Add file to zip
           zipObj.write(filePath)