# Data Science Festival x ASOS
## Build and Deploy a Recommender System in 3 Hours.

# Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

# Import training data

In [2]:
train = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_train_with_alphanumeric_dummy_ids.parquet")
valid = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_valid_with_alphanumeric_dummy_ids.parquet")
dummy_users = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_dummy_users_with_alphanumeric_dummy_ids.csv", header=None).values.flatten().astype(str)
products = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_productIds.csv", header=None).values.flatten().astype(int)

In [3]:
train.head()

Unnamed: 0,dummyUserId,productId
0,b'PIXcm7Ru5KmntCy0yA1K',10524048
1,b'd0RILFB1hUzNSINMY4Ow',9137713
2,b'Ebax7lyhnKRm4xeRlWW2',5808602
3,b'vtigDw2h2vxKt0sJpEeU',10548272
4,b'r4GfiEaUGxziyjX0PyU6',10988173


In [4]:
valid.head()

Unnamed: 0,dummyUserId,productId
0,b'I4Yc5Ztur3UNwY5SdvDh',10093853
1,b'nhWgcxEVY7jQ3MvvNxWL',12306408
2,b'3vriQXKwG095rvR1MSrz',11858310
3,b'MA8KmOxkGd1JQ42GXDGO',10072124
4,b'vax7VgJnswdiC8iHZSCi',10596405


In [5]:
dummy_users

array(['pmfkU4BNZhmtLgJQwJ7x', 'UDRRwOlzlWVbu7H8YCCi',
       'QHGAef0TI6dhn0wTogvW', ..., 'lcORJ5hemOZc1iGo9z7k',
       '5CqDquDAszqJp27P7AL8', 'SSPNYxJMfuKhoe1dg24m'], dtype='<U20')

In [7]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

You can check this product vising: asos.com/prd/'product number'

# The briefest intro to tf

Tensors

In [15]:
x = tf.constant([1,2,3,4])
x

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([1, 2, 3, 4], dtype=int32)>

In [11]:
tf.math.square(x)

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 1,  4,  9, 16], dtype=int32)>

The first great contribution of tf is the ability to process information in paralel.

In [12]:
tf.constant([[1,2,3],[4,5,6]])

<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)>

In [17]:
x = tf.Variable([1,2,3,4,5], dtype=tf.float32)
x

<tf.Variable 'Variable:0' shape=(5,) dtype=float32, numpy=array([1., 2., 3., 4., 5.], dtype=float32)>

The second greatest contribution of tensorflow is the abiliti to track gradiends

Gradients

(We won´t usually use this notation)

In [23]:
with tf.GradientTape() as tape:
  y = tf.math.square(x)

In [24]:
y

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([ 1.,  4.,  9., 16., 25.], dtype=float32)>

In [25]:
dy_dx = tape.gradient(y, x)
dy_dx

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([ 2.,  4.,  6.,  8., 10.], dtype=float32)>

Multiply and add tensors

In [26]:
x = tf.constant([[1,2,3]], dtype=tf.float32)
Y = tf.constant([[1,2,3, 4], [1,2,3,4], [1,2,3,4]], dtype=tf.float32)

In [28]:
tf.matmul(x, Y)

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[ 6., 12., 18., 24.]], dtype=float32)>

In [30]:
z = tf.constant([10, 11, 12, 13], dtype=tf.float32)

This operation is very common in deep learning, so it has been abstracted:

In [31]:
tf.matmul(x, Y) + z

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[16., 23., 30., 37.]], dtype=float32)>

You can choose to apply a function to each value in the output

In [32]:
dl1 = tf.keras.layers.Dense(4, use_bias=True, weights = [Y, z])

In [33]:
dl1(x)

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[16., 23., 30., 37.]], dtype=float32)>

In [34]:
dl2 = tf.keras.layers.Dense(4, use_bias=True, weights = [Y, z], activation = lambda x: x+1)

In [35]:
dl2(x)

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[17., 24., 31., 38.]], dtype=float32)>

We can put different layers together in a sequence:

In [36]:
dl3 = tf.keras.layers.Dense(1, use_bias=False, \
                             weights=[tf.constant([[0], [1], [0], [1]], \
                                                  dtype=tf.float32)])

In [37]:
x_b = dl2(x)
x_b

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[17., 24., 31., 38.]], dtype=float32)>

In [39]:
dl3(x_b)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[62.]], dtype=float32)>

We can get more flexibility if you use tf.keras.model:

In [43]:
class simple_model(tf.keras.Model):
  def __init__(self):
    super(simple_model, self).__init__()
    self.dl2 = tf.keras.layers.Dense(4, use_bias=True, weights = [Y, z], activation = lambda x: x+1)
    self.dl3 = tf.keras.layers.Dense(1, use_bias=False, \
                             weights=[tf.constant([[0], [1], [0], [1]], \
                                                  dtype=tf.float32)])
  
  def call(self, x):
    x_b = self.dl2(x)
    return self.dl3(x_b), x_b, x_b + 104

In [44]:
sm = simple_model()
sm(x)

(<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[62.]], dtype=float32)>,
 <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[17., 24., 31., 38.]], dtype=float32)>,
 <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[121., 128., 135., 142.]], dtype=float32)>)

So far we have been setting the weights of the dense layers, but if we don't set the weights than weights get randomly chosen.

In [45]:
dl6 = tf.keras.layers.Dense(4, use_bias=True)
dl6(x)

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[-3.416037 ,  3.3947973, -3.6639075,  2.429174 ]], dtype=float32)>

In [46]:
dl6.get_weights()

[array([[-0.8504924 ,  0.16819346, -0.09807462, -0.27803534],
        [-0.08776003,  0.58277917, -0.76496553,  0.48036933],
        [-0.7966749 ,  0.6870152 , -0.6786339 ,  0.5821569 ]],
       dtype=float32), array([0., 0., 0., 0.], dtype=float32)]

# Define a Recommender Model

The embedding layer gives a list of random numbers for each user and each product.

In [48]:
embed1 = tf.keras.layers.Embedding(5, 8)

In [51]:
embed1(2)

<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([ 0.04098612, -0.00338467, -0.01981864,  0.02555584,  0.03008887,
       -0.01583308, -0.03667592, -0.00386257], dtype=float32)>

In [52]:
embed1.get_weights()

[array([[ 0.04421851, -0.00542061, -0.03804218,  0.03482567, -0.0005324 ,
         -0.00920575,  0.00887649, -0.02722143],
        [-0.01628969, -0.03299286,  0.03481543, -0.04852385, -0.04668379,
          0.00818229, -0.01548814, -0.00963731],
        [ 0.04098612, -0.00338467, -0.01981864,  0.02555584,  0.03008887,
         -0.01583308, -0.03667592, -0.00386257],
        [ 0.00307351,  0.02731453,  0.0056811 ,  0.01556709, -0.00425903,
         -0.0099046 ,  0.00051346,  0.03168214],
        [-0.00430452,  0.01493876,  0.02956978,  0.01346907,  0.00946451,
         -0.03383627,  0.02937515,  0.02745849]], dtype=float32)]

Scores can be found using the dot product.

In [55]:
# create a embedding layer of users, with 6 (or any other arbitrary numbers) random numbers for each user
dummy_user_embedding = tf.keras.layers.Embedding(len(dummy_users), 6)

In [57]:
# create a embedding layer of users, with 6 (or any other arbitrary numbers) random numbers for each user
product_embedding = tf.keras.layers.Embedding(len(products), 6)

In [63]:
# we can multiply tensors, given the axes, 0 = rows
tf.tensordot(dummy_user_embedding(1), product_embedding(99), axes = [[0], [0]])

# what we have just calculated is the score for one product and one user

<tf.Tensor: shape=(), dtype=float32, numpy=0.00032884686>

We can score multiple products at the same time, which is what we need to create a ranking.

In [64]:
example_products = tf.constant([1, 77, 104, 2062])
product_embedding(example_products)

<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
array([[-0.01204646,  0.04826254, -0.04468935,  0.0375728 , -0.00027695,
        -0.04086324],
       [-0.04465527,  0.0067886 ,  0.0405813 , -0.01867063, -0.02017661,
         0.03765037],
       [-0.01314152,  0.04632732,  0.01604011, -0.00838994, -0.00933259,
         0.0485196 ],
       [ 0.03802104,  0.01117244,  0.03411022,  0.03009326, -0.02034249,
        -0.03795927]], dtype=float32)>

In [67]:
#here we are multiplying a given user (row axis), with products (for each product we can to multiply on the columns axes)
tf.tensordot(dummy_user_embedding(1), product_embedding(example_products), axes = [[0], [1]])

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([-0.00218456,  0.00432109,  0.00259641,  0.00136114], dtype=float32)>

In [None]:
#the result of the previous operation is a score for the 4 products given, what we want now is to rank them

And we can score multiple users for multiple products which we will need to do if we are to train quickly.

But we need to map product ids to embedding ids.

In [70]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [72]:
product_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(tf.constant(products, dtype=tf.int32), 
                                        range(len(products))), -1)

In [74]:
product_table.lookup(tf.constant([12058614]))

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([29693], dtype=int32)>

Let's put those two things together

In [144]:
class SimpleRecommender(tf.keras.Model):
    def __init__(self, dummy_users, products, length_of_embedding):
        super(SimpleRecommender, self).__init__()
        self.products = tf.constant(products, dtype=tf.int32)
        self.dummy_users = tf.constant(dummy_users, dtype=tf.string)
        # create this dictionary lookup tables, that matches each user/product to an index
        self.dummy_user_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.dummy_users, range(len(dummy_users))), -1)
        self.product_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.products, range(len(products))), -1)
        
        #for each index, creat random numbers
        self.user_embedding = tf.keras.layers.Embedding(len(dummy_users), length_of_embedding)
        self.product_embedding = tf.keras.layers.Embedding(len(products), length_of_embedding)

        #create a layer of abstraction to get the scores
        self.dot = tf.keras.layers.Dot(axes=-1) #axes -1 means getting the last axes for each input

        
    def call(self, inputs):
        user = inputs[0]
        products = inputs[1]

        #get the index of the user/product provided as inputs
        user_embedding_index = self.dummy_user_table.lookup(user)
        product_embedding_index = self.product_table.lookup(products)

        #get the weights given the index
        user_embedding_values = self.user_embedding(user_embedding_index)
        product_embedding_values = self.product_embedding(product_embedding_index)

        #get the scores
        return tf.squeeze(self.dot([user_embedding_values, product_embedding_values]), 1)


    
    @tf.function
    def call_item_item(self, product):
        product_x = self.product_table.lookup(product)
        pe = tf.expand_dims(self.product_embedding(product_x), 0)
        
        all_pe = tf.expand_dims(self.product_embedding.embeddings, 0)#note this only works if the layer has been built!
        scores = tf.reshape(self.dot([pe, all_pe]), [-1])
        
        top_scores, top_indices = tf.math.top_k(scores, k=100)
        top_ids = tf.gather(self.products, top_indices)
        return top_ids, top_scores

In [145]:
dummy_users

array(['pmfkU4BNZhmtLgJQwJ7x', 'UDRRwOlzlWVbu7H8YCCi',
       'QHGAef0TI6dhn0wTogvW', ..., 'lcORJ5hemOZc1iGo9z7k',
       '5CqDquDAszqJp27P7AL8', 'SSPNYxJMfuKhoe1dg24m'], dtype='<U20')

In [146]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [147]:
sr1 = SimpleRecommender(dummy_users, products, 15)
# I am introducing 2 users, and getting the score:
# for the first user: first 3 products
# for the second user: last 3 products
sr1([tf.constant([['pmfkU4BNZhmtLgJQwJ7x'], ['UDRRwOlzlWVbu7H8YCCi']]), tf.constant([[8650774,  9306139,  9961521], [12058614, 12058615, 11927550]])])

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[-0.00284287, -0.00420284, -0.00070727],
       [ 0.00227325,  0.00311452, -0.00174852]], dtype=float32)>

# Creating a dataset

First create a tf.data.Dataset from the user purchase pairs.

In [117]:
dummy_user_tensor = tf.constant(train[["dummyUserId"]].values, dtype=tf.string)
product_tensor = tf.constant(train[["productId"]].values, dtype=tf.int32)

dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
for x, y in dataset:
    print(x)
    print(y)
    break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor([10524048], shape=(1,), dtype=int32)


In [91]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [96]:
random_negatives_indexs = tf.random.uniform((7, ), minval = 0, maxval = len(products), dtype=tf.int32)
random_negatives_indexs

<tf.Tensor: shape=(7,), dtype=int32, numpy=array([ 8036, 19413,  4599,  1623, 15305, 24074,  7575], dtype=int32)>

In [97]:
# we can now obtain a list of random products that have not been purchased by a customer
tf.gather(products, random_negatives_indexs)

<tf.Tensor: shape=(7,), dtype=int64, numpy=
array([11832145,  9130879,  6966879, 10755196, 12782213, 12166478,
       10256831])>

In [None]:
# 'dataset' holds products that users has purchased. We want now to extend this dataset including products that the user has not purchased

For each purchase let's sample a number of products that the user did not purchase. Then the model can score each of the products and we will know we are doing a good job if the product with the highest score is the product that the user actually purchased.

We can do this using dataset.map

In [136]:
class Mapper():
    
    def __init__(self, possible_products, num_negative_products):
        self.num_possible_products = len(possible_products)
        self.possible_products_tensor = tf.constant(possible_products, dtype=tf.int32)
        
        self.num_negative_products = num_negative_products
        self.y = tf.one_hot(0, num_negative_products + 1)
    
    def __call__(self, user, product):
        random_negatives_indexs = tf.random.uniform((self.num_negative_products, ), minval = 0, maxval = self.num_possible_products, dtype=tf.int32)
        negatives = tf.gather(self.possible_products_tensor, random_negatives_indexs)
        candidates = tf.concat([product, negatives], axis = 0)

        return (user, candidates), self.y

In [137]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [138]:
dataset

<MapDataset shapes: (((1,), (29706,)), (11,)), types: ((tf.string, tf.int32), tf.float32)>

In [139]:
dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor)).map(Mapper(products, 10))
for (u, c), y in dataset:
  print(u)
  print(c)
  print(y)
  break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor(
[10524048 12514400 10262907  8823173 10840450  7870290 11765968 10349948
 10338995  9386044 11701198], shape=(11,), dtype=int32)
tf.Tensor([1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(11,), dtype=float32)


Let's bring the steps together to define a function which creates a dataset 

In [142]:
def get_dataset(df, products, num_negative_products):
  dummy_user_tensor = tf.constant(df[["dummyUserId"]].values, dtype=tf.string)
  product_tensor = tf.constant(df[["productId"]].values, dtype=tf.int32)

  dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
  dataset =  dataset.map(Mapper(products, num_negative_products))
  
  #we don´t want to perform the learning for each user, so:
  dataset = dataset.batch(1024)
  return dataset

In [143]:
for (u,c), y in get_dataset(train, products, 4):
  print(u)
  print(c)
  print(y)
  break

tf.Tensor(
[[b'PIXcm7Ru5KmntCy0yA1K']
 [b'd0RILFB1hUzNSINMY4Ow']
 [b'Ebax7lyhnKRm4xeRlWW2']
 ...
 [b'xuX9n8PHfSR0AP3UZ8ar']
 [b'iNnxsPFfOa9884fMjVPJ']
 [b'aD8Mn12im8lFPzXAY41P']], shape=(1024, 1), dtype=string)
tf.Tensor(
[[10524048 10892423 11597767  8606883 11228540]
 [ 9137713 11981868 11678202 12020960 11001860]
 [ 5808602 10131884 12076723 11754632 11760580]
 ...
 [11541336 12210627 10694141  9679607 12346969]
 [ 7779232 11679278  9996449  8903844 10011217]
 [ 4941259 10703724  9355015 12651403  9329335]], shape=(1024, 5), dtype=int32)
tf.Tensor(
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]], shape=(1024, 5), dtype=float32)


# Train a model

We need to compile a model, set the loss and create an evaluation metric. Then we need to train the model.

In [148]:
model = SimpleRecommender(dummy_users, products, 15)
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              optimizer = tf.keras.optimizers.SGD(learning_rate=100.), 
              metrics = [tf.keras.metrics.CategoricalAccuracy()])

model.fit(get_dataset(train, products, 100), validation_data = get_dataset(valid, products, 100), epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f3860fb79e8>

Let's do a manual check on whether the model is any good.

In [149]:
test_product = 11698965

In [150]:
print("Recs for item {}: {}".format(test_product, model.call_item_item(tf.constant(test_product, dtype=tf.int32))))

Recs for item 11698965: (<tf.Tensor: shape=(100,), dtype=int32, numpy=
array([ 6848491, 10360535, 10555920,  9099144, 12297404,  9572596,
       11113894, 10234617, 10789497, 10689723,  8907440, 10901238,
       11264058, 10102320, 12936554, 11923040, 12115410,  9361601,
        9844845, 10653500, 10326627, 11043826, 10959740, 10677145,
       10576225, 10294086, 11954508, 10252562, 10534339, 10251977,
        9791231,  9919566, 10766415, 10597379, 10667027, 10252928,
       10434319, 11051697, 12377471, 10881816, 11713447,  8670921,
        7329586, 10327862, 11613065, 13194153,  9489449, 11209140,
       11974770, 11228544, 10374100, 11698190, 10672170, 12721595,
       11296236, 11392796, 10725626, 11710889, 10490474, 11023025,
       11440869, 11832342,  8853625, 11230891,  9668689,  9179306,
       12064580, 12184081, 10818311, 10561562, 10951701, 12083964,
       10722923,  9071435,  8177027, 10460173,  9432360, 11888643,
       11015921, 10379216,  9190937, 11993803, 12018593, 1

# Save the model

In [None]:
model_path = "models/recommender/1"

In [None]:
inpute_signature = tf.TensorSpec(shape=(), dtype=tf.int32)

In [None]:
signatures = { 'call_item_item': r1.call_item_item.get_concrete_function(inpute_signature)}

In [None]:
imported_model = tf.saved_model.load('models/recommeder/1')
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['call_item_item'](tf.constant([14844847]))

In [None]:
os.makedirs("dummy/0")
tf.saved_model.save(model, 'dummy/0')    
imported = tf.saved_model.load("dummy/0")
imported(tf.constant([14844847]))

In [None]:
os.makedirs("dummy/1")
tf.saved_model.save(model, 'dummy/1',
                    model.call_item_item.get_concrete_function(tf.TensorSpec(shape=(), dtype=tf.int32)))      
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['serving_default'](tf.constant([14844847]))

Zipping the saved model will make it easier to download.

In [None]:
from zipfile import ZipFile
import os
# create a ZipFile object
with ZipFile('recommender.zip', 'w') as zipObj:
   # Iterate over all the files in directory
    for folderName, subfolders, filenames in os.walk("models"):
        for filename in filenames:
           #create complete filepath of file in directory
           filePath = os.path.join(folderName, filename)
           # Add file to zip
           zipObj.write(filePath)