In [1]:
# imports and helper libraries
import os
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
os.environ['KMP_AFFINITY'] = 'noverbose'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# get the training data
train = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_train_with_alphanumeric_dummy_ids.parquet")
valid = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_valid_with_alphanumeric_dummy_ids.parquet")
dummy_users = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_dummy_users_with_alphanumeric_dummy_ids.csv", header=None).values.flatten().astype(str)
products = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_productIds.csv", header=None).values.flatten().astype(int)

## Defining the Recommender Model

In [4]:
embed1 = tf.keras.layers.Embedding(5, 8)

In [5]:
embed1(2)

<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([-0.03090502, -0.03142109,  0.00747465, -0.00368375,  0.04794667,
        0.03024275, -0.03664469,  0.00443583], dtype=float32)>

In [6]:
embed1.get_weights()

[array([[-0.04748151,  0.00709735, -0.02149141, -0.00406612, -0.02391967,
         -0.02052337, -0.0371755 ,  0.02259222],
        [ 0.0332836 , -0.02688477,  0.01686648,  0.00957299,  0.02626557,
         -0.02232741,  0.03806034,  0.00116964],
        [-0.03090502, -0.03142109,  0.00747465, -0.00368375,  0.04794667,
          0.03024275, -0.03664469,  0.00443583],
        [-0.02845308,  0.00437478,  0.03625715,  0.03565258,  0.03072571,
          0.04779761, -0.03749662, -0.0483506 ],
        [-0.01640891, -0.0468799 ,  0.03553256, -0.04159101, -0.04083144,
          0.03385096,  0.02842072, -0.01853668]], dtype=float32)]

In [7]:
dummy_users

array(['pmfkU4BNZhmtLgJQwJ7x', 'UDRRwOlzlWVbu7H8YCCi',
       'QHGAef0TI6dhn0wTogvW', ..., 'lcORJ5hemOZc1iGo9z7k',
       '5CqDquDAszqJp27P7AL8', 'SSPNYxJMfuKhoe1dg24m'], dtype='<U20')

In [8]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [9]:
dummy_user_embedding = tf.keras.layers.Embedding(len(dummy_users), 6)
products_embedding = tf.keras.layers.Embedding(len(products), 6)

In [10]:
dummy_user_embedding(1)

<tf.Tensor: shape=(6,), dtype=float32, numpy=
array([ 0.02363217, -0.01482123,  0.04005045, -0.00431831, -0.00850946,
       -0.04536475], dtype=float32)>

In [11]:
products_embedding(99)

<tf.Tensor: shape=(6,), dtype=float32, numpy=
array([ 0.02785682,  0.01728873, -0.0105334 , -0.04317793, -0.01579964,
        0.03429708], dtype=float32)>

In [12]:
tf.tensordot(dummy_user_embedding(1), products_embedding(99), axes=[[0], [0]])

<tf.Tensor: shape=(), dtype=float32, numpy=-0.0012547674>

In [13]:
example_product = tf.constant([1, 77, 104, 2062])
products_embedding(example_product)

<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
array([[-0.02743806, -0.0306223 ,  0.00016682,  0.01159703, -0.02883006,
         0.04038957],
       [-0.0175973 , -0.04700191,  0.01031879,  0.01022498, -0.03636285,
         0.04981625],
       [ 0.00380818, -0.00935956, -0.0036974 , -0.02798309, -0.00140879,
         0.04953668],
       [ 0.03741796,  0.01892288,  0.04435607, -0.005238  ,  0.04076185,
         0.02899151]], dtype=float32)>

In [14]:
tf.tensordot(dummy_user_embedding(1), products_embedding(example_product), axes=[[0], [1]])

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([-0.00182489, -0.00130059, -0.00203376,  0.00074085], dtype=float32)>

### Mapping Product IDs to Embedding IDs

In [15]:
dummy_users

array(['pmfkU4BNZhmtLgJQwJ7x', 'UDRRwOlzlWVbu7H8YCCi',
       'QHGAef0TI6dhn0wTogvW', ..., 'lcORJ5hemOZc1iGo9z7k',
       '5CqDquDAszqJp27P7AL8', 'SSPNYxJMfuKhoe1dg24m'], dtype='<U20')

In [16]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [17]:
product_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(tf.constant(products, dtype=tf.int32), range(len(products))), -1)

In [18]:
product_table.lookup(tf.constant([12058614]))

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([29693], dtype=int32)>

In [49]:
class SimpleRecommender(tf.keras.Model):
    def __init__(self, dummy_users, products, len_embed):
        super(SimpleRecommender, self).__init__()
        self.products = tf.constant(products, dtype=tf.int32)
        self.dummy_users = tf.constant(dummy_users, dtype=tf.string)
        self.dummy_user_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.dummy_users, range(len(dummy_users))), -1)
        self.product_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.products, range(len(products))), -1)

        self.user_embedding = tf.keras.layers.Embedding(len(dummy_users), len_embed)
        self.product_embedding = tf.keras.layers.Embedding(len(products), len_embed)

        self.dot = tf.keras.layers.Dot(axes=-1) #* embedding of product and embedding of user

    def call(self, inputs):
        user = inputs[0]
        products = inputs[1]

        user_embedding_index = self.dummy_user_table.lookup(user)
        product_embedding_index = self.product_table.lookup(products)

        user_embedding_values = self.user_embedding(user_embedding_index)
        product_embedding_values = self.product_embedding(product_embedding_index)

        return tf.squeeze(self.dot([user_embedding_values, product_embedding_values]), 1)
    
    @tf.function
    def call_item_item(self, product):
        product_x = self.product_table.lookup(product)
        pe = tf.expand_dims(self.product_embedding(product_x), 0)

        all_pe = tf.expand_dims(self.product_embedding.embeddings, 0) #! only works if the layer has been built
        scores = tf.reshape(self.dot([pe, all_pe]), [-1])

        top_scores, top_indices = tf.math.top_k(scores, k=100)
        top_ids = tf.gather(self.products, top_indices)
        
        return top_ids, top_scores

In [50]:
srl = SimpleRecommender(dummy_users, products, 15)
srl([tf.constant([["UDRRwOlzlWVbu7H8YCCi"], ["QHGAef0TI6dhn0wTogvW"]]), tf.constant([[8650774, 9306139, 9961521], [12058614, 12058615, 11927550]])])

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 0.00218443, -0.0056198 , -0.00149052],
       [-0.00083884, -0.00186805,  0.00293378]], dtype=float32)>

### Creating Dataset

Creating a tf.data.Dataset from the user purchase pairs

In [51]:
dummy_user_tensor = tf.constant(train[["dummyUserId"]].values, dtype=tf.string)
product_tensor = tf.constant(train[["productId"]].values, dtype=tf.int32)

dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
for x, y in dataset:
    print(x)
    print(y)
    break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor([10524048], shape=(1,), dtype=int32)


for each purchase sample a number of products that theuser did not purchase. then the model can score each of the products and we will know we are doing a good job if the product with the highest score i the product that the user actually purchased (use dataset.map)

In [52]:
train

Unnamed: 0,dummyUserId,productId
0,b'PIXcm7Ru5KmntCy0yA1K',10524048
1,b'd0RILFB1hUzNSINMY4Ow',9137713
2,b'Ebax7lyhnKRm4xeRlWW2',5808602
3,b'vtigDw2h2vxKt0sJpEeU',10548272
4,b'r4GfiEaUGxziyjX0PyU6',10988173
...,...,...
165037,b'7Eom5Ancozj01ozGxAMK',9071435
165038,b'zi9vZETHqSIZK0TM2nZc',10413104
165039,b'fVCveec9P946asY5wqGm',9859881
165040,b'VJtfpw602SZHh2qwarK4',10809487


In [53]:
random_negatives_indexes = tf.random.uniform((7, ), minval=0, maxval=len(products), dtype=tf.int32)
random_negatives_indexes

<tf.Tensor: shape=(7,), dtype=int32, numpy=array([ 1776,  1429, 13702, 17240,  4536,  5279,  4225], dtype=int32)>

In [54]:
tf.gather(products, random_negatives_indexes)

<tf.Tensor: shape=(7,), dtype=int64, numpy=
array([ 8658687,  9312638, 11202042, 10824947, 11947395, 11295080,
       12208225])>

In [55]:
tf.one_hot(0, depth=11)

<tf.Tensor: shape=(11,), dtype=float32, numpy=array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

In [56]:
class Mapper():
    def __init__(self, possible_products, num_negative_products):
        self.num_possible_products = len(possible_products)
        self.possible_products_tensor = tf.constant(possible_products, dtype=tf.int32)

        self.num_negative_products = num_negative_products
        self.y = tf.one_hot(0, num_negative_products+1)

    def __call__(self, user, product):
        random_negatives_indexes = tf.random.uniform((self.num_negative_products, ), minval=0, maxval=self.num_possible_products, dtype=tf.int32)
        negatives = tf.gather(self.possible_products_tensor, random_negatives_indexes)
        candidates = tf.concat([product, negatives], axis=0)

        return (user, candidates), self.y

In [57]:
dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor)).map(Mapper(products, 10)) #* 10 being the products the user did not purchase
for (u, e), y in dataset:
    print(u)
    print(e)
    print(y)
    break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor(
[10524048 10234260  8118994  9758797 11841380  9198959  9748536  9025892
 12549087 11140118  9531630], shape=(11,), dtype=int32)
tf.Tensor([1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(11,), dtype=float32)


In [58]:
def get_dataset(df, products, num_negative_products):
    dummy_user_tensor = tf.constant(df[["dummyUserId"]].values, dtype=tf.string)
    product_tensor = tf.constant(df[["productId"]].values, dtype=tf.int32)

    dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
    dataset = dataset.map(Mapper(products, num_negative_products))
    dataset = dataset.batch(1024)

    return dataset

In [59]:
for (u, c), y in get_dataset(train, products, 4):
    print(u)
    print(c)
    print(y)
    break

tf.Tensor(
[[b'PIXcm7Ru5KmntCy0yA1K']
 [b'd0RILFB1hUzNSINMY4Ow']
 [b'Ebax7lyhnKRm4xeRlWW2']
 ...
 [b'xuX9n8PHfSR0AP3UZ8ar']
 [b'iNnxsPFfOa9884fMjVPJ']
 [b'aD8Mn12im8lFPzXAY41P']], shape=(1024, 1), dtype=string)
tf.Tensor(
[[10524048  9906592 12314017 12792079 12166862]
 [ 9137713  9181251  9065438 10886678 10879241]
 [ 5808602 12692359 11343908 11515266  8650774]
 ...
 [11541336  9429218  9493642 11781340 11415967]
 [ 7779232 11974396 12558915 11551124 11765849]
 [ 4941259 11712573  9498607 10250639  9994955]], shape=(1024, 5), dtype=int32)
tf.Tensor(
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]], shape=(1024, 5), dtype=float32)


## Train Model

In [60]:
model = SimpleRecommender(dummy_users, products, 15)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.SGD(learning_rate=100.), metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [61]:
model.fit(get_dataset(train, products, 100), validation_data=get_dataset(valid, products, 100), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f4728289a50>