Copyright 2021 The TensorFlow Authors.

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 85 kB 2.2 MB/s 
[K     |████████████████████████████████| 462 kB 36.2 MB/s 
[K     |████████████████████████████████| 4.2 MB 5.3 MB/s 
[?25h

In [None]:
import os
import pprint
import tempfile
import matplotlib.pyplot as plt
from typing import Dict, Text
import pickle
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
articles_ds = tf.data.experimental.make_csv_dataset(
    './drive/MyDrive/csv_data/articles.csv',
    batch_size=1, # Artificially small to make examples easier to show.
    ignore_errors=True,)

In [None]:
candidate_ds = articles_ds.map(lambda x: { "article_id": tf.squeeze(x["article_id"])})

In [None]:
candidate_ds

<MapDataset element_spec={'article_id': TensorSpec(shape=(), dtype=tf.int32, name=None)}>

In [None]:
train_filename = "./drive/MyDrive/output_hm/train_recommenders_v1.tfrecord"
train = tf.data.TFRecordDataset(train_filename)

test_filename = "./drive/MyDrive/output_hm/test_recommenders_v1.tfrecord"
test = tf.data.TFRecordDataset(test_filename)
feature_description = {
    "context_article_id":  tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_product_code": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_product_type_no": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_graphical_appearance_no": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_colour_group_code": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_perceived_colour_value_id": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_perceived_colour_master_id": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_department_no": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_index_code": tf.io.FixedLenFeature([5], tf.string, default_value=np.repeat("Missing", 5)),
    "context_index_group_no": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_section_no": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_garment_group_no": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_timestamp": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_price": tf.io.FixedLenFeature([5], tf.float32, default_value=np.repeat(0.0, 5)),
    "context_sales_channel_id": tf.io.FixedLenFeature([5], tf.int64, default_value=np.repeat(-1, 5)),
    "context_fn": tf.io.FixedLenFeature([1], tf.int64, default_value=-1),
    "context_active": tf.io.FixedLenFeature([1], tf.int64, default_value=-1),
    "context_club_member_status": tf.io.FixedLenFeature([1], tf.string, default_value="Missing"),
    "context_fashion_news_frequency": tf.io.FixedLenFeature([1], tf.string, default_value="Missing"),
    "context_age": tf.io.FixedLenFeature([1], tf.int64, default_value=-1),
    "label_article_id": tf.io.FixedLenFeature([1], tf.int64, default_value=-1),
}

def _parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, feature_description)

train_ds = train.map(_parse_function).map(lambda x: {
  "article_id": tf.strings.as_string(x["context_article_id"]),
  "product_code": x["context_product_code"],
  "product_type_no": x["context_product_type_no"],
  "graphical_appearance_no": x["context_graphical_appearance_no"],
  "colour_group_code": x["context_colour_group_code"],
  "perceived_colour_value_id": x["context_perceived_colour_value_id"],
  "perceived_colour_master_id": x["context_perceived_colour_master_id"],
  "department_no": x["context_department_no"],
  "index_code": x["context_index_code"],
  "index_group_no": x["context_index_group_no"],
  "section_no": x["context_section_no"],
  "garment_group_no": x["context_garment_group_no"],
  "timestamp": x["context_timestamp"],
  "price": x["context_price"],
  "sales_channel_id": x["context_sales_channel_id"],
  "fn": x["context_fn"],
  "active": x["context_active"],
  "club_member_status": x["context_club_member_status"],
  "fashion_news_frequency": x["context_fashion_news_frequency"],
  "age": x["context_age"],
  "label_article_id": tf.strings.as_string(x["label_article_id"])
})

test_ds = test.map(_parse_function).map(lambda x: {
  "article_id": tf.strings.as_string(x["context_article_id"]),
  "product_code": x["context_product_code"],
  "product_type_no": x["context_product_type_no"],
  "graphical_appearance_no": x["context_graphical_appearance_no"],
  "colour_group_code": x["context_colour_group_code"],
  "perceived_colour_value_id": x["context_perceived_colour_value_id"],
  "perceived_colour_master_id": x["context_perceived_colour_master_id"],
  "department_no": x["context_department_no"],
  "index_code": x["context_index_code"],
  "index_group_no": x["context_index_group_no"],
  "section_no": x["context_section_no"],
  "garment_group_no": x["context_garment_group_no"],
  "timestamp": x["context_timestamp"],
  "price": x["context_price"],
  "sales_channel_id": x["context_sales_channel_id"],
  "fn": x["context_fn"],
  "active": x["context_active"],
  "club_member_status": x["context_club_member_status"],
  "fashion_news_frequency": x["context_fashion_news_frequency"],
  "age": x["context_age"],
  "label_article_id": tf.strings.as_string(x["label_article_id"])
})

for x in train_ds.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'active': array([1]),
 'age': array([19]),
 'article_id': array([b'737260001', b'793108003', b'809487001', b'0', b'0'], dtype=object),
 'club_member_status': array([b'ACTIVE'], dtype=object),
 'colour_group_code': array([ 9, 43,  9,  0,  0]),
 'department_no': array([1515, 1338, 1338,    0,    0]),
 'fashion_news_frequency': array([b'Regularly'], dtype=object),
 'fn': array([1]),
 'garment_group_no': array([1010, 1017, 1017,    0,    0]),
 'graphical_appearance_no': array([1010016, 1010016, 1010016,       0,       0]),
 'index_code': array([b'A', b'B', b'B', b'UNK', b'UNK'], dtype=object),
 'index_group_no': array([1, 1, 1, 0, 0]),
 'label_article_id': array([b'874961004'], dtype=object),
 'perceived_colour_master_id': array([ 5, 18,  5,  0,  0]),
 'perceived_colour_value_id': array([4, 4, 4, 0, 0]),
 'price': array([0.00337288, 0.0169322 , 0.03049153, 0.        , 0.        ],
      dtype=float32),
 'product_code': array([737260, 793108, 809487,      0,      0]),
 'product_type_no': a

In [None]:
articles = train_ds.map(lambda x: tf.squeeze(x["label_article_id"]))
articles

<MapDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [None]:
str_features_dense = ['fashion_news_frequency', 'club_member_status']
int_features_dense = ['fn', 'active']

str_features_time = ['index_code']
int_features_time = ['section_no', 'sales_channel_id', 'product_type_no', 'product_code', 'perceived_colour_value_id', 'perceived_colour_master_id', 'index_group_no', 
                     'graphical_appearance_no', 'garment_group_no', 'department_no', 'colour_group_code']

In [None]:
with open(r"drive/MyDrive/output_hm/stats_dict.pkl", "rb") as stats_file:
  stats_dict = pickle.load(stats_file)

In [None]:
class CandidateModel(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.embedding_dimension = 128

    self.embedding = {}
    vocabulary = tf.strings.as_string(tf.convert_to_tensor(stats_dict['stats_dict']['article_id'])).numpy()
    self.embedding['article_id'] = tf.keras.Sequential(
        [tf.keras.layers.StringLookup(
            vocabulary=vocabulary, mask_token=None),
          tf.keras.layers.Embedding(len(vocabulary) + 1,
                                    self.embedding_dimension)
    ])

    # Compute embeddings for int features.
  def call(self, features):
    return self.embedding['article_id'](features)    

In [None]:
class QueryModel(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.embedding_dimension = 128

    self.str_features_time = ['index_code', 'article_id']
    self.int_features_time = ['section_no', 'sales_channel_id', 'product_type_no', 'product_code', 'perceived_colour_value_id', 'perceived_colour_master_id', 'index_group_no', 
                              'graphical_appearance_no', 'garment_group_no', 'department_no', 'colour_group_code']
    self.str_features_dense = ['fashion_news_frequency', 'club_member_status']
    self.int_features_dense = ['fn', 'active']

    self.time_embedding = {}
    self.dense_embedding = {}

    for feature_name in self.str_features_time:
      if feature_name == 'article_id':
        vocabulary=tf.strings.as_string(tf.convert_to_tensor(stats_dict['stats_dict']['article_id'])).numpy()
      else:
        vocabulary= np.array(stats_dict['stats_dict'][feature_name])
      self.time_embedding[feature_name] = tf.keras.Sequential(
          [tf.keras.layers.StringLookup(
              vocabulary=vocabulary, mask_token=None),
           tf.keras.layers.Embedding(len(vocabulary) + 1,
                                     self.embedding_dimension),
           tf.keras.layers.GRU(self.embedding_dimension, name=f'gru_{feature_name}', return_sequences=False),
    ])

    # Compute embeddings for int features.
    for feature_name in self.int_features_time:
      vocabulary = tf.convert_to_tensor(stats_dict['stats_dict'][feature_name])
      self.time_embedding[feature_name] = tf.keras.Sequential(
          [tf.keras.layers.IntegerLookup(
              vocabulary=vocabulary, mask_value=None),
           tf.keras.layers.Embedding(len(vocabulary) + 1,
                                     self.embedding_dimension),
           tf.keras.layers.GRU(self.embedding_dimension, name=f'gru_{feature_name}', return_sequences=False),
    ])

    """
    # Compute embeddings for int features.
    for feature_name in self.str_features_dense:
      vocabulary = tf.convert_to_tensor(stats_dict['stats_dict'][feature_name])
      self.dense_embedding[feature_name] = tf.keras.Sequential(
          [tf.keras.layers.StringLookup(
              vocabulary=vocabulary, mask_token=None),
           tf.keras.layers.Embedding(len(vocabulary) + 1,
                                     104),
    ])

    # Compute embeddings for int features.
    for feature_name in self.int_features_dense:
      vocabulary = tf.convert_to_tensor(stats_dict['stats_dict'][feature_name])
      self.dense_embedding[feature_name] = tf.keras.Sequential(
          [tf.keras.layers.IntegerLookup(
              vocabulary=vocabulary, mask_value=None),
           tf.keras.layers.Embedding(len(vocabulary) + 1,
                                     104),
    ]) 
    """
    self._cross_layer_1 = tfrs.layers.dcn.Cross(
            projection_dim=self.embedding_dimension,
            kernel_initializer="glorot_uniform")
    
    self._cross_layer_2 = tfrs.layers.dcn.Cross(
            projection_dim=self.embedding_dimension,
            kernel_initializer="glorot_uniform")

    self._deep_layers = [tf.keras.layers.Dense(int(self.embedding_dimension*13), activation="tanh")
      for layer_size in range(4)]
    self.last_layers = [tf.keras.layers.Dense(self.embedding_dimension, activation='relu') for layer_size in range(2)]
    self.last_layers.append(tf.keras.layers.Dense(self.embedding_dimension, activation='tanh'))

  def call(self, features):
    # Concatenate embeddings
    embeddings_time = []
    embeddings_dense = []
    for feature_name in self.str_features_time:
      embedding_fn = self.time_embedding[feature_name]
      embeddings_time.append(embedding_fn(features[feature_name]))

    for feature_name in self.int_features_time:
      embedding_fn = self.time_embedding[feature_name]
      embeddings_time.append(embedding_fn(features[feature_name]))
    """
    for feature_name in self.str_features_dense:
      embedding_fn = self.dense_embedding[feature_name]
      embeddings_dense.append(embedding_fn(features[feature_name]))

    for feature_name in self.int_features_dense:
      embedding_fn = self.dense_embedding[feature_name]
      embeddings_dense.append(embedding_fn(features[feature_name]))
    """
    concated_time = tf.concat(embeddings_time, axis=1)
    #concated_dense = tf.concat(embeddings_dense, axis=2)
    #print(concated_dense)

    #concate = tf.concat([concated_time + concated_dense], axis=1)
    x = self._cross_layer_1(concated_time)
    x = self._cross_layer_2(x)
    #self.timestamp_embedding(features["timestamp"]),
    #self.age_embedding(features["age"]),
    for deep_layer in self._deep_layers:
      concated_time = deep_layer(concated_time)               
    # Build Cross Network
    #if self._cross_layer is not None:
    #  x0 = self._cross_layer(x)
    x = tf.concat([concated_time, x], axis=1)
    for deep_layer in self.last_layers:
      x = deep_layer(x)   

    return x

In [None]:
class CombinedModel(tfrs.models.Model):
  def __init__(self, query_model, candidate_model):
    super().__init__()    
    self.query_model = tf.keras.Sequential([
      query_model,
      tf.keras.layers.Dense(64, activation='tanh')
    ])
    self.candidate_model = tf.keras.Sequential([
      candidate_model,
      tf.keras.layers.Dense(64, activation='tanh')
    ])
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=articles.batch(2048).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
      "article_id": features["article_id"],
      "product_code": features["product_code"],
      "product_type_no": features["product_type_no"],
      "graphical_appearance_no": features["graphical_appearance_no"],
      "colour_group_code": features["colour_group_code"],
      "perceived_colour_value_id": features["perceived_colour_value_id"],
      "perceived_colour_master_id": features["perceived_colour_master_id"],
      "department_no": features["department_no"],
      "index_code": features["index_code"],
      "index_group_no": features["index_group_no"],
      "section_no": features["section_no"],
      "garment_group_no": features["garment_group_no"],
      "sales_channel_id": features["sales_channel_id"],
      'fashion_news_frequency': features["fashion_news_frequency"], 
      'club_member_status': features["club_member_status"], 
      'fn': features["fn"], 
      'active': features["active"],
      })
    
    candidate_embeddings = self.candidate_model(features['label_article_id'])
    return self.task(
        query_embeddings, candidate_embeddings, compute_metrics=not training)

In [None]:
query_model = QueryModel()
candidate_model = CandidateModel()



In [None]:
model = CombinedModel(query_model, candidate_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.01))

In [None]:
cached_train = train_ds.shuffle(5_000).batch(1024).cache()
cached_test = test_ds.batch(1024).cache()

In [None]:
history = model.fit(cached_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20

In [None]:
model.evaluate(cached_test)