In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from typing import Dict, Text
from tensorflow import keras
from tqdm.notebook import tqdm
tqdm.pandas()
import tensorflow_recommenders as tfrs
try:
    if not tf.config.list_physical_devices('GPU'):
        assert tf.__version__ >= "2.0"
        print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
        if IS_COLAB:
            print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
        if IS_KAGGLE:
            print("Go to Settings > Accelerator and select GPU.")
except:
    if not tf.test.is_gpu_available():
        assert tf.__version__ >= "2.0"
        print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
        if IS_COLAB:
            print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
        if IS_KAGGLE:
            print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import pandas as pd
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import os
import datetime as dt
from pathlib import Path
from datetime import datetime
# to make this notebook's output stable across runs
np.random.seed(42)
try:
    if not tf.config.list_physical_devices('GPU'):
        tf.random.set_seed(42)
    else:
        tf.random.set_random_seed(42)
except:
    if not tf.test.is_gpu_available():
        tf.random.set_seed(42)
    else:
        tf.random.set_random_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

No GPU was detected. LSTMs and CNNs can be very slow without a GPU.


In [2]:
def zero_f(item):
    item=str(item)
    tem=len(item)
    if(len(item)<10):
        item=item.zfill(10)
    return item

In [3]:
trans_train = pd.read_csv('h-and-m-personalized-fashion-recommendations/transactions_train.csv',dtype={'customer_id': str,'article_id':str})
trans_train['quantity']=1
articles= pd.read_csv('h-and-m-personalized-fashion-recommendations/articles.csv',dtype={'article_id': str,'product_code':str})
master_df = trans_train[['customer_id','article_id']].astype(str)
master_df['dt']=pd.to_datetime(trans_train['t_dat'],format="%Y-%m-%d")
master_df['article_id']=master_df['article_id'].apply(zero_f)
master_df['dt']=master_df.dt.values.astype(np.int64)
master_df['quantity'] = trans_train['quantity'].astype(float)
masterdf = master_df

In [4]:
interactions_dict = masterdf.groupby(['customer_id', 'article_id', 'dt'])[ 'quantity'].sum().reset_index()
interactions_dict = {name: np.array(value) for name, value in interactions_dict.items()}
interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)
items_dict = articles[['article_id']].drop_duplicates()
items_dict = {name: np.array(value) for name, value in items_dict.items()}
items = tf.data.Dataset.from_tensor_slices(items_dict)
interactions = interactions.map(lambda x: {
                                            'customer_id' : x['customer_id'], 
                                            'article_id' : x['article_id'],
                                            'quantity' : float(x['quantity']),
                                            "dt": x["dt"] })
items = items.map(lambda x: x['article_id'])

In [5]:
timestamps = np.concatenate(list(interactions.map(lambda x: x["dt"]).batch(10000)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)
### get unique item and user id's as a lookup table
unique_items = np.unique(np.concatenate(list(items.batch(1_000))))
unique_user_ids = np.unique(np.concatenate(list(interactions.batch(1_000_000).map(lambda x: x["customer_id"]))))

# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = interactions.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(60_000)
test = shuffled.skip(60_000).take(20_000)
cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [6]:
class UserModel(tf.keras.Model):

    def __init__(self, use_timestamps):
        super().__init__()

        self._use_timestamps = use_timestamps

        ## embed user id from unique_user_ids
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
        ])

        ## embed timestamp
        if use_timestamps:
            self.timestamp_embedding = tf.keras.Sequential([
              tf.keras.layers.Discretization(timestamp_buckets.tolist()),
              tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
            ])
            self.normalized_timestamp = tf.keras.layers.Normalization(axis=None)

            self.normalized_timestamp.adapt(timestamps)

    def call(self, inputs):
        if not self._use_timestamps:
              return self.user_embedding(inputs["customer_id"])

        ## all features here
        return tf.concat([
            self.user_embedding(inputs["customer_id"]),
            self.timestamp_embedding(inputs["dt"]),
            tf.reshape(self.normalized_timestamp(inputs["dt"]), (-1, 1)),
    ], axis=1)

In [7]:
class ItemModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        max_tokens = 10_000

        ## embed title from unique_item_titles
        self.title_embedding = tf.keras.Sequential([
                      tf.keras.layers.StringLookup(
                          vocabulary=unique_items, mask_token=None),
                      tf.keras.layers.Embedding(len(unique_items) + 1, 32)])

        ## processing text features: item title vectorizer (see self.title_vectorizer)
        self.title_vectorizer = tf.keras.layers.TextVectorization(
            max_tokens=max_tokens)

        ## we apply title vectorizer to items
        self.title_text_embedding = tf.keras.Sequential([
                              self.title_vectorizer,
                              tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
                              tf.keras.layers.GlobalAveragePooling1D(),])

        self.title_vectorizer.adapt(items)

    def call(self, titles):
        return tf.concat([
            self.title_embedding(titles),
            self.title_text_embedding(titles),], axis=1)

In [8]:
class CandidateModel(tfrs.models.Model):
    def __init__(self, use_timestamps):
        super().__init__()

        ## query model is user model
        self.query_model = tf.keras.Sequential([
                          UserModel(use_timestamps),
                          tf.keras.layers.Dense(64,activation="relu"),
                          tf.keras.layers.Dense(32)])
        
        ## candidate model is the item model
        self.candidate_model = tf.keras.Sequential([
                              ItemModel(),
                              tf.keras.layers.Dense(64,activation="relu"),
                              tf.keras.layers.Dense(32)])
        
        ## retrieval task, choose metrics
        self.task = tfrs.tasks.Retrieval(
                    metrics=tfrs.metrics.FactorizedTopK(
                        candidates=items.batch(128).map(self.candidate_model),),)

    def compute_loss(self, features, training=False):
        # We only pass the user id and timestamp features into the query model. This
        # is to ensure that the training inputs would have the same keys as the
        # query inputs. Otherwise the discrepancy in input structure would cause an
        # error when loading the query model after saving it.
        
        query_embeddings = self.query_model({ "customer_id": features["customer_id"],
                                               "dt": features["dt"],})
        
        item_embeddings = self.candidate_model(features["article_id"])

        return self.task(query_embeddings, item_embeddings)

In [9]:
model = CandidateModel(use_timestamps=True)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(cached_train, epochs=10)
model.evaluate(cached_test, return_dict=True)

Epoch 1/10
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Consider rewriting this model with the Functional API.


{'factorized_top_k/top_1_categorical_accuracy': 0.0003499999875202775,
 'factorized_top_k/top_5_categorical_accuracy': 0.0023499999660998583,
 'factorized_top_k/top_10_categorical_accuracy': 0.004149999935179949,
 'factorized_top_k/top_50_categorical_accuracy': 0.012500000186264515,
 'factorized_top_k/top_100_categorical_accuracy': 0.021900000050663948,
 'loss': 40616.44921875,
 'regularization_loss': 0,
 'total_loss': 40616.44921875}

In [10]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(items.batch(100).map(lambda items: (items,model.candidate_model(items))))

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x25eaabeed00>

In [11]:
_, titles = index({"customer_id": np.array(["000231cc9af9e58ab4edc66fbd61da921b144ba85bc1c00d0ae2309531e4c210"]),
    "dt": np.array([879024327])}, k=3)
print(f"Top recommendations: {titles[0]}")

array([b'000231cc9af9e58ab4edc66fbd61da921b144ba85bc1c00d0ae2309531e4c210'],
      dtype=object)>, 'dt': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([879024327])>}
Consider rewriting this model with the Functional API.
Top recommendations: [b'0575533011' b'0671433004' b'0399136009']


In [26]:
import datetime
start = datetime.datetime(2020, 9, 23)
dt_array = np.array([start + datetime.timedelta(days=i) for i in range(7)])
from datetime import datetime
squares = np.array([datetime.timestamp(xi) for xi in dt_array])

In [27]:
squares

array([1.6008336e+09, 1.6009200e+09, 1.6010064e+09, 1.6010928e+09,
       1.6011792e+09, 1.6012656e+09, 1.6013520e+09])

In [30]:
for item in squares:
    _, titles = index({"customer_id": np.array(["000231cc9af9e58ab4edc66fbd61da921b144ba85bc1c00d0ae2309531e4c210"]),
    "dt": np.array([item])}, k=12)
    print(f"Top recommendations: {titles[0]}")

array([b'000231cc9af9e58ab4edc66fbd61da921b144ba85bc1c00d0ae2309531e4c210'],
      dtype=object)>, 'dt': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.6008335e+09], dtype=float32)>}
Consider rewriting this model with the Functional API.
Top recommendations: [b'0575533011' b'0671433004' b'0399136009' b'0594834002' b'0667499009'
 b'0644873003' b'0667491011' b'0717593001' b'0683356005' b'0696628002'
 b'0598806001' b'0639091008']
array([b'000231cc9af9e58ab4edc66fbd61da921b144ba85bc1c00d0ae2309531e4c210'],
      dtype=object)>, 'dt': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.60092e+09], dtype=float32)>}
Consider rewriting this model with the Functional API.
Top recommendations: [b'0575533011' b'0671433004' b'0399136009' b'0594834002' b'0667499009'
 b'0644873003' b'0667491011' b'0717593001' b'0683356005' b'0696628002'
 b'0598806001' b'0639091008']
array([b'000231cc9af9e58ab4edc66fbd61da921b144ba85bc1c00d0ae2309531e4c210'],
      dtype=object)>, 'dt': <tf.Tensor: shape=(1,