In [2]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
!pip install tensorflow_recommenders

In [4]:
### Import necessary libraries

from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_recommenders as tfrs

import os
import pprint
import tempfile

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
trans_train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')[:100000]
trans_train

In [6]:
# master_df = trans_train[['customer_id','article_id','price']].astype(str)
# master_df['price'] = master_df['price'].astype(float)
# masterdf = master_df

In [7]:
trans_train['quantity']=1
master_df = trans_train[['customer_id','article_id','quantity']].astype(str)
master_df['quantity'] = master_df['quantity'].astype(float)
masterdf = master_df

In [8]:
### define interactions data and user data

### interactions 
### here we create a reference table of the user , item, and quantity purchased
interactions_dict = masterdf.groupby(['customer_id', 'article_id'])[ 'quantity'].sum().reset_index()

## we tansform the table inta a dictionary , which then we feed into tensor slices
# this step is crucial as this will be the type of data fed into the embedding layers
interactions_dict = {name: np.array(value) for name, value in interactions_dict.items()}
interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)

## we do similar step for item, where this is the reference table for items to be recommended
items_dict = masterdf[['article_id']].drop_duplicates()
items_dict = {name: np.array(value) for name, value in items_dict.items()}
items = tf.data.Dataset.from_tensor_slices(items_dict)

## map the features in interactions and items to an identifier that we will use throught the embedding layers
## do it for all the items in interaction and item table
## you may often get itemtype error, so that is why here i am casting the quantity type as float to ensure consistency
interactions = interactions.map(lambda x: {
    'customer_id' : x['customer_id'], 
    'article_id' : x['article_id'], 
    'quantity' : float(x['quantity']),

})

items = items.map(lambda x: x['article_id'])

In [9]:
# ### define interactions data and user data

# ### interactions 
# ### here we create a reference table of the user , item, and quantity purchased
# interactions_dict = masterdf.groupby(['customer_id', 'article_id'])[ 'price'].sum().reset_index()

# ## we tansform the table inta a dictionary , which then we feed into tensor slices
# # this step is crucial as this will be the type of data fed into the embedding layers
# interactions_dict = {name: np.array(value) for name, value in interactions_dict.items()}
# interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)

# ## we do similar step for item, where this is the reference table for items to be recommended
# items_dict = masterdf[['article_id']].drop_duplicates()
# items_dict = {name: np.array(value) for name, value in items_dict.items()}
# items = tf.data.Dataset.from_tensor_slices(items_dict)

# ## map the features in interactions and items to an identifier that we will use throught the embedding layers
# ## do it for all the items in interaction and item table
# ## you may often get itemtype error, so that is why here i am casting the quantity type as float to ensure consistency
# interactions = interactions.map(lambda x: {
#     'customer_id' : x['customer_id'], 
#     'article_id' : x['article_id'], 
#     'price' : float(x['price']),

# })

# items = items.map(lambda x: x['article_id'])

In [10]:
unique_item_titles = np.unique(np.concatenate(list(items.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["customer_id"]))))

In [11]:
### get unique item and user id's as a lookup table
unique_item_titles = np.unique(np.concatenate(list(items.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["customer_id"]))))

# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = interactions.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(60_000)
test = shuffled.skip(60_000).take(20_000)

In [12]:
class RetailModel(tfrs.Model):

    def __init__(self, user_model, item_model):
        super().__init__()
        
        ### Candidate model (item)
        ### This is Keras preprocessing layers to first convert user ids to integers, 
        ### and then convert those to user embeddings via an Embedding layer. 
        ### We use the list of unique user ids we computed earlier as a vocabulary:
        item_model = tf.keras.Sequential([
                                        tf.keras.layers.experimental.preprocessing.StringLookup(
                                        vocabulary=unique_item_titles, mask_token=None),
                                        tf.keras.layers.Embedding(len(unique_item_titles) + 1, embedding_dimension)
                                        ])
        ### we pass the embedding layer into item model
        self.item_model: tf.keras.Model = item_model
            
        ### Query model (users)    
        user_model = tf.keras.Sequential([
                                        tf.keras.layers.experimental.preprocessing.StringLookup(
                                        vocabulary=unique_user_ids, mask_token=None),
                                        # We add an additional embedding to account for unknown tokens.
                                        tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
                                        ])
        self.user_model: tf.keras.Model = user_model
        
        ### for retrieval model. we take top-k accuracy as metrics
        metrics = tfrs.metrics.FactorizedTopK(candidates=items.batch(128).map(item_model))
        
        # define the task, which is retrieval                                    )    
        task = tfrs.tasks.Retrieval(metrics=metrics)
       
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["customer_id"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_movie_embeddings = self.item_model(features["article_id"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_movie_embeddings)

In [13]:
### Fitting and evaluating

### we choose the dimensionality of the query and candicate representation.
embedding_dimension = 32

## we pass the model, which is the same model we created in the query and candidate tower, into the model
item_model = tf.keras.Sequential([
                                tf.keras.layers.experimental.preprocessing.StringLookup(
                                vocabulary=unique_item_titles, mask_token=None),
                                tf.keras.layers.Embedding(len(unique_item_titles) + 1, embedding_dimension)
                                ])

user_model = tf.keras.Sequential([
                                tf.keras.layers.experimental.preprocessing.StringLookup(
                                vocabulary=unique_user_ids, mask_token=None),
                                # We add an additional embedding to account for unknown tokens.
                                tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
                                ])

model = RetailModel(user_model, item_model)

# a smaller learning rate may make the model move slower and prone to overfitting, so we stick to 0.1
# other optimizers, such as SGD and Adam, are listed here https://www.tensorflow.org/api_docs/python/tf/keras/optimizers
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

## fit the model with ten epochs
model_hist = model.fit(cached_train, epochs=2)

#evaluate the model
model.evaluate(cached_test, return_dict=True)

In [14]:
unique_item_titles

In [15]:
unique_user_ids 

In [16]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(items.batch(100).map(lambda items: (items,model.item_model(items))))

In [17]:
j = '00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657'
_, titles = index(tf.constant([j]),k=12)
print(f"Recommendations for user %s: {titles[0]}" %(j))

In [18]:
t = np.array(titles[0])
l = [el.decode('UTF-8') for el in t]
print(l)

In [19]:
submission_file = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')[:10]
submission_file

In [58]:
sub_cust = submission_file["customer_id"]
sub_cust

In [59]:
sub_df = pd.DataFrame(columns=['Customer_Id', 'Article_Id'])
sub_df

In [97]:
for customer_id in sub_cust:
    _, titles = index(tf.constant([customer_id]),k=12)
    t = np.array(titles[0])
    l = [el.decode('UTF-8') for el in t]
    l = [str(item).zfill(10) for item in l]
    l = " ".join(l)  
    print(f"Recommendations for user %s: {l}" %(customer_id))
    submission_file["prediction"] = l

In [98]:
submission_file

In [66]:
j = '00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657'
_, titles = index(tf.constant([j]),k=12)
print(f"Recommendations for user %s: {titles[0]}" %(j))