In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from typing import Dict, Text
from tensorflow import keras
from tqdm.notebook import tqdm
tqdm.pandas()
import tensorflow_recommenders as tfrs
try:
    if not tf.config.list_physical_devices('GPU'):
        assert tf.__version__ >= "2.0"
        print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
        if IS_COLAB:
            print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
        if IS_KAGGLE:
            print("Go to Settings > Accelerator and select GPU.")
except:
    if not tf.test.is_gpu_available():
        assert tf.__version__ >= "2.0"
        print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
        if IS_COLAB:
            print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
        if IS_KAGGLE:
            print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import pandas as pd
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import os
from pathlib import Path
from datetime import datetime
# to make this notebook's output stable across runs
np.random.seed(42)
try:
    if not tf.config.list_physical_devices('GPU'):
        tf.random.set_seed(42)
    else:
        tf.random.set_random_seed(42)
except:
    if not tf.test.is_gpu_available():
        tf.random.set_seed(42)
    else:
        tf.random.set_random_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

KeyboardInterrupt: 

In [None]:
def zero_f(item):
    item=str(item)
    tem=len(item)
    if(len(item)<10):
        item=item.zfill(10)
    return item

In [None]:
trans_train = pd.read_csv('h-and-m-personalized-fashion-recommendations/transactions_train.csv',dtype={'customer_id': str,'article_id':str})
trans_train['quantity']=1
trans_train = trans_train[trans_train['t_dat'] >'2020-06-15']
articles= pd.read_csv('h-and-m-personalized-fashion-recommendations/articles.csv',dtype={'article_id': str,'product_code':str})
customers = pd.read_csv('h-and-m-personalized-fashion-recommendations/customers.csv',dtype={'customer_id':str})
master_df = trans_train[['customer_id','article_id','t_dat']].astype(str)
master_df['article_id']=master_df['article_id'].apply(zero_f)
master_df['quantity'] = trans_train['quantity'].astype(float)
masterdf = master_df

In [None]:
interactions = masterdf.groupby(['customer_id', 'article_id','t_dat'])[ 'quantity'].sum().reset_index()
interactions['t_dat']=pd.to_datetime(interactions['t_dat'])
interactions=interactions.groupby([pd.Grouper(key="customer_id"),pd.Grouper(key="article_id"),pd.Grouper(key="t_dat",freq="1M")])['quantity'].sum().reset_index()
interactions

In [None]:
interactions.sort_values(by='t_dat')

In [None]:
interactions.sort_values(by='t_dat').tail(10000)

In [None]:
train = interactions[interactions['t_dat']<='2020-09-18']
valid=interactions[(interactions['t_dat'] >'2020-09-18')& (interactions['t_dat'] <='2020-09-25')]
test = interactions[interactions['t_dat'] >'2020-09-25']
train

In [None]:
valid

In [None]:
test

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices(dict(train[['customer_id','article_id','quantity']])).shuffle(1_000_000).batch(256).cache()
valid_ds = tf.data.Dataset.from_tensor_slices(dict(valid[['customer_id','article_id','quantity']])).batch(256).cache()
test_ds = tf.data.Dataset.from_tensor_slices(dict(test[['customer_id','article_id','quantity']])).batch(256).cache()
items_dict = articles[['article_id']].drop_duplicates()
customer_dict=customers[['customer_id']].drop_duplicates()
items_dict = {name: np.array(value) for name, value in items_dict.items()}
customer_dict={name:np.array(value) for name,value in customer_dict.items()}
customers=tf.data.Dataset.from_tensor_slices(customer_dict)
items = tf.data.Dataset.from_tensor_slices(items_dict)
items = items.map(lambda x: x['article_id'])
customers=customers.map(lambda x: x['customer_id'])

In [None]:
### get unique item and user id's as a lookup table
unique_items = np.unique(np.concatenate(list(items.batch(1_000))))
unique_user_ids = np.unique(np.concatenate(list(customers.batch(1_000))))

# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)

In [None]:
def my_l1_regularizer(weights):
    return tf.reduce_sum(tf.abs(0.01 * weights))
def my_positive_weights(weights): # return value is just tf.nn.relu(weights)
    return tf.where(weights < 0., tf.zeros_like(weights), weights)

In [None]:
class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        ## embed user id from unique_user_ids
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, 64),
        ])


    def call(self, inputs):
        return self.user_embedding(inputs)


In [None]:
class ItemModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        max_tokens = 10_000

        ## embed title from unique_item_titles
        self.title_embedding = tf.keras.Sequential([
                      tf.keras.layers.StringLookup(
                          vocabulary=unique_items, mask_token=None),
                      tf.keras.layers.Embedding(len(unique_items) + 1, 64)])

        ## processing text features: item title vectorizer (see self.title_vectorizer)
        self.title_vectorizer = tf.keras.layers.TextVectorization(
            max_tokens=max_tokens)

        ## we apply title vectorizer to items
        self.title_text_embedding = tf.keras.Sequential([
                              self.title_vectorizer,
                              tf.keras.layers.Embedding(max_tokens, 64, mask_zero=True),
                              tf.keras.layers.GlobalMaxPooling1D()])

        self.title_vectorizer.adapt(items)

    def call(self, titles):
        return tf.concat([
            self.title_embedding(titles),
            self.title_text_embedding(titles),], axis=1)

In [None]:
class CandidateModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()

        ## query model is user model
        self.query_model = tf.keras.Sequential([
                          UserModel(),
                          tf.keras.layers.Dense(256,activation="relu",kernel_initializer='he_normal',use_bias=False),
                          tf.keras.layers.BatchNormalization(),
                          tf.keras.layers.Dropout(0.3),
                          tf.keras.layers.Dense(128,activation="relu",kernel_initializer='he_normal',use_bias=False),
                          tf.keras.layers.BatchNormalization(),
                          tf.keras.layers.Dropout(0.3),
                          tf.keras.layers.Dense(64,activation="relu",kernel_initializer='he_normal',use_bias=False),
                          tf.keras.layers.BatchNormalization(),
                          tf.keras.layers.Dropout(0.3),
                          tf.keras.layers.Dense(32,kernel_regularizer=my_l1_regularizer,kernel_constraint=my_positive_weights)])
        
        ## candidate model is the item model
        self.candidate_model = tf.keras.Sequential([
                              ItemModel(),
                              tf.keras.layers.Dense(256,activation="relu",kernel_initializer='he_normal',use_bias=False),
                              tf.keras.layers.BatchNormalization(),
                              keras.layers.Dropout(0.3),
                              tf.keras.layers.Dense(128,activation="relu",kernel_initializer='he_normal',use_bias=False),
                              tf.keras.layers.BatchNormalization(),
                              tf.keras.layers.Dropout(0.3),
                              tf.keras.layers.Dense(64,activation="relu",kernel_initializer='he_normal',use_bias=False),
                              tf.keras.layers.BatchNormalization(),
                              tf.keras.layers.Dropout(0.3),
                              tf.keras.layers.Dense(32,kernel_regularizer=my_l1_regularizer,kernel_constraint=my_positive_weights)])
        
        ## retrieval task, choose metrics
        self.task = tfrs.tasks.Retrieval(
                    metrics=tfrs.metrics.FactorizedTopK(
                        candidates=items.batch(128).map(self.candidate_model),),)

    def compute_loss(self, features, training=False):
        # We only pass the user id and timestamp features into the query model. This
        # is to ensure that the training inputs would have the same keys as the
        # query inputs. Otherwise the discrepancy in input structure would cause an
        # error when loading the query model after saving it.
        
        query_embeddings = self.query_model(features["customer_id"])
        
        item_embeddings = self.candidate_model(features["article_id"])

        return self.task(query_embeddings, item_embeddings)

In [None]:
model = CandidateModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(train_ds,validation_data=valid_ds,epochs=3,batch_size=128)
model.evaluate(test_ds, return_dict=True)

In [None]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(items.batch(100).map(lambda items: (items,model.candidate_model(items))))

In [None]:
_, titles = index(np.array(["000231cc9af9e58ab4edc66fbd61da921b144ba85bc1c00d0ae2309531e4c210"]), k=3)
print(f"Top recommendations: {titles[0]}")

In [None]:
_, titles = index(np.array(["000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318"]), k=12)
print(f"Top recommendations: {titles[0]}")

In [None]:
def decoder(e):
    return e.decode('UTF-8')
def run_f(item):
    _, titles = index(tf.constant([item]),k=12)
    t = np.array(titles[0])
    vfunc = np.vectorize(decoder)
    l = vfunc(t)
    l = " ".join(l)
    return l

In [None]:
submission_file = pd.read_csv('h-and-m-personalized-fashion-recommendations/sample_submission.csv',dtype={'customer_id': str})
sub_cust = submission_file["customer_id"]
sub_df = pd.DataFrame(columns=['Customer_Id', 'Article_Id'])
submission_file["prediction"] = submission_file['customer_id'].progress_apply(run_f)
submission_file

In [None]:
submission_file.to_csv('submission.csv',index=False)