# Installing packages


In [1]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann

[K     |████████████████████████████████| 88 kB 2.8 MB/s 
[K     |████████████████████████████████| 511.7 MB 5.6 kB/s 
[K     |████████████████████████████████| 438 kB 53.3 MB/s 
[K     |████████████████████████████████| 5.8 MB 39.2 MB/s 
[K     |████████████████████████████████| 1.6 MB 66.4 MB/s 
[K     |████████████████████████████████| 4.3 MB 5.1 MB/s 
[K     |████████████████████████████████| 11.2 MB 5.6 MB/s 
[?25h

### Importing required library 

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
from typing import Dict, Text
import random
from collections import Counter

In [4]:
## loading data in colab
!gdown https://drive.google.com/uc?id=1SbdFzlp7HWumwXLauwjeqrgW9M1kKLUp
!unzip hmdata.zip

Downloading...
From: https://drive.google.com/uc?id=1SbdFzlp7HWumwXLauwjeqrgW9M1kKLUp
To: /content/hmdata.zip
100% 773M/773M [00:04<00:00, 182MB/s]
Archive:  hmdata.zip
   creating: hmdata/
  inflating: hmdata/customers.csv.zip  
  inflating: __MACOSX/hmdata/._customers.csv.zip  
  inflating: hmdata/articles.csv.zip  
  inflating: __MACOSX/hmdata/._articles.csv.zip  
  inflating: hmdata/transactions_train.csv.zip  
  inflating: __MACOSX/hmdata/._transactions_train.csv.zip  
  inflating: hmdata/sample_submission.csv.zip  
  inflating: __MACOSX/hmdata/._sample_submission.csv.zip  


### reading data

In [5]:
article_df = pd.read_csv("hmdata/articles.csv.zip")
customer_df = pd.read_csv("hmdata/customers.csv.zip")
transaction_df = pd.read_csv('hmdata/transactions_train.csv.zip')

# Feature Processing

In [6]:
def create_age_interval(x):
    if x <= 25:
        return [16, 25]
    elif x <= 35:
        return [26, 35]
    elif x <= 45:
        return [36, 45]
    elif x <= 55:
        return [46, 55]
    elif x <= 65:
        return [56, 65]
    else:
        return [66, 99]

customer_df["FN"].fillna(0, inplace=True)
customer_df["Active"].fillna(0, inplace=True)

# Set unknown the club member status & news frequency
customer_df["club_member_status"].fillna("UNKNOWN", inplace=True)

customer_df["fashion_news_frequency"] = customer_df["fashion_news_frequency"].replace({"None":"NONE"})
customer_df["fashion_news_frequency"].fillna("UNKNOWN", inplace=True)

# Set missing values in age with the median
customer_df["age"].fillna(customer_df["age"].median(), inplace=True)
customer_df["age_interval"] = customer_df["age"].apply(lambda x: create_age_interval(x))

customer_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,age_interval
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,"[46, 55]"
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,"[16, 25]"
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,"[16, 25]"
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,"[46, 55]"
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...,"[46, 55]"


In [7]:
# Replace missing values
article_df.fillna(value="No Description", inplace=True)

In [8]:
article_df['article_id'] = article_df['article_id'].astype(str)
article_df['article_id'] = article_df['article_id'].apply(lambda x: x.zfill(10))

unique_customer_ids = customer_df.customer_id.unique()
unique_article_ids = article_df.article_id.unique()

article_ds = tf.data.Dataset.from_tensor_slices(dict(article_df[['article_id']]))
articles = article_ds.map(lambda x: x['article_id'])

In [9]:
# add 0 in article_id column (string)
transaction_df['article_id'] = transaction_df['article_id'].astype(str)
transaction_df['article_id'] = transaction_df['article_id'].apply(lambda x: x.zfill(10))

In [10]:
## merge article data to transaction data
articles_for_merge = article_df[['article_id', 'prod_name', 'product_type_name', 'product_group_name', 'index_name']]
articles_for_merge = transaction_df[['customer_id', 'article_id', 'price', 't_dat']].merge(articles_for_merge, on='article_id', how='left')
articles_for_merge = articles_for_merge.merge(customer_df[['customer_id','age_interval']],on="customer_id", how ='left')

In [16]:
import gc
del transaction_df
del customer_df
del article_df
gc.collect()

551

In [18]:
## select 6 months data
articles_for_merge = articles_for_merge[articles_for_merge['t_dat'] <='2019-03-20']
articles_for_merge['age_interval'] = articles_for_merge['age_interval'].astype(str)
age_vec = articles_for_merge['age_interval'].unique()

train = articles_for_merge[articles_for_merge['t_dat']<='2018-12-20']
test = articles_for_merge[articles_for_merge['t_dat'] >='2018-12-20']

In [19]:
# Recall@k = (# of recommended items @k that are relevant) / (total # of relevant items)
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

def estimateRecall(cg, purchase):
    return len(intersection(cg,purchase))/len(purchase)

In [20]:
c1 = train['customer_id'].to_list()
c2 = test['customer_id'].to_list()
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

commonUsers = intersection(c1, c2)

print(len(train),len(test),len(c1),len(c2),len(commonUsers))

3976170 3703537 3976170 3703537 303704


In [21]:
#Candidate Generator 1: random products
    
def getKCandidatesRandom(u,k):
    return random.sample(train['article_id'].to_list(),k)

In [22]:
#Candidate Generator 2: top-k popular products based on historic interactions

def getTopKCandidatesPopular(u,k):
    c = Counter(train['article_id'].to_list())
    topPopular = [article for article, count in c.most_common(k)]
    return topPopular

In [23]:
# with the 2 CGs defined, we can now query these CGs for a set of users,
# and compute the recall metrics to see how well they perform.

def runCandidateGeneration(method,k):
    overallRecall = 0
    recallList = []
    i=0
    userSet = commonUsers[0:10]
    for u in userSet:
        purchaseList = test[test['customer_id']==u]['article_id'].to_list()
        cg = method(u,k)
        r = estimateRecall(cg,purchaseList)
        overallRecall+=r
        recallList.append(r)
    overallRecall = overallRecall/len(userSet)
    return (overallRecall,recallList)

In [17]:
# Running metric calculations for the CG 1: random CG, to fetch 100 and 1000 candidates.
r, rlist = runCandidateGeneration(getKCandidatesRandom,k=100)
print("recall for candidate generation at k=100 candidates fetched: ", r)

r, rlist = runCandidateGeneration(getKCandidatesRandom,k=1000)
print("recall for candidate generation at k=1000 candidates fetched: ", r)

recall for candidate generation at k=100 candidates fetched:  0.0
recall for candidate generation at k=1000 candidates fetched:  0.11754079254079255


In [18]:
# Running metric calculations for CG 2: top popular products CG, to fetch 100 and 1000 candidates.

r, rlist = runCandidateGeneration(getTopKCandidatesPopular,k=100)
print("recall for candidate generation at k=100 candidates fetched: ", r)

r, rlist = runCandidateGeneration(getTopKCandidatesPopular,k=1000)
print("recall for candidate generation at k=1000 candidates fetched: ", r)

recall for candidate generation at k=100 candidates fetched:  0.06678321678321678
recall for candidate generation at k=1000 candidates fetched:  0.20174825174825176


**Goal for this week:**
Implement two personalized non-learnt candidate generators:

1.   CG personalized to user’s interest
2.   Personalized to user’s price sensitivity

Once implemented, compare and report recall-k metrics together with the results of the 2 non-personalized CG and two-tower model. Please use k=100, and k=1000 to report the metrics.

**[optional] Adding features to two-tower model**
The two-tower we implemented just uses the customer-ID and article-ID as features. It'd be good to add additional features about customer and articles as input features and see how the recall@k performance changes.

In [24]:
##Candidate Generator 3: CG personalized to user’s interest

def getCandidatesUserinterest(u,k):
  """
  Get candidates for a user based on user's intreset
  Definition of Interest may depend on business objective 
  but for similicity I'm assuming that users Interest can be seen through purchase item category
  I'll be using index_name as group mapping for candidates/articles
  """

  ## filter train data and group by index name to get Top category user bought in past
  top_category = train[train['customer_id'] == u].groupby(
      ['index_name'])['customer_id'].count().sort_values(ascending=False).reset_index().iloc[0,0]
  
  category_index = Counter(train[train['index_name'] == top_category]['article_id'].to_list())
  topPopular = [article for article, count in category_index.most_common(k)]

  return topPopular

In [46]:
# Running metric calculations for the CG : CG based on User Interest, to fetch 100 and 1000 candidates.
r, rlist = runCandidateGeneration(getCandidatesUserinterest,k=100)
print("recall for candidate generation at k=100 candidates fetched: ", r)

r, rlist = runCandidateGeneration(getCandidatesUserinterest,k=1000)
print("recall for candidate generation at k=1000 candidates fetched: ", r)

recall for candidate generation at k=100 candidates fetched:  0.025874125874125874
recall for candidate generation at k=1000 candidates fetched:  0.26223776223776224


**Conclusion:** As per above experiment we can see CG based on user's interest performs better than popular CG



 

In [25]:
## Candidate Generator 4: CG Personalized to user’s price sensitivity

def getCandidatesPriceSensitive(u,k):
  """
  This CG tries to suggest popular candidates within the price range of past transactions
  """

  user_data = train[train['customer_id'] == u]  
  min_price = user_data['price'].min()
  max_price = user_data['price'].max()

  ## selecting popular candidates within price range of customer
  category_index = Counter(train[(min_price <= train['price']) & (train['price']<= max_price)]['article_id'].to_list())
  top_items = [article for article, count in category_index.most_common(k)]

  return top_items

In [71]:
# Running metric calculations for the CG : CG based on User Interest, to fetch 100 and 1000 candidates.
r, rlist = runCandidateGeneration(getCandidatesPriceSensitive,k=100)
print("recall for candidate generation at k=100 candidates fetched: ", r)

r, rlist = runCandidateGeneration(getCandidatesPriceSensitive,k=1000)
print("recall for candidate generation at k=1000 candidates fetched: ", r)

recall for candidate generation at k=100 candidates fetched:  0.075
recall for candidate generation at k=1000 candidates fetched:  0.2826923076923077


**Conclusion**: As per above experiment we can see CG Personalized to user’s price sensitivity performs better than previous tried CG

**[Optional CG] Adding features to two-tower model**

In [32]:
embedding_dimension = 64

class customerModel(tf.keras.Model):

  def __init__(self):
    super().__init__()


    self.user_embedding = tf.keras.Sequential([
         tf.keras.layers.StringLookup(
         vocabulary=unique_customer_ids, mask_token=None),  
         tf.keras.layers.Embedding(len(unique_customer_ids) + 1, embedding_dimension)
    ])

    self.age_embedding = tf.keras.Sequential([
         tf.keras.layers.StringLookup(
         vocabulary=age_vec, mask_token=None),  
         tf.keras.layers.Embedding(len(age_vec) + 1, embedding_dimension)
    ])
    
  def call(self, inputs):

    # Take the input dictionary, pass it through each input layer,
    # and concatenate the result.
    return tf.concat([
        self.user_embedding(inputs["customer_id"]),
        self.age_embedding(inputs["age_interval"])
    ], axis=1)

In [33]:
customer_model = tf.keras.Sequential([
      customerModel(),
      tf.keras.layers.Dense(64, activation='relu')
    ])

In [34]:
article_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_article_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_article_ids) + 1, embedding_dimension),
  tf.keras.layers.Dense(64, activation='relu')
])

In [35]:
class HandMModel(tfrs.Model):
    
    def __init__(self, customer_model, article_model):
        super().__init__()
        self.article_model: tf.keras.Model = article_model
        self.customer_model: tf.keras.Model = customer_model
        self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=articles.batch(128).map(self.article_model),            
            ),
        )        

    def compute_loss(self, features: Dict[str, tf.Tensor], training=False) -> tf.Tensor:
    
        customer_embeddings = self.customer_model(features)    
        article_embeddings = self.article_model(features["article_id"])

        # The task computes the loss and the metrics.
        return self.task(customer_embeddings, article_embeddings,compute_metrics=not training)

In [36]:
model = HandMModel(customer_model, article_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [37]:
num_epochs = 4
train_ds = tf.data.Dataset.from_tensor_slices(dict(train[['customer_id','age_interval','article_id']])).shuffle(100_000).batch(256).cache()
history = model.fit(train_ds,epochs=num_epochs, verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [38]:
test_ds = tf.data.Dataset.from_tensor_slices(dict(test.sample(2000)[['customer_id','age_interval','article_id']])).batch(500).cache()
model.evaluate(test_ds, return_dict=True)



{'factorized_top_k/top_100_categorical_accuracy': 0.026499999687075615,
 'factorized_top_k/top_10_categorical_accuracy': 0.006500000134110451,
 'factorized_top_k/top_1_categorical_accuracy': 0.0005000000237487257,
 'factorized_top_k/top_50_categorical_accuracy': 0.01850000023841858,
 'factorized_top_k/top_5_categorical_accuracy': 0.003000000026077032,
 'loss': 3931.68896484375,
 'regularization_loss': 0,
 'total_loss': 3931.68896484375}

In [41]:
CustToAge = dict(zip(articles_for_merge['customer_id'], articles_for_merge['age_interval']))

def getTopKCandidates2Tower(u,k):
    # Create a model that takes in raw customer features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.customer_model,k)
    # recommends articles out of the entire dataset.
    index.index_from_dataset(
        tf.data.Dataset.zip((articles.batch(1000), articles.batch(1000).map(model.article_model)))
    )
    _, candidates = index({'customer_id':tf.constant([u]), 'age_interval':tf.constant([CustToAge[u]])})
    candidates = candidates.numpy().tolist()[0]
    cg = []
    for c in candidates:
        cg.append(c.decode("utf-8") )
    return (cg)

In [None]:
r, rlist = runCandidateGeneration(getTopKCandidates2Tower,k=100)

In [46]:
print("recall for candidate generation at k=1000 candidates fetched: ", r)

recall for candidate generation at k=1000 candidates fetched:  0.017424242424242425


In [None]:
r, rlist = runCandidateGeneration(getTopKCandidates2Tower,k=1000)

In [48]:
print("recall for candidate generation at k=1000 candidates fetched: ", r)

recall for candidate generation at k=1000 candidates fetched:  0.07622529644268775


**Conclusion:** for Two tower Model after inclduing age as extra feature in user feature space below were recall metrics:

*   recall for candidate generation at k=1000 candidates fetched:  0.017424242424242425

*   recall for candidate generation at k=1000 candidates fetched:  0.07622529644268775



