In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
from typing import Dict, Text
import random
from collections import Counter

In [3]:
import tensorflow_recommenders as tfrs

In [4]:
import dask as dask
import dask.dataframe as dd
import dask.multiprocessing
import dask.distributed
import dask.diagnostics
import dask.bag as db
import dask.array as da
import dask.delayed as dd
import dask.bag as db
import dask.multiprocessing
import dask.distributed
#imports
import dask
import dask.dataframe as dd
from dask.distributed import Client, progress
#import dask.array as dask
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler
import pandas as pd
import numpy as np

import dask_ml

## Loading the Data
- Downcasting to save Memory
- Use only necessary columns 

In [89]:
article_df = pd.read_csv("/Volumes/GoogleDrive/My Drive/data/hmdata/articles.csv.zip")
customer_df = pd.read_csv("/Volumes/GoogleDrive/My Drive/data/hmdata/customers.csv.zip")
train0 = pd.read_csv('/Volumes/GoogleDrive/My Drive/data/hmdata/transactions_train.csv.zip',usecols= ['t_dat', 'customer_id', 'article_id', 'price'],
dtype={'article_id':'int32','price':'float32','customer_id':'str'},
parse_dates=['t_dat'])
transaction_df = train0

#### Checking Memory Usage

In [90]:
def mem_usage(pandas_obj):
    '''
    Takes a pandas dataframe and shows it memory usage.
    '''
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2
    return "{:03.2f} MB".format(usage_mb)



#### Downcasting
- Reduced Memory Usage from 6426.93 MB to 4153.25 MB

In [91]:
print(f'Original Memory Usage: {"6426.93 MB"}')
print(f'Memory Usage after Downcasting {mem_usage(transaction_df)}')
transaction_df=transaction_df[transaction_df['t_dat']>='2019-09-20']
print(f'Memory Usage after Using only 1 year data {mem_usage(transaction_df)}')

Original Memory Usage: 6426.93 MB
Memory Usage after Downcasting 4153.25 MB
Memory Usage after Using only 1 year data 2072.09 MB


## Feature Engineering

#### Calculating Article Freshness 

In [None]:
transaction_df['year_month'] = pd.to_datetime(transaction_df['t_dat']).dt.to_period('M')

In [243]:
transaction_df.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price,year_month
16803901,2019-09-20,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,745014003,0.020322,2019-09
16803902,2019-09-20,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,812484002,0.05422,2019-09


In [244]:
article_by_mth_yr = transaction_df.groupby(['article_id','year_month']).count()['customer_id'].to_frame('article_sold_yr_month').reset_index()

In [245]:
article_by_mth_yr.shape

(376397, 3)

In [246]:
article_by_mth_yr.head()

Unnamed: 0,article_id,year_month,article_sold_yr_month
0,108775015,2019-09,7
1,108775015,2019-10,13
2,108775015,2019-11,12
3,108775015,2019-12,21
4,108775015,2020-01,9


In [247]:
article_by_mth_yr['prev_month'] = article_by_mth_yr.sort_values(
    by=['article_id','year_month'], ascending=True).groupby(['article_id','year_month'])['article_sold_yr_month'].sum().shift(1).reset_index()['article_sold_yr_month'].fillna(0)
    

In [248]:
article_by_mth_yr['pct_change'] = (article_by_mth_yr['article_sold_yr_month'] - (article_by_mth_yr['prev_month'])*100/article_by_mth_yr['article_sold_yr_month']).round(2)

In [249]:
def stale_dict(x):
    if x<-25 and x>-50:
        return 'stale'
    elif x<-50:
        return 'very stale'
    elif x>-25 and x<-10:
        return 'tending to stale'
    else:
        return 'trending'
article_by_mth_yr['current_article_freshness']=article_by_mth_yr['pct_change'].apply(stale_dict)

In [251]:
article_by_mth_yr.query("article_id==108775015")

Unnamed: 0,article_id,year_month,article_sold_yr_month,prev_month,pct_change,current_article_freshness
0,108775015,2019-09,7,0.0,7.0,trending
1,108775015,2019-10,13,7.0,-40.85,stale
2,108775015,2019-11,12,13.0,-96.33,very stale
3,108775015,2019-12,21,12.0,-36.14,stale
4,108775015,2020-01,9,21.0,-224.33,very stale
5,108775015,2020-02,19,9.0,-28.37,stale
6,108775015,2020-03,1,19.0,-1899.0,very stale
7,108775015,2020-04,16,1.0,9.75,trending
8,108775015,2020-06,2,16.0,-798.0,very stale
9,108775015,2020-07,3,2.0,-63.67,very stale


In [114]:
article_df.query("article_id==812484002")

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
83563,812484002,812484,Orion NW tapered denim,272,Trousers,Garment Lower body,1010023,Denim,72,Blue,...,Trousers & Skirt,A,Ladieswear,1,Ladieswear,18,Womens Trend,1009,Trousers,5-pocket jeans in washed cotton denim with a h...


#### Calculating article_id sold by Age Group

In [254]:
def create_age_interval(x):
    if x <= 20:
        return "Teenaage"
    elif x <= 35:
        return "Working Age"
    elif x <= 45:
        return "Middle Age"
    elif x <= 55:
        return  "Senior Age"
    elif x <= 65:
        return  "Old Age"
    else:
        return  "Very Old Age"

customer_df["FN"].fillna(0, inplace=True)
customer_df["Active"].fillna(0, inplace=True)

# Set unknown the club member status & news frequency
customer_df["club_member_status"].fillna("UNKNOWN", inplace=True)

customer_df["fashion_news_frequency"] = customer_df["fashion_news_frequency"].replace({"None":"NONE"})
customer_df["fashion_news_frequency"].fillna("UNKNOWN", inplace=True)

# Set missing values in age with the median
customer_df["age"].fillna(customer_df["age"].median(), inplace=True)
customer_df["age_interval"] = customer_df["age"].apply(lambda x: create_age_interval(x))

customer_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,age_interval
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,Senior Age
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,Working Age
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,Working Age
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,Senior Age
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...,Senior Age


In [303]:
merged_age = transaction_df.merge(customer_df[['age_interval','customer_id']], on="customer_id", how="left")

In [None]:
merged_age.head(2)

In [297]:
article_by_age_group = merged_age.groupby(['article_id','age_interval']).agg({
    'customer_id': 'count'
}).groupby(level=0).apply(lambda x: 100*x/x.sum()).reset_index()

In [301]:
article_by_age_group.query("article_id==812484002")

Unnamed: 0,article_id,age_interval,customer_id
190707,812484002,Middle Age,4.743083
190708,812484002,Old Age,5.13834
190709,812484002,Senior Age,11.98946
190710,812484002,Teenaage,5.928854
190711,812484002,Very Old Age,1.449275
190712,812484002,Working Age,70.750988


In [None]:
article_by_age_group= article_by_age_group.pivot_table(index='article_id', columns='age_interval', values='customer_id').reset_index().fillna(0)
article_by_age_group['dominant_age_group'] = article_by_age_group[['Middle Age', 'Old Age', 'Senior Age', 'Teenaage',
       'Very Old Age', 'Working Age']].idxmax(axis=1)

#### Calculating Top-8 Category for Purchasae by Customer

In [305]:
customer_pref = merged_age.merge(article_df[['article_id','product_group_name']], on="article_id", how="left")

In [322]:
customer_pref.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price,year_month,age_interval,product_group_name
0,2019-09-20,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,745014003,0.020322,2019-09,Middle Age,Garment Lower body
1,2019-09-20,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,812484002,0.05422,2019-09,Middle Age,Garment Lower body


In [324]:
customer_pref_ranked = customer_pref.groupby(['customer_id','product_group_name']).agg({
    'article_id': 'count'
}).groupby(level=0).apply(lambda x: x/x.sum()).reset_index()

In [330]:
customer_pref_ranked['rank']=customer_pref_ranked.groupby(['customer_id'])['article_id'].rank(method='dense', ascending=False)

In [332]:
customer_pref_ranked.rename(columns={'article_id':'article_prefs'}, inplace=True)

In [333]:
sample_customer_pref = customer_pref[customer_pref['customer_id'].isin(
    customer_pref['customer_id'].sample(2))]

sample_customer_pref_ranked = sample_customer_pref.groupby(['customer_id', 'product_group_name']).agg({
    'article_id': 'count'
}).groupby(level=0).apply(lambda x: x/x.sum()).reset_index()

sample_customer_pref_ranked['rank'] = sample_customer_pref_ranked.groupby(
    ['customer_id'])['article_id'].rank(method='dense', ascending=False)


In [334]:
sample_customer_pref_ranked

Unnamed: 0,customer_id,product_group_name,article_id,rank
0,14ecec84a4b4313fa68b6a43365269289143659a105505...,Garment Full body,0.230769,2.0
1,14ecec84a4b4313fa68b6a43365269289143659a105505...,Garment Upper body,0.769231,1.0
2,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Accessories,0.08,4.0
3,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Garment Full body,0.05,5.0
4,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Garment Lower body,0.17,2.0
5,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Garment Upper body,0.51,1.0
6,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Nightwear,0.02,6.0
7,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Shoes,0.13,3.0
8,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Swimwear,0.02,6.0
9,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Underwear,0.02,6.0


#### Calculating Price Sensitivity By Customer ID

In [359]:
sample_customer_pref_price_sensitive = sample_customer_pref.groupby(['customer_id', 'product_group_name'])['price'].describe()[['max', 'min']].reset_index()


In [360]:
sample_customer_pref_price_sensitive

Unnamed: 0,customer_id,product_group_name,max,min
0,14ecec84a4b4313fa68b6a43365269289143659a105505...,Garment Full body,0.050831,0.028797
1,14ecec84a4b4313fa68b6a43365269289143659a105505...,Garment Upper body,0.093203,0.016932
2,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Accessories,0.022017,0.002695
3,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Garment Full body,0.008881,0.002119
4,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Garment Lower body,0.031763,0.006085
5,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Garment Upper body,0.030492,0.001407
6,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Nightwear,0.010153,0.010153
7,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Shoes,0.019814,0.002695
8,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Swimwear,0.001678,0.001678
9,fc7ff0f78626b3a9f558574f4ada3ee6e9445027562840...,Underwear,0.001695,0.001695


In [362]:
#USING DASK DATA FRAME TO PERFORM ANALYSIS
ddf = dd.from_pandas(customer_pref, npartitions=10)

In [363]:
customer_pref_price_sensitive = ddf.groupby(['customer_id', 'product_group_name']).agg({'price': ['max', 'min']}).compute().head()


In [366]:
customer_pref_price_sensitive = customer_pref_price_sensitive.reset_index()

In [368]:
customer_pref_price_sensitive.head(2)

Unnamed: 0_level_0,customer_id,product_group_name,price,price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,max,min
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,Accessories,0.018288,0.018288
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,Garment Full body,0.036593,0.036593


In [364]:
#customer_pref_price_sensitive = customer_pref.groupby(['customer_id', 'product_group_name'])['price'].describe()[['max', 'min']].reset_index()


## Weeks Notebook

In [335]:
# Replace missing values
article_df.fillna(value="No Description", inplace=True)

# Adjust the article ID and product code to be string & add "0"
#article_df["product_code"] = article_df["article_id"].apply(lambda x: x[:3])

#article_df.head()

#### Creating the TF Dataset

In [369]:
article_df['article_id'] = article_df['article_id'].astype(str)
#Padding the article ID with 0s
article_df['article_id'] = article_df['article_id'].apply(lambda x: x.zfill(10))

#Extracrt the unique customer IDs and article IDs
unique_customer_ids = customer_df.customer_id.unique()
unique_article_ids = article_df.article_id.unique()

#Create a dataframe with customer IDs and article IDs
article_ds = tf.data.Dataset.from_tensor_slices(dict(article_df[['article_id']]))
articles = article_ds.map(lambda x: x['article_id'])

In [370]:
#Subeslecting for last 1 year
train0 = train0[train0['t_dat'] <='2019-09-20']

# add 0 in article_id column (string)
train0.loc[:,'article_id'] = train0.loc[:,'article_id'].astype(str)
train0.loc[:,'article_id']  = train0.loc[:,'article_id'] .apply(lambda x: x.zfill(10))
train0.head()

Unnamed: 0,t_dat,customer_id,article_id,price
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932


#### Defining the Metric for Candidate Generator 

The goal of the candidate generator is to fetch enough reevant candidates for the ranker to rank. Indeed, the ranker works on top of the output of the candidate generator -- if the candidate generator fetches relevant candidates, the ranker can do a better job at ranking these candidates. However, if the candidate generator misses out on a large set of relevant items, the ranker cannot rank them since it doesn't have access to good candidate items.

- Given such a role of the candidate generator, *recall* becomes the key metric with which we evaluate different candidate generators. Intuitively, _Recall = of all relevant items, how many made it into the selected list?_

- Recall@k is the proportion of relevant items found in the top-k selected candidates.

- Mathematically recall@k is defined as follows:
    - Recall@k = (# of recommended items @k that are relevant) / (total # of relevant items)

- Suppose we have a user who will make 5 purchases next month, and we wish to make 10 recommendations to this user on the homepage from among the set of 1 million products. 
    - Further suppose we have two candidate generators (CG-A and CG-B), each returning 100 candidates that the ranker can consumer and select the 10 recommendations. 
    - If CG-A is able to retrieve 3 of these 5 products amongst its set of 100 candidates, and CG-B is able to retrieve only 2 of the 5 products amongst its set of 100 candidates, 
    - then CG-A would have a higher recall than CG-B. Recall for CG-A = 3/5 Recall for CG-B = 2/5

In [371]:
# Recall@k = (# of recommended items @k that are relevant) / (total # of relevant items)
def estimateRecall(cg, purchase):
    return len(set.intersection(set(cg), set(purchase))) / len(purchase)

#### Types of Candidatee Generators

## **Traditional Candidate Generation**

With the metric in place, now lets look into a few obvious candiate generators.

> **1. Candidate Genertator 1: random products**
The simplest candidate generator to start with is a random candidate selector -- from amongst the list of products, randomly select few products.


> **2. Candidate Genertator 2: Top most popular products**
Another simple candidate selection strategy is to select the top-k most popular products from historic interactions and use this list as the set of candidates for the downstream ranking goal.


> **3. Candidate Genertator 3: Two-tower model CG**
One of most popular, and industry-wide used CGs is a two-tower neural network. This is a learnt deep model with one tower for the user and another tower for the product, followed by a dot product. We describe this model in detail further down the notebook.


Below we will implement each of these three candidate generators and compare their performace in terms of recall metric.

In order to have a clean evaluation set-up, we will divide the transactions data we have into train and test split. We look at one year's of purchases and use the first 9 months as training data and the remaining three months as the test data.

#### Splitting into Train and Test

In [372]:
#train0 = pd.read_csv('hmdata/transactions_train.csv.zip')
train = train0[train0['t_dat']<'2019-06-20']
test = train0[train0['t_dat'] >='2019-06-20']#[train0['t_dat'] <='2020-09-20']

- Note: Considered customers that have made a purchase in both train and test time-periods. The methods and set-up remain the same for all other customers as well.

In [373]:
c1 = train['customer_id'].to_list()
c2 = test['customer_id'].to_list()
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

commonUsers = intersection(c1, c2)

print(len(train),len(test),len(c1),len(c2),len(commonUsers))

12148516 4686893 12148516 4686893 421401


#### Helper Function To Run Candidate Generator 

In [469]:
def runCandidateGeneration(method,k):
    """
    param method: method to use for candidate generation
    param k: number of top articles to use for candidate generation
    """
    overallRecall = 0
    recallList = []
    i=0
    userSet = commonUsers[0:10] # take first 10 users
    for user in userSet: # for each user
        purchaseList = test[test['customer_id']==user]['article_id'].to_list() # get the list of articles purchased by user
        cg = method(user,k) # get the list of top articles
        r = estimateRecall(cg,purchaseList) # get the recall
        overallRecall+=r # add to overall recall
        recallList.append(r) # add to recall list
    overallRecall = overallRecall/len(userSet) # get the overall recall
    return (overallRecall,recallList) # return the overall recall and recall list

#### Candidate Generators I:  Random Products

In [396]:
def getKCandidatesRandom(user,k):
    """ 
    param user: user id 
    param k: number of top articles to use for candidate generation
    """
    return random.sample(train['article_id'].to_list(),k)

#### Candidate Generator 2: top-k popular products based on historic interactions

In [397]:

def getTopKCandidatesPopular(user,k):
    """ 
    param user: user id 
    param k: number of top articles to use for candidate generation

    """
    
    c = Counter(train['article_id'].to_list())
    topPopular = [article for article, count in c.most_common(k)]
    return topPopular


#### Evaluating Candidate Generators I and II

In [395]:
# Running metric calculations for CG 2: top popular products CG, to fetch 100 and 1000 candidates.
r, rlist = runCandidateGeneration(getTopKCandidatesPopular,k=100)
print("recall for candidate generation at k=100 candidates fetched: ", r)

r, rlist = runCandidateGeneration(getTopKCandidatesPopular,k=1000)
print("recall for candidate generation at k=1000 candidates fetched: ", r)

recall for candidate generation at k=100 candidates fetched:  0.007142857142857143
recall for candidate generation at k=1000 candidates fetched:  0.22857142857142856


### **Two Tower Model: Candidate Generator**

- A two tower model is a neural network that is made up of two separate sub-models, 
    - one that learns representations for the users, 
    - and one that learns representations for candidate objects. 

This information is then combined in a single layer to produce a prediction of whether a user will like an item. This is currently one of the best-performing models for candidate generation, as it learns information from both the user and the items. 

- This approach is useful for candidate generation across a wide variety of industries, from online retail to social media! With one tower representing the users and the other representing the items, the system can learn a lot about whether a user might like a particular item. 
- These models are trained using implicit signals – YouTube, for example, might consider how many seconds a user streamed a video, or how they interacted with it. The learned information can then be combined to produce the final prediction.

#### Embedding for the Customer

In [376]:
embedding_dimension = 64

customer_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_customer_ids, mask_token=None),  
  tf.keras.layers.Embedding(len(unique_customer_ids) + 1, embedding_dimension),
  tf.keras.layers.Dense(64, activation='relu')
])


#### Embedding for the Articles

In [377]:
article_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_article_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_article_ids) + 1, embedding_dimension),
  tf.keras.layers.Dense(64, activation='relu')
])

#### **Metric: FactorizedTopK**
- To figure out how good our model is, 
    - we need to compare the **affinity score** that the model calculates for this pair to the scores of all the other _possible candidates_: if the score for the positive pair is higher than for all other candidates, our model is highly accurate.

- To do this, we can use the *tfrs.metrics.FactorizedTopK* metric. The metric has one required argument: the dataset of candidates that are used as implicit negatives for evaluation.

- The default metric is top K categorical accuracy: how often the true candidate is in the top K candidates for a given query.

#### Two Tower Model Class

TFRS exposes a base model class (tfrs.models.Model) which streamlines bulding models: 
 - In the __init__ method, and implement the compute_loss method, taking in the raw features and returning a loss value.

- The base model will then take care of creating the appropriate training loop to fit our model.

- The tfrs.Model base class is a simply convenience class: it allows us to compute both training and test losses using the same method.


In [378]:
class HandMModel(tfrs.Model):
    
    def __init__(self, customer_model, article_model):
        super().__init__()
        self.article_model: tf.keras.Model = article_model 
        self.customer_model: tf.keras.Model = customer_model 
        self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=articles.batch(128).map(self.article_model),            
            ),
        )        

    def compute_loss(self, features: Dict[str, tf.Tensor], training=False) -> tf.Tensor:
    
        customer_embeddings = self.customer_model(features["customer_id"])    
        article_embeddings = self.article_model(features["article_id"])

        # The task computes the loss and the metrics.
        return self.task(customer_embeddings, article_embeddings,compute_metrics=not training)

In [379]:
model = HandMModel(customer_model, article_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [380]:
train_ds = tf.data.Dataset.from_tensor_slices(dict(train[['customer_id','article_id']])).shuffle(100_000).batch(256)
test_ds = tf.data.Dataset.from_tensor_slices(dict(test[['customer_id','article_id']])).batch(256)

num_epochs = 4

history = model.fit(train_ds,epochs=num_epochs, verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [381]:
test_ds = tf.data.Dataset.from_tensor_slices(dict(test.sample(2000)[['customer_id','article_id']])).batch(500).cache()
model.evaluate(test_ds, return_dict=True)




{'factorized_top_k/top_1_categorical_accuracy': 0.0005000000237487257,
 'factorized_top_k/top_5_categorical_accuracy': 0.0024999999441206455,
 'factorized_top_k/top_10_categorical_accuracy': 0.003000000026077032,
 'factorized_top_k/top_50_categorical_accuracy': 0.008999999612569809,
 'factorized_top_k/top_100_categorical_accuracy': 0.01600000075995922,
 'loss': 3722.50390625,
 'regularization_loss': 0,
 'total_loss': 3722.50390625}

#### Making Predictions

In [385]:
articles = article_ds.map(lambda x: x['article_id'])

# Create a model that takes in raw customer features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.customer_model,k=50)
# recommends articles out of the entire dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((articles.batch(1000), articles.batch(1000).map(model.article_model)))
)

# Get recommendations.
_, titles = index(tf.constant(["00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657"]))
print(f"Recommendations for customer: {titles[0, :100]}")

Recommendations for customer: [b'0681105001' b'0691799005' b'0652000008' b'0568601006' b'0568601016'
 b'0577512009' b'0716211003' b'0702799003' b'0683517004' b'0688464004'
 b'0731512001' b'0672625008' b'0405887036' b'0611479009' b'0690952004'
 b'0729936003' b'0399061028' b'0625653011' b'0745367003' b'0611145012'
 b'0729936002' b'0620424042' b'0507909003' b'0695909002' b'0508932027'
 b'0685167001' b'0742204001' b'0751788001' b'0568601019' b'0507909024'
 b'0548110014' b'0507909026' b'0702213001' b'0568597006' b'0698629001'
 b'0619739034' b'0651504002' b'0568597016' b'0577512008' b'0566592006'
 b'0548110010' b'0714620001' b'0690960001' b'0688464001' b'0649397004'
 b'0717132001' b'0725216003' b'0735779002' b'0627759008' b'0712913001']


#### Evaluating 
- Now that we have the ability to make predictions, we would like to use this model and fetch the top-k candidates for the set of customers we have used for the previous two candidate generators.

- Similar to the CG1 and CG2 functions, we define the two-tower CG function that takes in a customer ID and value k, and returns the list of top-k products as the list of candidate products.

In [386]:
def getTopKCandidates2Tower(u,k):
    # Create a model that takes in raw customer features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.customer_model,k)
    # recommends articles out of the entire dataset.
    index.index_from_dataset(
        tf.data.Dataset.zip((articles.batch(1000), articles.batch(1000).map(model.article_model)))
    )
    _, candidates = index(tf.constant([u]))
    candidates = candidates.numpy().tolist()[0]
    cg = []
    for c in candidates:
        cg.append(c.decode("utf-8") )
    return (cg)

In [387]:
r, rlist = runCandidateGeneration(getTopKCandidates2Tower,k=100)
print("recall for candidate generation at k=100 candidates fetched: ", r)
r, rlist = runCandidateGeneration(getTopKCandidates2Tower,k=1000)
print("recall for candidate generation at k=1000 candidates fetched: ", r)

recall for candidate generation at k=100 candidates fetched:  0.0
recall for candidate generation at k=1000 candidates fetched:  0.06642857142857143


#### Candidate Generator IV

In [470]:
def getTopKCandidatesFeatures(user,k):
    """ 
    param user: user id 
    param k: number of top articles to use for candidate generation
    1 Filter for age level
    2 Filter for Stale Products
    3 Sample From Top K Popular Products with Probability based on popularity
    4 Restrict Based on User History and Price Sensitivity

    """
    
    print("user: ", user)
    user_age_interval = customer_df[customer_df['customer_id'] == user]['age_interval'].values[0]
    print("user_age_interval: ", user_age_interval)
    filter_by_age_interval = article_by_age_group[article_by_age_group['dominant_age_group'] == user_age_interval]['article_id']
    print("filter_by_age_interval: ", len(filter_by_age_interval))
    filter_by_age_interval_list  = []
    for article in filter_by_age_interval:
        if article_by_mth_yr[article_by_mth_yr["article_id"]==article]['current_article_freshness'].values[-1] in ['very stale']:
            continue
        else:
            filter_by_age_interval_list.append(article)

    final_list = [str(x).zfill(10) for x in filter_by_age_interval_list]
    return final_list

In [466]:
sample_user = customer_df.sample(1)['customer_id'].values[0]
sample_user

'19d3189004ea728660da0d47843d7bd215989cf45227f8d8e3534ccdc5947493'

In [474]:
r, rlist = runCandidateGeneration(getTopKCandidatesFeatures,k=1000)

user:  9fdf9625de6aa5a081b0412a23db2dbe9e00d0f662881918a2d618b39888927c
user_age_interval:  Working Age
filter_by_age_interval:  51553
user:  5cf22e04b3abfcaa5c86662097f4b4e7a90d1c68064c9855308f4359dd9a3b4b
user_age_interval:  Senior Age
filter_by_age_interval:  7601
user:  22563d01d9294322b2e33138c22dda80b8603b557973c3223349cd3682928080
user_age_interval:  Working Age
filter_by_age_interval:  51553
user:  7b4262b6309c77d80b599e9cde25e338387c877230a0e96a3003e41c319bcde2
user_age_interval:  Senior Age
filter_by_age_interval:  7601
user:  4b675a701c1ebb552f88407683e74bd6561b66ece1c2a933f7f4f8ba41a92299
user_age_interval:  Senior Age
filter_by_age_interval:  7601
user:  ae331140a60721eab55b431b7e19112141299e2815f2a803c2e0b525eb020472
user_age_interval:  Working Age
filter_by_age_interval:  51553
user:  6f4b38793b7bd54ad0e3aad30a4cc81e2773832625197e4d654ddaa231959c74
user_age_interval:  Middle Age
filter_by_age_interval:  9505
user:  3481cf616b7660aec5d697712f94f77ace1b3daebd83b141fde7b398

#### Candidate Generator IV : Recall@1000K

In [478]:
r

0.02857142857142857

#### Candidate Generator Recall@100K

In [479]:
r, rlist = runCandidateGeneration(getTopKCandidatesFeatures,k=100)

user:  9fdf9625de6aa5a081b0412a23db2dbe9e00d0f662881918a2d618b39888927c
user_age_interval:  Working Age
filter_by_age_interval:  51553
user:  5cf22e04b3abfcaa5c86662097f4b4e7a90d1c68064c9855308f4359dd9a3b4b
user_age_interval:  Senior Age
filter_by_age_interval:  7601
user:  22563d01d9294322b2e33138c22dda80b8603b557973c3223349cd3682928080
user_age_interval:  Working Age
filter_by_age_interval:  51553
user:  7b4262b6309c77d80b599e9cde25e338387c877230a0e96a3003e41c319bcde2
user_age_interval:  Senior Age
filter_by_age_interval:  7601
user:  4b675a701c1ebb552f88407683e74bd6561b66ece1c2a933f7f4f8ba41a92299
user_age_interval:  Senior Age
filter_by_age_interval:  7601
user:  ae331140a60721eab55b431b7e19112141299e2815f2a803c2e0b525eb020472
user_age_interval:  Working Age
filter_by_age_interval:  51553
user:  6f4b38793b7bd54ad0e3aad30a4cc81e2773832625197e4d654ddaa231959c74
user_age_interval:  Middle Age
filter_by_age_interval:  9505
user:  3481cf616b7660aec5d697712f94f77ace1b3daebd83b141fde7b398

In [480]:
r

0.02857142857142857