In [28]:
# we will be using the LibRecommender library in Python to train an NCF model for the purchase history
%pip install LibRecommender==1.5.1
%pip install numpy==1.26.4
%pip install pandas==2.2.2
%pip install scikit-learn==1.5.1
%pip install tensorflow==2.14.0
%pip install torch==2.4.0

Collecting tensorflow==2.14.0
  Obtaining dependency information for tensorflow==2.14.0 from https://files.pythonhosted.org/packages/de/ea/90267db2c02fb61f4d03b9645c7446d3cbca6d5c08522e889535c88edfcd/tensorflow-2.14.0-cp311-cp311-macosx_12_0_arm64.whl.metadata
  Downloading tensorflow-2.14.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting tensorflow-macos==2.14.0 (from tensorflow==2.14.0)
  Obtaining dependency information for tensorflow-macos==2.14.0 from https://files.pythonhosted.org/packages/d3/4b/ae9037ea22ba94eb2cf267e991384c3444f3e6142fa49923352b4ab73e14/tensorflow_macos-2.14.0-cp311-cp311-macosx_12_0_arm64.whl.metadata
  Downloading tensorflow_macos-2.14.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.9 kB)
Collecting ml-dtypes==0.2.0 (from tensorflow-macos==2.14.0->tensorflow==2.14.0)
  Obtaining dependency information for ml-dtypes==0.2.0 from https://files.pythonhosted.org/packages/15/da/43bee505963da0c730ee50e951c604bfdb90d4cccc9c0044c946b10e68a7/ml_dtypes-0.

In [1]:
import numpy as np
import pandas as pd
from libreco.data import random_split, DatasetPure
from libreco.algorithms import NCF  # pure data,
from libreco.evaluation import evaluate

Instructions for updating:
non-resource variables are not supported in the long term


Load purchase history data from my local

Clean the dataset by dropping NaN values and duplicates on subset ["user_id", "product_id", "category_id", "event_time"]

Create a dictionary to map the product id to the category code + brand

In [3]:
df = pd.read_csv("kz.csv")
df['category_brand'] = df[['category_code', 'brand']].astype(str).agg('-'.join, axis=1)
map_item_category = dict(zip(df.product_id, df.category_brand))
df = df[["user_id", "product_id", "category_id", "event_time"]]
df = df.drop_duplicates(subset=["user_id", "product_id", "category_id", "event_time"])
df = df.dropna()


Compute the frequency of purchase by category and user. Take the value as a rating (i.e. preference to purchase in each category)

In [4]:
rating = df[['user_id','category_id']].groupby(['user_id','category_id']).size().reset_index().rename(columns={0:'rating'})

rating['id'] = rating['user_id'].astype(str) + rating['category_id'].astype(str)
rating['id'] = rating['id'].astype(str)
rating['rating'] = rating['rating'].astype(int)
map_rating = dict(zip(rating.id, rating.rating))

df['id'] = df['user_id'].astype(str) + df['category_id'].astype(str)
df['id'] = df['id'].astype(str)
df['rating'] = df.apply(lambda x: map_rating.get(x['id'],0),axis=1)

df = df[["user_id", "product_id", "rating", "event_time"]]
df.columns = ["user_id", "product_id", "rating", "event_time"]
df = df.rename(columns={"user_id":"user", "product_id": "item", "rating": "label"})
print(df.head(2))

           user          item  label               event_time
0  1.515916e+18  1.515966e+18      1  2020-04-24 11:50:39 UTC
2  1.515916e+18  2.273948e+18      3  2020-04-24 14:37:43 UTC


Split the dataset into training, validation, and test data

In [5]:
train_data, eval_data, test_data = random_split(df, multi_ratios=[0.8, 0.1, 0.1])

Convert the pandas dataframe into a compatible datatype for LibRecommender
we are not using any other feature other than the interaction between the user and an item.
Hence, this DatasetPure function builds the datasets from a Pure Collaborative Filtering perspective

In [6]:
train_data, data_info= DatasetPure.build_trainset(train_data)
eval_data = DatasetPure.build_evalset(eval_data)
test_data = DatasetPure.build_testset(test_data)

Configure NCF

In [7]:
ncf = NCF(
    task="rating",
    data_info=data_info,
    loss_type="cross_entropy",
    embed_size=16,
    n_epochs=10,
    lr=1e-3,
    batch_size=2048,
    num_neg=1,
)

Next is to fit NCF and evaluate
monitor metrics on eval data during training

In [8]:
ncf.fit(
    train_data,
    neg_sampling=False, #for rating, this param is false else True
    verbose=2,
    eval_data=eval_data,
    metrics=["loss"],
)

Training start time: [35m2024-07-28 13:04:17[0m
Instructions for updating:
Colocations handled automatically by placer.


  net = tf.layers.batch_normalization(net, training=is_training)
Instructions for updating:
Colocations handled automatically by placer.
  net = tf.layers.batch_normalization(net, training=is_training)
2024-07-28 13:04:17.806943: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:382] MLIR V1 optimization pass is not enabled
train: 100%|██████████| 412/412 [00:02<00:00, 155.39it/s]


Epoch 1 elapsed: 2.670s
	 [32mtrain_loss: 35.268[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 99.11it/s]


	 eval rmse: 8.9692


train: 100%|██████████| 412/412 [00:02<00:00, 177.21it/s]


Epoch 2 elapsed: 2.326s
	 [32mtrain_loss: 21.1913[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 189.86it/s]


	 eval rmse: 4.9020


train: 100%|██████████| 412/412 [00:02<00:00, 177.38it/s]


Epoch 3 elapsed: 2.324s
	 [32mtrain_loss: 19.342[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 169.63it/s]


	 eval rmse: 4.7491


train: 100%|██████████| 412/412 [00:02<00:00, 173.45it/s]


Epoch 4 elapsed: 2.376s
	 [32mtrain_loss: 18.2455[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 190.33it/s]


	 eval rmse: 4.7883


train: 100%|██████████| 412/412 [00:02<00:00, 161.13it/s]


Epoch 5 elapsed: 2.558s
	 [32mtrain_loss: 17.7615[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 178.63it/s]


	 eval rmse: 4.7030


train: 100%|██████████| 412/412 [00:02<00:00, 172.57it/s]


Epoch 6 elapsed: 2.389s
	 [32mtrain_loss: 17.2019[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 196.49it/s]


	 eval rmse: 4.6785


train: 100%|██████████| 412/412 [00:02<00:00, 174.47it/s]


Epoch 7 elapsed: 2.363s
	 [32mtrain_loss: 16.9057[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 187.91it/s]


	 eval rmse: 4.6791


train: 100%|██████████| 412/412 [00:02<00:00, 175.55it/s]


Epoch 8 elapsed: 2.349s
	 [32mtrain_loss: 16.5163[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 186.31it/s]


	 eval rmse: 4.6454


train: 100%|██████████| 412/412 [00:02<00:00, 172.36it/s]


Epoch 9 elapsed: 2.391s
	 [32mtrain_loss: 16.3207[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 193.95it/s]


	 eval rmse: 4.6648


train: 100%|██████████| 412/412 [00:02<00:00, 176.60it/s]


Epoch 10 elapsed: 2.334s
	 [32mtrain_loss: 16.179[0m


eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 172.11it/s]

	 eval rmse: 4.6752





do final evaluation on test data

In [9]:
evaluate(
    model=ncf,
    data=test_data,
    neg_sampling=False,
    metrics=["loss"],
)

eval_pointwise: 100%|██████████| 6/6 [00:00<00:00, 137.67it/s]


{'loss': 4.5751715}

for implicit feedback, metrics like precision@k, recall@k, ndcg can be used
predict preference of user_id 1515915625450916989 to product_id 1515966223509089696

In [10]:
ncf.predict(user=1515915625450916989, item=1515966223509089696)

[31mDetect 1 unknown interaction(s), position: [0][0m


array([1.], dtype=float32)

recommend 10 items for user 1515915625450916989

In [18]:
results = ncf.recommend_user(user=1515915625446974274, n_rec=10)
recommended_product_ids = [l for item in results.values() for l in item.tolist()]
results2 = [map_item_category.get(l, None) for item in results.values() for l in item.tolist()]
print('Recommended 10 items for user 1515915625450916989:')
for i, (id, category_brand) in enumerate(zip(recommended_product_ids, results2)):
    print(i+1,": ","product_id=", id, ";category-brand=", category_brand)

[31mDetect unknown user: 1515915625446974274[0m
Recommended 10 items for user 1515915625450916989:
1 :  product_id= 2.3090182491671107e+18 ;category-brand= nan-nan
2 :  product_id= 2.388434452473805e+18 ;category-brand= appliances.environment.vacuum-bosch
3 :  product_id= 2.3884344524739645e+18 ;category-brand= appliances.kitchen.kettle-bosch
4 :  product_id= 2.2739482885813706e+18 ;category-brand= furniture.living_room.chair-monge
5 :  product_id= 1.5159662235109553e+18 ;category-brand= nan-bosch
6 :  product_id= 1.5159662235097172e+18 ;category-brand= computers.notebook-apple
7 :  product_id= 1.5159662235123845e+18 ;category-brand= nan-airline
8 :  product_id= 1.515966223509089e+18 ;category-brand= electronics.video.tv-lg
9 :  product_id= 1.5159662235123901e+18 ;category-brand= nan-technodom
10 :  product_id= 2.3090182636626253e+18 ;category-brand= computers.notebook-hp
