In [1]:
import os
import sys
import numpy as np
import pandas as pd

src_path = os.path.abspath(os.path.join(os.path.curdir, "src"))
sys.path.append(src_path)

from src.dataset import read_dataset
from src.helper import compute_dataset_statistics

In [2]:
df = read_dataset()

For collaborative filtering, we need to choose only 1 type of "item" to recommand to the user. For that, we choose to recommand artists to user. 

In [3]:
user_artists_df = df.groupby(['user_id', 'artist_id']).agg(
    timestamp = pd.NamedAgg(column='timestamp', aggfunc='max'),
).reset_index()
user_track_df = df.groupby(['user_id', 'track_id']).agg(
    timestamp = pd.NamedAgg(column='timestamp', aggfunc='max'),
).reset_index()

user_artists_df = user_artists_df.rename(columns={'artist_id': 'item_id'})
user_track_df = user_track_df.rename(columns={'track_id': 'item_id'})

user_item_df = pd.concat([user_artists_df, user_track_df])

user_item_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 184954 entries, 0 to 154894
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype              
---  ------     --------------   -----              
 0   user_id    184954 non-null  object             
 1   item_id    184954 non-null  object             
 2   timestamp  184954 non-null  datetime64[us, UTC]
dtypes: datetime64[us, UTC](1), object(2)
memory usage: 5.6+ MB


In [4]:
stats = compute_dataset_statistics(user_item_df)

Number of users: 50
Number of items: 126117
Number of interactions: 184954
Density: 0.029330542274237415
Sparsity: 0.9706694577257626


In [5]:
from src.cf_mf import MatrixFactorization

mf = MatrixFactorization(df=user_item_df,
                         batch_size=1024,
                         embedding_size=64,
                         learning_rate=0.001,
                         regularization=1e-5)
preds = mf.fit(epochs=5, val_epoch=1)

**************************************************************
Epoch 1/5


100%|██████████| 145/145 [01:33<00:00,  1.54it/s, BPR loss=0.00391]


Validation: Recall@20=0.0001	Precision@20=0.0045	nDCG@20=0.0082
Test: Recall@20=0.0008	Precision@20=0.0149	nDCG@20=0.0127
**************************************************************
Epoch 2/5


100%|██████████| 145/145 [01:39<00:00,  1.46it/s, BPR loss=0.00283]


Validation: Recall@20=0.0004	Precision@20=0.0106	nDCG@20=0.0111
Test: Recall@20=0.0014	Precision@20=0.0189	nDCG@20=0.0159
**************************************************************
Epoch 3/5


100%|██████████| 145/145 [01:37<00:00,  1.49it/s, BPR loss=0.00198]


Validation: Recall@20=0.0006	Precision@20=0.0121	nDCG@20=0.0132
Test: Recall@20=0.0008	Precision@20=0.0122	nDCG@20=0.0093
**************************************************************
Epoch 4/5


100%|██████████| 145/145 [01:37<00:00,  1.48it/s, BPR loss=0.00138]


Validation: Recall@20=0.0010	Precision@20=0.0136	nDCG@20=0.0128
Test: Recall@20=0.0012	Precision@20=0.0162	nDCG@20=0.0122
**************************************************************
Epoch 5/5


100%|██████████| 145/145 [01:38<00:00,  1.48it/s, BPR loss=0.00109]

Validation: Recall@20=0.0009	Precision@20=0.0121	nDCG@20=0.0124
Test: Recall@20=0.0011	Precision@20=0.0162	nDCG@20=0.0129

TRAINING COMPLETE!
Best epoch: 4
Test metrics: Recall@20=0.0012	Precision@20=0.0162	nDCG@20=0.0122



