In [1]:
from src.dataset import Dataset
from src.postprocess_data import MinInteractionsFilter, IdsEncoder, SplitTrainValTest

data = Dataset('ml-1m').get_data()

min_iterations_filter = MinInteractionsFilter()
ids_encoder = IdsEncoder()

data = min_iterations_filter.transform(data)
data = ids_encoder.fit_transform(data)

n_users = len(data['user_id'].unique())
n_items = len(data['item_id'].unique())
n_interactions = len(data)
print(f'users: {n_users}')
print(f'items: {n_items}')
print(f'interactions: {n_interactions}')
print(f'density: {(n_interactions / n_users / n_items) * 100:.2f}%')

splitter = SplitTrainValTest()
train_df, val_df, test_df = splitter.transform(data)

data

users: 6034
items: 3125
interactions: 574376
density: 3.05%


user_id,item_id,event_ts
i64,i64,i64
0,924,978300760
0,2685,978300275
0,1835,978824291
0,1015,978302039
0,2218,978300719
0,515,978302268
0,714,978301368
0,516,978824268
0,733,978301752
0,1876,978302281


In [2]:
from src.postprocess_data import create_sparse_dataset, create_positives_dataset

train_dataset = create_sparse_dataset(train_df)
train_positives = create_positives_dataset(train_df)
val_positives = create_positives_dataset(val_df)
test_positives = create_positives_dataset(test_df)

print(f'{len(train_positives.keys())} users in train')
print(f'{len(val_positives.keys())} users in val')
print(f'{len(test_positives.keys())} users in test')

4006 users in train
1579 users in val
1784 users in test


In [3]:
import faiss
import random
import numpy as np
from collections import defaultdict
from implicit.bpr import BayesianPersonalizedRanking

from src.metrics import user_recall

def linear_lr(epoch, n_epochs, lr_start, lr_end):
    p = epoch / n_epochs
    return lr_start * (1 - p) + lr_end * p

def cosine_annealing_warm_restart(epoch, n_epochs, lr_start, lr_end, t_i):
    # https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingWarmRestarts.html#torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
    t_cur = epoch % t_i
    # modification to make lr after restart closer to lr_end 
    linear_coef = (n_epochs - epoch) / n_epochs
    return lr_end + linear_coef * (lr_start - lr_end) * (1 + np.cos(np.pi * t_cur / t_i)) / 2

class SWECallback:
    def __init__(self):
        self.user_factors_swe = None
        self.item_factors_swe = None
        self.recall_history = defaultdict(list)
        
        self.n_models = 0
        
    def callback_fn(self, epoch, *_):
        if (epoch + 1) % RESTART_EPOCHS == 0:
            if self.user_factors_swe is None:
                self.user_factors_swe = model.user_factors
                self.item_factors_swe = model.item_factors
            else:
                self.n_models = epoch // RESTART_EPOCHS
                self.user_factors_swe = (self.user_factors_swe * self.n_models + model.user_factors) / (self.n_models + 1)
                self.item_factors_swe = (self.item_factors_swe * self.n_models + model.item_factors) / (self.n_models + 1)
        
        index = faiss.IndexFlatIP(DIM)
        index.add(model.item_factors)
        recs = index.search(model.user_factors, 200)[1]
        
        for k in [1, 10, 20]:
            recall_list = []
            for user_id, y_true in val_positives.items():
                y_pred = [
                    item_id for item_id in recs[user_id]
                    if item_id not in train_positives.get(user_id, set())
                ]
                recall_list.append(user_recall(y_pred, y_true, k))
            self.recall_history[k].append(np.mean(recall_list))


SEED = 42
DIM = 128
LR_START = 1e-2
LR_END = 1e-3
EPOCHS = 300
RESTART_EPOCHS = 50
REG_FACTOR = 1e-2

def set_seed():
    np.random.seed(SEED)
    random.seed(SEED)

def get_model():
    return BayesianPersonalizedRanking(
        iterations=EPOCHS, factors=(DIM - 1), random_state=SEED,
        learning_rate=LR_START, regularization=REG_FACTOR
    )

set_seed()
model = get_model()
fit_callback = SWECallback()
model.fit(train_dataset, callback=fit_callback.callback_fn)


  0%|          | 0/300 [00:00<?, ?it/s]

In [32]:
from src.metrics import user_recall, user_ap


index = faiss.IndexFlatIP(DIM)
index.add(model.item_factors)
recs = index.search(model.user_factors, 200)[1]

for k in [5, 10]:
    recall_list = []
    map_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

Recall@5 0.0241
MAP@5 0.1641
Recall@10 0.0439
MAP@10 0.1315


In [33]:
index = faiss.IndexFlatIP(DIM)
index.add(fit_callback.item_factors_swe)
recs = index.search(fit_callback.user_factors_swe, 200)[1]

for k in [5, 10]:
    recall_list = []
    map_list = []
    for user_id, y_true in test_positives.items():
        y_pred = [
            item_id for item_id in recs[user_id]
            if item_id not in train_positives.get(user_id, set())
        ]
        map_list.append(user_ap(y_pred, y_true, k))
        recall_list.append(user_recall(y_pred, y_true, k))
    print(f'Recall@{k}', round(np.mean(recall_list), 4))
    print(f'MAP@{k}', round(np.nanmean(map_list), 4))

Recall@5 0.0249
MAP@5 0.1686
Recall@10 0.0454
MAP@10 0.135
