In [None]:
import os
import sys
sys.path.append("..")
from nbr.preparation import Preprocess, save_split, Corpus
from nbr.trainer import NBRTrainer
from nbr.model import UPCF
import torch
import random
import numpy as np
import optuna
import warnings
warnings.filterwarnings("ignore")

# TaFeng

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "ta_feng"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 32266, #items = 23812, #clicks = 817741 (#illegal records = 0)
After preprocessing: #users = 7358, #items = 11202, #clicks = 368951
Saving dataset in ./data//data_ta_feng/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Tune hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 7358/7358 [00:00<00:00, 35656.03it/s]
100%|██████████| 7358/7358 [00:17<00:00, 427.69it/s]


dev dataset preparing...


100%|██████████| 7358/7358 [00:00<00:00, 37073.37it/s]
100%|██████████| 7357/7357 [00:00<00:00, 173250.62it/s]


test dataset preparing...


100%|██████████| 7358/7358 [00:00<00:00, 22734.31it/s]
100%|██████████| 7357/7357 [00:00<00:00, 152849.92it/s]


In [None]:
def objective(trial):
    params = {
        "model": UPCF(
            user_num=corpus.n_users,
            item_num=corpus.n_items,
            recency=trial.suggest_int("recency", low=2, high=50, step=2),
            q=trial.suggest_categorical("q", [1, 5, 10, 50, 100]),
            alpha=trial.suggest_float("alpha", 0.1, 1.0, step=0.1),
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=25, high=500, step=25),
            corpus=corpus
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[I 2023-07-04 22:25:28,746] A new study created in memory with name: no-name-4e6292cb-e3f9-467e-b17c-4d8c5ebe41b9


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 5801.48it/s]
100%|██████████| 7357/7357 [08:55<00:00, 13.74it/s]
[I 2023-07-04 22:34:31,443] Trial 0 finished with value: 0.10871439345788349 and parameters: {'recency': 40, 'q': 10, 'alpha': 0.2, 'nearest_neighbors_num': 400}. Best is trial 0 with value: 0.10871439345788349.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 4719.45it/s]
100%|██████████| 7357/7357 [06:56<00:00, 17.66it/s]
[I 2023-07-04 22:41:33,525] Trial 1 finished with value: 0.1041300205858735 and parameters: {'recency': 10, 'q': 10, 'alpha': 0.9, 'nearest_neighbors_num': 325}. Best is trial 0 with value: 0.10871439345788349.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6320.31it/s]
100%|██████████| 7357/7357 [07:56<00:00, 15.44it/s]
[I 2023-07-04 22:49:34,981] Trial 2 finished with value: 0.10922377359213577 and parameters: {'recency': 38, 'q': 5, 'alpha': 0.4, 'nearest_neighbors_num': 350}. Best is trial 2 with value: 0.10922377359213577.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 4900.80it/s]
100%|██████████| 7357/7357 [06:37<00:00, 18.53it/s]
[I 2023-07-04 22:56:17,465] Trial 3 finished with value: 0.10322487112077067 and parameters: {'recency': 24, 'q': 50, 'alpha': 0.9, 'nearest_neighbors_num': 275}. Best is trial 2 with value: 0.10922377359213577.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6602.18it/s]
100%|██████████| 7357/7357 [06:04<00:00, 20.19it/s]
[I 2023-07-04 23:02:27,876] Trial 4 finished with value: 0.10252126245799066 and parameters: {'recency': 46, 'q': 100, 'alpha': 0.1, 'nearest_neighbors_num': 325}. Best is trial 2 with value: 0.10922377359213577.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 4755.34it/s]
100%|██████████| 7357/7357 [10:21<00:00, 11.84it/s]
[I 2023-07-04 23:12:55,189] Trial 5 finished with value: 0.10827243840465635 and parameters: {'recency': 28, 'q': 10, 'alpha': 0.30000000000000004, 'nearest_neighbors_num': 450}. Best is trial 2 with value: 0.10922377359213577.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 4906.56it/s]
100%|██████████| 7357/7357 [11:31<00:00, 10.65it/s]
[I 2023-07-04 23:24:32,251] Trial 6 finished with value: 0.10325692587567967 and parameters: {'recency': 18, 'q': 50, 'alpha': 0.4, 'nearest_neighbors_num': 475}. Best is trial 2 with value: 0.10922377359213577.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 5040.51it/s]
100%|██████████| 7357/7357 [05:43<00:00, 21.44it/s]
[I 2023-07-04 23:30:20,465] Trial 7 finished with value: 0.1010932607556584 and parameters: {'recency': 50, 'q': 100, 'alpha': 0.6, 'nearest_neighbors_num': 300}. Best is trial 2 with value: 0.10922377359213577.


UPCF fitting...


100%|██████████| 7358/7358 [00:00<00:00, 15192.11it/s]
100%|██████████| 7357/7357 [03:33<00:00, 34.52it/s]
[I 2023-07-04 23:33:57,009] Trial 8 finished with value: 0.0746093887391135 and parameters: {'recency': 2, 'q': 100, 'alpha': 0.1, 'nearest_neighbors_num': 225}. Best is trial 2 with value: 0.10922377359213577.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 5542.08it/s]
100%|██████████| 7357/7357 [04:32<00:00, 26.96it/s]
[I 2023-07-04 23:38:34,619] Trial 9 finished with value: 0.10258683959079182 and parameters: {'recency': 16, 'q': 50, 'alpha': 0.9, 'nearest_neighbors_num': 225}. Best is trial 2 with value: 0.10922377359213577.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6736.95it/s]
100%|██████████| 7357/7357 [01:37<00:00, 75.42it/s]
[I 2023-07-04 23:40:16,049] Trial 10 finished with value: 0.10933490756623297 and parameters: {'recency': 36, 'q': 5, 'alpha': 0.6, 'nearest_neighbors_num': 75}. Best is trial 10 with value: 0.10933490756623297.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6777.27it/s]
100%|██████████| 7357/7357 [01:09<00:00, 105.47it/s]
[I 2023-07-04 23:41:29,937] Trial 11 finished with value: 0.10910995985879414 and parameters: {'recency': 36, 'q': 5, 'alpha': 0.6, 'nearest_neighbors_num': 50}. Best is trial 10 with value: 0.10933490756623297.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 5496.76it/s]
100%|██████████| 7357/7357 [01:10<00:00, 104.39it/s]
[I 2023-07-04 23:42:45,049] Trial 12 finished with value: 0.10886820162424546 and parameters: {'recency': 34, 'q': 5, 'alpha': 0.5, 'nearest_neighbors_num': 50}. Best is trial 10 with value: 0.10933490756623297.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 5438.30it/s]
100%|██████████| 7357/7357 [03:01<00:00, 40.55it/s]
[I 2023-07-04 23:45:51,828] Trial 13 finished with value: 0.09743702251610588 and parameters: {'recency': 40, 'q': 1, 'alpha': 0.7000000000000001, 'nearest_neighbors_num': 150}. Best is trial 10 with value: 0.10933490756623297.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6719.41it/s]
100%|██████████| 7357/7357 [02:34<00:00, 47.62it/s]
[I 2023-07-04 23:48:30,187] Trial 14 finished with value: 0.10989778901098889 and parameters: {'recency': 30, 'q': 5, 'alpha': 0.7000000000000001, 'nearest_neighbors_num': 125}. Best is trial 14 with value: 0.10989778901098889.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6785.04it/s]
100%|██████████| 7357/7357 [02:36<00:00, 47.03it/s]
[I 2023-07-04 23:51:10,453] Trial 15 finished with value: 0.10985490512086453 and parameters: {'recency': 28, 'q': 5, 'alpha': 0.7000000000000001, 'nearest_neighbors_num': 125}. Best is trial 14 with value: 0.10989778901098889.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6779.21it/s]
100%|██████████| 7357/7357 [03:10<00:00, 38.60it/s]
[I 2023-07-04 23:54:25,004] Trial 16 finished with value: 0.11022866212411131 and parameters: {'recency': 26, 'q': 5, 'alpha': 0.8, 'nearest_neighbors_num': 150}. Best is trial 16 with value: 0.11022866212411131.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6869.28it/s]
100%|██████████| 7357/7357 [03:14<00:00, 37.74it/s]
[I 2023-07-04 23:57:43,574] Trial 17 finished with value: 0.09731495677073082 and parameters: {'recency': 18, 'q': 1, 'alpha': 1.0, 'nearest_neighbors_num': 150}. Best is trial 16 with value: 0.11022866212411131.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6820.00it/s]
100%|██████████| 7357/7357 [03:59<00:00, 30.78it/s]
[I 2023-07-05 00:01:46,797] Trial 18 finished with value: 0.11027248144869457 and parameters: {'recency': 24, 'q': 5, 'alpha': 0.8, 'nearest_neighbors_num': 200}. Best is trial 18 with value: 0.11027248144869457.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6759.03it/s]
100%|██████████| 7357/7357 [04:38<00:00, 26.43it/s]
[I 2023-07-05 00:06:29,373] Trial 19 finished with value: 0.11025419279759426 and parameters: {'recency': 22, 'q': 5, 'alpha': 0.8, 'nearest_neighbors_num': 225}. Best is trial 18 with value: 0.11027248144869457.


UPCF fitting...


100%|██████████| 7358/7358 [00:00<00:00, 7379.96it/s]
100%|██████████| 7357/7357 [04:24<00:00, 27.77it/s]
[I 2023-07-05 00:10:57,977] Trial 20 finished with value: 0.10781085854648217 and parameters: {'recency': 10, 'q': 5, 'alpha': 1.0, 'nearest_neighbors_num': 225}. Best is trial 18 with value: 0.11027248144869457.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6667.44it/s]
100%|██████████| 7357/7357 [04:17<00:00, 28.56it/s]
[I 2023-07-05 00:15:20,543] Trial 21 finished with value: 0.11027248144869457 and parameters: {'recency': 24, 'q': 5, 'alpha': 0.8, 'nearest_neighbors_num': 200}. Best is trial 18 with value: 0.11027248144869457.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 4977.36it/s]
100%|██████████| 7357/7357 [04:11<00:00, 29.22it/s]
[I 2023-07-05 00:19:37,659] Trial 22 finished with value: 0.11025427338934153 and parameters: {'recency': 22, 'q': 5, 'alpha': 0.8, 'nearest_neighbors_num': 200}. Best is trial 18 with value: 0.11027248144869457.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 4930.46it/s]
100%|██████████| 7357/7357 [04:16<00:00, 28.70it/s]
[I 2023-07-05 00:23:58,698] Trial 23 finished with value: 0.10949797224875923 and parameters: {'recency': 14, 'q': 5, 'alpha': 0.8, 'nearest_neighbors_num': 200}. Best is trial 18 with value: 0.11027248144869457.


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 6866.96it/s]
100%|██████████| 7357/7357 [03:53<00:00, 31.45it/s]
[I 2023-07-05 00:27:56,663] Trial 24 finished with value: 0.11015451005311834 and parameters: {'recency': 22, 'q': 5, 'alpha': 0.8, 'nearest_neighbors_num': 175}. Best is trial 18 with value: 0.11027248144869457.


Test:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)
params = {
    "model": UPCF(
        user_num=corpus.n_users,
        item_num=corpus.n_items,
        recency=study.best_params["recency"],
        q=study.best_params["q"],
        alpha=study.best_params["alpha"],
        nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)

train dataset preparing...


100%|██████████| 7358/7358 [00:26<00:00, 275.67it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 3628.40it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 3159.63it/s]


UPCF fitting...


100%|██████████| 7358/7358 [00:01<00:00, 4743.39it/s]


In [None]:
trainer.evaluate(mode="test")

100%|██████████| 7357/7357 [04:46<00:00, 25.64it/s]


{'precision': 0.05759140954193285,
 'recall': 0.13649849839913436,
 'ndcg': 0.11541800466459054}

# TaoBao

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 10 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "taobao"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(10, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 672404, #items = 638962, #clicks = 2015807 (#illegal records = 0)
After preprocessing: #users = 10092, #items = 22286, #clicks = 67991
Saving dataset in ./data//data_taobao/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Tune hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 10092/10092 [00:00<00:00, 184939.34it/s]
100%|██████████| 10092/10092 [00:48<00:00, 206.70it/s]


dev dataset preparing...


100%|██████████| 10092/10092 [00:00<00:00, 108321.82it/s]
100%|██████████| 9307/9307 [00:00<00:00, 199324.90it/s]


test dataset preparing...


100%|██████████| 10092/10092 [00:00<00:00, 106080.60it/s]
100%|██████████| 9307/9307 [00:00<00:00, 199846.35it/s]


In [None]:
def objective(trial):
    params = {
        "model": UPCF(
            user_num=corpus.n_users,
            item_num=corpus.n_items,
            recency=trial.suggest_int("recency", low=2, high=50, step=2),
            q=trial.suggest_categorical("q", [1, 5, 10, 50, 100]),
            alpha=trial.suggest_float("alpha", 0.1, 1.0, step=0.1),
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=25, high=500, step=25),
            corpus=corpus
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[I 2023-07-05 08:44:02,760] A new study created in memory with name: no-name-0b7c4001-aaac-4ca6-80f9-627555014dac


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 8120.00it/s]
100%|██████████| 9307/9307 [01:19<00:00, 116.55it/s]
[I 2023-07-05 08:45:28,448] Trial 0 finished with value: 0.07057254405320963 and parameters: {'recency': 40, 'q': 10, 'alpha': 0.2, 'nearest_neighbors_num': 400}. Best is trial 0 with value: 0.07057254405320963.


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 9906.44it/s]
100%|██████████| 9307/9307 [01:13<00:00, 126.98it/s]
[I 2023-07-05 08:46:47,639] Trial 1 finished with value: 0.07059692330307843 and parameters: {'recency': 10, 'q': 10, 'alpha': 0.9, 'nearest_neighbors_num': 325}. Best is trial 1 with value: 0.07059692330307843.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13197.51it/s]
100%|██████████| 9307/9307 [01:20<00:00, 115.66it/s]
[I 2023-07-05 08:48:12,779] Trial 2 finished with value: 0.07043213846766691 and parameters: {'recency': 38, 'q': 5, 'alpha': 0.4, 'nearest_neighbors_num': 350}. Best is trial 1 with value: 0.07059692330307843.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 10830.60it/s]
100%|██████████| 9307/9307 [01:19<00:00, 117.78it/s]
[I 2023-07-05 08:49:36,948] Trial 3 finished with value: 0.07145889046028643 and parameters: {'recency': 24, 'q': 50, 'alpha': 0.9, 'nearest_neighbors_num': 275}. Best is trial 3 with value: 0.07145889046028643.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13460.59it/s]
100%|██████████| 9307/9307 [01:15<00:00, 123.27it/s]
[I 2023-07-05 08:50:57,504] Trial 4 finished with value: 0.07111800638544627 and parameters: {'recency': 46, 'q': 100, 'alpha': 0.1, 'nearest_neighbors_num': 325}. Best is trial 3 with value: 0.07145889046028643.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 12671.32it/s]
100%|██████████| 9307/9307 [01:15<00:00, 123.37it/s]
[I 2023-07-05 08:52:18,890] Trial 5 finished with value: 0.07050048520401532 and parameters: {'recency': 28, 'q': 10, 'alpha': 0.30000000000000004, 'nearest_neighbors_num': 450}. Best is trial 3 with value: 0.07145889046028643.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13464.54it/s]
100%|██████████| 9307/9307 [01:22<00:00, 113.07it/s]
[I 2023-07-05 08:53:45,906] Trial 6 finished with value: 0.07139799145237606 and parameters: {'recency': 18, 'q': 50, 'alpha': 0.4, 'nearest_neighbors_num': 475}. Best is trial 3 with value: 0.07145889046028643.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13052.81it/s]
100%|██████████| 9307/9307 [01:15<00:00, 122.79it/s]
[I 2023-07-05 08:55:06,349] Trial 7 finished with value: 0.07135180430366019 and parameters: {'recency': 50, 'q': 100, 'alpha': 0.6, 'nearest_neighbors_num': 300}. Best is trial 3 with value: 0.07145889046028643.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 26894.22it/s]
100%|██████████| 9307/9307 [00:36<00:00, 255.96it/s]
[I 2023-07-05 08:55:47,211] Trial 8 finished with value: 0.05986179961718508 and parameters: {'recency': 2, 'q': 100, 'alpha': 0.1, 'nearest_neighbors_num': 225}. Best is trial 3 with value: 0.07145889046028643.


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 9077.59it/s]
100%|██████████| 9307/9307 [01:14<00:00, 124.17it/s]
[I 2023-07-05 08:57:07,620] Trial 9 finished with value: 0.07167744390868701 and parameters: {'recency': 16, 'q': 50, 'alpha': 0.9, 'nearest_neighbors_num': 225}. Best is trial 9 with value: 0.07167744390868701.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13469.31it/s]
100%|██████████| 9307/9307 [01:12<00:00, 128.12it/s]
[I 2023-07-05 08:58:24,838] Trial 10 finished with value: 0.06969838396424953 and parameters: {'recency': 14, 'q': 1, 'alpha': 0.7000000000000001, 'nearest_neighbors_num': 75}. Best is trial 9 with value: 0.07167744390868701.


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 7564.74it/s]
100%|██████████| 9307/9307 [01:13<00:00, 126.09it/s]
[I 2023-07-05 08:59:45,217] Trial 11 finished with value: 0.07206451873009347 and parameters: {'recency': 26, 'q': 50, 'alpha': 1.0, 'nearest_neighbors_num': 200}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 12929.29it/s]
100%|██████████| 9307/9307 [01:13<00:00, 125.90it/s]
[I 2023-07-05 09:01:03,813] Trial 12 finished with value: 0.07200334719878085 and parameters: {'recency': 28, 'q': 50, 'alpha': 1.0, 'nearest_neighbors_num': 175}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 9264.02it/s]
100%|██████████| 9307/9307 [01:07<00:00, 136.95it/s]
[I 2023-07-05 09:02:17,286] Trial 13 finished with value: 0.07203610083555645 and parameters: {'recency': 30, 'q': 50, 'alpha': 1.0, 'nearest_neighbors_num': 100}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 9214.63it/s]
100%|██████████| 9307/9307 [01:05<00:00, 141.52it/s]
[I 2023-07-05 09:03:28,274] Trial 14 finished with value: 0.07120076614069208 and parameters: {'recency': 36, 'q': 50, 'alpha': 0.7000000000000001, 'nearest_neighbors_num': 50}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 9365.92it/s]
100%|██████████| 9307/9307 [01:10<00:00, 132.54it/s]
[I 2023-07-05 09:04:44,179] Trial 15 finished with value: 0.0712132716412333 and parameters: {'recency': 32, 'q': 5, 'alpha': 1.0, 'nearest_neighbors_num': 125}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13486.70it/s]
100%|██████████| 9307/9307 [01:13<00:00, 126.23it/s]
[I 2023-07-05 09:06:02,451] Trial 16 finished with value: 0.06998426947357358 and parameters: {'recency': 22, 'q': 1, 'alpha': 0.8, 'nearest_neighbors_num': 125}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13174.05it/s]
100%|██████████| 9307/9307 [00:57<00:00, 161.28it/s]
[I 2023-07-05 09:07:05,899] Trial 17 finished with value: 0.0716052145558884 and parameters: {'recency': 32, 'q': 50, 'alpha': 0.8, 'nearest_neighbors_num': 25}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 9950.69it/s]
100%|██████████| 9307/9307 [01:08<00:00, 136.80it/s]
[I 2023-07-05 09:08:19,248] Trial 18 finished with value: 0.07127871749812019 and parameters: {'recency': 8, 'q': 50, 'alpha': 1.0, 'nearest_neighbors_num': 150}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 7294.89it/s]
100%|██████████| 9307/9307 [01:18<00:00, 118.23it/s]
[I 2023-07-05 09:09:44,018] Trial 19 finished with value: 0.07144039393872273 and parameters: {'recency': 42, 'q': 50, 'alpha': 0.5, 'nearest_neighbors_num': 200}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 9334.07it/s]
100%|██████████| 9307/9307 [01:10<00:00, 131.42it/s]
[I 2023-07-05 09:11:00,415] Trial 20 finished with value: 0.0700239489570455 and parameters: {'recency': 22, 'q': 1, 'alpha': 0.8, 'nearest_neighbors_num': 100}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13233.35it/s]
100%|██████████| 9307/9307 [01:14<00:00, 125.39it/s]
[I 2023-07-05 09:12:19,288] Trial 21 finished with value: 0.07200334719878085 and parameters: {'recency': 28, 'q': 50, 'alpha': 1.0, 'nearest_neighbors_num': 175}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13227.39it/s]
100%|██████████| 9307/9307 [01:16<00:00, 122.12it/s]
[I 2023-07-05 09:13:41,872] Trial 22 finished with value: 0.07200805588588075 and parameters: {'recency': 32, 'q': 50, 'alpha': 1.0, 'nearest_neighbors_num': 175}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 13225.81it/s]
100%|██████████| 9307/9307 [01:19<00:00, 116.80it/s]
[I 2023-07-05 09:15:06,221] Trial 23 finished with value: 0.07142394402259548 and parameters: {'recency': 34, 'q': 50, 'alpha': 0.9, 'nearest_neighbors_num': 250}. Best is trial 11 with value: 0.07206451873009347.


UPCF fitting...


100%|██████████| 10092/10092 [00:00<00:00, 11239.79it/s]
100%|██████████| 9307/9307 [01:10<00:00, 131.63it/s]
[I 2023-07-05 09:16:21,620] Trial 24 finished with value: 0.07079538003656198 and parameters: {'recency': 32, 'q': 5, 'alpha': 0.7000000000000001, 'nearest_neighbors_num': 75}. Best is trial 11 with value: 0.07206451873009347.


Test:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)
params = {
    "model": UPCF(
        user_num=corpus.n_users,
        item_num=corpus.n_items,
        recency=study.best_params["recency"],
        q=study.best_params["q"],
        alpha=study.best_params["alpha"],
        nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)

train dataset preparing...


100%|██████████| 10092/10092 [00:57<00:00, 176.02it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 25735.44it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 21703.57it/s]


UPCF fitting...


100%|██████████| 10092/10092 [00:01<00:00, 7572.77it/s]


In [None]:
trainer.evaluate(mode="test")

100%|██████████| 9307/9307 [01:18<00:00, 118.54it/s]


{'precision': 0.008456000859568068,
 'recall': 0.08241108842806488,
 'ndcg': 0.05530549443807528}

# Dunnhumby

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "dunnhumby"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Tune hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 2358/2358 [00:00<00:00, 2835.92it/s]
100%|██████████| 2358/2358 [00:14<00:00, 162.48it/s]


dev dataset preparing...


100%|██████████| 2358/2358 [00:00<00:00, 2802.65it/s]
100%|██████████| 2357/2357 [00:00<00:00, 112280.65it/s]


test dataset preparing...


100%|██████████| 2358/2358 [00:00<00:00, 2789.40it/s]
100%|██████████| 2357/2357 [00:00<00:00, 91785.81it/s]


In [None]:
def objective(trial):
    params = {
        "model": UPCF(
            user_num=corpus.n_users,
            item_num=corpus.n_items,
            recency=trial.suggest_int("recency", low=2, high=50, step=2),
            q=trial.suggest_categorical("q", [1, 5, 10, 50, 100]),
            alpha=trial.suggest_float("alpha", 0.1, 1.0, step=0.1),
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=25, high=500, step=25),
            corpus=corpus
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[I 2023-07-05 08:11:27,680] A new study created in memory with name: no-name-0455df30-ae52-4e6d-b678-388ff39b1803


UPCF fitting...


100%|██████████| 2358/2358 [00:02<00:00, 1048.15it/s]
100%|██████████| 2357/2357 [01:03<00:00, 37.07it/s]
[I 2023-07-05 08:12:38,137] Trial 0 finished with value: 0.162296096705134 and parameters: {'recency': 40, 'q': 10, 'alpha': 0.2, 'nearest_neighbors_num': 400}. Best is trial 0 with value: 0.162296096705134.


UPCF fitting...


100%|██████████| 2358/2358 [00:00<00:00, 2620.80it/s]
100%|██████████| 2357/2357 [00:48<00:00, 48.18it/s]
[I 2023-07-05 08:13:34,776] Trial 1 finished with value: 0.14460461844739012 and parameters: {'recency': 10, 'q': 10, 'alpha': 0.9, 'nearest_neighbors_num': 325}. Best is trial 0 with value: 0.162296096705134.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1195.50it/s]
100%|██████████| 2357/2357 [00:58<00:00, 40.35it/s]
[I 2023-07-05 08:14:39,388] Trial 2 finished with value: 0.16384140253238402 and parameters: {'recency': 38, 'q': 5, 'alpha': 0.4, 'nearest_neighbors_num': 350}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 2170.20it/s]
100%|██████████| 2357/2357 [00:44<00:00, 53.38it/s]
[I 2023-07-05 08:15:28,091] Trial 3 finished with value: 0.16009368008381972 and parameters: {'recency': 24, 'q': 50, 'alpha': 0.9, 'nearest_neighbors_num': 275}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1234.71it/s]
100%|██████████| 2357/2357 [00:47<00:00, 49.77it/s]
[I 2023-07-05 08:16:21,584] Trial 4 finished with value: 0.15912054526924452 and parameters: {'recency': 46, 'q': 100, 'alpha': 0.1, 'nearest_neighbors_num': 325}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 2007.61it/s]
100%|██████████| 2357/2357 [01:09<00:00, 34.06it/s]
[I 2023-07-05 08:17:35,479] Trial 5 finished with value: 0.16124917961941043 and parameters: {'recency': 28, 'q': 10, 'alpha': 0.30000000000000004, 'nearest_neighbors_num': 450}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1764.19it/s]
100%|██████████| 2357/2357 [00:59<00:00, 39.87it/s]
[I 2023-07-05 08:18:40,391] Trial 6 finished with value: 0.15646028463171208 and parameters: {'recency': 18, 'q': 50, 'alpha': 0.4, 'nearest_neighbors_num': 475}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1409.74it/s]
100%|██████████| 2357/2357 [00:44<00:00, 53.04it/s]
[I 2023-07-05 08:19:30,053] Trial 7 finished with value: 0.15871600179874776 and parameters: {'recency': 50, 'q': 100, 'alpha': 0.6, 'nearest_neighbors_num': 300}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:00<00:00, 10463.90it/s]
100%|██████████| 2357/2357 [00:26<00:00, 89.57it/s]
[I 2023-07-05 08:20:00,014] Trial 8 finished with value: 0.09024262261280114 and parameters: {'recency': 2, 'q': 100, 'alpha': 0.1, 'nearest_neighbors_num': 225}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1877.62it/s]
100%|██████████| 2357/2357 [00:35<00:00, 65.65it/s]
[I 2023-07-05 08:20:41,681] Trial 9 finished with value: 0.15482654444916802 and parameters: {'recency': 16, 'q': 50, 'alpha': 0.9, 'nearest_neighbors_num': 225}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1323.30it/s]
100%|██████████| 2357/2357 [00:26<00:00, 87.63it/s]
[I 2023-07-05 08:21:16,990] Trial 10 finished with value: 0.16380072429758097 and parameters: {'recency': 36, 'q': 5, 'alpha': 0.6, 'nearest_neighbors_num': 75}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1733.19it/s]
100%|██████████| 2357/2357 [00:23<00:00, 100.68it/s]
[I 2023-07-05 08:21:44,982] Trial 11 finished with value: 0.16339277247683962 and parameters: {'recency': 36, 'q': 5, 'alpha': 0.6, 'nearest_neighbors_num': 50}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1725.33it/s]
100%|██████████| 2357/2357 [00:22<00:00, 103.83it/s]
[I 2023-07-05 08:22:12,743] Trial 12 finished with value: 0.16257880386225393 and parameters: {'recency': 34, 'q': 5, 'alpha': 0.5, 'nearest_neighbors_num': 50}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1618.84it/s]
100%|██████████| 2357/2357 [00:37<00:00, 62.30it/s]
[I 2023-07-05 08:22:56,371] Trial 13 finished with value: 0.10187947996248146 and parameters: {'recency': 40, 'q': 1, 'alpha': 0.7000000000000001, 'nearest_neighbors_num': 150}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1846.72it/s]
100%|██████████| 2357/2357 [00:31<00:00, 74.22it/s]
[I 2023-07-05 08:23:33,454] Trial 14 finished with value: 0.16320713703749326 and parameters: {'recency': 30, 'q': 5, 'alpha': 0.7000000000000001, 'nearest_neighbors_num': 125}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1518.41it/s]
100%|██████████| 2357/2357 [01:05<00:00, 35.97it/s]
[I 2023-07-05 08:24:44,885] Trial 15 finished with value: 0.16364073466601947 and parameters: {'recency': 44, 'q': 5, 'alpha': 0.4, 'nearest_neighbors_num': 400}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1799.53it/s]
100%|██████████| 2357/2357 [00:33<00:00, 69.35it/s]
[I 2023-07-05 08:25:24,216] Trial 16 finished with value: 0.16248017113218766 and parameters: {'recency': 32, 'q': 5, 'alpha': 0.4, 'nearest_neighbors_num': 150}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 2131.36it/s]
100%|██████████| 2357/2357 [00:59<00:00, 39.85it/s]
[I 2023-07-05 08:26:27,909] Trial 17 finished with value: 0.09898836744628735 and parameters: {'recency': 24, 'q': 1, 'alpha': 0.7000000000000001, 'nearest_neighbors_num': 375}. Best is trial 2 with value: 0.16384140253238402.


UPCF fitting...


100%|██████████| 2358/2358 [00:02<00:00, 1063.69it/s]
100%|██████████| 2357/2357 [00:43<00:00, 54.78it/s]
[I 2023-07-05 08:27:17,614] Trial 18 finished with value: 0.16411807590277894 and parameters: {'recency': 38, 'q': 5, 'alpha': 0.5, 'nearest_neighbors_num': 200}. Best is trial 18 with value: 0.16411807590277894.


UPCF fitting...


100%|██████████| 2358/2358 [00:02<00:00, 1152.27it/s]
100%|██████████| 2357/2357 [00:44<00:00, 52.96it/s]
[I 2023-07-05 08:28:10,273] Trial 19 finished with value: 0.164084168315017 and parameters: {'recency': 42, 'q': 5, 'alpha': 0.30000000000000004, 'nearest_neighbors_num': 225}. Best is trial 18 with value: 0.16411807590277894.


UPCF fitting...


100%|██████████| 2358/2358 [00:02<00:00, 1064.99it/s]
100%|██████████| 2357/2357 [00:43<00:00, 53.92it/s]
[I 2023-07-05 08:29:00,968] Trial 20 finished with value: 0.16256406090371955 and parameters: {'recency': 48, 'q': 5, 'alpha': 0.2, 'nearest_neighbors_num': 225}. Best is trial 18 with value: 0.16411807590277894.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1598.82it/s]
100%|██████████| 2357/2357 [00:41<00:00, 56.82it/s]
[I 2023-07-05 08:29:47,293] Trial 21 finished with value: 0.16404719368836046 and parameters: {'recency': 40, 'q': 5, 'alpha': 0.30000000000000004, 'nearest_neighbors_num': 200}. Best is trial 18 with value: 0.16411807590277894.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1559.65it/s]
100%|██████████| 2357/2357 [00:40<00:00, 58.12it/s]
[I 2023-07-05 08:30:33,394] Trial 22 finished with value: 0.16422305668086676 and parameters: {'recency': 42, 'q': 5, 'alpha': 0.30000000000000004, 'nearest_neighbors_num': 200}. Best is trial 22 with value: 0.16422305668086676.


UPCF fitting...


100%|██████████| 2358/2358 [00:02<00:00, 968.90it/s]
100%|██████████| 2357/2357 [00:40<00:00, 58.59it/s]
[I 2023-07-05 08:31:20,477] Trial 23 finished with value: 0.16376822957414042 and parameters: {'recency': 44, 'q': 5, 'alpha': 0.30000000000000004, 'nearest_neighbors_num': 175}. Best is trial 22 with value: 0.16422305668086676.


UPCF fitting...


100%|██████████| 2358/2358 [00:01<00:00, 1510.41it/s]
100%|██████████| 2357/2357 [00:30<00:00, 76.30it/s]
[I 2023-07-05 08:31:57,087] Trial 24 finished with value: 0.16340397647942972 and parameters: {'recency': 44, 'q': 5, 'alpha': 0.5, 'nearest_neighbors_num': 100}. Best is trial 22 with value: 0.16422305668086676.


Test:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)
params = {
    "model": UPCF(
        user_num=corpus.n_users,
        item_num=corpus.n_items,
        recency=study.best_params["recency"],
        q=study.best_params["q"],
        alpha=study.best_params["alpha"],
        nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)

In [None]:
trainer.evaluate(mode="test")

100%|██████████| 2357/2357 [00:42<00:00, 55.77it/s]


{'precision': 0.11671616461603734,
 'recall': 0.1662591039440097,
 'ndcg': 0.15995339543947557}