In [None]:
import os
import sys
sys.path.append("..")
from nbr.preparation import Preprocess, save_split, Corpus
from nbr.trainer import NBRTrainer
from nbr.model import TIFUKNN
import torch
import random
import numpy as np
import optuna
import warnings
warnings.filterwarnings("ignore")

# TaFeng

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "ta_feng"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 32266, #items = 23812, #clicks = 817741 (#illegal records = 0)
After preprocessing: #users = 7358, #items = 11202, #clicks = 368951
Saving dataset in ./data//data_ta_feng/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Tune hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 7358/7358 [00:12<00:00, 604.73it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 5557.89it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 3054.27it/s]


In [None]:
def objective(trial):
    params = {
        "model": TIFUKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            group_num=trial.suggest_int("group_num", low=2, high=10, step=1),
            within_decay_rate=trial.suggest_float("within_decay_rate", 0.1, 1.0, step=0.1),
            group_decay_rate=trial.suggest_float("group_decay_rate", 0.1, 1.0, step=0.1),
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=25, high=500, step=25),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.1),
            corpus=corpus
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[32m[I 2023-04-23 21:05:19,176][0m A new study created in memory with name: no-name-a58cd8bc-cd3c-4819-8da4-03b99e67430e[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:12<00:00, 603.22it/s]
100%|██████████| 7357/7357 [08:34<00:00, 14.30it/s]
[32m[I 2023-04-23 21:14:09,924][0m Trial 0 finished with value: 0.08107959365618923 and parameters: {'group_num': 8, 'within_decay_rate': 0.1, 'group_decay_rate': 0.7000000000000001, 'nearest_neighbors_num': 375, 'alpha': 0.5}. Best is trial 0 with value: 0.08107959365618923.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 695.30it/s] 
100%|██████████| 7357/7357 [05:28<00:00, 22.40it/s]
[32m[I 2023-04-23 21:19:53,719][0m Trial 1 finished with value: 0.06082132891301551 and parameters: {'group_num': 4, 'within_decay_rate': 0.2, 'group_decay_rate': 0.8, 'nearest_neighbors_num': 100, 'alpha': 0.0}. Best is trial 0 with value: 0.08107959365618923.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 677.96it/s] 
100%|██████████| 7357/7357 [06:13<00:00, 19.67it/s]
[32m[I 2023-04-23 21:26:22,753][0m Trial 2 finished with value: 0.08430556349343167 and parameters: {'group_num': 8, 'within_decay_rate': 1.0, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 275, 'alpha': 0.8}. Best is trial 2 with value: 0.08430556349343167.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 674.91it/s] 
100%|██████████| 7357/7357 [09:35<00:00, 12.78it/s]
[32m[I 2023-04-23 21:36:13,864][0m Trial 3 finished with value: 0.08654990976757163 and parameters: {'group_num': 7, 'within_decay_rate': 0.8, 'group_decay_rate': 0.30000000000000004, 'nearest_neighbors_num': 475, 'alpha': 0.7000000000000001}. Best is trial 3 with value: 0.08654990976757163.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 677.84it/s] 
100%|██████████| 7357/7357 [06:36<00:00, 18.56it/s]
[32m[I 2023-04-23 21:43:06,630][0m Trial 4 finished with value: 0.07926344407371379 and parameters: {'group_num': 6, 'within_decay_rate': 0.2, 'group_decay_rate': 0.4, 'nearest_neighbors_num': 350, 'alpha': 0.4}. Best is trial 3 with value: 0.08654990976757163.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 690.37it/s] 
100%|██████████| 7357/7357 [06:36<00:00, 18.57it/s]
[32m[I 2023-04-23 21:49:58,191][0m Trial 5 finished with value: 0.08945295740812033 and parameters: {'group_num': 5, 'within_decay_rate': 0.7000000000000001, 'group_decay_rate': 0.6, 'nearest_neighbors_num': 350, 'alpha': 0.6000000000000001}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:11<00:00, 660.69it/s]
100%|██████████| 7357/7357 [05:46<00:00, 21.21it/s]
[32m[I 2023-04-23 21:56:00,851][0m Trial 6 finished with value: 0.08372734063630134 and parameters: {'group_num': 9, 'within_decay_rate': 0.6, 'group_decay_rate': 1.0, 'nearest_neighbors_num': 175, 'alpha': 0.0}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 730.49it/s] 
100%|██████████| 7357/7357 [05:07<00:00, 23.94it/s]
[32m[I 2023-04-23 22:01:22,474][0m Trial 7 finished with value: 0.06873019890689729 and parameters: {'group_num': 4, 'within_decay_rate': 0.2, 'group_decay_rate': 0.9, 'nearest_neighbors_num': 25, 'alpha': 0.6000000000000001}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:09<00:00, 744.54it/s] 
100%|██████████| 7357/7357 [09:17<00:00, 13.20it/s]
[32m[I 2023-04-23 22:10:55,535][0m Trial 8 finished with value: 0.08384626944597413 and parameters: {'group_num': 6, 'within_decay_rate': 0.9, 'group_decay_rate': 0.2, 'nearest_neighbors_num': 450, 'alpha': 0.30000000000000004}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:11<00:00, 661.17it/s] 
100%|██████████| 7357/7357 [06:07<00:00, 20.04it/s]
[32m[I 2023-04-23 22:17:18,082][0m Trial 9 finished with value: 0.07921680102315841 and parameters: {'group_num': 8, 'within_decay_rate': 0.30000000000000004, 'group_decay_rate': 0.9, 'nearest_neighbors_num': 175, 'alpha': 0.1}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 717.65it/s] 
100%|██████████| 7357/7357 [07:07<00:00, 17.20it/s]
[32m[I 2023-04-23 22:24:42,630][0m Trial 10 finished with value: 0.08790284253796835 and parameters: {'group_num': 2, 'within_decay_rate': 0.6, 'group_decay_rate': 0.6, 'nearest_neighbors_num': 300, 'alpha': 0.9}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:09<00:00, 782.63it/s] 
100%|██████████| 7357/7357 [06:39<00:00, 18.43it/s]
[32m[I 2023-04-23 22:31:37,959][0m Trial 11 finished with value: 0.08264049804711332 and parameters: {'group_num': 2, 'within_decay_rate': 0.6, 'group_decay_rate': 0.6, 'nearest_neighbors_num': 300, 'alpha': 1.0}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 689.14it/s] 
100%|██████████| 7357/7357 [06:23<00:00, 19.20it/s]
[32m[I 2023-04-23 22:38:16,564][0m Trial 12 finished with value: 0.08105070533496007 and parameters: {'group_num': 2, 'within_decay_rate': 0.5, 'group_decay_rate': 0.5, 'nearest_neighbors_num': 225, 'alpha': 1.0}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 672.98it/s] 
100%|██████████| 7357/7357 [09:23<00:00, 13.06it/s]
[32m[I 2023-04-23 22:47:56,044][0m Trial 13 finished with value: 0.08520707449631412 and parameters: {'group_num': 4, 'within_decay_rate': 0.7000000000000001, 'group_decay_rate': 0.6, 'nearest_neighbors_num': 375, 'alpha': 0.8}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:11<00:00, 646.17it/s] 
100%|██████████| 7357/7357 [10:19<00:00, 11.88it/s]
[32m[I 2023-04-23 22:58:31,716][0m Trial 14 finished with value: 0.0866371324162653 and parameters: {'group_num': 3, 'within_decay_rate': 0.4, 'group_decay_rate': 0.5, 'nearest_neighbors_num': 425, 'alpha': 0.8}. Best is trial 5 with value: 0.08945295740812033.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:11<00:00, 613.22it/s]
100%|██████████| 7357/7357 [07:20<00:00, 16.69it/s]
[32m[I 2023-04-23 23:06:11,639][0m Trial 15 finished with value: 0.09373774487756065 and parameters: {'group_num': 5, 'within_decay_rate': 0.8, 'group_decay_rate': 0.7000000000000001, 'nearest_neighbors_num': 325, 'alpha': 0.30000000000000004}. Best is trial 15 with value: 0.09373774487756065.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:11<00:00, 625.34it/s]
100%|██████████| 7357/7357 [11:15<00:00, 10.89it/s]
[32m[I 2023-04-23 23:17:44,472][0m Trial 16 finished with value: 0.09439287006269607 and parameters: {'group_num': 5, 'within_decay_rate': 0.8, 'group_decay_rate': 0.7000000000000001, 'nearest_neighbors_num': 500, 'alpha': 0.2}. Best is trial 16 with value: 0.09439287006269607.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:12<00:00, 595.62it/s]
100%|██████████| 7357/7357 [10:33<00:00, 11.61it/s]
[32m[I 2023-04-23 23:28:35,795][0m Trial 17 finished with value: 0.10574871547190032 and parameters: {'group_num': 5, 'within_decay_rate': 1.0, 'group_decay_rate': 0.8, 'nearest_neighbors_num': 500, 'alpha': 0.2}. Best is trial 17 with value: 0.10574871547190032.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:11<00:00, 620.51it/s] 
100%|██████████| 7357/7357 [10:15<00:00, 11.96it/s]
[32m[I 2023-04-23 23:39:10,153][0m Trial 18 finished with value: 0.1047040898611304 and parameters: {'group_num': 10, 'within_decay_rate': 1.0, 'group_decay_rate': 0.8, 'nearest_neighbors_num': 500, 'alpha': 0.2}. Best is trial 17 with value: 0.10574871547190032.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 675.10it/s] 
100%|██████████| 7357/7357 [09:01<00:00, 13.58it/s]
[32m[I 2023-04-23 23:48:29,944][0m Trial 19 finished with value: 0.11085931477372793 and parameters: {'group_num': 9, 'within_decay_rate': 1.0, 'group_decay_rate': 1.0, 'nearest_neighbors_num': 425, 'alpha': 0.2}. Best is trial 19 with value: 0.11085931477372793.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:11<00:00, 656.08it/s] 
100%|██████████| 7357/7357 [08:51<00:00, 13.85it/s]
[32m[I 2023-04-23 23:57:36,757][0m Trial 20 finished with value: 0.11092596806719811 and parameters: {'group_num': 10, 'within_decay_rate': 1.0, 'group_decay_rate': 1.0, 'nearest_neighbors_num': 400, 'alpha': 0.4}. Best is trial 20 with value: 0.11092596806719811.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 677.97it/s]
100%|██████████| 7357/7357 [08:59<00:00, 13.65it/s]
[32m[I 2023-04-24 00:06:52,476][0m Trial 21 finished with value: 0.11105743291942718 and parameters: {'group_num': 10, 'within_decay_rate': 1.0, 'group_decay_rate': 1.0, 'nearest_neighbors_num': 425, 'alpha': 0.4}. Best is trial 21 with value: 0.11105743291942718.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 669.17it/s] 
100%|██████████| 7357/7357 [09:01<00:00, 13.59it/s]
[32m[I 2023-04-24 00:16:09,582][0m Trial 22 finished with value: 0.10950544725367108 and parameters: {'group_num': 10, 'within_decay_rate': 0.9, 'group_decay_rate': 1.0, 'nearest_neighbors_num': 425, 'alpha': 0.4}. Best is trial 21 with value: 0.11105743291942718.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:10<00:00, 677.76it/s]
100%|██████████| 7357/7357 [08:47<00:00, 13.94it/s]
[32m[I 2023-04-24 00:25:12,944][0m Trial 23 finished with value: 0.10939951387561178 and parameters: {'group_num': 9, 'within_decay_rate': 0.9, 'group_decay_rate': 1.0, 'nearest_neighbors_num': 400, 'alpha': 0.4}. Best is trial 21 with value: 0.11105743291942718.[0m


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:11<00:00, 668.43it/s] 
100%|██████████| 7357/7357 [09:21<00:00, 13.11it/s]
[32m[I 2023-04-24 00:34:50,604][0m Trial 24 finished with value: 0.10918993090481564 and parameters: {'group_num': 10, 'within_decay_rate': 1.0, 'group_decay_rate': 0.9, 'nearest_neighbors_num': 450, 'alpha': 0.5}. Best is trial 21 with value: 0.11105743291942718.[0m


Test:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)
params = {
    "model": TIFUKNN(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        group_num=study.best_params["group_num"],
        within_decay_rate=study.best_params["within_decay_rate"],
        group_decay_rate=study.best_params["group_decay_rate"],
        nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
        alpha=study.best_params["alpha"],
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)

train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 652.68it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 5238.99it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4507.02it/s]


TIFUKNN fitting...


100%|██████████| 7358/7358 [00:08<00:00, 876.22it/s] 


In [None]:
trainer.evaluate(mode="test")

100%|██████████| 7357/7357 [06:00<00:00, 20.38it/s]


{'precision': 0.057401114584749224,
 'recall': 0.135966844520265,
 'ndcg': 0.11790728194701038}

# TaoBao

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 10 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "taobao"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(10, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 672404, #items = 638962, #clicks = 2015807 (#illegal records = 0)
After preprocessing: #users = 10092, #items = 22286, #clicks = 67991
Saving dataset in ./data//data_taobao/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Tune hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 10092/10092 [00:34<00:00, 289.52it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 24261.11it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 18817.06it/s]


In [None]:
def objective(trial):
    params = {
        "model": TIFUKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            group_num=trial.suggest_int("group_num", low=2, high=10, step=1),
            within_decay_rate=trial.suggest_float("within_decay_rate", 0.1, 1.0, step=0.1),
            group_decay_rate=trial.suggest_float("group_decay_rate", 0.1, 1.0, step=0.1),
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=25, high=500, step=25),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.1),
            corpus=corpus
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[32m[I 2023-04-24 16:10:49,847][0m A new study created in memory with name: no-name-b404063c-a6ea-445a-987c-39d8846d2c55[0m
[32m[I 2023-04-24 16:10:49,852][0m Trial 0 finished with value: 0.057439564148598785 and parameters: {'group_num': 8, 'within_decay_rate': 0.1, 'group_decay_rate': 0.7000000000000001, 'nearest_neighbors_num': 375, 'alpha': 0.5}. Best is trial 0 with value: 0.057439564148598785.[0m
[32m[I 2023-04-24 16:10:49,857][0m Trial 1 finished with value: 0.013697305773809882 and parameters: {'group_num': 4, 'within_decay_rate': 0.2, 'group_decay_rate': 0.8, 'nearest_neighbors_num': 100, 'alpha': 0.0}. Best is trial 0 with value: 0.057439564148598785.[0m
[32m[I 2023-04-24 16:10:49,862][0m Trial 2 finished with value: 0.06671495591256227 and parameters: {'group_num': 8, 'within_decay_rate': 1.0, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 275, 'alpha': 0.8}. Best is trial 2 with value: 0.06671495591256227.[0m
[32m[I 2023-04-24 16:10:49,866][0m Trial 3 finis

TIFUKNN fitting...


100%|██████████| 10092/10092 [00:27<00:00, 369.56it/s]
100%|██████████| 9307/9307 [30:48<00:00,  5.03it/s]
[32m[I 2023-04-24 16:42:21,613][0m Trial 15 finished with value: 0.06557456152146485 and parameters: {'group_num': 3, 'within_decay_rate': 1.0, 'group_decay_rate': 0.30000000000000004, 'nearest_neighbors_num': 400, 'alpha': 0.8}. Best is trial 10 with value: 0.0702955950429197.[0m


TIFUKNN fitting...


100%|██████████| 10092/10092 [00:25<00:00, 392.02it/s]
100%|██████████| 9307/9307 [33:45<00:00,  4.60it/s]
[32m[I 2023-04-24 17:16:51,022][0m Trial 16 finished with value: 0.07006161576267411 and parameters: {'group_num': 2, 'within_decay_rate': 0.7000000000000001, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 500, 'alpha': 1.0}. Best is trial 10 with value: 0.0702955950429197.[0m


TIFUKNN fitting...


100%|██████████| 10092/10092 [00:26<00:00, 387.53it/s]
100%|██████████| 9307/9307 [29:22<00:00,  5.28it/s]
[32m[I 2023-04-24 17:47:00,052][0m Trial 17 finished with value: 0.0597820342039382 and parameters: {'group_num': 4, 'within_decay_rate': 0.6, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 325, 'alpha': 0.8}. Best is trial 10 with value: 0.0702955950429197.[0m


TIFUKNN fitting...


100%|██████████| 10092/10092 [00:26<00:00, 386.90it/s]
100%|██████████| 9307/9307 [32:12<00:00,  4.82it/s]
[32m[I 2023-04-24 18:19:54,945][0m Trial 18 finished with value: 0.07048305004009849 and parameters: {'group_num': 10, 'within_decay_rate': 0.7000000000000001, 'group_decay_rate': 0.2, 'nearest_neighbors_num': 425, 'alpha': 0.9}. Best is trial 18 with value: 0.07048305004009849.[0m


TIFUKNN fitting...


100%|██████████| 10092/10092 [00:26<00:00, 383.62it/s]
100%|██████████| 9307/9307 [31:35<00:00,  4.91it/s]
[32m[I 2023-04-24 18:52:14,053][0m Trial 19 finished with value: 0.06560007696234454 and parameters: {'group_num': 9, 'within_decay_rate': 0.4, 'group_decay_rate': 0.30000000000000004, 'nearest_neighbors_num': 425, 'alpha': 0.7000000000000001}. Best is trial 18 with value: 0.07048305004009849.[0m


TIFUKNN fitting...


100%|██████████| 10092/10092 [00:25<00:00, 393.35it/s]
100%|██████████| 9307/9307 [28:23<00:00,  5.46it/s]
[32m[I 2023-04-24 19:21:19,891][0m Trial 20 finished with value: 0.06512384065815259 and parameters: {'group_num': 10, 'within_decay_rate': 0.9, 'group_decay_rate': 0.2, 'nearest_neighbors_num': 300, 'alpha': 0.30000000000000004}. Best is trial 18 with value: 0.07048305004009849.[0m


TIFUKNN fitting...


100%|██████████| 10092/10092 [00:24<00:00, 419.18it/s]
100%|██████████| 9307/9307 [32:02<00:00,  4.84it/s]
[32m[I 2023-04-24 19:54:02,976][0m Trial 21 finished with value: 0.06599882562807997 and parameters: {'group_num': 3, 'within_decay_rate': 0.7000000000000001, 'group_decay_rate': 0.2, 'nearest_neighbors_num': 450, 'alpha': 0.9}. Best is trial 18 with value: 0.07048305004009849.[0m


TIFUKNN fitting...


100%|██████████| 10092/10092 [00:24<00:00, 410.01it/s]
100%|██████████| 9307/9307 [33:21<00:00,  4.65it/s]
[32m[I 2023-04-24 20:28:05,436][0m Trial 22 finished with value: 0.06155363566136018 and parameters: {'group_num': 5, 'within_decay_rate': 0.7000000000000001, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 500, 'alpha': 0.9}. Best is trial 18 with value: 0.07048305004009849.[0m


TIFUKNN fitting...


100%|██████████| 10092/10092 [00:26<00:00, 383.45it/s]
100%|██████████| 9307/9307 [31:07<00:00,  4.98it/s]
[32m[I 2023-04-24 20:59:56,908][0m Trial 23 finished with value: 0.07366860344168423 and parameters: {'group_num': 10, 'within_decay_rate': 0.6, 'group_decay_rate': 0.2, 'nearest_neighbors_num': 400, 'alpha': 1.0}. Best is trial 23 with value: 0.07366860344168423.[0m


TIFUKNN fitting...


100%|██████████| 10092/10092 [00:26<00:00, 383.32it/s]
100%|██████████| 9307/9307 [31:15<00:00,  4.96it/s]
[32m[I 2023-04-24 21:31:55,106][0m Trial 24 finished with value: 0.07059738999058329 and parameters: {'group_num': 10, 'within_decay_rate': 0.6, 'group_decay_rate': 0.30000000000000004, 'nearest_neighbors_num': 400, 'alpha': 0.9}. Best is trial 23 with value: 0.07366860344168423.[0m


Test:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)
params = {
    "model": TIFUKNN(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        group_num=study.best_params["group_num"],
        within_decay_rate=study.best_params["within_decay_rate"],
        group_decay_rate=study.best_params["group_decay_rate"],
        nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
        alpha=study.best_params["alpha"],
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)

train dataset preparing...


100%|██████████| 10092/10092 [00:41<00:00, 241.42it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 30982.39it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 22587.21it/s]

TIFUKNN fitting...



100%|██████████| 10092/10092 [00:15<00:00, 638.46it/s]


In [None]:
trainer.evaluate(mode="test")

100%|██████████| 9307/9307 [24:05<00:00,  6.44it/s]


{'precision': 0.0076823895992263885,
 'recall': 0.07491672934350488,
 'ndcg': 0.052395387393749784}

# Dunnhumby

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "dunnhumby"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 2500, #items = 92339, #clicks = 2595370 (#illegal records = 0)
After preprocessing: #users = 2358, #items = 26756, #clicks = 1976796
Saving dataset in ./data//data_dunnhumby/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Tune hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 2358/2358 [00:08<00:00, 271.42it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:10<00:00, 222.97it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:09<00:00, 235.88it/s]


In [None]:
def objective(trial):
    params = {
        "model": TIFUKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            group_num=trial.suggest_int("group_num", low=2, high=10, step=1),
            within_decay_rate=trial.suggest_float("within_decay_rate", 0.1, 1.0, step=0.1),
            group_decay_rate=trial.suggest_float("group_decay_rate", 0.1, 1.0, step=0.1),
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=25, high=500, step=25),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.1),
            corpus=corpus
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[32m[I 2023-04-24 12:33:18,099][0m A new study created in memory with name: no-name-96ff447b-41a0-4de7-9e69-adf3145aa2f1[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:32<00:00, 15.42it/s]
100%|██████████| 2357/2357 [05:29<00:00,  7.15it/s]
[32m[I 2023-04-24 12:41:27,001][0m Trial 0 finished with value: 0.09253603345186663 and parameters: {'group_num': 8, 'within_decay_rate': 0.1, 'group_decay_rate': 0.7000000000000001, 'nearest_neighbors_num': 375, 'alpha': 0.5}. Best is trial 0 with value: 0.09253603345186663.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:56<00:00, 20.23it/s]
100%|██████████| 2357/2357 [02:59<00:00, 13.17it/s]
[32m[I 2023-04-24 12:46:28,086][0m Trial 1 finished with value: 0.07525198420938144 and parameters: {'group_num': 4, 'within_decay_rate': 0.2, 'group_decay_rate': 0.8, 'nearest_neighbors_num': 100, 'alpha': 0.0}. Best is trial 0 with value: 0.09253603345186663.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:29<00:00, 15.81it/s]
100%|██████████| 2357/2357 [04:26<00:00,  8.85it/s]
[32m[I 2023-04-24 12:53:29,415][0m Trial 2 finished with value: 0.15614767582307007 and parameters: {'group_num': 8, 'within_decay_rate': 1.0, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 275, 'alpha': 0.8}. Best is trial 2 with value: 0.15614767582307007.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:24<00:00, 16.31it/s]
100%|██████████| 2357/2357 [06:26<00:00,  6.10it/s]
[32m[I 2023-04-24 13:02:26,446][0m Trial 3 finished with value: 0.13803990338157807 and parameters: {'group_num': 7, 'within_decay_rate': 0.8, 'group_decay_rate': 0.30000000000000004, 'nearest_neighbors_num': 475, 'alpha': 0.7000000000000001}. Best is trial 2 with value: 0.15614767582307007.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:16<00:00, 17.25it/s]
100%|██████████| 2357/2357 [05:29<00:00,  7.15it/s]
[32m[I 2023-04-24 13:10:20,360][0m Trial 4 finished with value: 0.10143512336473273 and parameters: {'group_num': 6, 'within_decay_rate': 0.2, 'group_decay_rate': 0.4, 'nearest_neighbors_num': 350, 'alpha': 0.4}. Best is trial 2 with value: 0.15614767582307007.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:06<00:00, 18.63it/s]
100%|██████████| 2357/2357 [05:28<00:00,  7.17it/s]
[32m[I 2023-04-24 13:18:03,066][0m Trial 5 finished with value: 0.13410804614024685 and parameters: {'group_num': 5, 'within_decay_rate': 0.7000000000000001, 'group_decay_rate': 0.6, 'nearest_neighbors_num': 350, 'alpha': 0.6000000000000001}. Best is trial 2 with value: 0.15614767582307007.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:33<00:00, 15.37it/s]
100%|██████████| 2357/2357 [03:26<00:00, 11.41it/s]
[32m[I 2023-04-24 13:24:09,831][0m Trial 6 finished with value: 0.09633295542051265 and parameters: {'group_num': 9, 'within_decay_rate': 0.6, 'group_decay_rate': 1.0, 'nearest_neighbors_num': 175, 'alpha': 0.0}. Best is trial 2 with value: 0.15614767582307007.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:01<00:00, 19.45it/s]
100%|██████████| 2357/2357 [02:38<00:00, 14.89it/s]
[32m[I 2023-04-24 13:28:55,209][0m Trial 7 finished with value: 0.095779054181922 and parameters: {'group_num': 4, 'within_decay_rate': 0.2, 'group_decay_rate': 0.9, 'nearest_neighbors_num': 25, 'alpha': 0.6000000000000001}. Best is trial 2 with value: 0.15614767582307007.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:14<00:00, 17.57it/s]
100%|██████████| 2357/2357 [06:16<00:00,  6.26it/s]
[32m[I 2023-04-24 13:37:35,435][0m Trial 8 finished with value: 0.1527917644962287 and parameters: {'group_num': 6, 'within_decay_rate': 0.9, 'group_decay_rate': 0.2, 'nearest_neighbors_num': 450, 'alpha': 0.30000000000000004}. Best is trial 2 with value: 0.15614767582307007.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:28<00:00, 15.87it/s]
100%|██████████| 2357/2357 [03:36<00:00, 10.88it/s]
[32m[I 2023-04-24 13:43:47,379][0m Trial 9 finished with value: 0.0879922526124206 and parameters: {'group_num': 8, 'within_decay_rate': 0.30000000000000004, 'group_decay_rate': 0.9, 'nearest_neighbors_num': 175, 'alpha': 0.1}. Best is trial 2 with value: 0.15614767582307007.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:45<00:00, 22.33it/s]
100%|██████████| 2357/2357 [04:17<00:00,  9.14it/s]
[32m[I 2023-04-24 13:49:58,997][0m Trial 10 finished with value: 0.1615120370126956 and parameters: {'group_num': 2, 'within_decay_rate': 1.0, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 250, 'alpha': 1.0}. Best is trial 10 with value: 0.1615120370126956.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:46<00:00, 22.13it/s]
100%|██████████| 2357/2357 [03:32<00:00, 11.12it/s]
[32m[I 2023-04-24 13:55:23,091][0m Trial 11 finished with value: 0.1615120370126956 and parameters: {'group_num': 2, 'within_decay_rate': 1.0, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 250, 'alpha': 1.0}. Best is trial 10 with value: 0.1615120370126956.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:45<00:00, 22.28it/s]
100%|██████████| 2357/2357 [03:50<00:00, 10.21it/s]
[32m[I 2023-04-24 14:01:07,402][0m Trial 12 finished with value: 0.1615120370126956 and parameters: {'group_num': 2, 'within_decay_rate': 1.0, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 225, 'alpha': 1.0}. Best is trial 10 with value: 0.1615120370126956.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:47<00:00, 21.99it/s]
100%|██████████| 2357/2357 [04:45<00:00,  8.25it/s]
[32m[I 2023-04-24 14:07:45,877][0m Trial 13 finished with value: 0.10810371822947634 and parameters: {'group_num': 2, 'within_decay_rate': 0.5, 'group_decay_rate': 0.4, 'nearest_neighbors_num': 275, 'alpha': 1.0}. Best is trial 10 with value: 0.1615120370126956.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:53<00:00, 20.85it/s]
100%|██████████| 2357/2357 [03:30<00:00, 11.20it/s]
[32m[I 2023-04-24 14:13:14,822][0m Trial 14 finished with value: 0.14149236243295835 and parameters: {'group_num': 3, 'within_decay_rate': 0.8, 'group_decay_rate': 0.30000000000000004, 'nearest_neighbors_num': 200, 'alpha': 0.9}. Best is trial 10 with value: 0.1615120370126956.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:51<00:00, 21.09it/s]
100%|██████████| 2357/2357 [02:52<00:00, 13.67it/s]
[32m[I 2023-04-24 14:18:04,741][0m Trial 15 finished with value: 0.17068007003990598 and parameters: {'group_num': 3, 'within_decay_rate': 1.0, 'group_decay_rate': 0.1, 'nearest_neighbors_num': 100, 'alpha': 0.8}. Best is trial 15 with value: 0.17068007003990598.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:59<00:00, 19.72it/s]
100%|██████████| 2357/2357 [02:48<00:00, 13.97it/s]
[32m[I 2023-04-24 14:23:00,960][0m Trial 16 finished with value: 0.11353122240930302 and parameters: {'group_num': 4, 'within_decay_rate': 0.5, 'group_decay_rate': 0.5, 'nearest_neighbors_num': 75, 'alpha': 0.8}. Best is trial 15 with value: 0.17068007003990598.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:52<00:00, 20.98it/s]
100%|██████████| 2357/2357 [02:57<00:00, 13.27it/s]
[32m[I 2023-04-24 14:27:59,341][0m Trial 17 finished with value: 0.14566834996004863 and parameters: {'group_num': 3, 'within_decay_rate': 0.8, 'group_decay_rate': 0.2, 'nearest_neighbors_num': 125, 'alpha': 0.8}. Best is trial 15 with value: 0.17068007003990598.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:49<00:00, 21.55it/s]
100%|██████████| 2357/2357 [02:38<00:00, 14.87it/s]
[32m[I 2023-04-24 14:32:33,510][0m Trial 18 finished with value: 0.16233130600796747 and parameters: {'group_num': 3, 'within_decay_rate': 0.9, 'group_decay_rate': 0.30000000000000004, 'nearest_neighbors_num': 25, 'alpha': 0.9}. Best is trial 15 with value: 0.17068007003990598.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:05<00:00, 18.74it/s]
100%|██████████| 2357/2357 [02:37<00:00, 14.95it/s]
[32m[I 2023-04-24 14:37:25,254][0m Trial 19 finished with value: 0.13034484311075023 and parameters: {'group_num': 5, 'within_decay_rate': 0.7000000000000001, 'group_decay_rate': 0.30000000000000004, 'nearest_neighbors_num': 25, 'alpha': 0.7000000000000001}. Best is trial 15 with value: 0.17068007003990598.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:52<00:00, 21.04it/s]
100%|██████████| 2357/2357 [02:48<00:00, 13.96it/s]
[32m[I 2023-04-24 14:42:11,612][0m Trial 20 finished with value: 0.16392089723355227 and parameters: {'group_num': 3, 'within_decay_rate': 0.9, 'group_decay_rate': 0.5, 'nearest_neighbors_num': 75, 'alpha': 0.30000000000000004}. Best is trial 15 with value: 0.17068007003990598.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:52<00:00, 21.00it/s]
100%|██████████| 2357/2357 [02:48<00:00, 14.00it/s]
[32m[I 2023-04-24 14:46:57,747][0m Trial 21 finished with value: 0.15856816582491384 and parameters: {'group_num': 3, 'within_decay_rate': 0.9, 'group_decay_rate': 0.5, 'nearest_neighbors_num': 75, 'alpha': 0.2}. Best is trial 15 with value: 0.17068007003990598.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [02:02<00:00, 19.32it/s]
100%|██████████| 2357/2357 [02:58<00:00, 13.19it/s]
[32m[I 2023-04-24 14:52:06,481][0m Trial 22 finished with value: 0.1599942690657107 and parameters: {'group_num': 5, 'within_decay_rate': 0.9, 'group_decay_rate': 0.4, 'nearest_neighbors_num': 125, 'alpha': 0.30000000000000004}. Best is trial 15 with value: 0.17068007003990598.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:49<00:00, 21.62it/s]
100%|██████████| 2357/2357 [02:44<00:00, 14.32it/s]
[32m[I 2023-04-24 14:56:47,045][0m Trial 23 finished with value: 0.1352820656812531 and parameters: {'group_num': 3, 'within_decay_rate': 0.7000000000000001, 'group_decay_rate': 0.6, 'nearest_neighbors_num': 50, 'alpha': 0.5}. Best is trial 15 with value: 0.17068007003990598.[0m


TIFUKNN fitting...


100%|██████████| 2358/2358 [01:56<00:00, 20.24it/s]
100%|██████████| 2357/2357 [03:00<00:00, 13.05it/s]
[32m[I 2023-04-24 15:01:50,118][0m Trial 24 finished with value: 0.1575971068680349 and parameters: {'group_num': 4, 'within_decay_rate': 0.9, 'group_decay_rate': 0.2, 'nearest_neighbors_num': 150, 'alpha': 0.9}. Best is trial 15 with value: 0.17068007003990598.[0m


Test:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)
params = {
    "model": TIFUKNN(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        group_num=study.best_params["group_num"],
        within_decay_rate=study.best_params["within_decay_rate"],
        group_decay_rate=study.best_params["group_decay_rate"],
        nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
        alpha=study.best_params["alpha"],
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)

train dataset preparing...


100%|██████████| 2358/2358 [00:09<00:00, 243.37it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:08<00:00, 279.67it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:09<00:00, 240.19it/s]


TIFUKNN fitting...


100%|██████████| 2358/2358 [00:43<00:00, 54.73it/s]


In [None]:
trainer.evaluate(mode="test")

100%|██████████| 2357/2357 [01:52<00:00, 20.92it/s]


{'precision': 0.11565549427238016,
 'recall': 0.16531576301767698,
 'ndcg': 0.16128563456518297}