In [None]:
import os
import sys
sys.path.append("..")
from nbr.preparation import Preprocess, save_split, Corpus
from nbr.trainer import NBRTrainer
from nbr.model import RepurchaseModule, NBRKNN
import torch
import random
import numpy as np
import optuna
import warnings
warnings.filterwarnings("ignore")

# TaFeng

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "ta_feng"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 32266, #items = 23812, #clicks = 817741 (#illegal records = 0)
After preprocessing: #users = 7358, #items = 11202, #clicks = 368951
Saving dataset in ./data//data_ta_feng/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Train Repurchase Module with best hyperparams (see ./testing_repurchasemodule.ipynb):

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=20,
    topk=10,
    early_stop_num=3
)

train dataset preparing...


100%|██████████| 7358/7358 [00:15<00:00, 461.82it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:03<00:00, 2337.01it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 2753.44it/s]


In [None]:
slrc_best_params = {'batch_size': 256, 'lr': 0.00011201144001505824, 'l2_reg_coef': 0.00011498224071460201}

params = {
    "model": RepurchaseModule(
        item_num=corpus.n_items,
        avg_repeat_interval=corpus.total_avg_interval
    ),
    "batch_size": slrc_best_params["batch_size"],
    "lr": slrc_best_params["lr"],
    "l2_reg_coef": slrc_best_params["l2_reg_coef"]
}

trainer.init_hyperparams(**params)

In [None]:
trainer.train()

Epoch 1:


Batch loss = 0.676425: 100%|██████████| 1112/1112 [00:25<00:00, 43.51it/s]



Evaluation (dev):


100%|██████████| 7357/7357 [02:11<00:00, 55.96it/s]


 {'precision': 0.05195052331113225, 'recall': 0.11515703682338883, 'ndcg': 0.10148815950513894}
Epoch 2:



Batch loss = 0.676221: 100%|██████████| 1112/1112 [00:23<00:00, 48.28it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [02:09<00:00, 56.70it/s]


 {'precision': 0.05242626070409134, 'recall': 0.11624861635726887, 'ndcg': 0.10236469000386939}
Epoch 3:



Batch loss = 0.676014: 100%|██████████| 1112/1112 [00:24<00:00, 44.81it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [02:13<00:00, 55.12it/s]


 {'precision': 0.052657333152100035, 'recall': 0.11643473131931699, 'ndcg': 0.10294663089929013}
Epoch 4:



Batch loss = 0.675803: 100%|██████████| 1112/1112 [00:22<00:00, 49.18it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [02:11<00:00, 55.96it/s]


 {'precision': 0.0527117031398668, 'recall': 0.11636928241260278, 'ndcg': 0.10297710339558913}
Epoch 5:



Batch loss = 0.67559: 100%|██████████| 1112/1112 [00:22<00:00, 50.34it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [02:16<00:00, 53.81it/s]


 {'precision': 0.052820443115400295, 'recall': 0.1163978595344264, 'ndcg': 0.10288667894946825}
Epoch 6:



Batch loss = 0.675375: 100%|██████████| 1112/1112 [00:22<00:00, 48.75it/s]



Evaluation (dev):


100%|██████████| 7357/7357 [02:12<00:00, 55.54it/s]


 {'precision': 0.05286122060622536, 'recall': 0.11661878096090342, 'ndcg': 0.10300096181938187}
Epoch 7:



Batch loss = 0.675158: 100%|██████████| 1112/1112 [00:21<00:00, 50.58it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [02:14<00:00, 54.87it/s]


 {'precision': 0.052752480630691864, 'recall': 0.11635567534079697, 'ndcg': 0.10284861376242276}
Epoch 8:



Batch loss = 0.674938: 100%|██████████| 1112/1112 [00:22<00:00, 48.85it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [02:10<00:00, 56.37it/s]


 {'precision': 0.05267092564904173, 'recall': 0.11613774389070566, 'ndcg': 0.10256797513574398}
Epoch 9:



Batch loss = 0.674717: 100%|██████████| 1112/1112 [00:23<00:00, 48.21it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [02:12<00:00, 55.63it/s]


 {'precision': 0.05253500067962485, 'recall': 0.11583893124912728, 'ndcg': 0.10218510742709311}





SLRCSimple()

Save users' embeddings for validation step (user's embedding for validation step is his vector of recommendation scores for all items at the time of validation):

In [None]:
dev_user_emb = trainer.get_predictions(mode="dev")

100%|██████████| 7357/7357 [02:09<00:00, 56.77it/s]


Tune Neighborhood Module hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 7358/7358 [00:16<00:00, 456.30it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4256.65it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 3648.96it/s]


In [None]:
def objective(trial):
    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=1, high=200),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.05),
            user_emb=dev_user_emb
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[32m[I 2023-05-02 16:14:51,370][0m A new study created in memory with name: no-name-78ada8d3-286c-4a92-a7e1-b75ef3db60e8[0m
100%|██████████| 7357/7357 [06:52<00:00, 17.81it/s]
[32m[I 2023-05-02 16:21:52,078][0m Trial 0 finished with value: 0.09548180430581692 and parameters: {'nearest_neighbors_num': 155, 'alpha': 0.0}. Best is trial 0 with value: 0.09548180430581692.[0m
100%|██████████| 7357/7357 [06:44<00:00, 18.17it/s]
[32m[I 2023-05-02 16:28:42,677][0m Trial 1 finished with value: 0.10705625742842506 and parameters: {'nearest_neighbors_num': 127, 'alpha': 0.75}. Best is trial 1 with value: 0.10705625742842506.[0m
100%|██████████| 7357/7357 [07:07<00:00, 17.19it/s]
[32m[I 2023-05-02 16:35:56,900][0m Trial 2 finished with value: 0.11094634153757461 and parameters: {'nearest_neighbors_num': 100, 'alpha': 0.2}. Best is trial 2 with value: 0.11094634153757461.[0m
100%|██████████| 7357/7357 [06:41<00:00, 18.34it/s]
[32m[I 2023-05-02 16:42:45,257][0m Trial 3 finished with va

Test TAIWI (calculate scores for different seeds):

In [None]:
test_metrics = {
    "precision": [],
    "recall": [],
    "ndcg": []
}

In [None]:
for seed in range(5):
    print(f"\n___SEED___{seed}")
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=20,
        topk=10,
        early_stop_num=3
    )

    params = {
        "model": RepurchaseModule(
            item_num=corpus.n_items,
            avg_repeat_interval=corpus.total_avg_interval
        ),
        "batch_size": slrc_best_params["batch_size"],
        "lr": slrc_best_params["lr"],
        "l2_reg_coef": slrc_best_params["l2_reg_coef"]
    }

    trainer.init_hyperparams(**params)
    trainer.train()

    dev_user_emb = trainer.get_predictions(mode="dev")
    test_user_emb = trainer.get_predictions(mode="test")

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=None,
        topk=10,
        early_stop_num=None
    )

    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
            alpha=study.best_params["alpha"],
            user_emb=dev_user_emb
        )
    }
    params["model"].set_emb(test_user_emb)

    trainer.init_hyperparams(**params)

    metrics = trainer.evaluate(mode="test")

    test_metrics["precision"].append(metrics["precision"])
    test_metrics["recall"].append(metrics["recall"])
    test_metrics["ndcg"].append(metrics["ndcg"])
    print(test_metrics)


___SEED___3
train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 627.64it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 5010.40it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 3886.18it/s]


Epoch 1:


Batch loss = 0.676474: 100%|██████████| 1112/1112 [00:17<00:00, 64.05it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:23<00:00, 88.63it/s]


 {'precision': 0.05195052331113225, 'recall': 0.1156230928626154, 'ndcg': 0.10197960100791124}
Epoch 2:



Batch loss = 0.67627: 100%|██████████| 1112/1112 [00:16<00:00, 65.71it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:19<00:00, 93.10it/s]


 {'precision': 0.05231752072855783, 'recall': 0.11603974059049585, 'ndcg': 0.10256831109125367}
Epoch 3:



Batch loss = 0.676062: 100%|██████████| 1112/1112 [00:16<00:00, 66.24it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:20<00:00, 91.30it/s] 


 {'precision': 0.05245344569797472, 'recall': 0.11591048006986918, 'ndcg': 0.1027257458142659}
Epoch 4:



Batch loss = 0.675851: 100%|██████████| 1112/1112 [00:16<00:00, 66.28it/s]



Evaluation (dev):


100%|██████████| 7357/7357 [01:21<00:00, 90.74it/s] 


 {'precision': 0.05250781568574146, 'recall': 0.11575319397368874, 'ndcg': 0.10271832963370045}
Epoch 5:



Batch loss = 0.675638: 100%|██████████| 1112/1112 [00:16<00:00, 69.11it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:21<00:00, 90.73it/s] 


 {'precision': 0.052643740655158346, 'recall': 0.11614537193572409, 'ndcg': 0.10285922667664568}
Epoch 6:



Batch loss = 0.675423: 100%|██████████| 1112/1112 [00:16<00:00, 66.86it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:20<00:00, 91.51it/s] 


 {'precision': 0.05276607312763355, 'recall': 0.11655349943530233, 'ndcg': 0.10290240449312202}
Epoch 7:



Batch loss = 0.675206: 100%|██████████| 1112/1112 [00:16<00:00, 68.26it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:20<00:00, 91.43it/s] 


 {'precision': 0.052779665624575235, 'recall': 0.11634911496685268, 'ndcg': 0.10273790106213881}
Epoch 8:



Batch loss = 0.674986: 100%|██████████| 1112/1112 [00:16<00:00, 67.67it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:20<00:00, 91.57it/s] 


 {'precision': 0.05256218567350823, 'recall': 0.11578826871917293, 'ndcg': 0.10241205444898473}
Epoch 9:



Batch loss = 0.674764: 100%|██████████| 1112/1112 [00:16<00:00, 67.08it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:20<00:00, 91.19it/s] 


 {'precision': 0.05249422318879978, 'recall': 0.11560075292633441, 'ndcg': 0.10212504761594665}



100%|██████████| 7357/7357 [01:18<00:00, 93.63it/s]
100%|██████████| 7357/7357 [01:18<00:00, 93.41it/s] 


train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 636.48it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4731.95it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4102.49it/s]
100%|██████████| 7357/7357 [04:46<00:00, 25.68it/s]


{'precision': [0.06676634497757239, 0.06725567486747315, 0.06739159983689004, 0.06713334239499796], 'recall': [0.16330173977286047, 0.1645299677770667, 0.16432525138329832, 0.16452850132144814], 'ndcg': [0.12485593594413333, 0.1264572343117576, 0.1275585838242084, 0.12585491718264302]}

___SEED___4
train dataset preparing...


100%|██████████| 7358/7358 [00:09<00:00, 751.60it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 2896.84it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 3387.62it/s]

Epoch 1:



Batch loss = 0.676412: 100%|██████████| 1112/1112 [00:17<00:00, 64.95it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:21<00:00, 90.19it/s] 


 {'precision': 0.0516786733722985, 'recall': 0.11499818536387797, 'ndcg': 0.10150591490811248}
Epoch 2:



Batch loss = 0.676208: 100%|██████████| 1112/1112 [00:16<00:00, 66.55it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:21<00:00, 90.03it/s] 


 {'precision': 0.0521272257713742, 'recall': 0.11563012852444533, 'ndcg': 0.10222992158679849}
Epoch 3:



Batch loss = 0.676: 100%|██████████| 1112/1112 [00:16<00:00, 66.03it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:22<00:00, 89.46it/s] 


 {'precision': 0.05257577817044992, 'recall': 0.11628381424230773, 'ndcg': 0.10269160061530277}
Epoch 4:



Batch loss = 0.67579: 100%|██████████| 1112/1112 [00:16<00:00, 65.86it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:21<00:00, 90.45it/s] 


 {'precision': 0.05252140818268316, 'recall': 0.11570567777503674, 'ndcg': 0.10242643734778803}
Epoch 5:



Batch loss = 0.675577: 100%|██████████| 1112/1112 [00:16<00:00, 66.29it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:21<00:00, 90.44it/s] 


 {'precision': 0.052752480630691864, 'recall': 0.1162870672163725, 'ndcg': 0.10264102934094418}
Epoch 6:



Batch loss = 0.675362: 100%|██████████| 1112/1112 [00:16<00:00, 67.08it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:21<00:00, 90.14it/s]


 {'precision': 0.052752480630691864, 'recall': 0.11651367722218438, 'ndcg': 0.10271090534626744}
Epoch 7:



Batch loss = 0.675144: 100%|██████████| 1112/1112 [00:16<00:00, 66.73it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:21<00:00, 90.02it/s]


 {'precision': 0.05271170313986679, 'recall': 0.11629664877014166, 'ndcg': 0.10254315420184261}
Epoch 8:



Batch loss = 0.674925: 100%|██████████| 1112/1112 [00:16<00:00, 65.67it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:21<00:00, 90.06it/s]


 {'precision': 0.052602963164333286, 'recall': 0.11587463724567974, 'ndcg': 0.10225979437727453}
Epoch 9:



Batch loss = 0.674703: 100%|██████████| 1112/1112 [00:16<00:00, 65.80it/s]


Evaluation (dev):



100%|██████████| 7357/7357 [01:21<00:00, 89.91it/s] 


 {'precision': 0.05252140818268316, 'recall': 0.11592458742058198, 'ndcg': 0.10216093113759561}



100%|██████████| 7357/7357 [01:18<00:00, 93.63it/s] 
100%|██████████| 7357/7357 [01:20<00:00, 91.58it/s] 


train dataset preparing...


100%|██████████| 7358/7358 [00:11<00:00, 647.25it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4578.31it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:01<00:00, 4459.73it/s]
100%|██████████| 7357/7357 [04:49<00:00, 25.46it/s]

{'precision': [0.06676634497757239, 0.06725567486747315, 0.06739159983689004, 0.06713334239499796, 0.06705178741334783], 'recall': [0.16330173977286047, 0.1645299677770667, 0.16432525138329832, 0.16452850132144814, 0.1645606933393488], 'ndcg': [0.12485593594413333, 0.1264572343117576, 0.1275585838242084, 0.12585491718264302, 0.12593791089122025]}





In [None]:
{
    "precision": np.array(test_metrics["precision"]).mean(),
    "recall": np.array(test_metrics["recall"]).mean(),
    "ndcg": np.array(test_metrics["ndcg"]).mean(),
}

{'precision': 0.06711974989805627,
 'recall': 0.1642492307188045,
 'ndcg': 0.1261329164307925}

# TaoBao

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 10 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "taobao"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(10, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 672404, #items = 638962, #clicks = 2015807 (#illegal records = 0)
After preprocessing: #users = 10092, #items = 22286, #clicks = 67991
Saving dataset in ./data//data_taobao/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Train Repurchase Module with best hyperparams (see ./testing_repurchasemodule.ipynb):

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=20,
    topk=10,
    early_stop_num=3
)

In [None]:
slrc_best_params = {'batch_size': 256, 'lr': 0.00022155020864083442, 'l2_reg_coef': 0.012687180197268989}

params = {
    "model": RepurchaseModule(
        item_num=corpus.n_items,
        avg_repeat_interval=corpus.total_avg_interval
    ),
    "batch_size": slrc_best_params["batch_size"],
    "lr": slrc_best_params["lr"],
    "l2_reg_coef": slrc_best_params["l2_reg_coef"]
}

trainer.init_hyperparams(**params)

In [None]:
trainer.train()

Save users' embeddings for validation step (user's embedding for validation step is his vector of recommendation scores for all items at the time of validation):

In [None]:
dev_user_emb = trainer.get_predictions(mode="dev")

Tune Neighborhood Module hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 10092/10092 [00:49<00:00, 205.66it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 29295.58it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 18293.17it/s]


In [None]:
def objective(trial):
    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=1, high=200),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.05),
            user_emb=dev_user_emb
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    trainer.model.reset()
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[32m[I 2023-05-06 18:57:16,656][0m A new study created in memory with name: no-name-c8c3fe05-679e-4cfb-b233-e5caa62b533c[0m
100%|██████████| 9307/9307 [28:33<00:00,  5.43it/s]
[32m[I 2023-05-06 19:26:11,499][0m Trial 0 finished with value: 0.02344135825603466 and parameters: {'nearest_neighbors_num': 155, 'alpha': 0.0}. Best is trial 0 with value: 0.02344135825603466.[0m
100%|██████████| 9307/9307 [29:41<00:00,  5.22it/s]
[32m[I 2023-05-06 19:56:13,777][0m Trial 1 finished with value: 0.07265987971860889 and parameters: {'nearest_neighbors_num': 127, 'alpha': 0.75}. Best is trial 1 with value: 0.07265987971860889.[0m
100%|██████████| 9307/9307 [29:06<00:00,  5.33it/s]
[32m[I 2023-05-06 20:25:41,039][0m Trial 2 finished with value: 0.07020255416138836 and parameters: {'nearest_neighbors_num': 100, 'alpha': 0.2}. Best is trial 1 with value: 0.07265987971860889.[0m
100%|██████████| 9307/9307 [28:02<00:00,  5.53it/s]
[32m[I 2023-05-06 20:54:04,652][0m Trial 3 finished with va

Test TAIWI (calculate scores for different seeds):

In [None]:
test_metrics = {
    "precision": [],
    "recall": [],
    "ndcg": []
}

In [None]:
for seed in range(3):
    print(f"\n___SEED___{seed}")
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=20,
        topk=10,
        early_stop_num=3
    )

    params = {
        "model": RepurchaseModule(
            item_num=corpus.n_items,
            avg_repeat_interval=corpus.total_avg_interval
        ),
        "batch_size": slrc_best_params["batch_size"],
        "lr": slrc_best_params["lr"],
        "l2_reg_coef": slrc_best_params["l2_reg_coef"]
    }

    trainer.init_hyperparams(**params)
    trainer.train()

    dev_user_emb = trainer.get_predictions(mode="dev")
    test_user_emb = trainer.get_predictions(mode="test")

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=None,
        topk=10,
        early_stop_num=None
    )

    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
            alpha=study.best_params["alpha"],
            user_emb=dev_user_emb
        )
    }
    params["model"].set_emb(test_user_emb)

    trainer.init_hyperparams(**params)

    metrics = trainer.evaluate(mode="test")

    trainer.model.reset()

    test_metrics["precision"].append(metrics["precision"])
    test_metrics["recall"].append(metrics["recall"])
    test_metrics["ndcg"].append(metrics["ndcg"])
    print(test_metrics)

In [None]:
{
    "precision": np.array(test_metrics["precision"]).mean(),
    "recall": np.array(test_metrics["recall"]).mean(),
    "ndcg": np.array(test_metrics["ndcg"]).mean(),
}

{'precision': 0.0123025679596003,
 'recall': 0.11903226961785035,
 'ndcg': 0.0809702514792717}

# Dunnhumby

Fix seed:

In [None]:
seed = 10
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "dunnhumby"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 2500, #items = 92339, #clicks = 2595370 (#illegal records = 0)
After preprocessing: #users = 2358, #items = 26756, #clicks = 1976796
Saving dataset in ./data//data_dunnhumby/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Train Repurchase Module with best hyperparams (see ./testing_repurchasemodule.ipynb):

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=20,
    topk=10,
    early_stop_num=3
)

train dataset preparing...


100%|██████████| 2358/2358 [00:13<00:00, 174.26it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:13<00:00, 170.87it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:13<00:00, 169.82it/s]


In [None]:
slrc_best_params = {'batch_size': 128, 'lr': 2.0869566476632644e-05, 'l2_reg_coef': 0.0008476179290251597}

params = {
    "model": RepurchaseModule(
        item_num=corpus.n_items,
        avg_repeat_interval=corpus.total_avg_interval
    ),
    "batch_size": slrc_best_params["batch_size"],
    "lr": slrc_best_params["lr"],
    "l2_reg_coef": slrc_best_params["l2_reg_coef"]
}

trainer.init_hyperparams(**params)

In [None]:
trainer.train()

Epoch 1:


Batch loss = 0.577438: 100%|██████████| 15060/15060 [04:32<00:00, 55.20it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [03:16<00:00, 11.97it/s]


 {'precision': 0.11565549427238016, 'recall': 0.17684104601420847, 'ndcg': 0.1648580594543285}
Epoch 2:



Batch loss = 0.576048: 100%|██████████| 15060/15060 [04:34<00:00, 54.85it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [03:15<00:00, 12.03it/s]


 {'precision': 0.11574034789987275, 'recall': 0.17606794780701862, 'ndcg': 0.16434954068456503}





Epoch 3:


Batch loss = 0.574895: 100%|██████████| 15060/15060 [04:15<00:00, 59.03it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [03:14<00:00, 12.10it/s]


 {'precision': 0.11629189647857446, 'recall': 0.17655253422588868, 'ndcg': 0.164681952386545}
Epoch 4:



Batch loss = 0.573924: 100%|██████████| 15060/15060 [04:11<00:00, 59.83it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [03:14<00:00, 12.11it/s]


 {'precision': 0.11624946966482817, 'recall': 0.17662851344398434, 'ndcg': 0.1649731924751921}
Epoch 5:



Batch loss = 0.573097: 100%|██████████| 15060/15060 [04:13<00:00, 59.45it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [03:15<00:00, 12.06it/s]


 {'precision': 0.1161646160373356, 'recall': 0.17646262212661903, 'ndcg': 0.16486376186879517}
Epoch 6:



Batch loss = 0.572386: 100%|██████████| 15060/15060 [04:12<00:00, 59.74it/s]



Evaluation (dev):


100%|██████████| 2357/2357 [03:14<00:00, 12.09it/s]


 {'precision': 0.11599490878235044, 'recall': 0.1754830049059438, 'ndcg': 0.1643936293198781}
Epoch 7:



Batch loss = 0.571765: 100%|██████████| 15060/15060 [04:12<00:00, 59.57it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [03:15<00:00, 12.05it/s]


 {'precision': 0.11582520152736529, 'recall': 0.17394463845647845, 'ndcg': 0.16410026592749155}





SLRCSimple()

Save users' embeddings for validation step (user's embedding for validation step is his vector of recommendation scores for all items at the time of validation):

In [None]:
dev_user_emb = trainer.get_predictions(mode="dev")

100%|██████████| 2357/2357 [03:29<00:00, 11.27it/s]


Tune Neighborhood Module hyperparams on validation dataset:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 2358/2358 [00:16<00:00, 145.73it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 189.84it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 184.62it/s]


In [None]:
def objective(trial):
    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=trial.suggest_int("nearest_neighbors_num", low=1, high=200),
            alpha=trial.suggest_float("alpha", 0.0, 1.0, step=0.05),
            user_emb=dev_user_emb
        )
    }

    trainer.init_hyperparams(**params)
    metrics = trainer.evaluate(mode="dev")
    score = metrics["ndcg"]
    return score

In [None]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=25)

[32m[I 2023-05-02 20:44:53,469][0m A new study created in memory with name: no-name-7325d121-200b-4c25-880c-0966116ae882[0m
100%|██████████| 2357/2357 [03:00<00:00, 13.09it/s]
[32m[I 2023-05-02 20:47:59,098][0m Trial 0 finished with value: 0.12311387916948324 and parameters: {'nearest_neighbors_num': 155, 'alpha': 0.0}. Best is trial 0 with value: 0.12311387916948324.[0m
100%|██████████| 2357/2357 [02:57<00:00, 13.26it/s]
[32m[I 2023-05-02 20:51:02,569][0m Trial 1 finished with value: 0.1718346591905977 and parameters: {'nearest_neighbors_num': 127, 'alpha': 0.75}. Best is trial 1 with value: 0.1718346591905977.[0m
100%|██████████| 2357/2357 [02:58<00:00, 13.20it/s]
[32m[I 2023-05-02 20:54:07,413][0m Trial 2 finished with value: 0.1527396060836034 and parameters: {'nearest_neighbors_num': 100, 'alpha': 0.2}. Best is trial 1 with value: 0.1718346591905977.[0m
100%|██████████| 2357/2357 [02:37<00:00, 14.96it/s]
[32m[I 2023-05-02 20:56:50,192][0m Trial 3 finished with value:

Test TAIWI (calculate scores for different seeds):

In [None]:
test_metrics = {
    "precision": [],
    "recall": [],
    "ndcg": []
}

In [None]:
for seed in range(5):
    print(f"\n___SEED___{seed}")
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=20,
        topk=10,
        early_stop_num=3
    )

    params = {
        "model": RepurchaseModule(
            item_num=corpus.n_items,
            avg_repeat_interval=corpus.total_avg_interval
        ),
        "batch_size": slrc_best_params["batch_size"],
        "lr": slrc_best_params["lr"],
        "l2_reg_coef": slrc_best_params["l2_reg_coef"]
    }

    trainer.init_hyperparams(**params)
    trainer.train()

    dev_user_emb = trainer.get_predictions(mode="dev")
    test_user_emb = trainer.get_predictions(mode="test")

    trainer = NBRTrainer(
        corpus=corpus,
        max_epochs=None,
        topk=10,
        early_stop_num=None
    )

    params = {
        "model": NBRKNN(
            item_num=corpus.n_items,
            user_num=corpus.n_users,
            nearest_neighbors_num=study.best_params["nearest_neighbors_num"],
            alpha=study.best_params["alpha"],
            user_emb=dev_user_emb
        )
    }
    params["model"].set_emb(test_user_emb)

    trainer.init_hyperparams(**params)

    metrics = trainer.evaluate(mode="test")

    test_metrics["precision"].append(metrics["precision"])
    test_metrics["recall"].append(metrics["recall"])
    test_metrics["ndcg"].append(metrics["ndcg"])
    print(test_metrics)


___SEED___0
train dataset preparing...


100%|██████████| 2358/2358 [00:08<00:00, 279.27it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 201.10it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:12<00:00, 194.80it/s]

Epoch 1:



Batch loss = 0.577507: 100%|██████████| 15060/15060 [02:47<00:00, 89.90it/s]



Evaluation (dev):


100%|██████████| 2357/2357 [01:12<00:00, 32.49it/s]


 {'precision': 0.11535850657615612, 'recall': 0.17654723061333896, 'ndcg': 0.16435737794573518}
Epoch 2:



Batch loss = 0.576102: 100%|██████████| 15060/15060 [02:45<00:00, 90.85it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 31.92it/s]


 {'precision': 0.11591005515485789, 'recall': 0.1762658233669282, 'ndcg': 0.16437461229120745}
Epoch 3:



Batch loss = 0.574934: 100%|██████████| 15060/15060 [02:47<00:00, 89.77it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:14<00:00, 31.82it/s]


 {'precision': 0.11629189647857445, 'recall': 0.1767296561883852, 'ndcg': 0.16486385888226146}
Epoch 4:



Batch loss = 0.57395: 100%|██████████| 15060/15060 [02:46<00:00, 90.54it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.20it/s]


 {'precision': 0.11633432329232074, 'recall': 0.17644111433967338, 'ndcg': 0.1648813214212546}
Epoch 5:



Batch loss = 0.573112: 100%|██████████| 15060/15060 [02:46<00:00, 90.34it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.55it/s]


 {'precision': 0.11612218922358933, 'recall': 0.1760895660491336, 'ndcg': 0.16441869769084638}
Epoch 6:



Batch loss = 0.572391: 100%|██████████| 15060/15060 [02:47<00:00, 90.01it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.60it/s]


 {'precision': 0.1158676283411116, 'recall': 0.1751983259060505, 'ndcg': 0.16430355773481312}
Epoch 7:



Batch loss = 0.571763: 100%|██████████| 15060/15060 [02:47<00:00, 90.05it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.67it/s]


 {'precision': 0.11582520152736529, 'recall': 0.17432394715513913, 'ndcg': 0.16400502673968226}



100%|██████████| 2357/2357 [01:12<00:00, 32.71it/s]
100%|██████████| 2357/2357 [01:12<00:00, 32.54it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:11<00:00, 201.09it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 204.73it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 207.01it/s]
100%|██████████| 2357/2357 [01:58<00:00, 19.82it/s]


{'precision': [0.12095884599066611], 'recall': [0.17612002745219862], 'ndcg': [0.17098355046152433]}

___SEED___1
train dataset preparing...


100%|██████████| 2358/2358 [00:10<00:00, 227.92it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 213.78it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 197.95it/s]

Epoch 1:



Batch loss = 0.577462: 100%|██████████| 15060/15060 [02:49<00:00, 88.85it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.35it/s]


 {'precision': 0.11506151887993214, 'recall': 0.1765820492392197, 'ndcg': 0.16449431960363373}
Epoch 2:



Batch loss = 0.576074: 100%|██████████| 15060/15060 [02:48<00:00, 89.54it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.58it/s]


 {'precision': 0.11540093338990241, 'recall': 0.17548989428823922, 'ndcg': 0.16405846192863924}
Epoch 3:



Batch loss = 0.574922: 100%|██████████| 15060/15060 [02:48<00:00, 89.13it/s]



Evaluation (dev):


100%|██████████| 2357/2357 [01:13<00:00, 32.18it/s]


 {'precision': 0.11591005515485786, 'recall': 0.17585670966119427, 'ndcg': 0.1642078415204301}
Epoch 4:



Batch loss = 0.573952: 100%|██████████| 15060/15060 [02:48<00:00, 89.47it/s]



Evaluation (dev):


100%|██████████| 2357/2357 [01:13<00:00, 32.28it/s]


 {'precision': 0.11595248196860417, 'recall': 0.1756660622423153, 'ndcg': 0.16442445542687847}



100%|██████████| 2357/2357 [01:12<00:00, 32.43it/s]
100%|██████████| 2357/2357 [01:12<00:00, 32.37it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:11<00:00, 203.61it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 206.72it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:10<00:00, 215.58it/s]
100%|██████████| 2357/2357 [01:54<00:00, 20.51it/s]


{'precision': [0.12095884599066611, 0.12134068731438269], 'recall': [0.17612002745219862, 0.17772127477198774], 'ndcg': [0.17098355046152433, 0.17145930304056856]}

___SEED___2
train dataset preparing...


100%|██████████| 2358/2358 [00:10<00:00, 214.85it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 210.98it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 207.74it/s]

Epoch 1:



Batch loss = 0.577384: 100%|██████████| 15060/15060 [02:48<00:00, 89.15it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.18it/s]


 {'precision': 0.11523122613491728, 'recall': 0.17673341506202614, 'ndcg': 0.16472421538662033}
Epoch 2:



Batch loss = 0.575992: 100%|██████████| 15060/15060 [02:49<00:00, 88.95it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.56it/s]


 {'precision': 0.11565549427238016, 'recall': 0.17654839324982205, 'ndcg': 0.16454621682819276}
Epoch 3:



Batch loss = 0.574835: 100%|██████████| 15060/15060 [02:50<00:00, 88.48it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.60it/s]


 {'precision': 0.11582520152736529, 'recall': 0.17622632417097026, 'ndcg': 0.16452099030358078}
Epoch 4:



Batch loss = 0.573862: 100%|██████████| 15060/15060 [02:50<00:00, 88.22it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.32it/s]


 {'precision': 0.11591005515485789, 'recall': 0.17607234421162468, 'ndcg': 0.16460485467650643}



100%|██████████| 2357/2357 [01:12<00:00, 32.36it/s]
100%|██████████| 2357/2357 [01:14<00:00, 31.47it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:09<00:00, 240.05it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 207.90it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 206.43it/s]
100%|██████████| 2357/2357 [01:59<00:00, 19.79it/s]


{'precision': [0.12095884599066611, 0.12134068731438269, 0.12112855324565125], 'recall': [0.17612002745219862, 0.17772127477198774, 0.17742432221031992], 'ndcg': [0.17098355046152433, 0.17145930304056856, 0.17137393142884408]}

___SEED___3
train dataset preparing...


100%|██████████| 2358/2358 [00:11<00:00, 213.11it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 203.28it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:10<00:00, 214.55it/s]

Epoch 1:



Batch loss = 0.577585: 100%|██████████| 15060/15060 [02:54<00:00, 86.48it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:14<00:00, 31.83it/s]


 {'precision': 0.11557064064488756, 'recall': 0.1767706526724302, 'ndcg': 0.16486263754332411}
Epoch 2:



Batch loss = 0.576177: 100%|██████████| 15060/15060 [02:51<00:00, 87.88it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 31.94it/s]


 {'precision': 0.11565549427238016, 'recall': 0.17640132038679918, 'ndcg': 0.16459255363632255}
Epoch 3:



Batch loss = 0.575007: 100%|██████████| 15060/15060 [02:51<00:00, 87.61it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.21it/s]


 {'precision': 0.11603733559609673, 'recall': 0.17665838357019475, 'ndcg': 0.16480815545779082}
Epoch 4:



Batch loss = 0.57402: 100%|██████████| 15060/15060 [02:52<00:00, 87.41it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 32.05it/s]


 {'precision': 0.11624946966482817, 'recall': 0.17608793864796965, 'ndcg': 0.1646243433827141}



100%|██████████| 2357/2357 [01:13<00:00, 32.28it/s]
100%|██████████| 2357/2357 [01:12<00:00, 32.41it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:11<00:00, 213.97it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 213.83it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:10<00:00, 225.32it/s]
100%|██████████| 2357/2357 [01:56<00:00, 20.18it/s]


{'precision': [0.12095884599066611, 0.12134068731438269, 0.12112855324565125, 0.12108612643190496], 'recall': [0.17612002745219862, 0.17772127477198774, 0.17742432221031992, 0.17745281735556748], 'ndcg': [0.17098355046152433, 0.17145930304056856, 0.17137393142884408, 0.17143860369949687]}

___SEED___4
train dataset preparing...


100%|██████████| 2358/2358 [00:10<00:00, 214.69it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 206.80it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 201.93it/s]

Epoch 1:



Batch loss = 0.577492: 100%|██████████| 15060/15060 [02:51<00:00, 87.74it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:14<00:00, 31.78it/s]


 {'precision': 0.11527365294866357, 'recall': 0.17695473105777185, 'ndcg': 0.16486642007309535}
Epoch 2:



Batch loss = 0.576083: 100%|██████████| 15060/15060 [02:52<00:00, 87.53it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:13<00:00, 31.88it/s]


 {'precision': 0.11548578701739501, 'recall': 0.1765150323151148, 'ndcg': 0.1648176459997348}
Epoch 3:



Batch loss = 0.57491: 100%|██████████| 15060/15060 [02:51<00:00, 87.66it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.33it/s]


 {'precision': 0.11578277471361902, 'recall': 0.1759971937348859, 'ndcg': 0.16475452493203535}
Epoch 4:



Batch loss = 0.573921: 100%|██████████| 15060/15060 [02:52<00:00, 87.32it/s]


Evaluation (dev):



100%|██████████| 2357/2357 [01:12<00:00, 32.60it/s]


 {'precision': 0.11595248196860414, 'recall': 0.17583848400859664, 'ndcg': 0.1647625265138274}



100%|██████████| 2357/2357 [01:13<00:00, 32.22it/s]
100%|██████████| 2357/2357 [01:13<00:00, 32.18it/s]


train dataset preparing...


100%|██████████| 2358/2358 [00:10<00:00, 230.37it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:10<00:00, 221.07it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:11<00:00, 208.46it/s]
100%|██████████| 2357/2357 [01:57<00:00, 20.04it/s]

{'precision': [0.12095884599066611, 0.12134068731438269, 0.12112855324565125, 0.12108612643190496, 0.12104369961815868], 'recall': [0.17612002745219862, 0.17772127477198774, 0.17742432221031992, 0.17745281735556748, 0.17755391459587222], 'ndcg': [0.17098355046152433, 0.17145930304056856, 0.17137393142884408, 0.17143860369949687, 0.17146823846395248]}





In [None]:
{
    "precision": np.array(test_metrics["precision"]).mean(),
    "recall": np.array(test_metrics["recall"]).mean(),
    "ndcg": np.array(test_metrics["ndcg"]).mean(),
}

{'precision': 0.12111158252015272,
 'recall': 0.1772544712771892,
 'ndcg': 0.17134472541887727}