In [None]:
import os
import sys
sys.path.append("..")
from nbr.preparation import Preprocess, save_split, Corpus
from nbr.trainer import NBRTrainer
from nbr.model import TopPopular, TopPersonal

# TaFeng

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "ta_feng"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 32266, #items = 23812, #clicks = 817741 (#illegal records = 0)
After preprocessing: #users = 7358, #items = 11202, #clicks = 368951
Saving dataset in ./data//data_ta_feng/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Top Popular baseline:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 7358/7358 [00:15<00:00, 461.02it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 2805.39it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:03<00:00, 2042.75it/s]


In [None]:
params = {
    "model": TopPopular(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)
trainer.evaluate(mode="test")

100%|██████████| 7357/7357 [01:26<00:00, 85.49it/s] 


{'precision': 0.030841375560690498,
 'recall': 0.08628107674119984,
 'ndcg': 0.09394825116150071}

Top Personal baseline:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 7358/7358 [00:15<00:00, 472.38it/s]


dev dataset preparing...


100%|██████████| 7357/7357 [00:02<00:00, 2456.82it/s]


test dataset preparing...


100%|██████████| 7357/7357 [00:03<00:00, 2360.94it/s]


In [None]:
params = {
    "model": TopPersonal(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)
trainer.evaluate(mode="test")

100%|██████████| 7357/7357 [01:21<00:00, 89.91it/s] 


{'precision': 0.057224412124507275,
 'recall': 0.13077223704869326,
 'ndcg': 0.10843777780601296}

# TaoBao

Read interactions data (filter users with less than 10 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "taobao"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(10, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 672404, #items = 638962, #clicks = 2015807 (#illegal records = 0)
After preprocessing: #users = 10092, #items = 22286, #clicks = 67991
Saving dataset in ./data//data_taobao/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Top Popular baseline:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 10092/10092 [00:50<00:00, 201.47it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 23847.99it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 17711.38it/s]


In [None]:
params = {
    "model": TopPopular(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)
trainer.evaluate(mode="test")

100%|██████████| 9307/9307 [03:28<00:00, 44.54it/s]


{'precision': 0.0005479746427420222,
 'recall': 0.005479746427420221,
 'ndcg': 0.0028849637834630843}

Top Personal baseline:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 10092/10092 [01:01<00:00, 163.05it/s]


dev dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 25184.16it/s]


test dataset preparing...


100%|██████████| 9307/9307 [00:00<00:00, 23041.67it/s]


In [None]:
params = {
    "model": TopPersonal(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)
trainer.evaluate(mode="test")

100%|██████████| 9307/9307 [03:20<00:00, 46.50it/s]


{'precision': 0.011539701300096702,
 'recall': 0.11161849503957594,
 'ndcg': 0.07411685538040647}

# Dunnhumby

Read interactions data (filter users with less than 5 transactions, high purchase frequency and one-day users and items with less than 10 transactions). Train dataset - all baskets except the last two, validation dataset - the last but one basket, test dataset - the last basket:

In [None]:
corpus_path = "./data/"
dataset_name = "dunnhumby"

preprocessor = Preprocess(corpus_path, dataset_name)
preprocessor.load_data(5, 10, filt=True)
save_split(corpus_path, dataset_name, preprocessor)

Before preprocessing: #users = 2500, #items = 92339, #clicks = 2595370 (#illegal records = 0)
After preprocessing: #users = 2358, #items = 26756, #clicks = 1976796
Saving dataset in ./data//data_dunnhumby/...


In [None]:
corpus = Corpus(corpus_path, dataset_name)
corpus.load_data()

Top Popular baseline:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 2358/2358 [00:14<00:00, 160.25it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:15<00:00, 156.75it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:14<00:00, 166.07it/s]


In [None]:
params = {
    "model": TopPopular(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)
trainer.evaluate(mode="test")

100%|██████████| 2357/2357 [01:10<00:00, 33.55it/s]


{'precision': 0.049724225710649134,
 'recall': 0.09043003780604869,
 'ndcg': 0.0795292139930181}

Top Personal baseline:

In [None]:
trainer = NBRTrainer(
    corpus=corpus,
    max_epochs=None,
    topk=10,
    early_stop_num=None
)

train dataset preparing...


100%|██████████| 2358/2358 [00:14<00:00, 167.38it/s]


dev dataset preparing...


100%|██████████| 2357/2357 [00:14<00:00, 168.05it/s]


test dataset preparing...


100%|██████████| 2357/2357 [00:14<00:00, 164.19it/s]


In [None]:
params = {
    "model": TopPersonal(
        item_num=corpus.n_items,
        user_num=corpus.n_users,
        corpus=corpus
    )
}

trainer.init_hyperparams(**params)
trainer.evaluate(mode="test")

100%|██████████| 2357/2357 [01:08<00:00, 34.57it/s]


{'precision': 0.10912176495545187,
 'recall': 0.15774129438087645,
 'ndcg': 0.1490099291378852}