# 3. Create dataset and train model with Recbole

For anyone need instruction document, please check this link: https://recbole.io/docs/user_guide/usage/use_modules.html

In [25]:
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import cl4srec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
import numpy as np
import pandas as pd

for limiting memory and time traning, we will filter for only using user who bought more than 40 items and item which is sold more than 40 times. If you want to train with more data, please change below config
* user_inter_num_interval
* item_inter_num_interval

In [33]:
parameter_dict = {
    'field_separator': "\t",
    'seq_separator': " ",
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': '[5,inf)',
    'item_inter_num_interval': "[5,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp'],
                 'item': ['item', 'genre', 'title', 'year', 'writer', 'director']},
    'selected_features': ['genre', 'title', 'year', 'writer', 'director'],
    'NEG_PREFIX': 'neg_',
    'epochs': 50,
    'training_neg_sample_num': 0,
    'eval_args': {
        'split': {'RS': [10, 0, 0]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}
config = Config(model='CL4SRec', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
logger.info(config)

04 Apr 06:20    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/recbox_data
show_progress = True

Training Hyper Parameters:
checkpoint_dir = saved
epochs = 50
train_batch_size = 2048
learner = adam
learning_rate = 0.001
training_neg_sample_num = 0
training_neg_sample_distribution = uniform
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
draw_loss_pic = False
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_setting = TO_LS,full
group_by_user = True
split_ratio = [0.8, 0.1, 0.1]
leave_one_num = 2
real_time_process = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user_id
ITEM_ID_FIELD = item_id
RATING_FIELD = rating
TIME_FIELD = timestamp
seq_len = None
LABEL_FIELD = label
threshold = None
NEG_PREFIX 

In [34]:
dataset = create_dataset(config)
logger.info(dataset)

ValueError: iid_field must be loaded if item_feat is loaded.

In [26]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

04 Apr 03:47    INFO  Build [SequentialDataLoader] for [train] with format [InputType.POINTWISE]
Build [SequentialDataLoader] for [train] with format [InputType.POINTWISE]
Build [SequentialDataLoader] for [train] with format [InputType.POINTWISE]
Build [SequentialDataLoader] for [train] with format [InputType.POINTWISE]
Build [SequentialDataLoader] for [train] with format [InputType.POINTWISE]
Build [SequentialDataLoader] for [train] with format [InputType.POINTWISE]
Build [SequentialDataLoader] for [train] with format [InputType.POINTWISE]
04 Apr 03:47    INFO  [train] No Negative Sampling
[train] No Negative Sampling
[train] No Negative Sampling
[train] No Negative Sampling
[train] No Negative Sampling
[train] No Negative Sampling
[train] No Negative Sampling
04 Apr 03:47    INFO  [train] batch_size = [256], shuffle = [True]

[train] batch_size = [256], shuffle = [True]

[train] batch_size = [256], shuffle = [True]

[train] batch_size = [256], shuffle = [True]

[train] batch_size = [

In [21]:
# model loading and initialization
model = cl4srec.CL4SRec(config, train_data.dataset).to(config['device'])
logger.info(model)

# trainer loading and initialization
trainer = Trainer(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(train_data)

04 Apr 03:42    INFO  CL4SRec(
  (item_embedding): Embedding(6809, 64, padding_idx=0)
  (position_embedding): Embedding(50, 64)
  (trm_encoder): TransformerEncoder(
    (layer): ModuleList(
      (0): TransformerLayer(
        (multi_head_attention): MultiHeadAttention(
          (query): Linear(in_features=64, out_features=64, bias=True)
          (key): Linear(in_features=64, out_features=64, bias=True)
          (value): Linear(in_features=64, out_features=64, bias=True)
          (attn_dropout): Dropout(p=0.5, inplace=False)
          (dense): Linear(in_features=64, out_features=64, bias=True)
          (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
          (out_dropout): Dropout(p=0.5, inplace=False)
        )
        (feed_forward): FeedForward(
          (dense_1): Linear(in_features=64, out_features=256, bias=True)
          (dense_2): Linear(in_features=256, out_features=64, bias=True)
          (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=T

TypeError: expected str, bytes or os.PathLike object, not NoneType

# 4. Create recommendation result from trained model

I note document here for any one want to customize it: https://recbole.io/docs/user_guide/usage/case_study.html

In [9]:
from recbole.utils.case_study import full_sort_topk
external_user_ids = dataset.id2token(
    dataset.uid_field, list(range(dataset.user_num)))[1:]#fist element in array is 'PAD'(default of Recbole) ->remove it 

In [10]:
import torch
from recbole.data.interaction import Interaction

def add_last_item(old_interaction, last_item_id, max_len=50):
    new_seq_items = old_interaction['item_id_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_items[old_interaction['item_length'][-1].item()] = last_item_id
    else:
        new_seq_items = torch.roll(new_seq_items, -1)
        new_seq_items[-1] = last_item_id
    return new_seq_items.view(1, len(new_seq_items))

def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset.inter_feat[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[index]
        test = {
            'item_id_list': add_last_item(input_interaction, 
                                          input_interaction['item_id'][-1].item(), model.max_seq_length),
            'item_length': torch.tensor(
                [input_interaction['item_length'][-1].item() + 1
                 if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, 12)

In [11]:
predict_for_all_item('0109ad0b5a76924a1b58be677409bb601cc8bead9a87b8ce5b08a4a1f5bc71ef', 
                     dataset, model)

NameError: name 'token' is not defined

In [12]:
topk_items = []
for external_user_id in external_user_ids:
    _, topk_iid_list = predict_for_all_item(external_user_id, dataset, model)
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)
print(len(topk_items))

NameError: name 'np' is not defined

In [13]:
external_item_str = [' '.join(x) for x in topk_items]
result = pd.DataFrame(external_user_ids, columns=['customer_id'])
result['prediction'] = external_item_str
result.head()

NameError: name 'pd' is not defined

In [28]:
del external_item_str
del topk_items
del external_user_ids
del train_data
del valid_data
del test_data
del model
del Trainer
del logger
del dataset
gc.collect()

42

# 5. Combine result from most bought items and GRU model

In [29]:
submit_df = pd.read_csv('submission.csv')
submit_df.shape

(1371980, 2)

In [30]:
submit_df.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0656719005 0745232001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0800436010 0924243001 0739590027 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643001 0852643003 0858883002 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0448509014 0573085028 0924243001 0751471001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0791587015 0924243001 0896152002 08...


In [31]:
submit_df = pd.merge(submit_df, result, on='customer_id', how='outer')
submit_df.head()

Unnamed: 0,customer_id,prediction_x,prediction_y
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0656719005 0745232001 09...,
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0800436010 0924243001 0739590027 07...,
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643001 0852643003 0858883002 09...,
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0448509014 0573085028 0924243001 0751471001 07...,
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0791587015 0924243001 0896152002 08...,


In [32]:
submit_df = submit_df.fillna(-1)
submit_df['prediction'] = submit_df.apply(
    lambda x: x['prediction_y'] if x['prediction_y'] != -1 else x['prediction_x'], axis=1)
submit_df.head()

Unnamed: 0,customer_id,prediction_x,prediction_y,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0656719005 0745232001 09...,-1,0568601043 0568601006 0656719005 0745232001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0800436010 0924243001 0739590027 07...,-1,0826211002 0800436010 0924243001 0739590027 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643001 0852643003 0858883002 09...,-1,0794321007 0852643001 0852643003 0858883002 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0448509014 0573085028 0924243001 0751471001 07...,-1,0448509014 0573085028 0924243001 0751471001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0791587015 0924243001 0896152002 08...,-1,0730683050 0791587015 0924243001 0896152002 08...


In [33]:
submit_df = submit_df.drop(columns=['prediction_y', 'prediction_x'])
submit_df.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0656719005 0745232001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0800436010 0924243001 0739590027 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643001 0852643003 0858883002 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0448509014 0573085028 0924243001 0751471001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0791587015 0924243001 0896152002 08...


In [34]:
submit_df.to_csv('submission.csv', index=False)