Reference : RecBole (https://recbole.io/) 
+ sequential-model-fixed-missing-last-item.ipynb

# 1. Create dataset and train model with Recbole

For anyone need instruction document, please check this link: https://recbole.io/docs/user_guide/usage/use_modules.html

In [31]:
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

In [32]:
parameter_dict = {
    'data_path': '/opt/ml/RecBole/srcs/dataset',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': "[30,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 100,
    'eval_args': {
        'split': {'RS': [10, 0, 0]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}

config = Config(model='GRU4Rec', dataset='ml_bc', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
logger.info(config)

13 Apr 09:22    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /opt/ml/RecBole/srcs/dataset/ml_bc
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 100
train_batch_size = 2048
learner = adam
learning_rate = 0.001
neg_sampling = None
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [10, 0, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}
repeatable = True
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user_id
ITEM_ID_FIELD = item_id
RATING_FIELD = rating
TIME_FIE

In [33]:
dataset = create_dataset(config)
logger.info(dataset)

13 Apr 09:22    INFO  ml_bc
The number of users: 31357
Average actions of users: 164.3724327082536
The number of items: 6799
Average actions of items: 758.1732862606649
The number of inters: 5154062
The sparsity of the dataset: 97.58247991265024%
Remain Fields: ['user_id', 'item_id', 'timestamp']
ml_bc
The number of users: 31357
Average actions of users: 164.3724327082536
The number of items: 6799
Average actions of items: 758.1732862606649
The number of inters: 5154062
The sparsity of the dataset: 97.58247991265024%
Remain Fields: ['user_id', 'item_id', 'timestamp']
ml_bc
The number of users: 31357
Average actions of users: 164.3724327082536
The number of items: 6799
Average actions of items: 758.1732862606649
The number of inters: 5154062
The sparsity of the dataset: 97.58247991265024%
Remain Fields: ['user_id', 'item_id', 'timestamp']
ml_bc
The number of users: 31357
Average actions of users: 164.3724327082536
The number of items: 6799
Average actions of items: 758.1732862606649
The

In [34]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

13 Apr 09:23    INFO  [Training]: train_batch_size = [2048] negative sampling: [None]
[Training]: train_batch_size = [2048] negative sampling: [None]
[Training]: train_batch_size = [2048] negative sampling: [None]
[Training]: train_batch_size = [2048] negative sampling: [None]
[Training]: train_batch_size = [2048] negative sampling: [None]
13 Apr 09:23    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [10, 0, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [10, 0, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [10, 0, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [10, 0, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [10, 0, 0]}, 'group_by': 'user', 'or

In [35]:
# model loading and initialization
model = GRU4Rec(config, train_data.dataset).to(config['device'])
logger.info(model)

# trainer loading and initialization
trainer = Trainer(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(train_data)

13 Apr 09:23    INFO  GRU4Rec(
  (item_embedding): Embedding(6799, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 517120
GRU4Rec(
  (item_embedding): Embedding(6799, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 517120
GRU4Rec(
  (item_embedding): Embedding(6799, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 517120
GRU4Rec(
  (item_embedding): Embedding(6799, 64, padding_idx=0)
  (emb_dropout): Dropou

# 2. Create recommendation result from trained model

I note document here for any one want to customize it: https://recbole.io/docs/user_guide/usage/case_study.html

In [164]:
external_user_ids = dataset.id2token(
    dataset.uid_field, list(range(dataset.user_num)))[1:]#fist element in array is 'PAD'(default of Recbole) ->remove it 

In [37]:
import torch
from recbole.data.interaction import Interaction

def add_last_item(old_interaction, last_item_id, max_len=50):
    new_seq_items = old_interaction['item_id_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_items[old_interaction['item_length'][-1].item()] = last_item_id
    else:
        new_seq_items = torch.roll(new_seq_items, -1)
        new_seq_items[-1] = last_item_id
    return new_seq_items.view(1, len(new_seq_items))

def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[index]
        test = {
            'item_id_list': add_last_item(input_interaction, 
                                          input_interaction['item_id'][-1].item(), model.max_seq_length),
            'item_length': torch.tensor(
                [input_interaction['item_length'][-1].item() + 1
                 if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, 10)

In [48]:
# 추가 
# submission_index.csv 파일이 생성되기 전이라면 for_submission.py 파일을 실행해주세요.
import pandas as pd
user_index_ = pd.read_csv('submission_index.csv')

user_index_ = np.array(user_index_['user'],dtype=str)

In [155]:
from tqdm import tqdm 
topk_items = []
for external_user_id in tqdm(external_user_ids):
    _, topk_iid_list = predict_for_all_item(external_user_id, dataset, model)
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)
print(len(topk_items))

100%|██████████| 31356/31356 [18:30<00:00, 28.25it/s]


31356


In [157]:
data = {'user' : external_user_ids, 'item' :topk_items} 
df = pd.DataFrame(data)

print('- submission.csv -')
print(df)
print()

df.to_csv('/opt/ml/RecBole/srcs/output/submission_GRU-tmp.csv',index=False)

- submission.csv -
         user                                               item
0          11  [48394, 5995, 48780, 3949, 5952, 55820, 30707,...
1          14  [4025, 1947, 838, 2724, 5299, 2150, 1962, 468,...
2          18  [64622, 65130, 63082, 60950, 63876, 71033, 723...
3          25  [7361, 72011, 6016, 72998, 69757, 6874, 5418, ...
4          31  [6550, 49649, 5621, 69526, 56775, 6294, 5504, ...
...       ...                                                ...
31351  138473  [6874, 32587, 47, 2571, 7438, 2329, 3147, 1704...
31352  138475  [6331, 6380, 1945, 3462, 930, 7013, 1248, 7056...
31353  138486  [6502, 7360, 40732, 4105, 6755, 1215, 53000, 5...
31354  138492  [6296, 3552, 4002, 1663, 4499, 3210, 2109, 303...
31355  138493  [72998, 59615, 60069, 68358, 1291, 59315, 1210...

[31356 rows x 2 columns]



In [158]:
# 학습이 되지 않은 user 추출 
user_index = pd.read_csv('/opt/ml/RecBole/srcs/submission_index.csv')
user_index = np.array(user_index['user'],dtype=str)
set_unknown = set(user_index) - set(external_user_ids)
list(set_unknown)

['53188', '68606', '105578', '128756']

In [175]:
# 필터링된 user 4건에 대해 RecVAE 결과 통합 
recvae = pd.read_csv("output_RecVAE.csv")
df_unknown = pd.DataFrame(columns=['user', 'item'])
external_user_ids_ = external_user_ids
# # itr = 1 
for unknown in list(set_unknown): 
    df_tmp = recvae[recvae['user'] == int(unknown)]
    df_unknown = pd.concat([df_unknown ,df_tmp], axis = 0, ignore_index=True) 
    
df_unknown

Unnamed: 0,user,item
0,53188,3578
1,53188,79132
2,53188,2959
3,53188,527
4,53188,296
5,53188,3147
6,53188,1240
7,53188,480
8,53188,60069
9,53188,593


In [178]:

user_index = np.repeat(external_user_ids, 10)

topk_items = np.array(topk_items).flatten()
# data = {'user' : user_index, 'item' :topk_items} # external_item_list}
data = {'user' : user_index, 'item' :topk_items} 

df = pd.DataFrame(data)
df.sort_values(by='user')
df = pd.concat([df, df_unknown], axis=0)

print(df)



- submission.csv -
      user   item
0       11  48394
1       11   5995
2       11  48780
3       11   3949
4       11   5952
..     ...    ...
35  128756   7700
36  128756     97
37  128756   6981
38  128756   8199
39  128756   8125

[313600 rows x 2 columns]



In [179]:
df.to_csv('/opt/ml/RecBole/srcs/output/submission_GRU_.csv',index=False)

print()
print('csv 파일 생성 완료')


csv 파일 생성 완료
