In [12]:
import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="last"

import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch
import missingno

from recbole.model.general_recommender.ease import EASE
from recbole.model.context_aware_recommender.ffm import FFM
from recbole.model.sequential_recommender.s3rec import S3Rec

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

In [3]:
!ls ../../data/train
!readlink -ef ../../data/train/train_ratings.csv

Ml_item2attributes.json  genres.tsv  train_ratings.csv	years.tsv
directors.tsv		 titles.tsv  writers.tsv
/opt/ml/input/data/train/train_ratings.csv


## 데이터 로드

In [17]:
data_path = '../../data/train'
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')

## genre
# genre 공백구분자 딕셔너리
genre_dict = dict(genre_data.groupby('item').genre.apply(lambda x: " ".join(list(x))))
title_data['genre'] = title_data.item.map(genre_dict) # 딕셔너리 적용

## year
title_data = title_data.merge(year_data, on='item', how='left')
# year에서 NaN값만, title의 year 데이터에서 채우기
title_data.year = title_data.year.fillna(title_data.title.map(lambda x: x[-5:-1])).astype(int)

## title
# title에서 "(년도)" 정보 제거
title_data.title = title_data.title.str.replace(r"(\(\d+-*\d*\))","").str.strip()

In [18]:
title_data

Unnamed: 0,item,title,genre,year
0,318,"Shawshank Redemption, The",Crime Drama,1994
1,2571,"Matrix, The",Action Sci-Fi Thriller,1999
2,2959,Fight Club,Action Crime Drama Thriller,1999
3,296,Pulp Fiction,Comedy Crime Drama Thriller,1994
4,356,Forrest Gump,Comedy Drama Romance War,1994
...,...,...,...,...
6802,73106,American Pie Presents: The Book of Love (Ameri...,Comedy,2009
6803,109850,Need for Speed,Action Crime Drama,2014
6804,8605,Taxi 3,Action Comedy,2003
6805,3689,Porky's II: The Next Day,Comedy,1983


In [13]:
title_data.columns=['item:token','title:token_seq','genre:token_seq','year:token']

In [8]:
data_path = '../../data/train'
train = pd.read_csv(os.path.join(data_path, 'train_ratings.csv'))

user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

## make inter file

In [None]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

In [None]:
train.columns=['user_id:token','item_id:token','timestamp:float']

In [None]:
train[:2]

In [None]:
outpath = f"dataset/sequential_data"
os.makedirs(outpath, exist_ok=True)
sub_train=train.groupby("user_id:token").sample(n=2, random_state=SEED)
sub_train.shape
sub_train.to_csv(os.path.join(outpath,"sequential_data.inter"),sep='\t',index=False)

In [None]:
sub_train.shape, train.shape

## make yaml file

In [None]:
yamldata="""
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
    
train_neg_sample_args: ~
ITEM_LIST_LENGTH_FIELD: item_length
LIST_SUFFIX: _list
MAX_ITEM_LIST_LENGTH: 50
"""
with open("s3rec_data.yaml", "w") as f:
    f.write(yamldata)

In [None]:
!ls dataset/sequential_data/

## make config, logger

In [None]:
logger = getLogger()

# configurations initialization
config = Config(model='S3Rec', dataset="sequential_data", config_file_list=[f's3rec_data.yaml'])
config['epochs'] = 1
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config['valid_metric'] = "Recall@10"
config['topk']=[10]
config['item_attribute'] = 'item_id'
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)
logger.info(config)

## make dataset

In [13]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

22 Dec 22:01    INFO  sequential_data
The number of users: 31361
Average actions of users: 2.0
The number of items: 5203
Average actions of items: 12.056901191849288
The number of inters: 62720
The sparsity of the dataset: 99.96156186379929%
Remain Fields: ['user_id', 'item_id', 'timestamp']
22 Dec 22:01    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
22 Dec 22:01    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'LS': 'valid_and_test'}, 'order': 'TO', 'mode': 'full', 'group_by': 'user'}]


In [14]:
train_data.dataset
valid_data.dataset
test_data.dataset

[1;35msequential_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 1.0
[1;34mThe number of items[0m: 5203
[1;34mAverage actions of items[0m: 7.1288929302114115
[1;34mThe number of inters[0m: 31360
[1;34mThe sparsity of the dataset[0m: 99.98078093189963%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'item_id_list', 'timestamp_list', 'item_length']

[1;35msequential_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 5203
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'item_id_list', 'timestamp_list', 'item_length']

[1;35msequential_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 5203
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'item_id_list', 'timestamp_list', 'item_length']

## make model

In [16]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = S3Rec(config, train_data.dataset).to(config['device'])
logger.info(model)

22 Dec 22:02    INFO  S3Rec(
  (item_embedding): Embedding(5204, 64, padding_idx=0)
  (position_embedding): Embedding(50, 64)
  (feature_embedding): Embedding(5202, 64, padding_idx=0)
  (trm_encoder): TransformerEncoder(
    (layer): ModuleList(
      (0): TransformerLayer(
        (multi_head_attention): MultiHeadAttention(
          (query): Linear(in_features=64, out_features=64, bias=True)
          (key): Linear(in_features=64, out_features=64, bias=True)
          (value): Linear(in_features=64, out_features=64, bias=True)
          (softmax): Softmax(dim=-1)
          (attn_dropout): Dropout(p=0.5, inplace=False)
          (dense): Linear(in_features=64, out_features=64, bias=True)
          (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
          (out_dropout): Dropout(p=0.5, inplace=False)
        )
        (feed_forward): FeedForward(
          (dense_1): Linear(in_features=64, out_features=256, bias=True)
          (dense_2): Linear(in_features=256, out_fe

## train

In [24]:
config['pretrain_epochs']=1
config['save_step']=1

In [25]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=False
)

22 Dec 22:43    INFO  epoch 0 training [time: 209.07s, train loss: 277064.3521]
22 Dec 22:43    INFO  Saving current: saved/S3Rec-sequential_data-1.pth


In [27]:
config['pre_model_path'] = 'saved/S3Rec-sequential_data-1.pth'

In [28]:
config['train_stage'] = 'finetune'

In [29]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=False
)

22 Dec 22:48    INFO  epoch 0 training [time: 208.20s, train loss: 276676.5093]
22 Dec 22:48    INFO  Saving current: saved/S3Rec-sequential_data-1.pth


In [254]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="last"

## inference

In [2]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch
import missingno

from recbole.model.general_recommender.ease import EASE
from recbole.model.context_aware_recommender.ffm import FFM
from recbole.model.sequential_recommender.s3rec import S3Rec

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

In [3]:
model_path='saved/S3Rec-sequential_data-1.pth'
# rank K 설정
K = 10

# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'sequential_data'

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

<All keys matched successfully>

In [None]:
# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128)

# user, item 길이
user_len = len(user_id2token)
item_len = len(item_id2token)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'))

for data in tbar:
    # interaction 생성
    interaction = dict()
    interaction = Interaction(interaction)
    interaction[user_id] = data
    interaction = interaction.to(device)
    break
    # user item별 score 예측
    score = model.full_sort_predict(interaction)
    score = score.view(-1, item_len)

    rating_pred = score.cpu().data.numpy().copy()

    user_index = data.numpy()

    idx = matrix[user_index].toarray() > 0

    rating_pred[idx] = -np.inf
    rating_pred[:, 0] = -np.inf
    ind = np.argpartition(rating_pred, -K)[:, -K:] #

    arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]

    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

    batch_pred_list = ind[
        np.arange(len(rating_pred))[:, None], arr_ind_argsort
    ]

    if pred_list is None:
        pred_list = batch_pred_list
        user_list = user_index
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )

result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# 데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])
sub.to_csv(
    "submission.csv", index=False
)
print('inference done!')

In [11]:
# https://github.com/RUCAIBox/RecBole/blob/master/run_example/recbole-using-all-items-for-prediction.ipynb
def add_last_item(old_interaction, last_item_id, max_len=50):
    new_seq_items = old_interaction['item_id_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_items[old_interaction['item_length'][-1].item()] = last_item_id
    else:
        new_seq_items = torch.roll(new_seq_items, -1)
        new_seq_items[-1] = last_item_id
    return new_seq_items.view(1, len(new_seq_items))

def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[index]
        test = {
            'item_id_list': add_last_item(input_interaction, 
                                          input_interaction['item_id'][-1].item(), model.max_seq_length),
            'item_length': torch.tensor(
                [input_interaction['item_length'][-1].item() + 1
                 if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, 10),new_inter

In [64]:
dict(input_interaction.interaction)

{'user_id': tensor([1]),
 'item_id': tensor([2]),
 'timestamp': tensor([1.2308e+09]),
 'item_length': tensor([1]),
 'item_id_list': tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]]),
 'timestamp_list': tensor([[1.2308e+09, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.

In [58]:
input_interaction['item_id']

2

In [None]:
add_last_item(input_interaction, input_interaction['item_id'][-1].item(), model.max_seq_length)

In [None]:
model.eval()
with torch.no_grad():
    uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
    index = np.isin(dataset[dataset.uid_field].numpy(), uid_series) # list, element -> True index
    input_interaction = dataset[index] # one interaction 추출
    test = {
        'item_id_list': add_last_item(input_interaction, 
                                      input_interaction['item_id'][-1].item(), model.max_seq_length),
        'item_length': torch.tensor(
            [input_interaction['item_length'][-1].item() + 1
             if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
    }
    new_inter = Interaction(test)
    new_inter = new_inter.to(config['device'])
    new_scores = model.full_sort_predict(new_inter)
    new_scores = new_scores.view(-1, test_data.dataset.item_num)
    new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf

In [45]:
all_users = list(map(str,range(1,len(user_id2token)-1)))

In [13]:
b,new_inter = predict_for_all_item('1', dataset, model)

In [25]:
b[1]

tensor([[   3,  910, 2418, 1360,  988,  485, 1189,  383, 3627, 1939]],
       device='cuda:0')

In [26]:
train_data.dataset[1].interaction

{'user_id': tensor(1493),
 'item_id': tensor(88),
 'timestamp': tensor(1.1132e+09),
 'item_length': tensor(1),
 'item_id_list': tensor([361,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]),
 'timestamp_list': tensor([1.1132e+09, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,


In [17]:
new_inter.interaction

{'item_id_list': tensor([[4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]], device='cuda:0'),
 'item_length': tensor([2], device='cuda:0')}

In [56]:
result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# 데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])

tensor([[   3,  910, 2418, 1360,  988,  485, 1189,  383, 3627, 1939]],
       device='cuda:0')

In [None]:
for i in all_users:
    predict_for_all_item(i, dataset, model)