In [107]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

# from recbole.model.general_recommender.ease import EASE
# from recbole.model.context_aware_recommender.ffm import FFM
# from recbole.model.general_recommender.neumf import NeuMF

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

In [108]:
!ls ../../data/train
!readlink -ef ../../data/train/train_ratings.csv

pro_sg	train_ratings.csv
/opt/ml/input/data/train/train_ratings.csv


## 데이터 로드

In [109]:
train = pd.read_csv("/opt/ml/input/project/model/data/train.csv")

In [110]:
train[:2]

Unnamed: 0,userid,rest,user_code,rest_code
0,5b61c7658f8242cb2a1b1028,1100141000.0,0,14
1,5b61c7658f8242cb2a1b1028,304986700.0,0,6


In [111]:
user2idx = {v:k for k,v in enumerate(sorted(set(train.user_code)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.rest_code)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user_code)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.rest_code)))}

## make inter file

In [113]:
train.columns=['userid:token','restid:token','user_id:token','item_id:token']

In [114]:
train[:2]

Unnamed: 0,userid:token,restid:token,user_id:token,item_id:token
0,5b61c7658f8242cb2a1b1028,1100141000.0,0,14
1,5b61c7658f8242cb2a1b1028,304986700.0,0,6


In [115]:
outpath = f"dataset/train_data"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
train.to_csv(os.path.join(outpath,"train_data.inter"),sep='\t',index=False)

## make yaml file

In [116]:
yamldata="""
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
"""
with open("ease.yaml", "w") as f:
    f.write(yamldata)

121

## make config, logger

In [178]:
logger = getLogger()

# configurations initialization
config = Config(model='MultiVAE', dataset="train_data", config_file_list=[f'ease.yaml'])
config['epochs'] = 10
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config['valid_metric'] = "Recall@10"
config['eval_args'] = {'split': {'RS': [1, 0, 0]},
                         'group_by': 'user',
                         'order': 'RO',
                         'mode': 'full'}
config['learning_rate'] = 0.01
config['topk']=[20]
config['mlp_hidden_size'] = [800,200,100]
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)
logger.info(config)

31 Jan 05:51    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/train_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.01
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [1, 0, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [20]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separ

## make dataset

In [140]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

31 Jan 05:10    INFO  train_data
The number of users: 382941
Average actions of users: 17.58996448529796
The number of items: 41408
Average actions of items: 162.67541719999033
The number of inters: 6735901
The sparsity of the dataset: 99.95752048263277%
Remain Fields: ['user_id', 'item_id']
31 Jan 05:10    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
31 Jan 05:10    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [1, 0, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [141]:
train_data.dataset
valid_data.dataset
test_data.dataset

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


[1;35mtrain_data[0m
[1;34mThe number of users[0m: 382941
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 41408
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id']

## make model

In [142]:
from recbole.model.general_recommender.multivae import MultiVAE

In [179]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = MultiVAE(config, train_data.dataset).to(config['device'])
logger.info(model)

31 Jan 05:52    INFO  MultiVAE(
  (encoder): Sequential(
    (0): Linear(in_features=41408, out_features=800, bias=True)
    (1): Tanh()
    (2): Linear(in_features=800, out_features=200, bias=True)
    (3): Tanh()
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): Tanh()
    (6): Linear(in_features=100, out_features=128, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=64, out_features=100, bias=True)
    (1): Tanh()
    (2): Linear(in_features=100, out_features=200, bias=True)
    (3): Tanh()
    (4): Linear(in_features=200, out_features=800, bias=True)
    (5): Tanh()
    (6): Linear(in_features=800, out_features=41408, bias=True)
  )
)
Trainable parameters: 66675736


## train

In [180]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

31 Jan 05:53    INFO  epoch 0 training [time: 12.74s, train loss: 32740.8203]
31 Jan 05:53    INFO  Saving current: saved/MultiVAE-Jan-31-2023_05-53-14.pth
31 Jan 05:53    INFO  epoch 1 training [time: 12.74s, train loss: 29053.8846]
31 Jan 05:53    INFO  Saving current: saved/MultiVAE-Jan-31-2023_05-53-14.pth
31 Jan 05:53    INFO  epoch 2 training [time: 12.81s, train loss: 27582.7622]
31 Jan 05:54    INFO  Saving current: saved/MultiVAE-Jan-31-2023_05-53-14.pth
31 Jan 05:54    INFO  epoch 3 training [time: 12.80s, train loss: 26584.3931]
31 Jan 05:54    INFO  Saving current: saved/MultiVAE-Jan-31-2023_05-53-14.pth
31 Jan 05:54    INFO  epoch 4 training [time: 12.80s, train loss: 25755.2664]
31 Jan 05:54    INFO  Saving current: saved/MultiVAE-Jan-31-2023_05-53-14.pth
31 Jan 05:54    INFO  epoch 5 training [time: 12.83s, train loss: 25183.4543]
31 Jan 05:55    INFO  Saving current: saved/MultiVAE-Jan-31-2023_05-53-14.pth
31 Jan 05:55    INFO  epoch 6 training [time: 12.81s, train loss

In [145]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="last"

## inference

In [181]:
model_path='./saved/MultiVAE-Jan-31-2023_05-53-14.pth'
# rank K 설정
K = 20

In [182]:
# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]


In [None]:
# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,820) # 2335 , 164

# user, item 길이
user_len = len(user_id2token) # 382941 (PAD 포함)
item_len = len(item_id2token) # 41408 (PAD 포함)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr') # (382941, 41408)

# user id, predict item id 저장 변수
pred_list = None
user_list = None

In [None]:

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink')) # 245, 128

for data in tbar: # data: 128, 
    # interaction 생성
    interaction = dict()
    interaction = Interaction(interaction)
    interaction[user_id] = data
    interaction = interaction.to(device)

    # user item별 score 예측
    score = model.full_sort_predict(interaction) # [1, 871424]
    score = score.view(-1, item_len) # 128, 6808

    rating_pred = score.cpu().data.numpy().copy() # 128, 6808

    user_index = data.numpy() # 128,

    # idx에는 128명의 영화상호작용이 True, False로 있다.
    idx = matrix[user_index].toarray() > 0 # idx shape: 128, 6808

    rating_pred[idx] = -np.inf # idx에서 True부분이 -inf로 변경
    rating_pred[:, 0] = -np.inf # 첫번째 PAD 열도 -inf로 변경
    
    # np.argpartition(배열, -K) : 배열에서 순서 상관없이 큰 값 K개를 뽑아 오른쪽에 놓겠다 -> 인덱스반환
    # rating_pred에서 각 행마다 K개의 score가 큰 인덱스를 오른쪽에 두고, 그 K개만 가져오기
    ind = np.argpartition(rating_pred, -K)[:, -K:] # rating_pred: (128, 6808) -> ind: (128, 20)

    user_row_index = np.arange(len(rating_pred)).reshape(-1,1) # [[0],[1],...,[127]]
    arr_ind = rating_pred[user_row_index, ind] # 128, 6808 -> 128, 20

    # arr_ind 내부에서 행별로, 내림차순 정렬해서 index 나오도록
    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

    # ind는 item의 real index를 갖는 128,20 -> arr_ind_argsort를 통해 pred가 높은 상위 20개 read index 추출
    batch_pred_list = ind[user_row_index, arr_ind_argsort] # 128,20 -> 128,20

    if pred_list is None: # 처음에는 직접 정의
        pred_list = batch_pred_list
        user_list = user_index
    else: # pred_list가 있을 때는, append
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )

result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# 데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])
sub.to_csv(
    "submission.csv", index=False
)
print('inference done!')

Inference:   0%|          | 0/467 [00:00<?, ?it/s]:   0%|          | 1/467 [00:00<05:14,  1.48it/s]:   0%|          | 2/467 [00:01<05:10,  1.50it/s]:   1%|          | 3/467 [00:02<05:13,  1.48it/s]:   1%|          | 4/467 [00:02<05:11,  1.49it/s]:   1%|          | 5/467 [00:03<05:11,  1.48it/s]:   1%|▏         | 6/467 [00:04<05:13,  1.47it/s]:   1%|▏         | 7/467 [00:04<05:12,  1.47it/s]:   2%|▏         | 8/467 [00:05<05:12,  1.47it/s]:   2%|▏         | 9/467 [00:06<05:09,  1.48it/s]:   2%|▏         | 10/467 [00:06<05:08,  1.48it/s]:   2%|▏         | 11/467 [00:07<05:09,  1.47it/s]:   3%|▎         | 12/467 [00:08<05:07,  1.48it/s]:   3%|▎         | 13/467 [00:08<05:08,  1.47it/s]:   3%|▎         | 14/467 [00:09<05:06,  1.48it/s]:   3%|▎         | 15/467 [00:10<05:05,  1.48it/s]:   3%|▎         | 16/467 [00:10<05:05,  1.48it/s]:   4%|▎         | 17/467 [00:11<05:04,  1.48it/s]:   4%|▍         | 18/467 [00:12<05:03,  1.48it/s]:   4%|▍         | 19/467 [00:12<05:02,  1.48it/s]:   4%|▍ 

inference done!


## Recall@K

In [101]:
answer = pd.read_csv('/opt/ml/input/project/model/data/S_test.csv')

In [None]:
predict_user = sub.groupby('user')['item'].apply(list) 
answer_user = answer.groupby('user')['item'].apply(list)


In [158]:
train['item_id:token'].value_counts()

5619     1795
408      1623
1959     1505
1303     1453
3996     1430
         ... 
41232       1
38818       1
41231       1
41230       1
41460       1
Name: item_id:token, Length: 41407, dtype: int64

In [None]:
predict_user

user
0         [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
1         [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
2         [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
3         [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
4         [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
                                ...                        
382935    [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
382936    [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
382937    [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
382938    [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
382939    [341, 408, 3996, 1592, 7837, 11058, 3220, 1435...
Name: item, Length: 382940, dtype: object

In [132]:
_recall = []

for i, ans in enumerate(answer_user):
    a = 0
    for j in ans:
        if j in predict_user[i]:
            a += 1 
    _recall.append(a/2)

In [133]:
recall = sum(_recall) / len(_recall)
recall

9.916699722332408e-05