In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

from recbole.model.general_recommender.ease import EASE
from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

train = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")

user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

In [2]:
model_path='saved/EASE-Dec-21-2022_15-02-10.pth'
# rank K 설정
K = 20

# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128) # 245, 128

# user, item 길이
user_len = len(user_id2token) # 31361 (PAD 포함)
item_len = len(item_id2token) # 6808 (PAD 포함)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr') # (31361, 6808)

# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink')) # 245, 128

for data in tbar: # data: 128, 
    # interaction 생성
    interaction = dict()
    interaction = Interaction(interaction)
    interaction[user_id] = data
    interaction = interaction.to(device)

    # user item별 score 예측
    score = model.full_sort_predict(interaction) # [1, 871424]
    score = score.view(-1, item_len) # 128, 6808

    rating_pred = score.cpu().data.numpy().copy() # 128, 6808

    user_index = data.numpy() # 128,

    # idx에는 128명의 영화상호작용이 True, False로 있다.
    idx = matrix[user_index].toarray() > 0 # idx shape: 128, 6808

    rating_pred[idx] = -np.inf # idx에서 True부분이 -inf로 변경
    rating_pred[:, 0] = -np.inf # 첫번째 PAD 열도 -inf로 변경
    
    # np.argpartition(배열, -K) : 배열에서 순서 상관없이 큰 값 K개를 뽑아 오른쪽에 놓겠다 -> 인덱스반환
    # rating_pred에서 각 행마다 K개의 score가 큰 인덱스를 오른쪽에 두고, 그 K개만 가져오기
    ind = np.argpartition(rating_pred, -K)[:, -K:] # rating_pred: (128, 6808) -> ind: (128, 20)

    user_row_index = np.arange(len(rating_pred)).reshape(-1,1) # [[0],[1],...,[127]]
    arr_ind = rating_pred[user_row_index, ind] # 128, 6808 -> 128, 20

    # arr_ind 내부에서 행별로, 내림차순 정렬해서 index 나오도록
    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

    # ind는 item의 real index를 갖는 128,20 -> arr_ind_argsort를 통해 pred가 높은 상위 20개 read index 추출
    batch_pred_list = ind[user_row_index, arr_ind_argsort] # 128,20 -> 128,20

    if pred_list is None: # 처음에는 직접 정의
        pred_list = batch_pred_list
        user_list = user_index
    else: # pred_list가 있을 때는, append
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )

result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# 데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])
print('inference done!')

<All keys matched successfully>

EASE()

[1;35mInference[0m: 100%|█████████████████████████████████████████████████| 245/245 [00:37<00:00,  6.53it/s][0m


inference done!


In [63]:
sub2 = sub.copy()

In [64]:
sub2['rating'] = sub2.groupby('user').item.cumcount()

In [65]:
recvae = pd.read_csv('Recvae_963.csv')

recvae['rating']=1

common = sub.merge(recvae,on=['user','item'])

## 기존 ease_recvase_combine.csv W=10
## ease_recvase_combine_5.csv는 W=5

In [66]:
W=5
for i in tqdm(common.itertuples(), total=common.shape[0]):
    sub2.loc[(sub2.user==i.user) & (sub2.item==i.item),'rating']-=W

100%|███████████████████████████████████████████████████████████| 414/414 [00:01<00:00, 398.15it/s]


In [67]:
i.user, i.item

(31297, 2762)

In [68]:
sub_last = sub2.sort_values(['user','rating'], ascending=True).groupby('user').head(10)

In [69]:
sub_last.rating.value_counts()

 1    31368
 8    31366
 5    31365
 4    31363
 0    31360
 7    31355
 6    31351
 3    31350
 2    31345
 9    31254
-3       32
-2       26
-1       23
-5       21
-4       21
Name: rating, dtype: int64

In [72]:
sub_last[sub_last.rating<0]

Unnamed: 0,user,item,rating
5082,254,356,-3
5883,294,1580,-2
16543,827,356,-2
23083,1154,356,-2
33144,1657,593,-1
...,...,...,...
593724,29686,920,-1
597781,29889,356,-4
600982,30049,356,-3
611340,30567,1580,-5


In [71]:
sub_last[sub_last.user==147]

Unnamed: 0,user,item,rating
2940,147,3038,0
2945,147,356,0
2941,147,1435,1
2942,147,659,2
2943,147,179,3
2944,147,43,4
2946,147,1580,6
2947,147,317,7
2948,147,644,8
2949,147,5042,9


In [73]:
sub_last.drop('rating',axis=1,inplace=True)

In [74]:
sub_last

Unnamed: 0,user,item
0,0,2381
1,0,2619
2,0,41
3,0,4581
4,0,4790
...,...,...
627185,31359,4101
627186,31359,2812
627187,31359,2208
627188,31359,4581


In [75]:
sub_last.user = sub_last.user.map(uidx2user)
sub_last.item = sub_last.item.map(iidx2item)

In [76]:
sub_last.to_csv('ease_recvase_combine_5.csv',index=False)