## 데이터 로드

In [148]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

from recbole.model.general_recommender.ease import EASE
from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

train = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")

user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

## make inter file

In [180]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

In [11]:
train.columns=['user_id:token','item_id:token','timestamp:float']

In [12]:
train[:2]

Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,0,2505,1230782529
1,0,109,1230782534


In [14]:
outpath = f"dataset/train_data"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
train.to_csv(os.path.join(outpath,"train_data.inter"),sep='\t',index=False)

In [15]:
train.shape

(5154471, 3)

## make yaml file

In [16]:
yamldata="""
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
"""
with open("ease.yaml", "w") as f:
    f.write(yamldata)

121

## make config, logger

In [17]:
logger = getLogger()

# configurations initialization
config = Config(model='EASE', dataset="train_data", config_file_list=[f'ease.yaml'])
# config['epochs'] = 100
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config['valid_metric'] = "Recall@10"
config['eval_args'] = {'split': {'RS': [1, 0, 0]},
                         'group_by': 'user',
                         'order': 'RO',
                         'mode': 'full'}
config['topk']=[20]
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)
logger.info(config)

23 Dec 15:53    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/train_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 300
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [1, 0, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [20]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_sep

## make dataset

In [18]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

23 Dec 15:54    INFO  train_data
The number of users: 31361
Average actions of users: 164.36450892857144
The number of items: 6808
Average actions of items: 757.2309387395328
The number of inters: 5154471
The sparsity of the dataset: 97.58579218741939%
Remain Fields: ['user_id', 'item_id', 'timestamp']
23 Dec 15:54    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
23 Dec 15:54    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [1, 0, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [19]:
train_data.dataset
valid_data.dataset
test_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 164.36450892857144
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 757.2309387395328
[1;34mThe number of inters[0m: 5154471
[1;34mThe sparsity of the dataset[0m: 97.58579218741939%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

## make model

In [20]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = EASE(config, train_data.dataset).to(config['device'])
logger.info(model)

23 Dec 15:54    INFO  EASE()
Trainable parameters: 1


## train

In [21]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

23 Dec 15:55    INFO  epoch 0 training [time: 12.03s, train loss: 0.0000]
23 Dec 15:55    INFO  Saving current: saved/EASE-Dec-23-2022_15-54-55.pth


In [253]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

21 Dec 16:29    INFO  epoch 0 training [time: 14.01s, train loss: 0.0000]
21 Dec 16:29    INFO  Saving current: saved/EASE-Dec-21-2022_16-28-47.pth


In [22]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="last"

## inference

In [8]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

from recbole.model.general_recommender.ease import EASE
from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

train = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")

user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

In [149]:
model_path='saved/EASE-Dec-21-2022_15-02-10.pth'
# rank K 설정
K = 20

# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128) # 245, 128

# user, item 길이
user_len = len(user_id2token) # 31361 (PAD 포함)
item_len = len(item_id2token) # 6808 (PAD 포함)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr') # (31361, 6808)

# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink')) # 245, 128

for data in tbar: # data: 128, 
    # interaction 생성
    interaction = dict()
    interaction = Interaction(interaction)
    interaction[user_id] = data
    interaction = interaction.to(device)

    # user item별 score 예측
    score = model.full_sort_predict(interaction) # [1, 871424]
    score = score.view(-1, item_len) # 128, 6808

    rating_pred = score.cpu().data.numpy().copy() # 128, 6808

    user_index = data.numpy() # 128,

    # idx에는 128명의 영화상호작용이 True, False로 있다.
    idx = matrix[user_index].toarray() > 0 # idx shape: 128, 6808

    rating_pred[idx] = -np.inf # idx에서 True부분이 -inf로 변경
    rating_pred[:, 0] = -np.inf # 첫번째 PAD 열도 -inf로 변경
    
    # np.argpartition(배열, -K) : 배열에서 순서 상관없이 큰 값 K개를 뽑아 오른쪽에 놓겠다 -> 인덱스반환
    # rating_pred에서 각 행마다 K개의 score가 큰 인덱스를 오른쪽에 두고, 그 K개만 가져오기
    ind = np.argpartition(rating_pred, -K)[:, -K:] # rating_pred: (128, 6808) -> ind: (128, 20)

    user_row_index = np.arange(len(rating_pred)).reshape(-1,1) # [[0],[1],...,[127]]
    arr_ind = rating_pred[user_row_index, ind] # 128, 6808 -> 128, 20

    # arr_ind 내부에서 행별로, 내림차순 정렬해서 index 나오도록
    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

    # ind는 item의 real index를 갖는 128,20 -> arr_ind_argsort를 통해 pred가 높은 상위 20개 read index 추출
    batch_pred_list = ind[user_row_index, arr_ind_argsort] # 128,20 -> 128,20

    if pred_list is None: # 처음에는 직접 정의
        pred_list = batch_pred_list
        user_list = user_index
    else: # pred_list가 있을 때는, append
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )

result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# 데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])
print('inference done!')

In [152]:
sub

Unnamed: 0,user,item
0,0,2381
1,0,2619
2,0,41
3,0,4581
4,0,4790
...,...,...
627195,31359,2541
627196,31359,178
627197,31359,1206
627198,31359,2603


In [97]:
arr_ind_argsort

array([[13,  2, 16, ..., 11, 10,  0],
       [13, 17, 11, ...,  2,  1,  0],
       [13, 12, 14, ...,  1,  2,  0],
       ...,
       [19, 18, 16, ...,  1,  2,  0],
       [19, 12, 10, ...,  2,  1,  0],
       [11, 14,  7, ...,  2,  1,  0]])

In [100]:
ind

array([[ 960,  202,  290, ...,  391,  274,  281],
       [ 554,  743,  573, ...,   83,  158,  377],
       [1329, 1162,  274, ...,  814, 3478,  287],
       ...,
       [ 265,  339,  626, ...,  962,  298,  139],
       [  73,  199,  629, ...,  377,  505,   85],
       [1328,  965,  622, ...,  158,  377,   73]])

In [133]:
train[train.user==0].shape

(376, 3)

In [130]:
sum(matrix.toarray()[1])

376.0

In [236]:
matrix = dataset.inter_matrix(form='csr') # (31361, 6808)

In [238]:
coo = matrix.tocoo(copy=False)

In [241]:
len(coo.data)

5154471

In [242]:
df = pd.DataFrame({'index': coo.row, 'col': coo.col, 'data': coo.data}).set_index(["index", "col"])

In [244]:
n_rows, n_cols = coo.shape
rows, cols = map(np.ndarray.flatten, np.mgrid[:n_rows, :n_cols])
filling = pd.DataFrame({"index": rows, "col": cols, "data": np.repeat(0, n_rows * n_cols)}) \
    .set_index(["index", "col"])

res = df.combine_first(filling).reset_index()

In [None]:
res.data.value_counts()

In [225]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

In [None]:
['user_id:token','item_id:token','timestamp:float']

## test dataset 만들기

In [24]:
!mkdir -p dataset/test_data20

In [94]:
sub = pd.DataFrame(result, columns=["user", "item"])
sub2 = sub.copy()

In [95]:
sub

Unnamed: 0,user,item
0,0,2381
1,0,2619
2,0,41
3,0,4581
4,0,4790
...,...,...
627195,31359,2541
627196,31359,178
627197,31359,1206
627198,31359,2603


In [96]:
exist_item = sorted(set(sub2.item))

In [97]:
n_exist_item = [i for i in range(6807) if i not in exist_item]

In [80]:
len(n_exist_item)

4090

In [98]:
for i in tqdm(n_exist_item):
    sub2.loc[len(sub2),]=[31359,int(i)]

100%|█████████████████████████████████████████████████████████| 4090/4090 [00:10<00:00, 388.71it/s]


In [103]:
sub2 = sub2.astype(int)

In [105]:
sub2['time']=np.nan

sub2.columns = ['user_id:token','item_id:token','timestamp:float']

sub2.to_csv("dataset/test_data20/train_data.inter",sep='\t',index=False)

In [53]:
!ls dataset/test_data20

test_data20.inter  train_data.inter


In [48]:
config['data_path']="dataset/test_data20"

In [49]:
config['dataset_save_path']=""

In [55]:
config['eval_args']['split']['RS']=[1,0,99]

In [56]:
config['eval_args']

{'split': {'RS': [1, 0, 99]},
 'group_by': 'user',
 'order': 'RO',
 'mode': 'full'}

In [106]:
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

In [107]:
train_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 1.0013073979591838
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 20.430058555627845
[1;34mThe number of inters[0m: 31401
[1;34mThe sparsity of the dataset[0m: 99.98529266349101%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [162]:
valid_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [161]:
test_data.dataset

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

## sub 파일 train_dataset.inter로 쓰기

In [143]:
sub = pd.DataFrame(result, columns=["user", "item"])

In [145]:
sub.user = sub.user.map(uidx2user)

In [146]:
sub.item = sub.item.map(iidx2item)

In [147]:
sub

Unnamed: 0,user,item
0,11,4370
1,11,4886
2,11,47
3,11,32587
4,11,40815
...,...,...
627195,138493,4720
627196,138493,293
627197,138493,2174
627198,138493,4848


In [141]:
sub

Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,0,2381,
1,0,2619,
2,0,41,
3,0,4581,
4,0,4790,
...,...,...,...
627195,31359,2541,
627196,31359,178,
627197,31359,1206,
627198,31359,2603,


In [None]:
sub['time']=np.nan

sub.columns = ['user_id:token','item_id:token','timestamp:float']
!mkdir dataset/ease_data20
sub.to_csv("dataset/ease_data20/ease_data20.inter",sep='\t',index=False)

## full_sort_topk

In [181]:
train

Unnamed: 0,user,item,time
0,0,2505,1230782529
1,0,109,1230782534
2,0,319,1230782539
3,0,368,1230782542
4,0,1183,1230782563
...,...,...,...
5154466,31359,4882,1260209449
5154467,31359,2652,1260209482
5154468,31359,5768,1260209720
5154469,31359,4791,1260209726


In [167]:
config['eval_args']['split']['RS']=[8,1,1]

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

In [170]:
test_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 15.981855867346939
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 73.68288738606292
[1;34mThe number of inters[0m: 501191
[1;34mThe sparsity of the dataset[0m: 99.76525637106212%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [171]:
valid_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 15.981855867346939
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 73.63958272112842
[1;34mThe number of inters[0m: 501191
[1;34mThe sparsity of the dataset[0m: 99.76525637106212%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [172]:
full_sort_topk(data, model, test_data, 10, device=device)

  uid_series = torch.tensor(uid_series)


torch.return_types.topk(
values=tensor([[1.2123, 1.0779, 1.0422,  ..., 0.8778, 0.8621, 0.8151],
        [0.7155, 0.6466, 0.6000,  ..., 0.5113, 0.4922, 0.4918],
        [0.6584, 0.6403, 0.6158,  ..., 0.4063, 0.3620, 0.3482],
        ...,
        [0.5309, 0.4816, 0.4163,  ..., 0.3179, 0.2975, 0.2974],
        [1.0423, 1.0132, 0.9649,  ..., 0.6417, 0.6149, 0.6103],
        [0.8704, 0.7527, 0.6028,  ..., 0.4454, 0.4451, 0.4445]],
       dtype=torch.float64),
indices=tensor([[ 127,  118,  375,  ...,  113,  310,  485],
        [ 272,   74,  723,  ..., 1328, 2481,  470],
        [ 529,  605,  265,  ..., 1284,   83,  629],
        ...,
        [ 225,  219, 2278,  ...,  314,  342,   47],
        [ 342,  632,  149,  ..., 1319,  159,  199],
        [ 377,  210,  565,  ..., 1110,  313,  569]]))

In [173]:
full_sort_topk(data, model, valid_data, 10, device=device)

  uid_series = torch.tensor(uid_series)


torch.return_types.topk(
values=tensor([[1.2131, 1.2123, 1.0963,  ..., 1.0033, 0.9646, 0.9450],
        [0.8679, 0.8007, 0.7155,  ..., 0.5643, 0.5376, 0.5303],
        [0.6584, 0.6403, 0.6158,  ..., 0.4063, 0.3620, 0.3482],
        ...,
        [0.5309, 0.4816, 0.4163,  ..., 0.3179, 0.2975, 0.2974],
        [1.0423, 1.0132, 1.0013,  ..., 0.7582, 0.7536, 0.6992],
        [0.8704, 0.7527, 0.6503,  ..., 0.4972, 0.4615, 0.4563]],
       dtype=torch.float64),
indices=tensor([[  41,  127,  126,  ...,   42,  271,   65],
        [  62,  388,  272,  ...,  614, 1732,  204],
        [ 529,  605,  265,  ..., 1284,   83,  629],
        ...,
        [ 225,  219, 2278,  ...,  314,  342,   47],
        [ 342,  632,  669,  ...,  227,  141,  717],
        [ 377,  210, 1099,  ...,  792, 1085,  207]]))

### full_sort_topk에서 test_dataset의 역할 확인

In [222]:
pred_list2 = None
user_list2 = []
from recbole.utils.case_study import full_sort_topk
for data in tbar:
    batch_pred_list2 = full_sort_topk(data, model, test_data, 47, device=device)[1]
    batch_pred_list2 = batch_pred_list2.clone().detach().cpu().numpy()
    if pred_list2 is None:
        pred_list2 = batch_pred_list2
        user_list2 = data.numpy()
    else:
        pred_list2 = np.append(pred_list2, batch_pred_list2, axis=0)
        user_list2 = np.append(
            user_list2, data.numpy(), axis=0
        )
tbar.close()

  uid_series = torch.tensor(uid_series)


In [223]:
# user별 item 추천 결과 하나로 합쳐주기
result2 = []
for user, pred in zip(user_list2, pred_list2):
    for item in pred:
        result2.append((int(user_id2token[user]), int(item_id2token[item])))

In [237]:
sub2 = pd.DataFrame(result2, columns=["user", "item"])

In [None]:
train_sub2 = train[(train.user==0) & (train.item.isin(sub2[sub2.user==0].item))].sort_values('item')
print(train_sub2.shape)
train_sub2

In [None]:
sub_sub2 = sub[(sub.user==0) & (sub.item.isin(sub2[sub2.user==0].item))].sort_values('item')
print(sub_sub2.shape)
sub_sub2

In [240]:
set(train_sub2.item).intersection(set(sub_sub2.item))

set()

In [None]:
sub2[sub2.user==0].sort_values('item')

### test dataset 전체로 수정

In [400]:
config['eval_args']['split']['RS']=[1,0,999999999]

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

In [401]:
train_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 1.0
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 7.452471482889734
[1;34mThe number of inters[0m: 31360
[1;34mThe sparsity of the dataset[0m: 99.98531186672648%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [402]:
test_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 163.36450892857144
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 752.6239165564859
[1;34mThe number of inters[0m: 5123111
[1;34mThe sparsity of the dataset[0m: 97.60048032069291%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [403]:
data[:1]

tensor([1])

In [419]:
pred_list3 = full_sort_topk(data[:1], model, test_data, 200, device=device)[1]

result3 = []
for item in pred_list3[0]:
    result3.append((0, int(item_id2token[item])))

sub3 = pd.DataFrame(result3, columns=["user", "item"])

  uid_series = torch.tensor(uid_series)


In [420]:
sub3

Unnamed: 0,user,item
0,0,424
1,0,287
2,0,717
3,0,663
4,0,4659
...,...,...
195,0,5768
196,0,908
197,0,4717
198,0,5218


In [421]:
train[train.user==0].shape

(376, 3)

In [422]:
## 기존 EASE 결과에 포함되는 정도
sub_sub3 = sub[(sub.user==0) & (sub.item.isin(sub3[sub3.user==0].item))].sort_values('item')
print(sub_sub3.shape)
sub_sub3

(12, 2)


Unnamed: 0,user,item
10,0,1
2,0,41
9,0,1284
0,0,2381
1,0,2619
5,0,3663
7,0,3679
8,0,4101
3,0,4581
6,0,4597


In [423]:
# train 결과에 포함되는 정도
train_sub3 = train[(train.user==0) & (train.item.isin(sub3[sub3.user==0].item))].sort_values('item')
print(train_sub3.shape)
train_sub3

(188, 3)


Unnamed: 0,user,item,time
267,0,0,1230858821
33,0,18,1230783704
17,0,31,1230783095
205,0,76,1230853748
61,0,92,1230785343
...,...,...,...
86,0,5522,1230787565
343,0,5547,1251170254
346,0,5665,1251170364
350,0,5741,1251170472


### test dataset을 매우적게 한 유저 예측

In [276]:
config['eval_args']['split']['RS']=[999999,0,1]

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

In [277]:
train_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 163.36450892857144
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 752.6239165564859
[1;34mThe number of inters[0m: 5123111
[1;34mThe sparsity of the dataset[0m: 97.60048032069291%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [278]:
test_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 1.0
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 7.5675675675675675
[1;34mThe number of inters[0m: 31360
[1;34mThe sparsity of the dataset[0m: 99.98531186672648%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [387]:
n=0 # n번 유저
k=20 # top k 추천


data=all_user_list[0]
print(data[n:n+1])
pred_list3 = full_sort_topk(data[n:n+1], model, test_data, k, device=device)[1]

result3 = []
for item in pred_list3[0]:
    result3.append((n, int(item_id2token[item])))

sub3 = pd.DataFrame(result3, columns=["user", "item"])
sub3

tensor([1])


  uid_series = torch.tensor(uid_series)


Unnamed: 0,user,item
0,0,2381
1,0,916
2,0,2619
3,0,41
4,0,4581
5,0,4790
6,0,3663
7,0,4597
8,0,3679
9,0,4101


In [388]:
sub_sub3 = sub[(sub.user==n) & (sub.item.isin(sub3[sub3.user==n].item))].sort_values('item')
print(sub_sub3.shape)
sub_sub3

(19, 2)


Unnamed: 0,user,item
10,0,1
2,0,41
17,0,177
16,0,1206
9,0,1284
12,0,1658
13,0,2036
14,0,2187
15,0,2366
0,0,2381


In [389]:
train_sub3 = train[(train.user==n) & (train.item.isin(sub3[sub3.user==n].item))].sort_values('item')
print(train_sub3.shape)
train_sub3

(1, 3)


Unnamed: 0,user,item,time
192,0,916,1230789404


In [334]:
train[train.user==n].shape

(172, 3)

### test_dataset 매우 적게 전체 예측 - full_sort_topk 확인

In [276]:
config['eval_args']['split']['RS']=[999999,0,1]

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

In [277]:
train_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 163.36450892857144
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 752.6239165564859
[1;34mThe number of inters[0m: 5123111
[1;34mThe sparsity of the dataset[0m: 97.60048032069291%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [278]:
test_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 1.0
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 7.5675675675675675
[1;34mThe number of inters[0m: 31360
[1;34mThe sparsity of the dataset[0m: 99.98531186672648%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

In [340]:
pred_list4 = None
user_list4 = []
from recbole.utils.case_study import full_sort_topk

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128) # 245, 128

tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink')) # 245, 128

for data in tbar:
    batch_pred_list4 = full_sort_topk(data, model, test_data, 30, device=device)[1]
    batch_pred_list4 = batch_pred_list4.clone().detach().cpu().numpy()
    if pred_list4 is None:
        pred_list4 = batch_pred_list4
        user_list4 = data.numpy()
    else:
        pred_list4 = np.append(pred_list4, batch_pred_list4, axis=0)
        user_list4 = np.append(
            user_list4, data.numpy(), axis=0
        )
tbar.close()
# user별 item 추천 결과 하나로 합쳐주기
result4 = []
for user, pred in zip(user_list4, pred_list4):
    for item in pred:
        result4.append((int(user_id2token[user]), int(item_id2token[item])))

sub4 = pd.DataFrame(result4, columns=["user", "item"])

[1;35mInference[0m: 100%|█████████████████████████████████████████████████| 245/245 [00:34<00:00,  7.10it/s][0m


In [381]:
sub4.shape

(940800, 2)

In [343]:
sub_top10 = sub.groupby('user').head(10).reset_index(drop=True)
sub_top10

Unnamed: 0,user,item
0,0,2381
1,0,2619
2,0,41
3,0,4581
4,0,4790
...,...,...
313595,31359,4101
313596,31359,2812
313597,31359,2208
313598,31359,4581


In [349]:
sub_top10.merge(sub4, on=['user','item'])

Unnamed: 0,user,item
0,0,2381
1,0,2619
2,0,41
3,0,4581
4,0,4790
...,...,...
313595,31359,4101
313596,31359,2812
313597,31359,2208
313598,31359,4581


In [352]:
sub4_top10 = sub4.groupby('user').head(10).reset_index(drop=True)
sub4_top10

Unnamed: 0,user,item
0,0,2381
1,0,916
2,0,2619
3,0,41
4,0,4581
...,...,...
313595,31359,4101
313596,31359,2812
313597,31359,2208
313598,31359,4581


In [353]:
sub_top10.merge(sub4_top10, on=['user','item'])

Unnamed: 0,user,item
0,0,2381
1,0,2619
2,0,41
3,0,4581
4,0,4790
...,...,...
303185,31359,4101
303186,31359,2812
303187,31359,2208
303188,31359,4581


In [367]:
sub_top10['sub']=1
sub4_top10['sub4']=1

In [368]:
sub_sub4 = sub_top10.merge(sub4_top10, on=['user','item'],how='right')
sub_sub4

Unnamed: 0,user,item,sub,sub4
0,0,2381,1.0,1
1,0,916,,1
2,0,2619,1.0,1
3,0,41,1.0,1
4,0,4581,1.0,1
...,...,...,...,...
313595,31359,4101,1.0,1
313596,31359,2812,1.0,1
313597,31359,2208,1.0,1
313598,31359,4581,1.0,1


In [374]:
only_sub4 = sub_sub4[(sub_sub4['sub'].isna()) & (sub_sub4.sub4.notna())]
only_sub4

Unnamed: 0,user,item,sub,sub4
1,0,916,,1
23,2,3010,,1
32,3,1642,,1
42,4,5936,,1
71,7,1580,,1
...,...,...,...,...
313511,31351,647,,1
313534,31353,1089,,1
313540,31354,1435,,1
313554,31355,4522,,1


In [382]:
train.shape

(5154471, 3)

In [378]:
only_sub4_train = only_sub4.merge(train, on=['user','item'], how='left')
only_sub4_train

Unnamed: 0,user,item,sub,sub4,time
0,0,916,,1,1230789404
1,2,3010,,1,1195573950
2,3,1642,,1,1277963517
3,4,5936,,1,1424734588
4,7,1580,,1,1182349131
...,...,...,...,...,...
10405,31351,647,,1,1180760864
10406,31353,1089,,1,1355632199
10407,31354,1435,,1,1194038965
10408,31355,4522,,1,1274233896


In [380]:
sum(only_sub4_train['sub'])

nan

In [379]:
sum(only_sub4_train.sub4)

10410

In [124]:
sub3[sub3.user==0].sort_values('item')

Unnamed: 0,user,item
6,0,0
7,0,229
1,0,287
0,0,424
3,0,663
2,0,717
9,0,2812
8,0,3240
5,0,3994
4,0,4659


In [126]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

In [129]:
train[(train.user==0) & (train.item.isin(sub3[sub3.user==0].item))].sort_values('item')

Unnamed: 0,user,item,time
267,0,0,1230858821
279,0,229,1230858954
149,0,287,1230788713
116,0,424,1230788340
40,0,663,1230784258
72,0,717,1230785922
157,0,2812,1230788806
22,0,3240,1230783392
124,0,3994,1230788473
126,0,4659,1230788489


In [118]:
sub3[sub3.user==0].sort_valuesalues('item')

Unnamed: 0,user,item
6,0,0
7,0,229
1,0,287
0,0,424
3,0,663
2,0,717
9,0,2812
8,0,3240
5,0,3994
4,0,4659


In [218]:
sub2.user = sub2.user.map(uidx2user)
sub2.item = sub2.item.map(iidx2item)

In [219]:
sub2

Unnamed: 0,user,item
0,11,780
1,11,480
2,11,1270
3,11,1214
4,11,34048
...,...,...
313595,138493,4306
313596,138493,5218
313597,138493,7361
313598,138493,4973


In [220]:
sub2.item[0] in train[train.user==11].item

False

In [223]:
for uid in sorted(set(sub2.user)):
    sub2[(sub2.user==uid) & sub2.item.isin(train[train.user==uid].item)]
    break

Unnamed: 0,user,item
0,11,780
1,11,480
2,11,1270
3,11,1214
4,11,34048
5,11,8644
6,11,1
7,11,367
8,11,6502
9,11,5349


## 제출파일 생성

In [None]:
sub

In [None]:
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

In [132]:
sub.to_csv('EASE_1_0.csv',index=False)

In [133]:
sub

Unnamed: 0,user,item
0,11,4370
1,11,4886
2,11,47
3,11,32587
4,11,40815
...,...,...
313595,138493,8961
313596,138493,5349
313597,138493,4022
313598,138493,32587


In [137]:
sub.user = sub.user.map(user2idx)
sub.item = sub.item.map(item2idx)

In [144]:
train.columns=['user','item','time']
train[:3]

Unnamed: 0,user,item,time
0,0,2505,1230782529
1,0,109,1230782534
2,0,319,1230782539


In [138]:
afterdf = sub.merge(train[['user','item','time']], on=['user','item'], how='left')

In [139]:
afterdf[afterdf['timestamp:float'].notna()]

Unnamed: 0,user,item,timestamp:float


## 시간 순서 잘 지켜졌는지 확인해보기

In [446]:
train.user = train.user.map(uidx2user)
train.item = train.item.map(iidx2item)

In [441]:
import time
from time import localtime

In [442]:
data_path = '../../data/train'
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

In [469]:
train['review_year'] = train['time'].apply(lambda x : localtime(x).tm_year)
train.sort_values(['user','time'],inplace=True)
user2lastyear = dict(train.groupby('user').review_year.max())

### train에 review_year, month, day, year_month 추가

In [153]:
train['review_year'] = train['time'].apply(lambda x : localtime(x).tm_year)

In [155]:
train['month'] = train['time'].apply(lambda x : localtime(x).tm_mon)

In [156]:
train['day'] = train['time'].apply(lambda x : localtime(x).tm_mday)

In [166]:
train['year_month'] = train['time'].apply(lambda x : time.strftime('%Y-%m',localtime(x)))

In [168]:
train.sort_values(['user','time'],inplace=True)

In [172]:
user2lastyear = dict(train.groupby('user').review_year.max())

In [173]:
user2lastyearmonth = dict(train.groupby('user').year_month.max())

In [176]:
train

Unnamed: 0,user,item,time,year,month,day,year_month,lastyear,last_yearmonth
0,11,4643,1230782529,2009,1,1,2009-01,2011,2011-01
1,11,170,1230782534,2009,1,1,2009-01,2011,2011-01
2,11,531,1230782539,2009,1,1,2009-01,2011,2011-01
3,11,616,1230782542,2009,1,1,2009-01,2011,2011-01
4,11,2140,1230782563,2009,1,1,2009-01,2011,2011-01
...,...,...,...,...,...,...,...,...,...
5154466,138493,44022,1260209449,2009,12,7,2009-12,2009,2009-12
5154467,138493,4958,1260209482,2009,12,7,2009-12,2009,2009-12
5154468,138493,68319,1260209720,2009,12,7,2009-12,2009,2009-12
5154469,138493,40819,1260209726,2009,12,7,2009-12,2009,2009-12


### year 채워넣기

In [448]:
merge_df = train.copy()
side_info = [title_data,year_data]
for side in side_info:
    merge_df = merge_df.merge(side,how = 'left',on='item')
merge_df.head()

Unnamed: 0,user,item,time,review_year,title,year
0,11,4643,1230782529,2009,Planet of the Apes (2001),2001.0
1,11,170,1230782534,2009,Hackers (1995),1995.0
2,11,531,1230782539,2009,"Secret Garden, The (1993)",1993.0
3,11,616,1230782542,2009,"Aristocats, The (1970)",1970.0
4,11,2140,1230782563,2009,"Dark Crystal, The (1982)",1982.0


In [449]:
merge_df['year_from_title'] = merge_df['title'].apply(lambda x : (x[-5:-1]))

In [450]:
merge_df.loc[merge_df['year_from_title']=='007-','year_from_title'] = 2007

In [451]:
merge_df.year_from_title=merge_df.year_from_title.astype(int)

In [452]:
cond = merge_df.year_from_title == merge_df.year

In [453]:
merge_df[~cond & merge_df.year.notna()].title.value_counts()

Fawlty Towers (1975-1979)    163
Name: title, dtype: int64

In [454]:
merge_df.year = merge_df.year.fillna(merge_df.year_from_title)

In [455]:
merge_df.year = merge_df.year.astype(int)

In [None]:
merge_df[merge_df.lastyear < merge_df.year]

In [460]:
## item별 year이 모두 동일한지 첫번째값과 평균값이 같은 아이템들 확인 -> 모두 일치
sum(merge_df.groupby('item').year.first() == merge_df.groupby('item').year.mean())

6807

In [458]:
item2year = dict(merge_df.groupby('item').year.first())

In [467]:
!ls baseline/index

iidx2item.pickle  item2year.pickle  user2idx.pickle
item2idx.pickle   uidx2user.pickle


In [464]:
import pickle

with open('baseline/index/item2year.pickle','wb') as f:
    pickle.dump(item2year,f)

In [472]:
import pickle

with open('baseline/index/userid2lastyear.pickle','wb') as f:
    pickle.dump(user2lastyear,f)

## sub 보기

In [474]:
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

In [475]:
# sub['last_yearmonth']=sub.user.map(user2lastyearmonth)
sub['lastyear']=sub.user.map(user2lastyear)
sub['m_year'] = sub.item.map(item2year)

In [476]:
sub

Unnamed: 0,user,item,lastyear,m_year
0,11,4370,2011,2001
1,11,4886,2011,2001
2,11,47,2011,1995
3,11,32587,2011,2005
4,11,40815,2011,2005
...,...,...,...,...
627195,138493,4720,2009,2001
627196,138493,293,2009,1994
627197,138493,2174,2009,1988
627198,138493,4848,2009,2001


In [477]:
sub2 = sub[sub.lastyear >= sub.m_year]

In [478]:
sub2

Unnamed: 0,user,item,lastyear,m_year
0,11,4370,2011,2001
1,11,4886,2011,2001
2,11,47,2011,1995
3,11,32587,2011,2005
4,11,40815,2011,2005
...,...,...,...,...
627195,138493,4720,2009,2001
627196,138493,293,2009,1994
627197,138493,2174,2009,1988
627198,138493,4848,2009,2001


In [485]:
users = sub2.groupby('user').user.head(10).reset_index(drop=True)

items = sub2.groupby('user').item.head(10).reset_index(drop=True)

sub = pd.concat([users,items],axis=1)

In [276]:
df = pd.read_csv('EASE_1_0.csv')

In [282]:
df['item2']= sub2.groupby('user').item.head(10).reset_index(drop=True)

### 이전 제출(1594)대비 현재 제출(1595)에서 달라진 부분 보기

In [324]:
## 이전 제출(1594)대비 현재 제출(1595)에서 달라진 부분 보기
idx_list=[]
user_item2 = dict(df.groupby('user').item2.apply(list))
for i in tqdm(df.itertuples(), total=df.shape[0]):
    if i.item not in user_item2[i.user]:
        idx_list.append(i.Index)

100%|██████████████████████████████████████████████████| 313600/313600 [00:00<00:00, 562116.84it/s]


In [329]:
df['lastyear'] = df.user.map(user2lastyear)
df['last_yearmonth'] = df.user.map(user2lastyearmonth)
df['m_year'] = df.item.map(item2year)

In [330]:
df.iloc[idx_list,:]

Unnamed: 0,user,item,item2,lastyear,last_yearmonth,m_year
109,61,58559,48780,2007,2007-12,2008
1559,664,63436,8528,2007,2007-11,2008
2144,915,54286,4027,2006,2006-08,2007
3504,1539,79132,3949,2009,2009-07,2010
5019,2160,58559,111,2007,2007-01,2008
...,...,...,...,...,...,...
300758,132448,69844,2542,2007,2007-10,2009
307115,135535,58559,318,2005,2005-12,2008
307702,135798,88125,6539,2010,2010-12,2011
311225,137460,58559,745,2007,2007-09,2008


## 최종 제출 -> 1595

In [288]:
df[['user','item2']].rename(columns={'item2':'item'}).to_csv("EASE_1_0_Top20_remove_review_after_movie.csv",index=False)