In [1]:

import os 
import pandas as pd 
import numpy as np 
import torch 

from recbole.model.general_recommender.bpr import BPR
from recbole.model.general_recommender.ease import EASE

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color
from recbole.quick_start import load_data_and_model, run_recbole
from recbole.utils.case_study import full_sort_topk

from logging import getLogger


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.getcwd()

'/opt/ml/input/level2_movierecommendation-recsys-01/notebooks/sangwu/recbole_baseline'

### mapping 이후 tsv 파일 생성

In [17]:
train = pd.read_csv("../../../data/train/train_ratings.csv")

In [18]:
user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

In [19]:
train

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [20]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

In [21]:
train

Unnamed: 0,user,item,time
0,0,2505,1230782529
1,0,109,1230782534
2,0,319,1230782539
3,0,368,1230782542
4,0,1183,1230782563
...,...,...,...
5154466,31359,4882,1260209449
5154467,31359,2652,1260209482
5154468,31359,5768,1260209720
5154469,31359,4791,1260209726


In [15]:
train.columns=['user_id:token','item_id:token','timestamp:float']
train[:2]


Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,0,2505,1230782529
1,0,109,1230782534


In [16]:
outpath = f"dataset/recbole_train"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
train.to_csv(os.path.join(outpath,"recbole_train.inter"),sep='\t',index=False)

### yaml 파일 생성

In [18]:
os.getcwd()

'/opt/ml/input/level2_movierecommendation-recsys-01/notebooks/sangwu/recbole_baseline'

In [19]:
yaml_data = """
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
"""

with open("bpr.yaml","w") as f:
    f.write(yaml_data)

In [89]:
yaml_data = """
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
"""

with open("ease.yaml","w") as f:
    f.write(yaml_data)

In [97]:
logger = getLogger()

In [7]:
model_name = 'ease'

In [9]:
config = Config(model=model_name.upper(), dataset='recbole_train', config_file_list=['ease.yaml'])

In [99]:
config['epochs'] = 500
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [100]:
init_seed(config['seed'], config['reproducibility'])
init_logger(config)

logger.info(config)

08 Jun 09:18    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/recbole_train
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 500
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
s

In [94]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

08 Jun 09:15    INFO  recbole_train
The number of users: 31361
Average actions of users: 164.36450892857144
The number of items: 6808
Average actions of items: 757.2309387395328
The number of inters: 5154471
The sparsity of the dataset: 97.58579218741939%
Remain Fields: ['user_id', 'item_id', 'timestamp']
08 Jun 09:15    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
08 Jun 09:15    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [101]:
init_seed(config['seed'], config['reproducibility'])
model = EASE(config, train_data.dataset).to(config['device'])
logger.info(model)

08 Jun 09:18    INFO  EASE()
Trainable parameters: 1


In [102]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

08 Jun 09:18    INFO  epoch 0 training [time: 9.46s, train loss: 0.0000]
08 Jun 09:19    INFO  epoch 0 evaluating [time: 61.32s, valid_score: 0.470900]
08 Jun 09:19    INFO  valid result: 
recall@10 : 0.1644    mrr@10 : 0.4709    ndcg@10 : 0.2437    hit@10 : 0.8396    precision@10 : 0.2025
08 Jun 09:19    INFO  Saving current: saved/EASE-Jun-08-2023_09-18-37.pth


In [103]:
test_result = trainer.evaluate(test_data,load_best_model=True,show_progress=config['show_progress'])


08 Jun 09:19    INFO  Loading model structure and parameters from saved/EASE-Jun-08-2023_09-18-37.pth


In [104]:
test_result

OrderedDict([('recall@10', 0.1886),
             ('mrr@10', 0.5723),
             ('ndcg@10', 0.3056),
             ('hit@10', 0.8628),
             ('precision@10', 0.2459)])

In [105]:
submission = pd.read_csv("../../../data/eval/sample_submission.csv")

In [106]:
submission['user'] = submission['user'].map(user2idx)

In [107]:
sub_user_idx = submission['user'].unique()

In [108]:
sub_user_idx = np.array(sub_user_idx,dtype=str)

In [109]:
sub_user_idx

array(['0', '1', '2', ..., '31357', '31358', '31359'], dtype='<U21')

In [110]:
uid_series = dataset.token2id(dataset.uid_field, sub_user_idx)

In [48]:
uid_series

array([    1,     2,     3, ..., 31358, 31359, 31360])

In [111]:
total_topk_score, total_topk_iid_list = torch.zeros_like(torch.Tensor(31360, 10)), torch.zeros_like(torch.Tensor(31360, 10))

In [112]:
from tqdm import tqdm
for idx in tqdm(range(0,len(uid_series))):
    topk_score, topk_iid_list = full_sort_topk(np.array([uid_series[idx]]),model,test_data,10,config['device'])
    total_topk_score[idx] = topk_score
    total_topk_iid_list[idx] = topk_iid_list

    

100%|██████████| 31360/31360 [00:53<00:00, 584.43it/s]


In [113]:
int_iid = total_topk_iid_list.to(torch.int64)

In [70]:
int_iid

tensor([[ 934,  152,  214,  ...,  735,  197,  761],
        [ 222,  614, 1438,  ...,  298, 1288,  668],
        [1971, 1284, 1882,  ..., 1695, 2698, 1660],
        ...,
        [ 267,   41,  962,  ...,  265,  375,  197],
        [ 625,  265,  620,  ...,  261,  593,  973],
        [  33,   74,  284,  ...,  225,  731,  126]])

In [114]:
external_item_list = dataset.id2token(dataset.iid_field, int_iid.cpu())
external_item_list = external_item_list.flatten()

In [115]:
df = pd.DataFrame({'user': np.repeat(sub_user_idx, 10), 'item': external_item_list})

In [116]:
df['user'] = df['user'].astype(int)
df['item'] = df['item'].astype(int)

In [117]:
df['user'] = df['user'].map(uidx2user)
df['item'] = df['item'].map(iidx2item)
df

Unnamed: 0,user,item
0,11,1214
1,11,1580
2,11,4886
3,11,4370
4,11,8961
...,...,...
313595,138493,1270
313596,138493,110
313597,138493,593
313598,138493,4848


In [118]:
df.to_csv("submission.csv",index=False)