## FMM Recbole 구현

### Recbole 라이브러리 로딩

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

from logging import getLogger
import os
import json
import numpy as np
import pandas as pd
import time, datetime
from tqdm import tqdm

from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.utils import init_logger, get_trainer, init_seed, set_color, get_model
from recbole.quick_start.quick_start import load_data_and_model

from recbole.config import Config
from recbole.data import create_dataset

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import ndcg_score, recall_score

import torch

### 데이터 파일 변환

기존 데이터 파일을 Recbole 데이터 파일로 변환시키는 과정

In [4]:
train = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")
user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

In [5]:
yamldata = """
field_separator: "\t"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
    user: [user_id]
    item: [item_id, title, genre, year, writer, director]

train_neg_sample_args:
    uniform: 1
    
eval_args:
    split: {'RS': [8, 1, 1]}
    group_by: user
    order: RO
    mode: full
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 10
valid_metric: Recall@10
"""

In [29]:
!cp dataset/train_data.item dataset/ease_data20/ease_data20.item

In [30]:
!tree dataset/ease*

dataset/ease_data20
├── ease_data20.inter
└── ease_data20.item

0 directories, 2 files


In [23]:
outpath = f"dataset/ease_data20"
yamlfile = f"ease_data20.yaml"
os.makedirs(outpath, exist_ok=True)
print("Dump Start")
# 데이터 설정 파일 저장
with open(yamlfile, "w") as f:
    f.write(yamldata) 
print("Dump Complete")

Dump Start


447

Dump Complete


### 로거 생성

In [24]:
logger = getLogger()

### 설정 인스턴스 생성

In [27]:
# configurations initialization
config = Config(model='FFM', dataset="ease_data20", config_file_list=[f'ease_data20.yaml'])
config['epochs'] = 10
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)
logger.info(config)

25 Dec 15:05    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/ease_data20
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_

### 데이터 로드

In [31]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

25 Dec 15:06    INFO  ease_data20
The number of users: 31361
Average actions of users: 20.0
The number of items: 6808
Average actions of items: 230.84284136915716
The number of inters: 627200
The sparsity of the dataset: 99.70623733452946%
Remain Fields: ['user_id', 'item_id', 'timestamp', 'title', 'genre', 'year', 'writer', 'director']
25 Dec 15:07    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
25 Dec 15:07    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [32]:
train_data.dataset
valid_data.dataset
test_data.dataset

[1;35mease_data20[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 16.0
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 191.95103289977047
[1;34mThe number of inters[0m: 501760
[1;34mThe sparsity of the dataset[0m: 99.76498986762358%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'title', 'genre', 'year', 'writer', 'director', 'label']

[1;35mease_data20[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 2.0
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 35.295441755768145
[1;34mThe number of inters[0m: 62720
[1;34mThe sparsity of the dataset[0m: 99.97062373345294%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'title', 'genre', 'year', 'writer', 'director', 'label']

[1;35mease_data20[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 2.0
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 35.216170690623244
[1;34mThe number of inters[0m: 62720
[1;34mThe sparsity of the dataset[0m: 99.97062373345294%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'title', 'genre', 'year', 'writer', 'director', 'label']

### 모델 인스턴스 생성

In [33]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = FFM(config, train_data.dataset).to(config['device'])
logger.info(model)

25 Dec 15:07    INFO  FFM(
  (token_embedding_table): FMEmbedding(
    (embedding): Embedding(38271, 10)
  )
  (token_seq_embedding_table): ModuleList(
    (0): Embedding(9062, 10)
    (1): Embedding(19, 10)
    (2): Embedding(2990, 10)
    (3): Embedding(1341, 10)
  )
  (first_order_linear): FMFirstOrderLinear(
    (token_embedding_table): FMEmbedding(
      (embedding): Embedding(38271, 1)
    )
    (token_seq_embedding_table): ModuleList(
      (0): Embedding(9062, 1)
      (1): Embedding(19, 1)
      (2): Embedding(2990, 1)
      (3): Embedding(1341, 1)
    )
  )
  (sigmoid): Sigmoid()
  (ffm): FieldAwareFactorizationMachine(
    (token_embeddings): ModuleList(
      (0): Embedding(38271, 10)
      (1): Embedding(38271, 10)
      (2): Embedding(38271, 10)
      (3): Embedding(38271, 10)
      (4): Embedding(38271, 10)
      (5): Embedding(38271, 10)
      (6): Embedding(38271, 10)
    )
    (token_seq_embeddings): ModuleList(
      (0): ModuleList(
        (0): Embedding(9062, 10)


)
Trainable parameters: 4186324


### 모델 학습

In [34]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

25 Dec 15:07    INFO  epoch 0 training [time: 23.82s, train loss: 150.3774]
25 Dec 15:17    INFO  epoch 0 evaluating [time: 557.60s, valid_score: 0.089200]
25 Dec 15:17    INFO  valid result: 
recall@10 : 0.0892    mrr@10 : 0.0532    ndcg@10 : 0.0509    hit@10 : 0.1688    precision@10 : 0.0178    map@10 : 0.0282
25 Dec 15:17    INFO  Saving current: saved/FFM-Dec-25-2022_15-07-28.pth
25 Dec 15:17    INFO  epoch 1 training [time: 27.47s, train loss: 94.1129]
25 Dec 15:26    INFO  epoch 1 evaluating [time: 558.32s, valid_score: 0.093700]
25 Dec 15:26    INFO  valid result: 
recall@10 : 0.0937    mrr@10 : 0.0558    ndcg@10 : 0.0535    hit@10 : 0.1771    precision@10 : 0.0187    map@10 : 0.0296
25 Dec 15:26    INFO  Saving current: saved/FFM-Dec-25-2022_15-07-28.pth
25 Dec 15:27    INFO  epoch 2 training [time: 23.05s, train loss: 85.5196]
25 Dec 15:36    INFO  epoch 2 evaluating [time: 550.52s, valid_score: 0.094100]
25 Dec 15:36    INFO  valid result: 
recall@10 : 0.0941    mrr@10 : 0.05

### 학습 결과 출력

In [None]:
# model evaluation
test_result = trainer.evaluate(test_data, load_best_model="True", show_progress=config['show_progress'])

logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}')
logger.info(set_color('test result', 'yellow') + f': {test_result}')

result = {
    'best_valid_score': best_valid_score,
    'valid_score_bigger': config['valid_metric_bigger'],
    'best_valid_result': best_valid_result,
    'test_result': test_result
}

print(json.dumps(result, indent=4))

In [36]:
# 모델 불러오기
model_path='saved/FFM-Dec-25-2022_15-07-28.pth'
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(model_path)

25 Dec 18:25    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/ease_data20
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_

## Inference

In [75]:
# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

In [None]:
 11(user_id) -> 0(token=idx) -> 1(dataset_id) -> 0(field2id_token) -> 11(idx2userid)

In [88]:
train

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [97]:
test_data.dataset.item_num

6808

In [96]:
full_sort_topk(data, model, test_data, 10, device=device)[1]

  uid_series = torch.tensor(uid_series)


tensor([[184, 120,  76,  ..., 168,  89,  98],
        [567, 977,  27,  ..., 805, 161, 452],
        [184,  99,   4,  ..., 110, 154, 319],
        ...,
        [ 42,   9, 123,  ..., 243, 103,  96],
        [120,  80,  83,  ..., 247, 233, 101],
        [184,  99,   8,  ...,  74, 105, 119]], device='cuda:0')

In [81]:
from recbole.utils.case_study import full_sort_topk

user_id = config['USER_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128)

device = config.final_config_dict['device']

tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'), leave=True, mininterval=1)

pred_list = None
user_list = []
for data in tbar:
    batch_pred_list = full_sort_topk(data, model, test_data, 10, device=device)[1]
    batch_pred_list = batch_pred_list.clone().detach().cpu().numpy()
    if pred_list is None:
        pred_list = batch_pred_list
        user_list = data.numpy()
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, data.numpy(), axis=0
        )
tbar.close()


  uid_series = torch.tensor(uid_series)
Inference:   2%|█▏                                                 | 6/245 [00:01<00:44,  5.40it/s]:   5%|██▍                                               | 12/245 [00:02<00:42,  5.42it/s]:   7%|███▋                                              | 18/245 [00:03<00:41,  5.43it/s]:  10%|████▉                                             | 24/245 [00:04<00:40,  5.43it/s]:  12%|██████                                            | 30/245 [00:05<00:39,  5.43it/s]:  15%|███████▎                                          | 36/245 [00:06<00:38,  5.43it/s]:  17%|████████▌                                         | 42/245 [00:07<00:37,  5.43it/s]:  20%|█████████▊                                        | 48/245 [00:08<00:36,  5.43it/s]:  22%|███████████                                       | 54/245 [00:09<00:35,  5.43it/s]:  24%|████████████▏                                     | 60/245 [00:11<00:34,  5.43it/s]:  27%|█████████████▍                              

In [82]:
# user별 item 추천 결과 하나로 합쳐주기
result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((user_id2token[user],item_id2token[item]))

In [83]:
# submission file 제작하기
sub = pd.DataFrame(result, columns=["user", "item"])

In [89]:
sub

Unnamed: 0,user,item
0,0,3654
1,0,5461
2,0,3200
3,0,2659
4,0,4646
...,...,...
313595,31359,2659
313596,31359,179
313597,31359,2304
313598,31359,931


In [91]:
sub[sub.user=='0']

Unnamed: 0,user,item
0,0,3654
1,0,5461
2,0,3200
3,0,2659
4,0,4646
5,0,3909
6,0,3255
7,0,3989
8,0,3994
9,0,4886


In [85]:
df = pd.read_csv('dataset/ease_data20/ease_data20.inter',sep='\t')

In [92]:
df[df['user_id:token']==0]

Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,0,2381,
1,0,2619,
2,0,41,
3,0,4581,
4,0,4790,
5,0,3663,
6,0,4597,
7,0,3679,
8,0,4101,
9,0,1284,


In [58]:
sub[sub.user==11]

Unnamed: 0,user,item
0,11,303
1,11,186
2,11,107
3,11,170
4,11,158
5,11,111
6,11,43
7,11,272
8,11,144
9,11,157


In [97]:
sub.to_csv(
    "submission_ffm.csv", index=False
)
print('inference done!')

inference done!
