# FMM Recbole 구현

### Recbole 라이브러리 로딩

In [135]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

from logging import getLogger
import os
import json
import numpy as np
import pandas as pd
import time, datetime
from tqdm import tqdm

from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.utils import init_logger, get_trainer, init_seed, set_color, get_model
from recbole.quick_start.quick_start import load_data_and_model

from recbole.config import Config
from recbole.data import create_dataset

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import ndcg_score, recall_score

import torch

### 데이터 로드

In [136]:
train_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')

In [172]:
user_grp = dict(train_df.groupby('user').item.apply(list))

In [137]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5154471 entries, 0 to 5154470
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   user    int64
 1   item    int64
 2   time    int64
dtypes: int64(3)
memory usage: 118.0 MB


In [138]:
data_path = '/opt/ml/input/data/train'
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

In [139]:
df_merge = pd.merge(train_df, year_data.drop_duplicates(subset=['item']), on='item', how='inner')
df_merge = pd.merge(df_merge, writer_data.drop_duplicates(subset=['item']), on='item', how='inner')
df_merge = pd.merge(df_merge, title_data.drop_duplicates(subset=['item']), on='item', how='inner')
df_merge = pd.merge(df_merge, genre_data.drop_duplicates(subset=['item']), on='item', how='inner')
df_merge = pd.merge(df_merge, director_data.drop_duplicates(subset=['item']), on='item', how='inner')

In [140]:
train = df_merge[['user', 'item', 'time']]

In [141]:
user_data = df_merge[['user']]

In [142]:
item_data = df_merge[['item', 'year', 'writer', 'title', 'genre', 'director']].drop_duplicates(subset=['item'])

In [143]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4658299 entries, 0 to 4658298
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   user    int64
 1   item    int64
 2   time    int64
dtypes: int64(3)
memory usage: 142.2 MB


In [144]:
userid, itemid = list(set(train.user)), list(set(train.item))
n_user, n_item = len(userid), len(itemid)
print(f'n_user : {n_user}')
print(f'n_item : {n_item}')

n_user : 31360
n_item : 4967


### 데이터 파일 변환

기존 데이터 파일을 Recbole 데이터 파일로 변환시키는 과정

In [145]:
userid, itemid = sorted(userid), sorted(itemid)
n_user, n_item = len(userid), len(itemid)

userid_2_index = {v:i for i,v in enumerate(userid)}
itemid_2_index = {v:i for i,v in enumerate(itemid)}
index_2_userid = {i:v for i,v in enumerate(userid)}
index_2_itemid = {i:v for i,v in enumerate(itemid)}

In [146]:
train.user = train.user.map(userid_2_index)
train.item = train.item.map(itemid_2_index)

user_data.user = user_data.user.map(userid_2_index)
item_data.item = item_data.item.map(itemid_2_index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.user = train.user.map(userid_2_index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.item = train.item.map(itemid_2_index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data.user = user_data.user.map(userid_2_index)


In [147]:
yamldata = """
field_separator: "\t"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]

train_neg_sample_args:
    uniform: 1
    
eval_args:
    split: {'RS': [8, 2, 0]}
    group_by: user
    order: RO
    mode: full
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 10
valid_metric: Recall@10
"""

In [148]:
train.columns=['user_id:token', 'item_id:token', 'timestamp:float']
user_data.columns=['user_id:token']
item_data.columns=['item_id:token', 'year:float', 'writer:token', 'title:token', 'genre:token', 'director:token']


In [149]:
outpath = f"dataset/context_data"
# outfile = f"dataset/context_data/context_data.inter"
yamlfile = f"context_data.yaml"

os.makedirs(outpath, exist_ok=True)
SEED=13
sub_train=train.groupby("user_id:token").sample(n=2, random_state=SEED)
sub_train.shape
# 데이터 설정 파일 저장
with open(yamlfile, "w") as f:
    f.write(yamldata) 

(62720, 3)

369

In [58]:
# 데이터 파일 저장
sub_train.to_csv(os.path.join(outpath,"context_data.inter"),sep='\t',index=False)
# user_data.to_csv(os.path.join(outpath,"context_data.user"),sep='\t',index=False)
# item_data.to_csv(os.path.join(outpath,"context_data.item"),sep='\t',index=False)
print("Dump Complete")

(62720, 3)

447

Dump Complete


In [152]:
# !rm dataset/context_data/ context_data.

context_data.inter  context_data.item  context_data.user


### 로거 생성

In [153]:
logger = getLogger()

### 설정 인스턴스 생성

In [154]:
# configurations initialization
config = Config(model='FFM', dataset="context_data", config_file_list=[f'context_data.yaml'])
config['epochs'] = 1
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)
logger.info(config)

23 Dec 02:16    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/context_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 1
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 2, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_

### 데이터 로드

In [155]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

23 Dec 02:16    INFO  context_data
The number of users: 31361
Average actions of users: 2.0
The number of items: 4036
Average actions of items: 15.543990086741015
The number of inters: 62720
The sparsity of the dataset: 99.95044756624075%
Remain Fields: ['user_id', 'item_id', 'timestamp']
23 Dec 02:16    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
23 Dec 02:16    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [8, 2, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [156]:
train_data.dataset
valid_data.dataset
test_data.dataset

[1;35mcontext_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 1.0
[1;34mThe number of items[0m: 4036
[1;34mAverage actions of items[0m: 9.145523476232137
[1;34mThe number of inters[0m: 31360
[1;34mThe sparsity of the dataset[0m: 99.97522378312037%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

[1;35mcontext_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 1.0
[1;34mThe number of items[0m: 4036
[1;34mAverage actions of items[0m: 9.311163895486937
[1;34mThe number of inters[0m: 31360
[1;34mThe sparsity of the dataset[0m: 99.97522378312037%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


[1;35mcontext_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 4036
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

### 모델 인스턴스 생성

In [157]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = FFM(config, train_data.dataset).to(config['device'])
logger.info(model)

23 Dec 02:16    INFO  FFM(
  (token_embedding_table): FMEmbedding(
    (embedding): Embedding(35397, 10)
  )
  (first_order_linear): FMFirstOrderLinear(
    (token_embedding_table): FMEmbedding(
      (embedding): Embedding(35397, 1)
    )
  )
  (sigmoid): Sigmoid()
  (ffm): FieldAwareFactorizationMachine(
    (token_embeddings): ModuleList(
      (0): Embedding(35397, 10)
      (1): Embedding(35397, 10)
    )
  )
  (loss): BCEWithLogitsLoss()
)
Trainable parameters: 1097308


### 모델 학습

In [158]:
config['epochs'] = 10
config['eval_step'] = 10

In [159]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

23 Dec 02:17    INFO  epoch 0 training [time: 0.16s, train loss: 21.4237]
23 Dec 02:17    INFO  epoch 1 training [time: 0.16s, train loss: 21.2819]
23 Dec 02:17    INFO  epoch 2 training [time: 0.15s, train loss: 21.1360]
23 Dec 02:17    INFO  epoch 3 training [time: 0.16s, train loss: 20.9768]
23 Dec 02:17    INFO  epoch 4 training [time: 0.15s, train loss: 20.7927]
23 Dec 02:17    INFO  epoch 5 training [time: 0.17s, train loss: 20.5800]
23 Dec 02:17    INFO  epoch 6 training [time: 0.17s, train loss: 20.3369]
23 Dec 02:17    INFO  epoch 7 training [time: 0.18s, train loss: 20.0529]
23 Dec 02:17    INFO  epoch 8 training [time: 0.17s, train loss: 19.7512]
23 Dec 02:17    INFO  epoch 9 training [time: 0.17s, train loss: 19.4186]
23 Dec 02:18    INFO  epoch 9 evaluating [time: 56.99s, valid_score: 0.032800]
23 Dec 02:18    INFO  valid result: 
recall@10 : 0.0328    mrr@10 : 0.0106    ndcg@10 : 0.0157    hit@10 : 0.0328    precision@10 : 0.0033    map@10 : 0.0106
23 Dec 02:18    INFO  S

### 학습 결과 출력

In [160]:
model_path='saved/FFM-Dec-23-2022_02-17-01.pth'
# rank K 설정
K = 10

In [161]:
# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'context_data'

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128)

# user, item 길이
user_len = len(user_id2token)
item_len = len(item_id2token)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()

23 Dec 02:18    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
23 Dec 02:18    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [8, 2, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


<All keys matched successfully>

FFM(
  (token_embedding_table): FMEmbedding(
    (embedding): Embedding(35397, 10)
  )
  (first_order_linear): FMFirstOrderLinear(
    (token_embedding_table): FMEmbedding(
      (embedding): Embedding(35397, 1)
    )
  )
  (sigmoid): Sigmoid()
  (ffm): FieldAwareFactorizationMachine(
    (token_embeddings): ModuleList(
      (0): Embedding(35397, 10)
      (1): Embedding(35397, 10)
    )
  )
  (loss): BCEWithLogitsLoss()
)

In [162]:
all_user_list = torch.arange(1, len(user_id2token))[:2]
all_item_list = torch.arange(1, len(item_id2token))[:2]

In [163]:
interaction = dict()
interaction = Interaction(interaction)
interaction[user_id] = all_user_list
interaction[item_id] = all_item_list
interaction = interaction.to(device)

In [164]:
interaction.interaction

{'user_id': tensor([1, 2], device='cuda:0'),
 'item_id': tensor([1, 2], device='cuda:0')}

In [166]:
model.predict(interaction)

tensor([0.5018, 0.4682], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [None]:
# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'))

for data in tbar:
    # interaction 생성
    interaction = dict()
    interaction = Interaction(interaction)
    interaction[user_id] = data
    interaction = interaction.to(device)

    # user item별 score 예측
    score = model.full_sort_predict(interaction)
    score = score.view(-1, item_len)

    rating_pred = score.cpu().data.numpy().copy()

    user_index = data.numpy()

    idx = matrix[user_index].toarray() > 0

    rating_pred[idx] = -np.inf
    rating_pred[:, 0] = -np.inf
    ind = np.argpartition(rating_pred, -K)[:, -K:] #

    arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]

    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

    batch_pred_list = ind[
        np.arange(len(rating_pred))[:, None], arr_ind_argsort
    ]

    if pred_list is None:
        pred_list = batch_pred_list
        user_list = user_index
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )

result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# 데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])
sub.to_csv(
    "submission.csv", index=False
)
print('inference done!')

### Inference

In [31]:
model_path='saved/FFM-Dec-22-2022_08-26-29.pth'