# FMM Recbole 구현

### Recbole 라이브러리 로딩

In [84]:
# !pip install recbole

In [85]:
# !pip install ray

In [58]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

from logging import getLogger
import os
import json
import numpy as np
import pandas as pd
import time, datetime
from tqdm import tqdm

from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.utils import init_logger, get_trainer, init_seed, set_color, get_model
from recbole.quick_start.quick_start import load_data_and_model

from recbole.config import Config
from recbole.data import create_dataset

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import ndcg_score, recall_score

import torch

### 데이터 로드

In [59]:
train_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')

In [60]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5154471 entries, 0 to 5154470
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   user    int64
 1   item    int64
 2   time    int64
dtypes: int64(3)
memory usage: 118.0 MB


In [4]:
data_path = '/opt/ml/input/data/train'
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

In [6]:
df_merge = pd.merge(train_df, year_data.drop_duplicates(subset=['item']), on='item', how='inner')
df_merge = pd.merge(df_merge, writer_data.drop_duplicates(subset=['item']), on='item', how='inner')
df_merge = pd.merge(df_merge, title_data.drop_duplicates(subset=['item']), on='item', how='inner')
df_merge = pd.merge(df_merge, genre_data.drop_duplicates(subset=['item']), on='item', how='inner')
df_merge = pd.merge(df_merge, director_data.drop_duplicates(subset=['item']), on='item', how='inner')

In [8]:
# df_merge = pd.read_csv('./uitgyt.csv')
# director_data = pd.read_csv('./director_fe.csv')
# writer_data = pd.read_csv('./writer_fe.csv')
# df_merge = pd.merge(df_merge, director_data, on='item', how='left')
# df_merge = pd.merge(df_merge, writer_data, on='item', how='left')

In [89]:
train_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')
user_data = pd.read_csv('/opt/ml/input/fighting/FE/user_director3_fe.csv')
item_data = pd.read_csv('/opt/ml/input/fighting/FE/item_direct_str.csv')

In [62]:
train_df = train_df.sort_values('user')

In [8]:
# df_merge.head()

In [63]:
train_data = train_df[['user', 'item', 'time']]

In [64]:
user_data.head(3)

Unnamed: 0,user,director1,director2,director3
0,11,nm0000229,nm0000318,nm0000709
1,14,nm0000229,nm0000709,nm0414144
2,18,nm0000264,nm0001466,nm0600546


In [65]:
# user_data = train_df[['user']]

In [66]:
item_data

Unnamed: 0,item,director_str
0,1237,nm0000005
1,5147,nm0000005
2,7327,nm0000005
3,2068,nm0000005
4,7396,nm0000005
...,...,...
6802,73681,nm0000000
6803,32728,nm0000000
6804,32743,nm0000000
6805,106491,nm0000000


In [13]:
# item_data = train_df[['item']]#.drop_duplicates(subset=['item'])

In [67]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5154471 entries, 0 to 5154470
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   user    int64
 1   item    int64
 2   time    int64
dtypes: int64(3)
memory usage: 157.3 MB


In [68]:
userid, itemid = list(set(train_data.user)), list(set(train_data.item))
n_user, n_item = len(userid), len(itemid)
print(f'n_user : {n_user}')
print(f'n_item : {n_item}')

n_user : 31360
n_item : 6807


### 데이터 전처리

### 데이터 파일 변환

기존 데이터 파일을 Recbole 데이터 파일로 변환시키는 과정

In [69]:
userid, itemid = sorted(userid), sorted(itemid)
n_user, n_item = len(userid), len(itemid)

userid_2_index = {v:i for i,v in enumerate(userid)}
itemid_2_index = {v:i for i,v in enumerate(itemid)}
index_2_userid = {i:v for i,v in enumerate(userid)}
index_2_itemid = {i:v for i,v in enumerate(itemid)}

In [70]:
yamldata = """
field_separator: "\t"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
    user: [user_id, director1, director2, director3]
    item: [item_id, director_str]

train_neg_sample_args:
    uniform: 1
    
eval_args:
    split: {'RS': [8, 1, 1]}
    group_by: user
    order: RO
    mode: full
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 10
valid_metric: Recall@10
"""

In [71]:
train_data.user = train_data.user.map(userid_2_index)
train_data.item = train_data.item.map(itemid_2_index)

user_data.user = user_data.user.map(userid_2_index)
item_data.item = item_data.item.map(itemid_2_index)

In [72]:
train_data.columns=['user_id:token', 'item_id:token', 'timestamp:float']
user_data.columns=['user_id:token', 'director1:token', 'director2:token', 'director3:token']
item_data.columns=['item_id:token', 'director_str:token_seq']


In [73]:
outpath = f"dataset/train_data"
# outfile = f"dataset/train_data/train_data.inter"
yamlfile = f"train_data.yaml"

os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape

# print("Processing Start")
# inter_table = []
# for user, item, time in zip(train_data.user, train_data.item, train_data.time):
#     uid, iid = userid_2_index[user], itemid_2_index[item]
#     # tval = int(time.mktime(datetime.datetime.strptime(time, "%Y-%m-%d %H:%M:%S").timetuple()))
#     inter_table.append( [uid, iid, time] )

# print("Processing Complete")

print("Dump Start")
# 데이터 설정 파일 저장
with open(yamlfile, "w") as f:
    f.write(yamldata) 

# 데이터 파일 저장
train_data.to_csv(os.path.join(outpath,"train_data.inter"),sep='\t',index=False)
user_data.to_csv(os.path.join(outpath,"train_data.user"),sep='\t',index=False)
item_data.to_csv(os.path.join(outpath,"train_data.item"),sep='\t',index=False)
# with open(outfile, "w") as f:
#     # write header
#     f.write("user_id:token\titem_id:token\ttimestamp:float\n")
#     for row in inter_table:
#         f.write("\t".join([str(x) for x in row])+"\n")

print("Dump Complete")

Dump Start


456

Dump Complete


### 로거 생성

In [74]:
logger = getLogger()

### 설정 인스턴스 생성

In [75]:
# configurations initialization
config = Config(model='FFM', dataset="train_data", config_file_list=[f'train_data.yaml'])
config['epochs'] = 6
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)

logger.info(config)

27 Dec 13:36    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/train_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 6
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_se

### 데이터 로드

In [76]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

27 Dec 13:37    INFO  train_data
The number of users: 31361
Average actions of users: 164.36450892857144
The number of items: 6808
Average actions of items: 757.2309387395328
The number of inters: 5154471
The sparsity of the dataset: 97.58579218741939%
Remain Fields: ['user_id', 'item_id', 'timestamp', 'director1', 'director2', 'director3', 'director_str']
27 Dec 13:37    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
27 Dec 13:37    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [77]:
train_data.dataset
valid_data.dataset
test_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 132.40079719387754
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 609.9734097252827
[1;34mThe number of inters[0m: 4152089
[1;34mThe sparsity of the dataset[0m: 98.05527944529516%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'director1', 'director2', 'director3', 'director_str', 'label']

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 15.981855867346939
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 73.68288738606292
[1;34mThe number of inters[0m: 501191
[1;34mThe sparsity of the dataset[0m: 99.76525637106212%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'director1', 'director2', 'director3', 'director_str', 'label']

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 15.981855867346939
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 73.69372151154242
[1;34mThe number of inters[0m: 501191
[1;34mThe sparsity of the dataset[0m: 99.76525637106212%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'director1', 'director2', 'director3', 'director_str', 'label']

### 모델 인스턴스 생성

In [78]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = FFM(config, train_data.dataset).to(config['device'])
logger.info(model)

27 Dec 13:37    INFO  FFM(
  (token_embedding_table): FMEmbedding(
    (embedding): Embedding(39745, 10)
  )
  (token_seq_embedding_table): ModuleList(
    (0): Embedding(1342, 10)
  )
  (first_order_linear): FMFirstOrderLinear(
    (token_embedding_table): FMEmbedding(
      (embedding): Embedding(39745, 1)
    )
    (token_seq_embedding_table): ModuleList(
      (0): Embedding(1342, 1)
    )
  )
  (sigmoid): Sigmoid()
  (ffm): FieldAwareFactorizationMachine(
    (token_embeddings): ModuleList(
      (0): Embedding(39745, 10)
      (1): Embedding(39745, 10)
      (2): Embedding(39745, 10)
      (3): Embedding(39745, 10)
      (4): Embedding(39745, 10)
      (5): Embedding(39745, 10)
    )
    (token_seq_embeddings): ModuleList(
      (0): ModuleList(
        (0): Embedding(1342, 10)
        (1): Embedding(1342, 10)
        (2): Embedding(1342, 10)
        (3): Embedding(1342, 10)
        (4): Embedding(1342, 10)
        (5): Embedding(1342, 10)
      )
      (1): ModuleList(
        (

### 모델 학습

In [79]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

27 Dec 13:38    INFO  epoch 0 training [time: 69.30s, train loss: 1696.1397]
27 Dec 13:43    INFO  epoch 0 evaluating [time: 274.13s, valid_score: 0.097400]
27 Dec 13:43    INFO  valid result: 
recall@10 : 0.0974    mrr@10 : 0.3248    ndcg@10 : 0.1512    hit@10 : 0.6573    precision@10 : 0.1269    map@10 : 0.07
27 Dec 13:43    INFO  Saving current: saved/FFM-Dec-27-2022_13-37-27.pth
27 Dec 13:44    INFO  epoch 1 training [time: 75.69s, train loss: 1456.6774]
27 Dec 13:49    INFO  epoch 1 evaluating [time: 275.00s, valid_score: 0.108900]
27 Dec 13:49    INFO  valid result: 
recall@10 : 0.1089    mrr@10 : 0.3546    ndcg@10 : 0.1681    hit@10 : 0.6939    precision@10 : 0.1408    map@10 : 0.0797
27 Dec 13:49    INFO  Saving current: saved/FFM-Dec-27-2022_13-37-27.pth
27 Dec 13:50    INFO  epoch 2 training [time: 73.82s, train loss: 1365.6169]
27 Dec 13:54    INFO  epoch 2 evaluating [time: 283.51s, valid_score: 0.114000]
27 Dec 13:54    INFO  valid result: 
recall@10 : 0.114    mrr@10 : 0.

### 학습 결과 출력

In [80]:
# model evaluation
test_result = trainer.evaluate(test_data, load_best_model="True", show_progress=config['show_progress'])

logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}')
logger.info(set_color('test result', 'yellow') + f': {test_result}')

result = {
    'best_valid_score': best_valid_score,
    'valid_score_bigger': config['valid_metric_bigger'],
    'best_valid_result': best_valid_result,
    'test_result': test_result
}

print(json.dumps(result, indent=4))

27 Dec 14:12    INFO  Loading model structure and parameters from saved/FFM-Dec-27-2022_13-37-27.pth
27 Dec 14:16    INFO  best valid : OrderedDict([('recall@10', 0.1202), ('mrr@10', 0.3767), ('ndcg@10', 0.183), ('hit@10', 0.7321), ('precision@10', 0.1538), ('map@10', 0.0878)])
27 Dec 14:16    INFO  test result: OrderedDict([('recall@10', 0.1361), ('mrr@10', 0.441), ('ndcg@10', 0.2223), ('hit@10', 0.7583), ('precision@10', 0.1827), ('map@10', 0.1184)])


{
    "best_valid_score": 0.1202,
    "valid_score_bigger": true,
    "best_valid_result": {
        "recall@10": 0.1202,
        "mrr@10": 0.3767,
        "ndcg@10": 0.183,
        "hit@10": 0.7321,
        "precision@10": 0.1538,
        "map@10": 0.0878
    },
    "test_result": {
        "recall@10": 0.1361,
        "mrr@10": 0.441,
        "ndcg@10": 0.2223,
        "hit@10": 0.7583,
        "precision@10": 0.1827,
        "map@10": 0.1184
    }
}


In [81]:
# 모델 불러오기
model_path='saved/FFM-Dec-27-2022_13-37-27.pth'
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(model_path)

27 Dec 14:30    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/train_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 6
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_se

## Inference

In [134]:
from recbole.utils.case_study import full_sort_topk

user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

all_user_list = torch.arange(1, len(user_id2token)).view(-1,128)

device = config.final_config_dict['device']

tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'), leave=True, mininterval=1)

pred_list = None
user_list = []
for data in tbar:
    batch_pred_list = full_sort_topk(data, model, test_data, 30, device=device)[1]
    batch_pred_list = batch_pred_list.clone().detach().cpu().numpy()
    if pred_list is None:
        pred_list = batch_pred_list
        user_list = data.numpy()
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, data.numpy(), axis=0
        )
tbar.close()

Inference:   0%|          | 0/245 [00:00<?, ?it/s]:   8%|▊         | 20/245 [00:01<00:11, 19.33it/s]:  16%|█▋        | 40/245 [00:02<00:10, 19.36it/s]:  24%|██▍       | 60/245 [00:03<00:09, 19.38it/s]:  33%|███▎      | 80/245 [00:04<00:08, 19.37it/s]:  41%|████      | 100/245 [00:05<00:07, 19.36it/s]:  49%|████▉     | 120/245 [00:06<00:06, 19.34it/s]:  57%|█████▋    | 140/245 [00:07<00:05, 19.32it/s]:  65%|██████▌   | 160/245 [00:08<00:04, 19.29it/s]:  73%|███████▎  | 180/245 [00:09<00:03, 19.27it/s]:  82%|████████▏ | 200/245 [00:10<00:02, 19.23it/s]:  90%|████████▉ | 220/245 [00:11<00:01, 19.20it/s]:  98%|█████████▊| 240/245 [00:12<00:00, 19.17it/s]: 100%|██████████| 245/245 [00:12<00:00, 19.26it/s]


In [86]:
# pred_list

In [90]:
train_df[train_df['user'] == 11]['item'].unique()

array([ 4643,   170,   531,   616,  2140,  2722,  2313,  2688,  2428,
        3113,  1591,  2600,  8169,  2572, 58293,  7541,  1367,    32,
        4792,  7444, 53953, 56949,  6502, 53000, 51662,  5151, 35836,
        7293, 33585,  8810, 56801,  5377,   344,    19,   410,  2124,
         828,  1274,  8977,  1032,  1214,  1200,  1320,  3897,  7173,
        1225,  2858, 59418, 45361,  2706,  1321,  2793, 33085,  4235,
        3892,  4340, 27660, 43556, 47124,  2294, 48304,   150, 31184,
       34338,  1917, 50162,  2827, 27368,  4366,  2153, 30812,  3525,
        1270,  2011,  2012,  8973,  1255,  2018,   541,  4878,  7361,
       31658,  2571,  7099,   260,  1196, 60069,   160,  1882, 60037,
         880, 36509,   405,  3826,  4133,   673,  6541,   611,   172,
        4638,  5171,   208,  4887,  5459, 60760,  8361, 60514,  1544,
        1876,   442, 32213,  5219,  1690,  2717, 27608, 52722,   780,
        6934, 52287,  3745, 45499, 37830, 60040, 34319,  8644,  6365,
       34048,   316,

In [121]:
pred_list.shape

(31360, 30)

In [131]:
# user별 item 추천 결과 하나로 합쳐주기
result = []
for user, pred in zip(user_list, pred_list):
    item_cnt = 0 # 10개가 되면 다음 유저로 이동
    user = int(index_2_userid[user-1])
    user_before = train_df[train_df['user'] == user]['item'].unique()
    for item in pred:
        item = index_2_itemid[int(item_id2token[item])]
        if item_cnt >= 10:
            continue
        if item not in user_before:
            result.append((user, item))
            item_cnt += 1

In [132]:
# submission file 제작하기
sub = pd.DataFrame(result, columns=["user", "item"])
sub.to_csv(
    "submission_ffm.csv", index=False
)
print('inference done!')

inference done!


In [129]:
sub['item'].isna().sum()

0

In [None]:
# config, model, dataset 불러오기

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128) # 245, 128

# user, item 길이
user_len = len(user_id2token) # 31361 (PAD 포함)
item_len = len(item_id2token) # 6808 (PAD 포함)


# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink')) # 245, 128

for data in tbar: # data: 128, 
    batch_pred_list = full_sort_topk(data, model, test_data, 30, device=device)[1]
    batch_pred_list = batch_pred_list.clone().detach().cpu().numpy()
    if pred_list is None: # 처음에는 직접 정의
        pred_list = batch_pred_list
        user_list = data.numpy()
    else: # pred_list가 있을 때는, append
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )