# FMM Recbole 구현

### Recbole 라이브러리 로딩

In [84]:
# !pip install recbole

In [85]:
# !pip install ray

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

from logging import getLogger
import os
import json
import numpy as np
import pandas as pd
import time, datetime
from tqdm import tqdm

from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.utils import init_logger, get_trainer, init_seed, set_color, get_model
from recbole.quick_start.quick_start import load_data_and_model

from recbole.config import Config
from recbole.data import create_dataset

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import ndcg_score, recall_score

import torch

### 데이터 로드

In [2]:
train_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5154471 entries, 0 to 5154470
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   user    int64
 1   item    int64
 2   time    int64
dtypes: int64(3)
memory usage: 118.0 MB


In [4]:
data_path = '/opt/ml/input/data/train'
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

In [5]:
df_merge = pd.merge(train_df, year_data, on='item', how='left')
df_merge = pd.merge(df_merge, writer_data, on='item', how='left')
df_merge = pd.merge(df_merge, title_data, on='item', how='left')
df_merge = pd.merge(df_merge, genre_data, on='item', how='left')
df_merge = pd.merge(df_merge, director_data, on='item', how='left')

In [6]:
train_df['item'].nunique()
df_merge['item'].nunique() 

6807

6807

In [7]:
df_merge = df_merge.sort_values('user')

In [8]:
df_merge.head()

Unnamed: 0,user,item,time,year,writer,title,genre,director
0,11,4643,1230782529,2001.0,nm0099541,Planet of the Apes (2001),Action,nm0000318
3077,11,8907,1230856729,2004.0,nm0769840,Shark Tale (2004),Children,nm1224299
3076,11,8907,1230856729,2004.0,nm0769840,Shark Tale (2004),Children,nm0421776
3075,11,8907,1230856729,2004.0,nm0769840,Shark Tale (2004),Children,nm0074426
3073,11,8907,1230856729,2004.0,nm0769840,Shark Tale (2004),Animation,nm0421776


In [14]:
train_data = df_merge[['user', 'item', 'time']].reset_index(drop=True)

In [15]:
user_data = df_merge[['user']].reset_index(drop=True)

In [16]:
item_data = df_merge[['item', 'year', 'writer', 'title', 'genre', 'director']].drop_duplicates(subset=['item']).reset_index(drop=True)

In [17]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48264331 entries, 0 to 48264330
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   user    int64
 1   item    int64
 2   time    int64
dtypes: int64(3)
memory usage: 1.1 GB


In [20]:
userid, itemid = list(set(train_data.user)), list(set(train_data.item))
n_user, n_item = len(userid), len(itemid)
print(f'n_user : {n_user}')
print(f'n_item : {n_item}')

n_user : 31360
n_item : 6807


In [21]:
max(itemid)

119145

### 데이터 전처리

### 데이터 파일 변환

기존 데이터 파일을 Recbole 데이터 파일로 변환시키는 과정

In [22]:
userid, itemid = sorted(userid), sorted(itemid)
n_user, n_item = len(userid), len(itemid)

userid_2_index = {v:i for i,v in enumerate(userid)}
itemid_2_index = {v:i for i,v in enumerate(itemid)}
index_2_userid = {i:v for i,v in enumerate(userid)}
index_2_itemid = {i:v for i,v in enumerate(itemid)}

In [23]:
yamldata = """
field_separator: "\t"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
    user: [user_id]
    item: [item_id, year, writer, title, genre, director]

train_neg_sample_args:
    uniform: 1
    
eval_args:
    split: {'RS': [0.98, 0.01, 0.01]}
    group_by: user
    order: RO
    mode: full
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 10
valid_metric: Recall@10
"""

In [24]:
train_data.user = train_data.user.map(userid_2_index)
train_data.item = train_data.item.map(itemid_2_index)

user_data.user = user_data.user.map(userid_2_index)
item_data.item = item_data.item.map(itemid_2_index)

In [25]:
train_data.columns=['user_id:token', 'item_id:token', 'timestamp:float']
user_data.columns=['user_id:token']
item_data.columns=['item_id:token', 'year:token', 'writer:token', 'title:token_seq', 'genre:token', 'director:token']


In [26]:
outpath = f"dataset/train_data"
# outfile = f"dataset/train_data/train_data.inter"
yamlfile = f"train_data.yaml"

os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape

# print("Processing Start")
# inter_table = []
# for user, item, time in zip(train_data.user, train_data.item, train_data.time):
#     uid, iid = userid_2_index[user], itemid_2_index[item]
#     # tval = int(time.mktime(datetime.datetime.strptime(time, "%Y-%m-%d %H:%M:%S").timetuple()))
#     inter_table.append( [uid, iid, time] )

# print("Processing Complete")

print("Dump Start")
# 데이터 설정 파일 저장
with open(yamlfile, "w") as f:
    f.write(yamldata) 

# 데이터 파일 저장
train_data.to_csv(os.path.join(outpath,"train_data.inter"),sep='\t',index=False)
user_data.to_csv(os.path.join(outpath,"train_data.user"),sep='\t',index=False)
item_data.to_csv(os.path.join(outpath,"train_data.item"),sep='\t',index=False)
# with open(outfile, "w") as f:
#     # write header
#     f.write("user_id:token\titem_id:token\ttimestamp:float\n")
#     for row in inter_table:
#         f.write("\t".join([str(x) for x in row])+"\n")

print("Dump Complete")

Dump Start


456

Dump Complete


### 로거 생성

In [27]:
logger = getLogger()

### 설정 인스턴스 생성

In [28]:
# configurations initialization
config = Config(model='FFM', dataset="train_data", config_file_list=[f'train_data.yaml'])
config['epochs'] = 1
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)

logger.info(config)

25 Dec 20:11    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/train_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 1
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.98, 0.01, 0.01]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:

### 데이터 로드

In [29]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

25 Dec 20:16    INFO  train_data
The number of users: 31361
Average actions of users: 1539.0411670918368
The number of items: 6808
Average actions of items: 7090.396797414426
The number of inters: 48264331
The sparsity of the dataset: 77.3943582243111%
Remain Fields: ['user_id', 'item_id', 'timestamp', 'year', 'writer', 'title', 'genre', 'director']
25 Dec 20:18    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
25 Dec 20:18    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [0.98, 0.01, 0.01]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [30]:
train_data.dataset
valid_data.dataset
test_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 1509.2573660714286
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 6953.182165417952
[1;34mThe number of inters[0m: 47330311
[1;34mThe sparsity of the dataset[0m: 77.83182666309105%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'year', 'writer', 'title', 'genre', 'director', 'label']

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 14.891900510204081
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 75.47026502908855
[1;34mThe number of inters[0m: 467010
[1;34mThe sparsity of the dataset[0m: 99.78126578061001%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'year', 'writer', 'title', 'genre', 'director', 'label']

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 14.891900510204081
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 75.05785920925747
[1;34mThe number of inters[0m: 467010
[1;34mThe sparsity of the dataset[0m: 99.78126578061001%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'year', 'writer', 'title', 'genre', 'director', 'label']

### 모델 인스턴스 생성

In [31]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = FFM(config, train_data.dataset).to(config['device'])
logger.info(model)

25 Dec 20:18    INFO  FFM(
  (token_embedding_table): FMEmbedding(
    (embedding): Embedding(42034, 10)
  )
  (token_seq_embedding_table): ModuleList(
    (0): Embedding(9166, 10)
  )
  (first_order_linear): FMFirstOrderLinear(
    (token_embedding_table): FMEmbedding(
      (embedding): Embedding(42034, 1)
    )
    (token_seq_embedding_table): ModuleList(
      (0): Embedding(9166, 1)
    )
  )
  (sigmoid): Sigmoid()
  (ffm): FieldAwareFactorizationMachine(
    (token_embeddings): ModuleList(
      (0): Embedding(42034, 10)
      (1): Embedding(42034, 10)
      (2): Embedding(42034, 10)
      (3): Embedding(42034, 10)
      (4): Embedding(42034, 10)
      (5): Embedding(42034, 10)
      (6): Embedding(42034, 10)
    )
    (token_seq_embeddings): ModuleList(
      (0): ModuleList(
        (0): Embedding(9166, 10)
        (1): Embedding(9166, 10)
        (2): Embedding(9166, 10)
        (3): Embedding(9166, 10)
        (4): Embedding(9166, 10)
        (5): Embedding(9166, 10)
        

### 모델 학습

In [32]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

25 Dec 20:34    INFO  epoch 0 training [time: 933.53s, train loss: 9319.8292]
25 Dec 20:38    INFO  epoch 0 evaluating [time: 238.06s, valid_score: 0.264200]
25 Dec 20:38    INFO  valid result: 
recall@10 : 0.2642    mrr@10 : 0.5806    ndcg@10 : 0.3612    hit@10 : 0.8577    precision@10 : 0.2857    map@10 : 0.2321
25 Dec 20:38    INFO  Saving current: saved/FFM-Dec-25-2022_20-18-51.pth


### 학습 결과 출력

In [33]:
# model evaluation
test_result = trainer.evaluate(test_data, load_best_model="True", show_progress=config['show_progress'])

logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}')
logger.info(set_color('test result', 'yellow') + f': {test_result}')

result = {
    'best_valid_score': best_valid_score,
    'valid_score_bigger': config['valid_metric_bigger'],
    'best_valid_result': best_valid_result,
    'test_result': test_result
}

print(json.dumps(result, indent=4))

25 Dec 20:38    INFO  Loading model structure and parameters from saved/FFM-Dec-25-2022_20-18-51.pth
25 Dec 20:42    INFO  best valid : OrderedDict([('recall@10', 0.2642), ('mrr@10', 0.5806), ('ndcg@10', 0.3612), ('hit@10', 0.8577), ('precision@10', 0.2857), ('map@10', 0.2321)])
25 Dec 20:42    INFO  test result: OrderedDict([('recall@10', 0.2653), ('mrr@10', 0.5785), ('ndcg@10', 0.3611), ('hit@10', 0.8579), ('precision@10', 0.2857), ('map@10', 0.2318)])


{
    "best_valid_score": 0.2642,
    "valid_score_bigger": true,
    "best_valid_result": {
        "recall@10": 0.2642,
        "mrr@10": 0.5806,
        "ndcg@10": 0.3612,
        "hit@10": 0.8577,
        "precision@10": 0.2857,
        "map@10": 0.2321
    },
    "test_result": {
        "recall@10": 0.2653,
        "mrr@10": 0.5785,
        "ndcg@10": 0.3611,
        "hit@10": 0.8579,
        "precision@10": 0.2857,
        "map@10": 0.2318
    }
}


In [34]:
# 모델 불러오기
model_path='saved/FFM-Dec-25-2022_20-18-51.pth'
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(model_path)

25 Dec 20:42    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/train_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 1
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.98, 0.01, 0.01]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:

## Inference

#### 전체 item에서 각 사용자가 이미 본 item과 FFM으로 추천하는 100개 item을 제외한 item 중
#### random으로 negative sample 50개 추출!

-> 그럼 사용자 별로 50개의 negative sample, 전체로는 31360 * 50 개의 negative sample이 만들어진다!

In [35]:
before = df_merge.groupby('user')['item'].unique()
before 

user
11        [4643, 8907, 36401, 41571, 56757, 761, 2004, 8...
14        [1022, 4921, 40629, 4886, 31658, 364, 4027, 60...
18        [1209, 1211, 7034, 7361, 1247, 7234, 1305, 120...
25        [919, 2706, 4306, 1291, 1193, 5952, 2683, 288,...
31        [8907, 56171, 102125, 32031, 45431, 42738, 472...
                                ...                        
138473    [1921, 31658, 65261, 3000, 1840, 1, 8507, 8533...
138475    [7132, 2936, 3730, 2010, 5177, 923, 3629, 7234...
138486    [919, 1917, 2617, 6863, 35836, 1136, 6333, 235...
138492    [2571, 3000, 6874, 2859, 1923, 69, 1649, 3677,...
138493    [364, 2087, 2078, 1029, 4306, 596, 2018, 588, ...
Name: item, Length: 31360, dtype: object

In [36]:
from recbole.utils.case_study import full_sort_topk

user_id = config['USER_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128)

device = config.final_config_dict['device']

tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'), leave=True, mininterval=1)

pred_list = None
user_list = []
for data in tbar:
    batch_pred_list = full_sort_topk(data, model, test_data, 100, device=device)[1]
    batch_pred_list = batch_pred_list.clone().detach().cpu().numpy()
    if pred_list is None:
        pred_list = batch_pred_list
        user_list = data.numpy()
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, data.numpy(), axis=0
        )
tbar.close()

  uid_series = torch.tensor(uid_series)
Inference:   5%|▌         | 13/245 [00:01<00:18, 12.37it/s]:  11%|█         | 26/245 [00:02<00:17, 12.42it/s]:  16%|█▌        | 39/245 [00:03<00:16, 12.42it/s]:  21%|██        | 52/245 [00:04<00:15, 12.40it/s]:  27%|██▋       | 65/245 [00:05<00:14, 12.37it/s]:  32%|███▏      | 78/245 [00:06<00:13, 12.35it/s]:  37%|███▋      | 91/245 [00:07<00:12, 12.33it/s]:  42%|████▏     | 104/245 [00:08<00:11, 12.30it/s]:  48%|████▊     | 117/245 [00:09<00:10, 12.27it/s]:  53%|█████▎    | 130/245 [00:10<00:09, 12.22it/s]:  58%|█████▊    | 143/245 [00:11<00:08, 12.20it/s]:  64%|██████▎   | 156/245 [00:12<00:07, 12.18it/s]:  69%|██████▉   | 169/245 [00:13<00:06, 12.15it/s]:  74%|███████▍  | 182/245 [00:14<00:05, 12.10it/s]:  80%|███████▉  | 195/245 [00:15<00:04, 12.06it/s]:  85%|████████▍ | 208/245 [00:17<00:03, 12.02it/s]:  90%|█████████ | 221/245 [00:18<00:02, 11.93it/s]:  95%|█████████▌| 233/245 [00:19<00:01, 11.92it/s]: 100%|██████████| 245/245 [00:20<00:00,

In [45]:
# user별 이미 본 item과 item 추천 결과 하나로 합쳐주기
result = []
for user, pred in zip(user_list, pred_list):
    user = int(index_2_userid[user-1])
    user_before = df_merge[df_merge['user'] == user]['item'].unique()
    list(pred).append(list(user_before))
    pred = list(set(pred))
    for item in pred:
        item = int(index_2_itemid[item-1])
        result.append((user, item))

In [None]:
print("Create Nagetive instances")
num_negative = 50
user_group_dfs = list(pd.DataFrame(result).groupby('user')['item'])
first_row = True
user_neg_dfs = pd.DataFrame() # 총 유저의 negative sample df

# 각 user의 negative sample df를 구하고 총 유저의 negative sample df인 user_neg_dfs 에 concat하는 과정
for u, u_items in tqdm(user_group_dfs): # 한 유저와 해당 유저의 item들이 for문을 통해 (모든유저가 똑같이) 반복됨.
    u_items = set(u_items) # 해당 유저의 item 중복 제거 <- 위에서 해줬기 때문에 안 해도 되긴 함
    i_user_neg_item = np.random.choice(list(set(itemid) - u_items), num_negative, replace=False) # negative sample 생성
    # np.random.choice() : Generates a random sample from a given 1-D array

    i_user_neg_df = pd.DataFrame({'user': [u]*num_negative, 'item': i_user_neg_item, 'label': [0]*num_negative}) # negative sample df 생성
    if first_row == True: # 첫번째 유저일 때만 실행
        user_neg_dfs = i_user_neg_df
        first_row = False
    else:
        user_neg_dfs = pd.concat([user_neg_dfs, i_user_neg_df], axis = 0, sort=False)

In [None]:
# 원래 df에 label 추가 후 negative sample df와 합쳐주기
origin_train_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')
origin_train_df["label"] = 1
p_n_train_df = pd.concat([origin_train_df, user_neg_dfs], axis = 0, sort=False)
p_n_train_dff = p_n_train_df.sort_values(by=['user']) # 유저를 기준으로 df 정렬
p_n_train_df.reset_index(drop=True, inplace=True) # 인덱스 리셋

In [None]:
# negative sample file 제작하기
sub = p_n_train_df
sub.to_csv(
    "p_n_train_ratings.csv", index=False
)
print('done!')

inference done!
