In [1]:
import os
from datetime import datetime
import time

import numpy as np
import pandas as pd
from recbole.quick_start import run_recbole
from tqdm import tqdm
#!pip install recbole

In [2]:
dataset_path = "/opt/ml/input/data/train"
output_path = "/opt/ml/input/data/recbole"

In [42]:
movie = pd.read_csv(os.path.join(dataset_path, 'train_ratings.csv'))
movie['rating'] = [1] * len(movie)

In [43]:
print(movie)

           user   item        time  rating
0            11   4643  1230782529       1
1            11    170  1230782534       1
2            11    531  1230782539       1
3            11    616  1230782542       1
4            11   2140  1230782563       1
...         ...    ...         ...     ...
5154466  138493  44022  1260209449       1
5154467  138493   4958  1260209482       1
5154468  138493  68319  1260209720       1
5154469  138493  40819  1260209726       1
5154470  138493  27311  1260209807       1

[5154471 rows x 4 columns]


In [44]:
movie.item.nunique()

6807

In [45]:
movie.columns = ['user', 'item', 'timestamp', 'rating']

In [46]:
movie

Unnamed: 0,user,item,timestamp,rating
0,11,4643,1230782529,1
1,11,170,1230782534,1
2,11,531,1230782539,1
3,11,616,1230782542,1
4,11,2140,1230782563,1
...,...,...,...,...
5154466,138493,44022,1260209449,1
5154467,138493,4958,1260209482,1
5154468,138493,68319,1260209720,1
5154469,138493,40819,1260209726,1


In [47]:
movie = movie[['user', 'item', 'rating', 'timestamp']]

In [48]:
movie

Unnamed: 0,user,item,rating,timestamp
0,11,4643,1,1230782529
1,11,170,1,1230782534
2,11,531,1,1230782539
3,11,616,1,1230782542
4,11,2140,1,1230782563
...,...,...,...,...
5154466,138493,44022,1,1260209449
5154467,138493,4958,1,1260209482
5154468,138493,68319,1,1260209720
5154469,138493,40819,1,1260209726


In [49]:
movie.to_csv(os.path.join(dataset_path, 'rating.csv'), header=True, sep=',', index=False, encoding='utf-8')

In [50]:
movie['user'].to_csv(os.path.join(dataset_path, 'user.csv'), header=True, sep=',', index=False, encoding='utf-8')

In [51]:
movie['item'].to_csv(os.path.join(dataset_path, 'item.csv'), header=True, sep=',', index=False, encoding='utf-8')

In [52]:
class BaseDataset(object):
    def __init__(self, input_path, output_path):
        super(BaseDataset, self).__init__()

        self.dataset_name = ''
        self.input_path = input_path
        self.output_path = output_path
        self.check_output_path()

        # input file
        self.inter_file = os.path.join(self.input_path, "rating.csv")
        self.user_file = os.path.join(self.input_path, "user.csv")
        self.item_file = os.path.join(self.input_path, "item.csv")
        self.sep = '\t'

        # output file
        self.output_inter_file, self.output_item_file, self.output_user_file = self.get_output_files()

        # selected feature fields
        self.inter_fields = {}
        self.item_fields = {}
        self.user_fields = {}

    def check_output_path(self):
        if not os.path.isdir(self.output_path):
            os.makedirs(self.output_path)

    def get_output_files(self):
        output_inter_file = os.path.join(self.output_path, self.dataset_name + '.inter')
        output_item_file = os.path.join(self.output_path, self.dataset_name + '.item')
        output_user_file = os.path.join(self.output_path, self.dataset_name + '.user')
        return output_inter_file, output_item_file, output_user_file

    def load_inter_data(self) -> pd.DataFrame():
        raise NotImplementedError

    def load_item_data(self) -> pd.DataFrame():
        raise NotImplementedError

    def load_user_data(self) -> pd.DataFrame():
        raise NotImplementedError

    def convert_inter(self):
        try:
            input_inter_data = self.load_inter_data()
            self.convert(input_inter_data, self.inter_fields, self.output_inter_file)
        except NotImplementedError:
            print('This dataset can\'t be converted to inter file\n')

    def convert_item(self):
        try:
            input_item_data = self.load_item_data()
            self.convert(input_item_data, self.item_fields, self.output_item_file)
        except NotImplementedError:
            print('This dataset can\'t be converted to item file\n')

    def convert_user(self):
        try:
            input_user_data = self.load_user_data()
            self.convert(input_user_data, self.user_fields, self.output_user_file)
        except NotImplementedError:
            print('This dataset can\'t be converted to user file\n')

    @staticmethod
    def convert(input_data, selected_fields, output_file):
        output_data = pd.DataFrame()
        for column in selected_fields:
            output_data[column] = input_data.iloc[:, column]
        with open(output_file, 'w') as fp:
            fp.write('\t'.join([selected_fields[column] for column in output_data.columns]) + '\n')
            for i in tqdm(range(output_data.shape[0])):
                fp.write('\t'.join([str(output_data.iloc[i, j])
                                    for j in range(output_data.shape[1])]) + '\n')

    def parse_json(self, data_path):
        with open(data_path, 'rb') as g:
            for l in g:
                yield eval(l)

    def getDF(self, data_path):
        i = 0
        df = {}
        for d in self.parse_json(data_path):
            df[i] = d
            i += 1
        data = pd.DataFrame.from_dict(df, orient='index')

        return data

In [53]:
class MovieDataset(BaseDataset):
    def __init__(self, input_path, output_path):
        super(MovieDataset, self).__init__(input_path, output_path)
        self.dataset_name = "recbole"

        self.inter_file = os.path.join(self.input_path, "rating.csv")
        self.item_file = os.path.join(self.input_path, "user.csv")
        self.user_file = os.path.join(self.input_path, "item.csv")

        self.sep = ","

        # output_path
        output_files = self.get_output_files()
        self.output_inter_file = output_files[0]
        self.output_item_file = output_files[1]
        self.output_user_file = output_files[2]

        # selected feature fields
        self.inter_fields = {
            0: "user:token",
            1: "item:token",
            2: "rating:float",
            3: "timestamp:float",
        }

        self.item_fields = {
            0: "user:token",
        }

        self.user_fields = {
            0: "item:token",
        }

    def load_inter_data(self):
        df = pd.read_csv(self.inter_file,
            dtype={"user": int, "item": int, "rating": float, "timestamp":float}
           )

        return df

    def load_item_data(self):
        return pd.read_csv(self.item_file, delimiter=self.sep, engine="python")

    def load_user_data(self):
        return pd.read_csv(self.user_file, delimiter=self.sep, engine="python")

In [54]:
movieDataset = MovieDataset(dataset_path, output_path)

In [55]:
movieDataset.convert_inter()
movieDataset.convert_user()
movieDataset.convert_item()
del movieDataset

100%|██████████| 5154471/5154471 [08:38<00:00, 9940.64it/s] 
100%|██████████| 5154471/5154471 [02:24<00:00, 35599.28it/s]
100%|██████████| 5154471/5154471 [02:24<00:00, 35731.42it/s]


In [12]:
cfg_str = """
data_path: /opt/ml/input/data/
dataset: recbole
field_separator: "\\t"
USER_ID_FIELD: user
ITEM_ID_FIELD: item
RATING_FIELD: rating
TIME_FIELD: timestamp
show_progress: false

load_col:
    inter: [user, item, rating, timestamp]
    user: [user]
    item: [item]

epochs: 5
learning_rate: 0.01
user_inter_num_interval: "[0,inf)"
item_inter_num_interval: "[0,inf)"
filter_inter_by_user_or_item: false
neg_sampling:
    uniform: 1
eval_args:
    split: {'RS': [4, 1, 1]}
    group_by: user
    order: TO
    mode: uni50
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 10
valid_metric: Recall@10
"""


with open(os.path.join(output_path, "config.yaml"), "w") as f:
    f.write(cfg_str)

In [13]:
def run(model_name):
    if model_name in [
        "MultiVAE",
        "MultiDAE",
        "MacridVAE",
        "RecVAE",
        "GRU4Rec",
        "NARM",
        "SASRecF",
        "STAMP",
        "NextItNet",
        "TransRec",
        "SASRec",
        "BERT4Rec",
        "SRGNN",
        "GCSAN",
        "GRU4RecF",
        "FOSSIL",
        "SHAN",
        "RepeatNet",
        "HRM",
        "NPE",
    ]:
        parameter_dict = {
            "neg_sampling": None,
        }
        return run_recbole(
            model=model_name,
            dataset='recbole',
            config_file_list=['/opt/ml/input/data/recbole/config.yaml'],
            config_dict=parameter_dict,
        )
    else:
        return run_recbole(
            model=model_name,
            dataset='recbole',
            config_file_list=['/opt/ml/input/data/recbole/config.yaml'],
        )

In [None]:
#추가
#%%time
model_list = ["Pop", "ItemKNN", "BPR", "NeuMF", "RecVAE", "LightGCN"] # General
model_list += ["FFM", "DeepFM"] # Context-aware
model_list += ["GRU4Rec", "SHAN"] # Sequential

model_name = "NeuMF"
print(f"running {model_name}...")
start = time.time()
result = run(model_name)
t = time.time() - start
print(f"It took {t/60:.2f} mins")
print(result)

In [60]:
%%time
model_list = ["Pop", "ItemKNN", "BPR", "NeuMF", "RecVAE", "LightGCN"] # General
model_list += ["FFM", "DeepFM"] # Context-aware
model_list += ["GRU4Rec", "SHAN"] # Sequential
for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run(model_name)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running Pop...


14 Apr 04:53    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /opt/ml/input/data/recbole
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 5
train_batch_size = 2048
learner = adam
learning_rate = 0.01
neg_sampling = {'uniform': 1}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [4, 1, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'uni50'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user
ITEM_ID_FIELD = item
RATING_FIELD = rating
TIM

It took 2.44 mins
{'best_valid_score': 0.1928, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@10', 0.1928), ('mrr@10', 0.4927), ('ndcg@10', 0.3164), ('hit@10', 0.8403), ('precision@10', 0.3002), ('map@10', 0.1927)]), 'test_result': OrderedDict([('recall@10', 0.173), ('mrr@10', 0.4649), ('ndcg@10', 0.2851), ('hit@10', 0.8192), ('precision@10', 0.269), ('map@10', 0.1672)])}
running ItemKNN...


14 Apr 04:55    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /opt/ml/input/data/recbole
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 5
train_batch_size = 2048
learner = adam
learning_rate = 0.01
neg_sampling = {'uniform': 1}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [4, 1, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'uni50'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user
ITEM_ID_FIELD = item
RATING_FIELD = rating
TIM

It took 10.85 mins
{'best_valid_score': 0.2321, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@10', 0.2321), ('mrr@10', 0.5739), ('ndcg@10', 0.3821), ('hit@10', 0.9045), ('precision@10', 0.3599), ('map@10', 0.2442)]), 'test_result': OrderedDict([('recall@10', 0.1982), ('mrr@10', 0.5249), ('ndcg@10', 0.3272), ('hit@10', 0.8689), ('precision@10', 0.3059), ('map@10', 0.1977)])}
running BPR...


14 Apr 05:06    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /opt/ml/input/data/recbole
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 5
train_batch_size = 2048
learner = adam
learning_rate = 0.01
neg_sampling = {'uniform': 1}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [4, 1, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'uni50'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user
ITEM_ID_FIELD = item
RATING_FIELD = rating
TIM

It took 6.50 mins
{'best_valid_score': 0.1976, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@10', 0.1976), ('mrr@10', 0.5463), ('ndcg@10', 0.3342), ('hit@10', 0.9076), ('precision@10', 0.3132), ('map@10', 0.1961)]), 'test_result': OrderedDict([('recall@10', 0.1743), ('mrr@10', 0.5056), ('ndcg@10', 0.2937), ('hit@10', 0.8748), ('precision@10', 0.2734), ('map@10', 0.1648)])}
running NeuMF...


14 Apr 05:12    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /opt/ml/input/data/recbole
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 5
train_batch_size = 2048
learner = adam
learning_rate = 0.01
neg_sampling = {'uniform': 1}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [4, 1, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'uni50'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user
ITEM_ID_FIELD = item
RATING_FIELD = rating
TIM

It took 7.80 mins
{'best_valid_score': 0.2286, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@10', 0.2286), ('mrr@10', 0.5937), ('ndcg@10', 0.3851), ('hit@10', 0.9246), ('precision@10', 0.3609), ('map@10', 0.2409)]), 'test_result': OrderedDict([('recall@10', 0.2007), ('mrr@10', 0.5466), ('ndcg@10', 0.3366), ('hit@10', 0.8938), ('precision@10', 0.3138), ('map@10', 0.2008)])}
running RecVAE...


14 Apr 05:20    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /opt/ml/input/data/recbole
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 5
train_batch_size = 2048
learner = adam
learning_rate = 0.01
neg_sampling = None
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [4, 1, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'uni50'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user
ITEM_ID_FIELD = item
RATING_FIELD = rating
TIME_FIELD = 

It took 16.07 mins
{'best_valid_score': 0.1977, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@10', 0.1977), ('mrr@10', 0.5095), ('ndcg@10', 0.3265), ('hit@10', 0.8555), ('precision@10', 0.3083), ('map@10', 0.1998)]), 'test_result': OrderedDict([('recall@10', 0.1762), ('mrr@10', 0.4808), ('ndcg@10', 0.2938), ('hit@10', 0.8335), ('precision@10', 0.2758), ('map@10', 0.1729)])}
running LightGCN...


14 Apr 05:36    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /opt/ml/input/data/recbole
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 5
train_batch_size = 2048
learner = adam
learning_rate = 0.01
neg_sampling = {'uniform': 1}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [4, 1, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'uni50'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user
ITEM_ID_FIELD = item
RATING_FIELD = rating
TIM

KeyboardInterrupt: 