In [1]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn 

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from typing import Tuple

from tqdm import tqdm
from collections import OrderedDict
from box import Box

# 1. Dataset Preprocessing

In [6]:
def encode(df: pd.DataFrame) -> Tuple[pd.DataFrame, LabelEncoder, LabelEncoder]:
    userId_label_encoder = LabelEncoder()
    movieId_label_encoder = LabelEncoder()

    df['userId'] = userId_label_encoder.fit_transform(df['userId'].values)
    df['movieId'] = movieId_label_encoder.fit_transform(df['movieId'].values)

    # encoder.inverse_transform() 으로 decode
    return df, userId_label_encoder, movieId_label_encoder

In [8]:
def trainTestSplit(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits our original data into one test and one
    training set. 
    The test set is made up of one item for each user. This is
    our holdout item used to compute Top@K later.
    The training set is the same as our original data but
    without any of the holdout items.
    Args:
        df (dataframe): Our original data
    Returns:
        df_train (dataframe): All of our data except holdout items
        df_test (dataframe): Only our holdout items.
    """

    # Group by userId and select only the first item for
    # each user (our holdout).
    df_test = df.groupby(['userId']).first()
    df_test['userId'] = df_test.index
    df_test = df_test[['userId', 'movieId', 'rating', 'timestamp']]
    df_test.index.name = None

    # Remove the same items as we for our test set in our training set.
    mask = df.groupby(['userId'])['userId'].transform(maskFirst).astype(bool)
    df_train = df.loc[mask]

    return df_train, df_test


def maskFirst(x):
    """
    Return a list of 0 for the first item and 1 for all others
    """
    result = np.ones_like(x)
    result[0] = 0

    return result    

In [10]:
def getNegatives(df_train: pd.DataFrame, df_test: pd.DataFrame, set_all_movies: set, num_negatives: int) -> pd.DataFrame:
    list_negative = []

    test_user = df_test['userId'].values.tolist()
    test_movie = df_test['movieId'].values.tolist()

    for user, movie in zip(test_user, test_movie):
        list_train_user_movies = df_train[df_train['userId']==user]['movieId'].tolist()
        set_pos_user_movies = set(list_train_user_movies + [movie])
        list_user_neg_movies = list(set_all_movies - set_pos_user_movies)
        
        negatives = [(user, movie)] + np.random.choice(list_user_neg_movies, num_negatives, replace=False).tolist()
        list_negative.append(negatives)

    df_neg = pd.DataFrame(list_negative)

    return df_neg

In [11]:
class NeuMFDataset(Dataset):
    def __init__(self, df: pd.DataFrame, num_movies: int, num_negatives: int):
        self.df = df
        self.num_movies = num_movies
        self.num_negatives = num_negatives
        self.df_with_neg = self._getNegativeInstances()
    
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        user = self.df_with_neg.iloc[idx]['userId']
        item = self.df_with_neg.iloc[idx]['movieId']
        label = self.df_with_neg.iloc[idx]['label']

        return user, item, label
    
    def _getNegativeInstances(self) -> pd.DataFrame:

        df_users = self.df['userId'].values.tolist()

        all_movies = set([i for i in range(self.num_movies)])
        list_users, list_movies, list_labels = [], [], []

        for user in tqdm(df_users):
            user_pos_movies = set(self.df[self.df['userId']==user]['movieId'].tolist())
            candi_user_neg_movies = list(all_movies - user_pos_movies)

            user_neg_movies = np.random.choice(candi_user_neg_movies, self.num_negatives, replace=False).tolist()

            self._appendListUserItemLabel(list_users, list_movies, list_labels, user, user_pos_movies, 1)
            self._appendListUserItemLabel(list_users, list_movies, list_labels, user, user_neg_movies, 0)

        print('make list done!')

        df_with_neg = pd.DataFrame([x for x in zip(list_users, list_movies, list_labels)], columns=['userId', 'movieId', 'label'])

        print('make pd.DataFrame doen!')

        return df_with_neg

    
    def _appendListUserItemLabel(self,
                                users: list, 
                                movies: list, 
                                labels: list, 
                                candi_user: int, 
                                candi_movies: list, 
                                candi_label: int,
                                ) -> None:
        for movie in candi_movies:
            users.append(candi_user)
            movies.append(movie)
            labels.append(candi_label)

# 2. Model

In [34]:
class GMF(nn.Module):
    def __init__(self, num_users: int, num_items: int, latent_dim: int):
        super(GMF, self).__init__()

        self.embedding_user = nn.Embedding(num_users, latent_dim)
        self.embedding_item = nn.Embedding(num_items, latent_dim)

        self.prediction = nn.Sequential(
            nn.Linear(latent_dim, 1, bias=False),
            nn.Sigmoid()
        )

        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            nn.init.normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)

    
    def forward(self, user_input, item_input):
        user_latent = self.embedding_user(user_input)
        item_latent = self.embedding_item(item_input)

        product = user_latent * item_latent

        output = self.prediction(product)

        return output.squeeze()

In [50]:
class MLP(nn.Module):
    def __init__(self, num_uesrs, num_items, latent_dim, dropout, layers=[20, 10]):
        super(MLP, self).__init__()

        self.embedding_user = nn.Embedding(num_uesrs, latent_dim)
        self.embedding_item = nn.Embedding(num_items, latent_dim)

        layers.insert(0, latent_dim * 2)
        modules = []
        for i in range(len(layers) - 1):
            modules.append(nn.Dropout(p=dropout))
            modules.append(nn.Linear(layers[i], layers[i+1]))
            modules.append(nn.ReLU())
        
        self.dense_layers = nn.Sequential(*modules)

        self.prediction = nn.Sequential(
            nn.Linear(layers[-1], 1, bias=False),
            nn.Sigmoid()
        )

        self.apply(self._init_weights)
    
    # initialize weights
    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            nn.init.normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)

    def forward(self, user_input, item_input):
        user_latent = self.embedding_user(user_input)
        item_latent = self.embedding_item(item_input)

        vector = torch.cat((user_latent, item_latent), dim=-1)

        output = self.prediction(self.dense_layers(vector))

        return output.squeeze()

In [51]:
class NeuMF(nn.Module):
    def __init__(self, GMF, MLP, latent_dim, layers):
        super(NeuMF, self).__init__()

        self.GMF_embedding_user = GMF.embedding_user
        self.GMF_embedding_item = GMF.embedding.item

        self.MLP_embedding_user = MLP.embedding_user
        self.MLP_embedding_item = MLP.embedding.item

        self.MLP_dense_layers = MLP.dense_layers

        self.prediction = nn.Sequential(OrderedDict([
            ('prediction', nn.Linear(latent_dim + layers[-1], 1, bias=False)),
            ('prediction', nn.Sigmoid())

        ]))
        
        self.apply(self._init_weights)
    
    # initialize weights
    def _init_weights(self, module):
        for name, layer in module.named_modules():
            if isinstance(layer, nn.Linear) and name == 'prediction':
                nn.init.normal_(layer.weight.data, mean=0.0, std=0.01)
                if layer.bias is not None:
                    layer.bias.data.fill_(0.0)

    def forward(self, user_input, item_input):
        GMF_user_latent = self.GMF_embedding_user(user_input)
        GMF_item_latent = self.GMF_embedding_item(item_input)
        GMF_output = GMF_user_latent * GMF_item_latent

        MLP_user_latent = self.MLP_embedding_user(user_input)
        MLP_item_latent = self.MLP_embedding_item(item_input)
        vector = torch.cat((MLP_user_latent, MLP_item_latent), dim=-1)
        MLP_output = self.MLP_dense_layers(vector)

        concat_output = torch.cat((GMF_output, MLP_output), dim=-1)

        output = self.prediction(concat_output)

        return output.squeeze()


# 3. Metric: HR, NDCG

# 4. Train 함수

In [48]:
def train(model, optimizer, train_loader, epochs, criterion, device):
    model.train()

    size = len(train_loader)

    # 훈련 시간 측정
    epoch_start = torch.cuda.Event(enable_timing=True)
    epoch_end = torch.cuda.Event(enable_timing=True)

    for epoch in range(epochs):
        epoch_loss = 0

        # 시작 시간 기록
        epoch_start.record()

        for user, item, label in train_loader:
            user = user.to(device)
            item = item.to(device)
            label = label.to(device).float()

            optimizer.zero_grad()

            output = model(user, item)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()


        epoch_end.record()
        torch.cuda.synchronize()

        avg_loss = epoch_loss / size

        print(
            f'Epoch[{epoch+1}/{epochs}]\ttrain_loss: {avg_loss:.4f}' +
            f'\t훈련시간: {epoch_start.elapsed_time(epoch_end)/1000:.2f} sec'
        )


In [43]:
class DirFilePath:
    dir_base = os.path.join(os.path.join('/opt','ml','paper','RecSys'))
    dir_data = os.path.join(dir_base, 'Data', 'ml-latest-small')
    path_rating = os.path.join(dir_data, 'ratings.csv')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Use {device}')

# 해당 논문의 github에서 사용한 값 참고
config = {
    'seed': 42,
    'batch_size': 256,
    'epochs': 20,
    'latent_dim': 8,
    'num_neg': 4,
    'lr': 0.001,
    'optimizer': 'adam',
    'criterion': 'BCE',
    'device': device,
    'layers': [64, 32, 16, 8],
    'dropout': 0,
    'topK': 10,
    }

config = Box(config)

def setSeed(seed):
    random.seed(seed)
    np.random.seed(seed)

setSeed(config.seed)

Use cuda


In [None]:
df_rating = pd.read_csv(DirFilePath.path_rating)
df_rating, user_encoder, movie_encoder = encode(df_rating)

df_train, df_test = trainTestSplit(df_rating)

# # Create lists of all movies
# set_all_movies = set(df_rating['movieId'].unique())

# df_test_neg = getNegatives(df_train, df_test, set_all_movies, 99)

num_movies = len(df_rating['movieId'].unique())
num_users = len(df_rating['userId'].unique())

dataset_train = NeuMFDataset(df_train, num_movies, 4)
dataset_test = NeuMFDataset(df_test, num_movies, 99)

criterion = nn.BCELoss()

train_loader = DataLoader(dataset_train, batch_size=config.batch_size, shuffle=True, drop_last=False)
test_loader = DataLoader(dataset_test, batch_size=config.batch_size, shuffle=False, drop_last=False)

In [42]:
GMF_model = GMF(num_users, num_movies, config.latent_dim)
GMF_model.to(config.device)
GMF_optimizer = torch.optim.Adam(GMF_model.parameters(), lr=config.lr)

train(GMF_model, GMF_optimizer, train_loader, config.epochs, criterion, config.device)

Epoch[1/20]	train_loss: 0.5499	훈련시간: 23.22 sec
Epoch[2/20]	train_loss: 0.0927	훈련시간: 23.40 sec
Epoch[3/20]	train_loss: 0.0249	훈련시간: 23.29 sec
Epoch[4/20]	train_loss: 0.0143	훈련시간: 23.39 sec
Epoch[5/20]	train_loss: 0.0100	훈련시간: 23.53 sec
Epoch[6/20]	train_loss: 0.0074	훈련시간: 23.39 sec
Epoch[7/20]	train_loss: 0.0056	훈련시간: 23.58 sec
Epoch[8/20]	train_loss: 0.0042	훈련시간: 23.47 sec
Epoch[9/20]	train_loss: 0.0032	훈련시간: 23.60 sec
Epoch[10/20]	train_loss: 0.0024	훈련시간: 23.38 sec
Epoch[11/20]	train_loss: 0.0018	훈련시간: 23.32 sec
Epoch[12/20]	train_loss: 0.0013	훈련시간: 23.16 sec
Epoch[13/20]	train_loss: 0.0010	훈련시간: 23.23 sec
Epoch[14/20]	train_loss: 0.0007	훈련시간: 23.35 sec
Epoch[15/20]	train_loss: 0.0005	훈련시간: 23.39 sec
Epoch[16/20]	train_loss: 0.0004	훈련시간: 23.48 sec
Epoch[17/20]	train_loss: 0.0003	훈련시간: 23.41 sec
Epoch[18/20]	train_loss: 0.0002	훈련시간: 23.38 sec
Epoch[19/20]	train_loss: 0.0002	훈련시간: 23.53 sec
Epoch[20/20]	train_loss: 0.0001	훈련시간: 23.42 sec


In [52]:
MLP_model = MLP(num_users, num_movies, config.latent_dim, config.dropout, config.layers)
MLP_model.to(config.device)
MLP_optimizer = torch.optim.Adam(MLP_model.parameters(), lr=config.lr)

train(MLP_model, MLP_optimizer, train_loader, config.epochs, criterion, config.device)

Epoch[1/20]	train_loss: 0.1938	훈련시간: 24.79 sec
Epoch[2/20]	train_loss: 0.0173	훈련시간: 24.47 sec
Epoch[3/20]	train_loss: 0.0077	훈련시간: 24.60 sec
Epoch[4/20]	train_loss: 0.0050	훈련시간: 24.43 sec
Epoch[5/20]	train_loss: 0.0046	훈련시간: 24.47 sec
Epoch[6/20]	train_loss: 0.0045	훈련시간: 24.51 sec
Epoch[7/20]	train_loss: 0.0044	훈련시간: 24.64 sec
Epoch[8/20]	train_loss: 0.0044	훈련시간: 24.51 sec
Epoch[9/20]	train_loss: 0.0044	훈련시간: 24.48 sec
Epoch[10/20]	train_loss: 0.0043	훈련시간: 24.79 sec
Epoch[11/20]	train_loss: 0.0040	훈련시간: 24.72 sec
Epoch[12/20]	train_loss: 0.0029	훈련시간: 24.85 sec
Epoch[13/20]	train_loss: 0.0027	훈련시간: 24.47 sec
Epoch[14/20]	train_loss: 0.0027	훈련시간: 24.53 sec
Epoch[15/20]	train_loss: 0.0026	훈련시간: 24.63 sec
Epoch[16/20]	train_loss: 0.0025	훈련시간: 24.53 sec
Epoch[17/20]	train_loss: 0.0025	훈련시간: 24.83 sec
Epoch[18/20]	train_loss: 0.0025	훈련시간: 24.57 sec
Epoch[19/20]	train_loss: 0.0024	훈련시간: 24.51 sec
Epoch[20/20]	train_loss: 0.0024	훈련시간: 24.44 sec
