In [159]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn 

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from typing import Tuple

In [137]:
seed = 42
random.seed(seed)
np.random.seed(seed)

In [115]:
dir_base = os.path.join(os.path.join('/opt','ml','paper','RecSys'))
dir_data = os.path.join(dir_base, 'Data', 'ml-latest-small')
path_rating = os.path.join(dir_data, 'ratings.csv')

In [116]:
df_rating = pd.read_csv(path_rating)

In [117]:
def encode(df: pd.DataFrame) -> Tuple[pd.DataFrame, LabelEncoder, LabelEncoder]:
    userId_label_encoder = LabelEncoder()
    movieId_label_encoder = LabelEncoder()

    df['userId'] = userId_label_encoder.fit_transform(df['userId'].values)
    df['movieId'] = movieId_label_encoder.fit_transform(df['movieId'].values)

    # encoder.inverse_transform() 으로 decode
    return df, userId_label_encoder, movieId_label_encoder


In [118]:
df_rating, user_encoder, movie_encoder = encode(df_rating)

In [125]:
def trainTestSplit(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits our original data into one test and one
    training set. 
    The test set is made up of one item for each user. This is
    our holdout item used to compute Top@K later.
    The training set is the same as our original data but
    without any of the holdout items.
    Args:
        df (dataframe): Our original data
    Returns:
        df_train (dataframe): All of our data except holdout items
        df_test (dataframe): Only our holdout items.
    """

    # Create two copies of our dataframe that we can modify
    df_test = df.copy(deep=True)
    df_train = df.copy(deep=True)

    # Group by userId and select only the first item for
    # each user (our holdout).
    df_test = df_test.groupby(['userId']).first()
    df_test['userId'] = df_test.index
    df_test = df_test[['userId', 'movieId', 'rating', 'timestamp']]
    df_test.index.name = None

    # Remove the same items as we for our test set in our training set.
    mask = df.groupby(['userId'])['userId'].transform(maskFirst).astype(bool)
    df_train = df.loc[mask]

    return df_train, df_test
    
    

In [126]:
def maskFirst(x):
    """
    Return a list of 0 for the first item and 1 for all others
    """
    result = np.ones_like(x)
    result[0] = 0
    
    return result

In [127]:
df_train, df_test = trainTestSplit(df_rating)

In [155]:
def getNegatives(df_train: pd.DataFrame, df_test: pd.DataFrame, set_all_movies: set) -> pd.DataFrame:
    list_negative = []

    test_user = df_test['userId'].values.tolist()
    test_movie = df_test['movieId'].values.tolist()

    for user, movie in zip(test_user, test_movie):
        list_train_user_movies = df_train[df_train['userId']==user]['movieId'].tolist()
        set_pos_user_movies = set(list_train_user_movies + [movie])
        list_user_neg_movies = list(set_all_movies - set_pos_user_movies)
        
        negatives = [user, movie] + np.random.choice(list_user_neg_movies, 99, replace=False).tolist()
        list_negative.append(negatives)

    df_neg = pd.DataFrame(list_negative)

    return df_neg

In [156]:
# Create lists of all movies
set_all_movies = set(df_rating['movieId'].unique())

df_test_neg = getNegatives(df_train, df_test, set_all_movies)

In [None]:
class NeuMFDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        


In [None]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, latent_dim, reg=[0, 0]):
        super(GMF, self)__init__()

        self.MF_embedding_user = nn.Embedding(num_users, latent_dim)
        self.NF_embedding_item = nn.Embedding(num_items, latent_dim)

        self.prediction = nn.Sequential(
            nn.Linear(latent_dim, 1, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, user_input, item_input):
        user_latent = self.MF_embedding_user(user_input)
        item_latent = self.MF_embedding_item(item_input)

        product = user_latent * item_latent

        output = self.
