In [3]:
import pandas as pd
import numpy as np
np.random.seed(123)

In [4]:
ratings = pd.read_csv('ratings.csv', parse_dates=['timestamp'])


In [None]:
# In order to keep memory usage manageable, use data from 30% of the users from dataset
# select randomly 30% of the users and only use data from the selected users

In [5]:
rand_userIds = np.random.choice(ratings['userId'].unique(), size=int(len(ratings['userId'].unique())*0.3),replace=False)

In [6]:
ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
560,6,2,4.0,845553522
561,6,3,5.0,845554296
562,6,4,3.0,845554349
563,6,5,5.0,845553938
564,6,6,4.0,845553757


In [10]:
# Train-test split
# timestamp column shows the data and time the review was submitted
# implementing train-test split using the leave-one-out methodology
# For each user, the most recentreview is used as the test set (i.e. leave one out)
# rest will be used as training data

In [None]:
# Split ratings dataset into a train and test set using leave-one-out methodology

In [11]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)

In [12]:
train_ratings = ratings[ratings['rank_latest'] != 1]

In [13]:
test_ratings = ratings[ratings['rank_latest'] == 1]

In [None]:
# drop columns that we no longer need

In [14]:
train_ratings = train_ratings[['userId','movieId','rating']]

In [15]:
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

In [None]:
# converting the dataset into an implicit feedback dataset
# MoveLens dataset is based on explicit feedback
# to convert it into an implicit feedback dataset
# simply binarize the ratings and convert them to 1 (i.e. positive class)
# 1 representing the user has interacted with the item

In [16]:
train_ratings.loc[:, 'rating'] = 1

In [17]:
train_ratings.head()

Unnamed: 0,userId,movieId,rating
560,6,2,1
561,6,3,1
562,6,4,1
563,6,5,1
564,6,6,1


In [None]:
# after binarizing dataset every sample in the dataset now belongs to the positive class
# require -ve negative samples to train our models

In [None]:
# code below generates 4 -ve negative samples for each row of data
# the ratio of -ve to +ve saples is 4:1
# this ratio is choosen arbitrarily

In [29]:
# Get a list of all movie IDs 

all_movieIds = ratings['movieId'].unique()

In [30]:
# placeholders that will hold the training data

users, items, labels = [], [], []

In [31]:
# This is the set of items that each user has interaction with

user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

In [32]:
# 4:1 ratio of negative to positive samples 

num_negatives = 4 

for (u, i) in user_item_set:
    
    users.append(u)
    
    items.append(i)
    
    labels.append(1) # items that the user has interacted with are positive
    
    for _ in range(num_negatives):
        
        # randomly select an item
        
        negative_item = np.random.choice(all_movieIds)
        
        # check that the user has not interested with this item
        
        while (u, negative_item) in user_item_set:
            
            negative_item = np.random.choice(all_movieIds)
            
            users.append(u)
            
            items.append(negative_item)
            
            labels.append(0) # item not interacted with are negative

In [None]:
# Done upto now....we have the data in the format required by our model

In [None]:
# define a PyTorch Dataset to facilitate training

In [35]:
import torch
from torch.utils.data import Dataset

In [36]:
class MovieLensTrainDataset(Dataset):
    
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
        
        """
    def __init__(self, ratings, all_movieIds):
        
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)
    
    def __len__(self):
        
        return len(self.users)
    
    def __getitem__(self, idx):
        
        return self.users[idx], self.items[idx], self.labels[idx]
    
    def get_dataset(self,ratings, all_movieIds):
        
        users, items, labels = [], [], []
        
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))
    
        num_negatives = 4
        
        for u, i in user_item_set:
            
            users.append(u)
            items.append(i)
            labels.append(1)
            
            for _ in range(num_negatives):
                
                negative_item = np.random.choice(all_movieIds)
                
            users.append(u)
            items.append(negative_item)
            labels.append(0)
            
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)
    