In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import copy
import sys
from sklearn.model_selection import train_test_split
import os

file_path = os.path.dirname(os.getcwd())

import pandas as pd
import numpy as np
import pickle

# ========================================
# Load MovieLens 1M datasets
# ========================================
data_movies = pd.read_csv(file_path+'./ml-1m/movies.dat', sep = "::", names = ['movieid','title','genres'], encoding_errors='ignore')
data_ratings = pd.read_csv(file_path+'./ml-1m/ratings.dat', sep = "::", names = ['userid','movieid','ratings','timestamp'], encoding_errors='ignore')
data_users = pd.read_csv(file_path+'./ml-1m/users.dat', sep = "::", names = ['userid','gender','age','occupation','zipcode'], encoding_errors='ignore')

  data_movies = pd.read_csv(file_path+'./ml-1m/movies.dat', sep = "::", names = ['movieid','title','genres'], encoding_errors='ignore')
  data_ratings = pd.read_csv(file_path+'./ml-1m/ratings.dat', sep = "::", names = ['userid','movieid','ratings','timestamp'], encoding_errors='ignore')
  data_users = pd.read_csv(file_path+'./ml-1m/users.dat', sep = "::", names = ['userid','gender','age','occupation','zipcode'], encoding_errors='ignore')


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from box import Box

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
config = {

    'p_dims': [200, 600],
    'dropout_rate' : 0.5,
    'weight_decay' : 0.01,
    'valid_samples' : 10,
    'seed' : 22,
    'anneal_cap' : 0.2,
    'total_anneal_steps' : 200000,

    'lr' : 0.005,
    'batch_size' : 500,
    'num_epochs' : 50,
    'num_workers' : 0,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

In [6]:
class MakeMatrixDataSet():
    def __init__(self, ratings):
        
        self.df = ratings
        
        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('movieid')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('userid')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['movieid'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['userid'].apply(lambda x : self.user_encoder[x])

        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:

        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['timestamp']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(42)

            user_total = users[user]
            valid = np.random.choice(user_total, size = 10, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid 

        return user_train, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_matrix based on input user_list
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.user_train[user.item()]] = 1
            else:
                mat[idx, self.user_train[user.item()] + self.user_valid[user.item()]] = 1

        return mat
    
class AEDataSet(Dataset):
    def __init__(self, num_user):
        self.num_user = num_user
        self.users = [i for i in range(num_user)]

    def __len__(self):
        return self.num_user

    def __getitem__(self, idx): 
        user = self.users[idx]
        return torch.LongTensor([user])

In [7]:
class MultiVAE(nn.Module):

    def __init__(self, p_dims, dropout_rate = 0.5):
        super(MultiVAE, self).__init__()
        self.p_dims = p_dims
        self.q_dims = p_dims[::-1]

        temp_q_dims = self.q_dims[:-1] + [self.q_dims[-1] * 2]

        self.q_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(temp_q_dims[:-1], temp_q_dims[1:])])

        self.p_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.p_dims[:-1], self.p_dims[1:])])

        self.drop = nn.Dropout(dropout_rate)
        self.init_weights()
    
    def forward(self, input, loss = False):
        mu, logvar = self.encode(input)
        z = self.reparameterize(mu, logvar)
        h = self.decode(z)
        if loss:
            return h, mu, logvar
        else:
            return h
    
    def encode(self, input):
        h = F.normalize(input)
        h = self.drop(h)

        for i, layer in enumerate(self.q_layers):
            h = layer(h)
            if i != len(self.q_layers) - 1:
                h = F.tanh(h)
            else:
                mu = h[:, :self.q_dims[-1]]
                logvar = h[:, self.q_dims[-1]:]
        return mu, logvar

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            return mu
    
    def decode(self, z):
        h = z
        for i, layer in enumerate(self.p_layers):
            h = layer(h)
            if i != len(self.p_layers) - 1:
                h = F.tanh(h)
        return h

    def init_weights(self):
        for layer in self.q_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)
        
        for layer in self.p_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)
     

In [8]:
class LossFunc(nn.Module):

    def __init__(self, loss_type = 'Multinomial', model_type = None):
        super(LossFunc, self).__init__()
        self.loss_type = loss_type
        self.model_type = model_type

    def forward(self, recon_x = None, x = None, mu = None, logvar = None, anneal = None):
        if self.loss_type == 'Gaussian':
            loss = self.Gaussian(recon_x, x)
        elif self.loss_type == 'Logistic':
            loss = self.Logistic(recon_x, x)
        elif self.loss_type == 'Multinomial':
            loss = self.Multinomial(recon_x, x)
        
        if self.model_type == 'VAE':
            KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))
            loss = loss + anneal * KLD
        
        return loss

    def Gaussian(self, recon_x, x):
        gaussian = F.mse_loss(recon_x, x)
        return gaussian

    def Logistic(self, recon_x, x):
        logistic = F.binary_cross_entropy(recon_x.sigmoid(), x, reduction='none').sum(1).mean()
        return logistic

    def Multinomial(self, recon_x, x):
        # putting log in pmf and omit the factorial part, only calculates the sum of x * p_i
        multinomial = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
        return multinomial

In [9]:
def get_ndcg(pred_list, true_list):
    idcg = sum((1 / np.log2(rank + 2) for rank in range(1, len(pred_list))))
    dcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            dcg += 1 / np.log2(rank + 2)
    ndcg = dcg / idcg
    return ndcg

def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit


def train(model, criterion, optimizer, data_loader, make_matrix_data_set, config):
    global update_count
    model.train()
    loss_val = 0
    for users in data_loader:
        mat = make_matrix_data_set.make_matrix(users)
        mat = mat.to(device)

        if criterion.model_type == 'VAE':
            anneal = min(config.anneal_cap, 1. * update_count / config.total_anneal_steps)
            update_count += 1
            recon_mat, mu, logvar = model(mat, loss = True)
            
            optimizer.zero_grad()
            loss = criterion(recon_x = recon_mat, x = mat, mu = mu, logvar = logvar, anneal = anneal)

        else:
            recon_mat = model(mat)
            optimizer.zero_grad()
            loss = criterion(recon_x = recon_mat, x = mat)

        loss_val += loss.item()

        loss.backward()
        optimizer.step()

    loss_val /= len(data_loader)

    return loss_val

def evaluate(model, data_loader, user_train, user_valid, make_matrix_data_set):
    model.eval()

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    with torch.no_grad():
        for users in data_loader:
            mat = make_matrix_data_set.make_matrix(users)
            mat = mat.to(device)

            recon_mat = model(mat)
            recon_mat[mat == 1] = -np.inf
            rec_list = recon_mat.argsort(dim = 1)

            for user, rec in zip(users, rec_list):
                uv = user_valid[user.item()]
                up = rec[-10:].cpu().numpy().tolist()
                NDCG += get_ndcg(pred_list = up, true_list = uv)
                HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(data_loader.dataset)
    HIT /= len(data_loader.dataset)

    return NDCG, HIT

In [10]:
loss_dict = {}
ndcg_dict = {}
hit_dict = {}

In [11]:
make_matrix_data_set = MakeMatrixDataSet(data_ratings)
user_train, user_valid = make_matrix_data_set.get_train_valid_data()
ae_dataset = AEDataSet(
    num_user = make_matrix_data_set.num_user,
    )

# 500 users batch
data_loader = DataLoader(
    ae_dataset,
    batch_size = 500, 
    shuffle = True, 
    pin_memory = True,
    num_workers = 0,
    )

VAE with Logistic Loss Function

In [14]:
model = MultiVAE(
    p_dims = config.p_dims + [make_matrix_data_set.num_item], 
    dropout_rate = config.dropout_rate).to(device)

criterion = LossFunc(loss_type = 'Logistic', model_type = 'VAE')
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

In [15]:
best_hit = 0
update_count = 1
loss_list = []
ndcg_list = []
hit_list = []
for epoch in range(1, config.num_epochs + 1):
    
    train_loss = train(
        model = model, 
        criterion = criterion, 
        optimizer = optimizer, 
        data_loader = data_loader,
        make_matrix_data_set = make_matrix_data_set,
        config = config,
        )
    
    ndcg, hit = evaluate(
        model = model, 
        data_loader = data_loader,
        user_train = user_train,
        user_valid = user_valid,
        make_matrix_data_set = make_matrix_data_set,
        )

    loss_list.append(train_loss)
    ndcg_list.append(ndcg)
    hit_list.append(hit)

    print(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

Epoch:   1| Train loss: 1046.69032| NDCG@10: 0.05430| HIT@10: 0.04553
Epoch:   2| Train loss: 562.90932| NDCG@10: 0.07126| HIT@10: 0.06000
Epoch:   3| Train loss: 544.27723| NDCG@10: 0.07817| HIT@10: 0.06666
Epoch:   4| Train loss: 517.95200| NDCG@10: 0.08403| HIT@10: 0.06970
Epoch:   5| Train loss: 496.85051| NDCG@10: 0.08579| HIT@10: 0.07219
Epoch:   6| Train loss: 479.96840| NDCG@10: 0.07854| HIT@10: 0.06589
Epoch:   7| Train loss: 470.85446| NDCG@10: 0.08613| HIT@10: 0.07139
Epoch:   8| Train loss: 460.70295| NDCG@10: 0.08206| HIT@10: 0.06899
Epoch:   9| Train loss: 451.70306| NDCG@10: 0.07891| HIT@10: 0.06634
Epoch:  10| Train loss: 446.84652| NDCG@10: 0.08449| HIT@10: 0.07190
Epoch:  11| Train loss: 443.02700| NDCG@10: 0.08173| HIT@10: 0.06667
Epoch:  12| Train loss: 445.67227| NDCG@10: 0.08458| HIT@10: 0.06993
Epoch:  13| Train loss: 437.99890| NDCG@10: 0.08361| HIT@10: 0.07017
Epoch:  14| Train loss: 446.54483| NDCG@10: 0.08357| HIT@10: 0.07086
Epoch:  15| Train loss: 443.04455

Multi-VAE

In [32]:
model = MultiVAE(
    p_dims = config.p_dims + [make_matrix_data_set.num_item], 
    dropout_rate = config.dropout_rate).to(device)

criterion = LossFunc(loss_type = 'Multinomial', model_type = 'VAE')
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

In [33]:
best_hit = 0
update_count = 1
loss_list = []
ndcg_list = []
hit_list = []
for epoch in range(1, config.num_epochs + 1):
    
    train_loss = train(
        model = model, 
        criterion = criterion, 
        optimizer = optimizer, 
        data_loader = data_loader,
        make_matrix_data_set = make_matrix_data_set,
        config = config,
        )
    
    ndcg, hit = evaluate(
        model = model, 
        data_loader = data_loader,
        user_train = user_train,
        user_valid = user_valid,
        make_matrix_data_set = make_matrix_data_set,
        )

    loss_list.append(train_loss)
    ndcg_list.append(ndcg)
    hit_list.append(hit)

    print(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')



Epoch:   1| Train loss: 1180.00846| NDCG@10: 0.08561| HIT@10: 0.07157
Epoch:   2| Train loss: 1158.18674| NDCG@10: 0.12077| HIT@10: 0.10230
Epoch:   3| Train loss: 1141.33620| NDCG@10: 0.14885| HIT@10: 0.12773
Epoch:   4| Train loss: 1108.19178| NDCG@10: 0.16510| HIT@10: 0.14179
Epoch:   5| Train loss: 1097.91410| NDCG@10: 0.17916| HIT@10: 0.15232
Epoch:   6| Train loss: 1066.42540| NDCG@10: 0.18532| HIT@10: 0.15828
Epoch:   7| Train loss: 1093.40770| NDCG@10: 0.18975| HIT@10: 0.16131
Epoch:   8| Train loss: 1074.27530| NDCG@10: 0.19722| HIT@10: 0.16849
Epoch:   9| Train loss: 1090.46537| NDCG@10: 0.19813| HIT@10: 0.16940
Epoch:  10| Train loss: 1070.13530| NDCG@10: 0.20431| HIT@10: 0.17432
Epoch:  11| Train loss: 1072.67084| NDCG@10: 0.20402| HIT@10: 0.17364
Epoch:  12| Train loss: 1048.56078| NDCG@10: 0.21197| HIT@10: 0.18179
Epoch:  13| Train loss: 1072.10419| NDCG@10: 0.21245| HIT@10: 0.17977
Epoch:  14| Train loss: 1072.47389| NDCG@10: 0.20758| HIT@10: 0.17599
Epoch:  15| Train lo