In [None]:
import pandas as pd
from datetime import datetime
import numpy as np

def RMSE(ratings_pred, ratings):
    ratings_mask = ratings > 0
    return torch.sum((ratings_pred * ratings_mask - ratings) ** 2)

# Load data into memory

df_train = pd.read_csv('movielens1m_test.csv')
df_test = pd.read_csv('movielens1m_train.csv')
df_train['UserID'] = df_train['UserID']
df_test['UserID'] = df_test['UserID']
USERS = max(df_train['UserID'].max(), df_test['UserID'].max()) + 1
df_train['MovieID'] = df_train['MovieID']
df_test['MovieID'] = df_test['MovieID']
MOVIES = max(df_train['MovieID'].max(), df_test['MovieID'].max()) + 1

df = pd.concat([df_train, df_test])
ROWS = df['UserID']
COLS = df['MovieID']

print('Finished feature engineering...')
print('df shape', len(df))
print('df_train.head()', len(df_train))
print(df_train.head())
print('df_test.head()', len(df_test))
print(df_test.head())

Finished feature engineering...
df shape 1000209
df_train.head() 199828
   Unnamed: 0  UserID  MovieID  Rating  Timestamp
0           3       0        3       4  978300275
1          22       0       22       5  978300055
2          30       0       30       4  978824291
3          34       0       34       4  978824330
4          42       0       42       4  978301753
df_test.head() 800381
   Unnamed: 0  UserID  MovieID  Rating  Timestamp
0           0       0        0       5  978300760
1           1       0        1       3  978302109
2           2       0        2       3  978301968
3           4       0        4       5  978824291
4           5       0        5       3  978302268


In [None]:
from scipy.sparse import csr_matrix
import torch
import random
import numpy as np

seed = 1337
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device('cpu')
print('Using device', device)
torch.set_default_tensor_type(torch.FloatTensor)

# Construct the train and test rating matrices.

train_data = csr_matrix((df_train['Rating'], (df_train['UserID'], df_train['MovieID'])), shape=(USERS, MOVIES), dtype=np.float32)
RMSE_train = torch.Tensor(train_data.copy().toarray()).to(device)
rows, cols = train_data.nonzero()
train_mn = train_data.min()
train_mx = train_data.max()
for i, j in zip(rows, cols):
  train_data[i, j] = (train_data[i, j] - train_mn) / (train_mx - train_mn)


test_data = csr_matrix((df_test['Rating'], (df_test['UserID'], df_test['MovieID'])), shape=(USERS, MOVIES), dtype=np.float32)
RMSE_test = torch.Tensor(test_data.copy().toarray()).to(device)
rows, cols = test_data.nonzero()
test_mn = test_data.min()
test_mx = test_data.max()
for i, j in zip(rows, cols):
  test_data[i, j] = (test_data[i, j] - test_mn) / (test_mx - test_mn)


Using device cuda:0


In [None]:
print(train_data.shape, test_data.shape)

(6040, 3706) (6040, 3706)


In [None]:
# Precalculate batches for faster computing.
R_train = train_data
R_train_tensor = []
for i in range(USERS):
    batch = R_train[i].nonzero()[1]
    ts = torch.from_numpy(R_train[i, batch].todense().transpose().astype(np.float32)).to(device)
    R_train_tensor.append(ts)
print(len(R_train_tensor), R_train_tensor[0].shape)
print(R_train.shape)

R_train_T = train_data.transpose()
R_train_tensor_T = []
for j in range(MOVIES):
    batch = R_train_T[j].nonzero()[1]
    ts = torch.from_numpy(R_train_T[j, batch].todense().transpose().astype(np.float32)).to(device)
    R_train_tensor_T.append(ts)
print(len(R_train_tensor_T), R_train_tensor_T[0].shape)

6040 torch.Size([5, 1])
(6040, 3706)
3706 torch.Size([342, 1])


In [None]:
import numpy as np
from copy import deepcopy

import torch
from torch import nn
from torch.nn import functional as F
from torch.distributions.multivariate_normal import MultivariateNormal
from torch.autograd import Variable as V


def swish(x):
    return x.mul(torch.sigmoid(x))


def log_norm_pdf(x, mu, logvar):
    return -0.5 * (logvar + np.log(2 * np.pi) + (x - mu).pow(2) / logvar.exp())


def reparameterize(mu, logvar):
    std = torch.exp(0.5 * logvar)
    eps = torch.randn_like(std)
    return eps.mul(std).add_(mu)


def weights_init(m):
    pass
    # if isinstance(m, nn.Linear):
    #     torch.nn.init.normal_(m.weight, 0, 0.01)
    #     torch.nn.init.normal(m.bias, 0, 0.01)


class Encoder(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim, eps=1e-1):
        super(Encoder, self).__init__()

        self.act = nn.LeakyReLU()
        self.fc1 = nn.Linear(input_dim, 40)
        self.ln1 = nn.LayerNorm(40, eps=eps)

        self.fc_mu = nn.Linear(40, latent_dim)
        self.fc_logvar = nn.Linear(40, latent_dim)

        self.fc1.apply(weights_init)
        self.fc_mu.apply(weights_init)
        self.fc_logvar.apply(weights_init)

    def forward(self, x, dropout_rate, calculate_loss=True):
        x = F.dropout(x, p=dropout_rate, training=self.training)

        h1 = self.ln1(self.act(self.fc1(x)))

        mu, logvar = self.fc_mu(h1), self.fc_logvar(h1)

        return mu, logvar


class Decoder(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim, eps=1e-1):
        super(Decoder, self).__init__()

        self.act = nn.LeakyReLU()
        self.fc1 = nn.Linear(latent_dim, input_dim)

        self.fc1.apply(weights_init)

    def forward(self, z, calculate_loss=True):
        x = torch.sigmoid(self.fc1(z))

        return x


class VAE(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim, device):
        super(VAE, self).__init__()

        self.encoder = Encoder(hidden_dim, latent_dim, input_dim)
        self.decoder = Decoder(hidden_dim, latent_dim, input_dim)
        self.zr = torch.zeros(1, latent_dim).to(device=device)
        self.mones = torch.ones(1, latent_dim).to(device) * -1

    def forward(self, user_ratings, alpha=0.5, beta=None, gamma=1, dropout_rate=0.5, calculate_loss=True, n_epoch=1):
        mu, logvar = self.encoder(user_ratings, dropout_rate=dropout_rate, calculate_loss=calculate_loss)
        z = reparameterize(mu, logvar)
        x_pred = self.decoder(z, calculate_loss=calculate_loss)

        if calculate_loss:
            if gamma:
                kl_weight = gamma * 10
            elif beta:
                kl_weight = beta

            # Choose between MSE and cross-entropy

            # user_mask = user_ratings > 0
            # mll = torch.pow(x_pred * user_mask - user_ratings, 2).sum(dim=-1).mul(kl_weight).mean()
            mll = (-torch.log(x_pred) * user_ratings).sum(dim=-1).mul(kl_weight).mean()

            # Compute divergence
            prior = log_norm_pdf(z, self.zr, self.zr)
            kld = (log_norm_pdf(z, mu, logvar) - prior).sum(dim=-1).mul(kl_weight).mean()

            negative_elbo = mll + kld

            return mu, z, negative_elbo
        else:
            return mu

    def update_prior(self):
        self.prior.encoder_old.load_state_dict(deepcopy(self.encoder.state_dict()))


class DualVAE(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim_users, input_dim_movies, device, eps=1e-1):
        super(DualVAE, self).__init__()
        self.device = device
        self.movies_VAE = VAE(hidden_dim, latent_dim, input_dim_users, device)
        self.users_VAE = VAE(hidden_dim, latent_dim, input_dim_movies, device)
        self.input_dim_users = input_dim_users
        self.input_dim_movies = input_dim_movies
        self.f = latent_dim
        self.f_eye = torch.eye(self.f).to(device)
        self.lambda_u = 10
        self.lambda_v = 10
        mU = MultivariateNormal(torch.zeros(latent_dim), 1 / self.lambda_u * torch.eye(latent_dim))
        mV = MultivariateNormal(torch.zeros(latent_dim), 1 / self.lambda_v * torch.eye(latent_dim))
        self.U = torch.stack([mU.sample() for _ in range(input_dim_users)]).to(device)
        self.V = torch.stack([mV.sample() for _ in range(input_dim_movies)]).to(device)

    def pmf_init(self, R_train, R_train_tensor, R_train_T, R_train_tensor_T):
        # Compute initial U vector
        self.v_conv_all = torch.stack([torch.einsum('n,m->nm', vj, vj.T) for vj in self.V])

        self.u_dev = torch.zeros((self.input_dim_users, self.f, self.f)).to(self.device)
        for i in range(self.input_dim_users):
            batch = torch.from_numpy(R_train[i].nonzero()[1]).to(dtype=torch.int64)
            v_sum = torch.sum(self.v_conv_all[batch], axis=0)
            self.u_dev[i] = torch.inverse(v_sum + self.lambda_u * self.f_eye)

            v_conv = torch.sum(R_train_tensor[i] * self.V[batch], axis=0)
            self.U[i] = self.u_dev[i] @ v_conv

        # Compute initial V vector
        self.u_conv_all = torch.stack([torch.einsum('n,m->nm', ui, ui.T) for ui in self.U])

        self.v_dev = torch.zeros((self.input_dim_movies, self.f, self.f)).to(self.device)
        for j in range(self.input_dim_movies):
            batch = torch.from_numpy(R_train_T[j].nonzero()[1]).to(dtype=torch.int64)
            u_sum = torch.sum(self.u_conv_all[batch], axis=0)
            self.v_dev[j] = torch.inverse(u_sum + self.lambda_v * self.f_eye)

            u_conv = torch.sum(R_train_tensor_T[j] * self.U[batch], axis=0)
            self.V[j] = self.v_dev[j] @ u_conv

    def forward(self, ratings, R_train, R_train_tensor, R_train_T, R_train_tensor_T, userFeatures=None, beta=None, gamma=1, dropout_rate=0.5, calculate_loss=True, n_epoch=1):
        if calculate_loss:
            mu_users, z_i, user_loss = self.users_VAE(ratings, beta=beta, gamma=gamma, dropout_rate=dropout_rate,
                                                      calculate_loss=calculate_loss, n_epoch=n_epoch)
            mu_movies, z_j, movie_loss = self.movies_VAE(ratings.T, beta=beta, gamma=gamma, dropout_rate=dropout_rate,
                                                         calculate_loss=calculate_loss, n_epoch=n_epoch)
            self.pmf_estimate(mu_users, mu_movies, R_train, R_train_tensor, R_train_T, R_train_tensor_T)
            # y_hat = self.U @ self.V.T

            normalizer = 0.5 * self.lambda_u * torch.sum(
                self.u_dev * torch.pow(self.U - mu_users, 2).unsqueeze(2)) + 0.5 * self.lambda_v * torch.sum(
                self.v_dev * torch.pow(self.V - mu_movies, 2).unsqueeze(2))
            total_loss = user_loss + movie_loss + normalizer
            return total_loss, user_loss, movie_loss, normalizer

        if userFeatures is not None:
            mu_users = self.users_VAE(userFeatures, beta=beta, gamma=gamma, dropout_rate=dropout_rate,
                                      calculate_loss=calculate_loss, n_epoch=n_epoch)
            y_hat = mu_users @ self.V.T
        else:
            y_hat = self.U @ self.V.T

        return y_hat

    def pmf_estimate(self, mu_users, mu_movies, R_train, R_train_tensor, R_train_T, R_train_tensor_T):
        with torch.no_grad():
            # Update U vector
            self.v_conv_all = torch.stack([torch.einsum('n,m->nm', vj, vj.T) for vj in self.V])
            for i in range(self.input_dim_users):
                batch = torch.from_numpy(R_train[i].nonzero()[1]).to(dtype=torch.int64)
                v_sum = torch.sum(self.v_conv_all[batch], axis=0)
                v_dev_sm = torch.sum(self.v_dev[batch], axis=0)
                self.u_dev.data[i] = torch.inverse(v_sum + v_dev_sm + self.lambda_u * self.f_eye)

                v_conv = torch.sum(R_train_tensor[i] * self.V[batch], axis=0)
                self.U.data[i] = self.u_dev[i] @ (v_conv + self.lambda_u * mu_users[i])

            # Update V vector
            self.u_conv_all = torch.stack([torch.einsum('n,m->nm', ui, ui.T) for ui in self.U])
            for j in range(self.input_dim_movies):
                batch = torch.from_numpy(R_train_T[j].nonzero()[1]).to(dtype=torch.int64)
                u_sum = torch.sum(self.u_conv_all[batch], axis=0)
                u_dev_sm = torch.sum(self.u_dev[batch], axis=0)
                self.v_dev.data[j] = torch.inverse(u_sum + u_dev_sm + self.lambda_v * self.f_eye)

                u_conv = torch.sum(R_train_tensor_T[j] * self.U[batch], axis=0)
                self.V.data[j] = self.v_dev[j] @ (u_conv + self.lambda_v * mu_movies[j])

    def update_prior(self):
        self.users_VAE.update_prior()
        self.movies_VAE.update_prior()

In [None]:
import numpy as np

import torch
from torch import optim

import random
from copy import deepcopy
from tqdm import trange

args = {
    'dataset': '',
    'hidden_dim': 40,
    'latent_dim': 5,
    'batch_size': train_data.count_nonzero(),
    'beta': None,
    'gamma': 1,
    'lr': 1e-3,
    'n_epochs': 150,
    'dropout_rate': 0.8,
    'print_step': 1,
    'n_enc_epochs': 3,
    'n_dec_epochs': 1,
    'not_alternating': True,
}

user_losses = []
movie_losses = []
mse_losses = []

def generate(batch_size, device, data_in, data_out=None, shuffle=False, samples_perc_per_epoch=1):
    yield Batch(device, [], data_in, data_out)


class Batch:
    def __init__(self, device, idx, data_in, data_out=None):
        self._device = device
        self._idx = idx
        self._data_in = data_in
        self._data_out = data_out
        
    def get_ratings(self, is_out=False):
        data = self._data_in
        return data
    
    def get_ratings_to_dev(self, is_out=False):
        ratings = self.get_ratings(is_out)
        return torch.Tensor(ratings.toarray()).to(self._device)


def evaluate(model, data_in, data_out, RMSE_data, metrics, data_mn, data_mx, samples_perc_per_epoch=1, batch_size=500):
    metrics = deepcopy(metrics)
    model.eval()
    
    for m in metrics:
        m['score'] = []
    
    for batch in generate(batch_size=batch_size,
                          device=device,
                          data_in=data_in,
                          data_out=data_out,
                          samples_perc_per_epoch=samples_perc_per_epoch
                         ):
        
        ratings = batch.get_ratings_to_dev()
        ratings_pred = model(ratings, R_train, R_train_tensor, R_train_T, R_train_tensor_T, calculate_loss=False) * (data_mx - data_mn) + data_mn
        for m in metrics:
            x = m['metric'](ratings_pred, RMSE_data)
            m['score'].append(x.cpu().detach())
    for m in metrics:
        m['score'] = np.sqrt(np.sum(m['score']) / data_in.count_nonzero())
        
    return [x['score'] for x in metrics]


def run(model, opts, train_data, batch_size, n_epoch, n_epochs, beta, gamma, dropout_rate):
    model.train()
    for epoch in range(n_epochs):
        for batch in generate(batch_size=batch_size, device=device, data_in=train_data, shuffle=False):
            ratings = batch.get_ratings_to_dev()

            for optimizer in opts:
                optimizer.zero_grad()
              
            loss, user_loss, movie_loss, mse = model(ratings, R_train, R_train_tensor, R_train_T, R_train_tensor_T, beta=beta, gamma=gamma, dropout_rate=dropout_rate, n_epoch=n_epoch)
            loss.backward()

            user_losses.append(user_loss)
            movie_losses.append(movie_loss)
            mse_losses.append(mse)
            # print('Epoch:', n_epoch, 'user_loss=', f'{user_loss:.4f}', 'movie_loss', f'{movie_loss:.4f}', 'mse_loss', f'{mse:.4f}')


            for optimizer in opts:
                optimizer.step()


model_kwargs = {
    'hidden_dim': args['hidden_dim'],
    'latent_dim': args['latent_dim'],
    'input_dim_users': train_data.shape[0],
    'input_dim_movies': train_data.shape[1],
    'device': device
}
metrics = [{'metric': RMSE}]

best_valid = np.inf
train_scores, valid_scores = [], []

model = DualVAE(**model_kwargs).to(device)
batch = Batch(device, [], train_data, train_data)
ratings = batch.get_ratings_to_dev()
model.pmf_init(R_train, R_train_tensor, R_train_T, R_train_tensor_T)
model_best = DualVAE(**model_kwargs).to(device)

learning_kwargs = {
    'model': model,
    'train_data': train_data,
    'batch_size': args['batch_size'],
    'beta': args['beta'],
    'gamma': args['gamma']
}


optimizer = optim.Adam(set(model.parameters()), lr=args['lr'], weight_decay=1)
import time
for epoch in range(args['n_epochs']):
    t = time.time()

    run(opts=[optimizer], n_epoch=epoch, n_epochs=1, dropout_rate=args['dropout_rate'], **learning_kwargs)

    train_scores.append(
        evaluate(model, train_data, train_data, RMSE_train, metrics, train_mn, train_mx, 1, batch_size=args['batch_size'])[0]
    )
    valid_scores.append(
        evaluate(model, test_data, test_data, RMSE_test, metrics, test_mn, test_mx, 1, batch_size=args['batch_size'])[0]
    )
    if valid_scores[-1] < best_valid:
      best_valid = valid_scores[-1]
      model_best.load_state_dict(deepcopy(model.state_dict()))
    if epoch % args['print_step'] == 0:
      print('Time:', time.time() - t)
      print('Epoch:', epoch, 'train_RMSE=', f'{train_scores[-1]:.4f}', 'test_RMSE', f'{valid_scores[-1]:.4f}', 'min_test_RMSE', f'{min(valid_scores):.4f}')

Time: 12.8509361743927
Epoch: 0 train_RMSE= 2.1900 test_RMSE 2.2555 min_test_RMSE 2.2555
Time: 12.794533252716064
Epoch: 1 train_RMSE= 1.3666 test_RMSE 1.4309 min_test_RMSE 1.4309
Time: 12.831523656845093
Epoch: 2 train_RMSE= 1.2706 test_RMSE 1.3349 min_test_RMSE 1.3349
Time: 12.85643482208252
Epoch: 3 train_RMSE= 1.2053 test_RMSE 1.2710 min_test_RMSE 1.2710
Time: 12.741624593734741
Epoch: 4 train_RMSE= 1.1595 test_RMSE 1.2244 min_test_RMSE 1.2244
Time: 12.617982625961304
Epoch: 5 train_RMSE= 1.1223 test_RMSE 1.1855 min_test_RMSE 1.1855
Time: 12.678699970245361
Epoch: 6 train_RMSE= 1.0943 test_RMSE 1.1561 min_test_RMSE 1.1561
Time: 12.656322956085205
Epoch: 7 train_RMSE= 1.0757 test_RMSE 1.1376 min_test_RMSE 1.1376
Time: 12.782132863998413
Epoch: 8 train_RMSE= 1.0600 test_RMSE 1.1193 min_test_RMSE 1.1193


KeyboardInterrupt: ignored