In [1]:
import os
import shutil
import sys

import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import pandas as pd
import bottleneck as bn

In [2]:
data_dir = '../data/'
tr_data = pd.read_csv(data_dir+'train_ratings.csv')
te_data = pd.read_csv(data_dir+'test_ratings.csv')

In [3]:
tr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306795 entries, 0 to 306794
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  306795 non-null  int64 
 1   isbn     306795 non-null  object
 2   rating   306795 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 7.0+ MB


## Preprocessing

In [4]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [5]:
def filter_triplets(tp, min_uc=1, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'isbn')
        tp = tp[tp['isbn'].isin(itemcount.index[itemcount['size'] >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'user_id')
        tp = tp[tp['user_id'].isin(usercount.index[usercount['size'] >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'user_id'), get_count(tp, 'isbn') 
    return tp, usercount, itemcount

Only keep items that are clicked on by at least 2 users

In [6]:
tr_data, tr_user_activity, tr_item_popularity = filter_triplets(tr_data)

In [7]:
sparsity = 1. * tr_data.shape[0] / (tr_user_activity.shape[0] * tr_item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (tr_data.shape[0], tr_user_activity.shape[0], tr_item_popularity.shape[0], sparsity * 100))

After filtering, there are 69182 watching events from 12961 users and 43564 movies (sparsity: 0.012%)


In [8]:
def numerize(tp):
    uid = map(lambda x: profile2id[x], tp['user_id'])
    sid = map(lambda x: show2id[x], tp['isbn'])
    return pd.DataFrame(data={'uid': list(uid), 'sid': list(sid)}, columns=['uid', 'sid'])

In [9]:
unique_uid = tr_user_activity.index

np.random.seed(42)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [10]:
n_users = unique_uid.size
print(n_users*.8)
tr_users = unique_uid[:10368]
vd_users = unique_uid[10368:]

10368.800000000001


In [11]:
tr_plays = tr_data.loc[tr_data['user_id'].isin(tr_users)]

In [12]:
unique_sid = pd.unique(tr_plays['isbn'])

In [13]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [14]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user_id')
    tr_list, te_list = list(), list()

    np.random.seed(42)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 2:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [15]:
vd_plays = tr_data.loc[tr_data['user_id'].isin(vd_users)]
vd_plays = vd_plays.loc[vd_plays['isbn'].isin(unique_sid)]

In [16]:
vd_plays_tr, vd_plays_te = split_train_test_proportion(vd_plays)

0 users sampled


In [17]:
output_dir = '../data'
tr_data = numerize(tr_plays)
tr_data.to_csv(os.path.join(output_dir, 'train_vae.csv'), index=False)

vd_data_tr = numerize(vd_plays_tr)
vd_data_tr.to_csv(os.path.join(output_dir, 'validation_tr_vae.csv'), index=False)

vd_data_te = numerize(vd_plays_te)
vd_data_te.to_csv(os.path.join(output_dir, 'validation_te_vae.csv'), index=False)

## Model training

In [18]:
import random
import torch
from torch import optim
from src.models.VAE.RecVAE_model import VAE
from copy import deepcopy
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
device = torch.device("cuda:0")

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
def load_data(csv_file):
    tp = pd.read_csv(csv_file)
    n_users = tp['uid'].max() + 1

    rows, cols = tp['uid'], tp['sid']
    data = sparse.csr_matrix((np.ones_like(rows),
                             (rows, cols)), dtype='float64',
                             shape=(n_users, n_items))
    return data

In [20]:
def load_te_data(data):
    tp = data
    n_users = tp['uid'].max() + 1

    rows, cols = tp['uid'], tp['sid']
    data = sparse.csr_matrix((np.ones_like(rows),
                             (rows, cols)), dtype='float64',
                             shape=(n_users, n_items))
    return data

In [21]:
def load_tr_te_data(csv_file_tr, csv_file_te):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

    data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                             (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te),
                             (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    return data_tr, data_te

In [22]:
n_items = len(unique_sid)
train_data = load_data(os.path.join(output_dir, 'train_vae.csv'))
valid_in_data, valid_out_data = load_tr_te_data(os.path.join(output_dir, 'validation_tr_vae.csv'), os.path.join(output_dir, 'validation_te_vae.csv'))

In [23]:
def generate(batch_size, device, data_in, data_out=None, shuffle=False, samples_perc_per_epoch=1):
    assert 0 < samples_perc_per_epoch <= 1
    
    total_samples = data_in.shape[0]
    samples_per_epoch = int(total_samples * samples_perc_per_epoch)
    
    if shuffle:
        idxlist = np.arange(total_samples)
        np.random.shuffle(idxlist)
        idxlist = idxlist[:samples_per_epoch]
    else:
        idxlist = np.arange(samples_per_epoch)
    
    for st_idx in range(0, samples_per_epoch, batch_size):
        end_idx = min(st_idx + batch_size, samples_per_epoch)
        idx = idxlist[st_idx:end_idx]

        yield Batch(device, idx, data_in, data_out)

In [24]:
class Batch:
    def __init__(self, device, idx, data_in, data_out=None):
        self._device = device
        self._idx = idx
        self._data_in = data_in
        self._data_out = data_out
    
    def get_idx(self):
        return self._idx
    
    def get_idx_to_dev(self):
        return torch.LongTensor(self.get_idx()).to(self._device)
        
    def get_ratings(self, is_out=False):
        data = self._data_out if is_out else self._data_in
        return data[self._idx]
    
    def get_ratings_to_dev(self, is_out=False):
        return torch.Tensor(
            self.get_ratings(is_out).toarray()
        ).to(self._device)

In [25]:
def evaluate(model, data_in, data_out, metrics, samples_perc_per_epoch=1, batch_size=500):
    metrics = deepcopy(metrics)
    model.eval()
    
    for m in metrics:
        m['score'] = []
        
    for batch in generate(batch_size=batch_size,
                          device=device,
                          data_in=data_in,
                          data_out=data_out,
                          samples_perc_per_epoch=samples_perc_per_epoch
                         ):
        
        ratings_in = batch.get_ratings_to_dev()
        ratings_out = batch.get_ratings(is_out=True)
    
        ratings_pred = model(ratings_in, calculate_loss=False).cpu().detach().numpy()
        
        if not (data_in is data_out):
            ratings_pred[batch.get_ratings().nonzero()] = -np.inf
            
        for m in metrics:
            m['score'].append(m['metric'](ratings_pred, ratings_out, k=m['k']))

    for m in metrics:
        m['score'] = np.concatenate(m['score']).mean()
        
    return [x['score'] for x in metrics]

In [26]:
def run(model, opts, train_data, batch_size, n_epochs, beta, gamma, dropout_rate):
    model.train()
    for epoch in range(n_epochs):
        for batch in generate(batch_size=batch_size, device=device, data_in=train_data, shuffle=True):
            ratings = batch.get_ratings_to_dev()

            for optimizer in opts:
                optimizer.zero_grad()
                
            _, loss = model(ratings, beta=beta, gamma=gamma, dropout_rate=dropout_rate)
            loss.backward()
            
            for optimizer in opts:
                optimizer.step()

In [27]:
def ndcg(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG

In [28]:
def recall(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return 

In [29]:
model_kwargs = {
    'hidden_dim': 400,
    'latent_dim': 100,
    'input_dim': train_data.shape[1]
}
metrics = [{'metric': ndcg, 'k': 20}]

best_ndcg = -np.inf
train_scores, valid_scores = [], []

In [30]:
model = VAE(**model_kwargs).to(device)
model_best = VAE(**model_kwargs).to(device)

In [31]:
model

VAE(
  (encoder): Encoder(
    (fc1): Linear(in_features=12559, out_features=400, bias=True)
    (ln1): LayerNorm((400,), eps=0.1, elementwise_affine=True)
    (fc2): Linear(in_features=400, out_features=400, bias=True)
    (ln2): LayerNorm((400,), eps=0.1, elementwise_affine=True)
    (fc3): Linear(in_features=400, out_features=400, bias=True)
    (ln3): LayerNorm((400,), eps=0.1, elementwise_affine=True)
    (fc4): Linear(in_features=400, out_features=400, bias=True)
    (ln4): LayerNorm((400,), eps=0.1, elementwise_affine=True)
    (fc5): Linear(in_features=400, out_features=400, bias=True)
    (ln5): LayerNorm((400,), eps=0.1, elementwise_affine=True)
    (fc_mu): Linear(in_features=400, out_features=100, bias=True)
    (fc_logvar): Linear(in_features=400, out_features=100, bias=True)
  )
  (prior): CompositePrior(
    (encoder_old): Encoder(
      (fc1): Linear(in_features=12559, out_features=400, bias=True)
      (ln1): LayerNorm((400,), eps=0.1, elementwise_affine=True)
      (f

In [32]:
learning_kwargs = {
    'model': model,
    'train_data': train_data,
    'batch_size': 256,
    'beta': None,
    'gamma': 0.005
}

In [33]:
decoder_params = set(model.decoder.parameters())
encoder_params = set(model.encoder.parameters())

In [34]:
optimizer_encoder = optim.Adam(encoder_params, lr=5e-4)
optimizer_decoder = optim.Adam(decoder_params, lr=5e-4)

In [35]:
for epoch in range(50):

    if False:
        run(opts=[optimizer_encoder, optimizer_decoder], n_epochs=1, dropout_rate=0.5, **learning_kwargs)
    else:
        run(opts=[optimizer_encoder], n_epochs=3, dropout_rate=0.5, **learning_kwargs)
        model.update_prior()
        run(opts=[optimizer_decoder], n_epochs=1, dropout_rate=0, **learning_kwargs)

    train_scores.append(
        evaluate(model, train_data, train_data, metrics, 0.01)[0]
    )
    valid_scores.append(
        evaluate(model, valid_in_data, valid_out_data, metrics, 1)[0]
    )
    
    if valid_scores[-1] > best_ndcg:
        best_ndcg = valid_scores[-1]
        model_best.load_state_dict(deepcopy(model.state_dict()))
        

    print(f'epoch {epoch} | valid ndcg@100: {valid_scores[-1]:.4f} | ' +
          f'best valid: {best_ndcg:.4f} | train ndcg@100: {train_scores[-1]:.4f}')


  return DCG / IDCG


epoch 0 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 1 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 2 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 3 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 4 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 5 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 6 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 7 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 8 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 9 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 10 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 11 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 12 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
epoch 13 | valid ndcg@100: nan | best valid: -inf | train ndcg@100: nan
ep

In [36]:
test_metrics = [{'metric': ndcg, 'k': 100}, {'metric': recall, 'k': 20}, {'metric': recall, 'k': 50}]

final_scores = evaluate(model_best, test_data, test_data, test_metrics)

for metric, score in zip(test_metrics, final_scores):
    print(f"{metric['metric'].__name__}@{metric['k']}:\t{score:.4f}")

NameError: name 'test_data' is not defined