In [1]:
import os
import time
import numpy as np
import scipy.sparse as sp
from datetime import timedelta

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


### 1. Data

In [2]:
def data_load(dir_path):
    train_path = os.path.join(dir_path, 'train_list.npy')
    valid_path = os.path.join(dir_path, 'valid_list.npy')
    test_path = os.path.join(dir_path, 'test_list.npy')

    train_list = np.load(train_path, allow_pickle=True)
    valid_list = np.load(valid_path, allow_pickle=True)
    test_list = np.load(test_path, allow_pickle=True)

    uid_max = 0
    iid_max = 0
    train_dict = {}

    for uid, iid in train_list:
        if uid not in train_dict:
            train_dict[uid] = []
        train_dict[uid].append(iid)
        if uid > uid_max:
            uid_max = uid
        if iid > iid_max:
            iid_max = iid
    
    n_user = uid_max + 1
    n_item = iid_max + 1
    print(f'user num: {n_user}')
    print(f'item num: {n_item}')

    train_data = sp.csr_matrix((np.ones_like(train_list[:, 0]), \
        (train_list[:, 0], train_list[:, 1])), dtype='float64', \
        shape=(n_user, n_item))
    
    valid_y_data = sp.csr_matrix((np.ones_like(valid_list[:, 0]),
                 (valid_list[:, 0], valid_list[:, 1])), dtype='float64',
                 shape=(n_user, n_item))  # valid_groundtruth

    test_y_data = sp.csr_matrix((np.ones_like(test_list[:, 0]),
                 (test_list[:, 0], test_list[:, 1])), dtype='float64',
                 shape=(n_user, n_item))  # test_groundtruth
    
    return train_data, valid_y_data, test_y_data, n_user, n_item


In [3]:
class DataMacridVAE(Dataset):
    def __init__(self, dataset):
        self.data = dataset

    def __getitem__(self, index): 
        item = self.data[index]
        return item
        
    def __len__(self):
        return len(self.data)

### 2. Model

In [4]:
class MacridVAE(nn.Module):
    def __init__(self, args, num_items, dropout=0.5):
        super(MacridVAE, self).__init__()
        self.args = args
        if args.dropout: dropout = args.dropout
        self.num_items = num_items
    
        self.proto_type = nn.Parameter(torch.Tensor(args.num_concepts, args.dim_item))
        # Item의 concept에 대한 representation이다.
        # Item들은 가장 가까운 concept으로 배정된다.
        # Consine similarity로 거리가 계산되기 때문에, proto type의 emb dim은 item의 emb dim과 같아야 한다.
        self.emb_items = nn.Parameter(torch.Tensor(num_items, args.dim_item))

        dims_encoder = [num_items, args.dim_item, args.dim_item * 2]
        # 첫 번째 layer는 논문에 있는 context embedding을 만드는 것이다.
        # 두 번째 layer는 논문에서 f_{nn}에 해당하는 shallow network이다.
        # Decoder에서 item과 내적을 하기 때문에 평균과 분산의 dimension은 item과 같아야 한다.
        self.dropout = nn.Dropout(dropout)
        self.encoder_list = nn.ModuleList([])
        for d_in, d_out in zip(dims_encoder[:-1], dims_encoder[1:]):
            self.encoder_list.append(nn.Linear(d_in, d_out))
            self.encoder_list.append(nn.Tanh())
        self.encoder = nn.Sequential(*self.encoder_list[:-1]) # 마지막엔 activation function 추가 안 한다.
        del self.encoder_list

        self.init_weights()
    
    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight.data)
                if m.bias is not None:
                    nn.init.normal_(m.bias.data, mean=0.0, std=0.001)

        nn.init.xavier_normal_(self.proto_type)
        nn.init.xavier_normal_(self.emb_items)

    def compute_cosin_sim(self, mat1, mat2, tau=0.1, eps=1e-8):
        norm_mat1 = mat1.norm(dim=1)[:, None]
        norm_mat2 = mat2.norm(dim=1)[:, None]
        mat1_normed = mat1 / torch.max(norm_mat1, eps * torch.ones_like(norm_mat1))
        mat2_normed = mat2 / torch.max(norm_mat2, eps * torch.ones_like(norm_mat2))
        cos_sim = torch.einsum('ix, jx -> ij', [mat1_normed, mat2_normed])

        return cos_sim / tau # shape (len(mat1[0]), len(mat2[0]))

    def Encode(self, batch):
        # User의 k개 concept(preference)를 encoding한다.
        # batch - shape: (batch_size, num_items: interaction이 있으면 1)
        batch_size = batch.shape[0]

        probs = self.compute_cosin_sim(self.emb_items, self.proto_type) # shape: (self.M, num_concepts)
        if self.training: concept_mask = F.gumbel_softmax(logits=probs, hard=False) # shape: (self.M, num_concept: one-hot) -> 논문에서는 hard, 구현은 soft...
        else: concept_mask = F.softmax(probs, dim=-1) # Test 때 mode 사용.즉, sampling 안 함.
        
        concept_mask = concept_mask.t().expand(batch_size, self.args.num_concepts, self.num_items)
        # shape: (batch_size, num_concepts, num_items)
        # concept_mask글 batch_size 만큼 반복해서 생성한다.
        # 각 concept에 해당하는 item들이 1로 표시되어 있는데, 이를 batch_size만큼 반복해서 생성한다.
        # concept_mask[0][0]은 concept 0에 해당하는 item들이 1로 표시되어 있다.
        batch = self.dropout(batch)
        batch = batch.reshape(batch_size, 1, self.num_items) * concept_mask
        batch = F.normalize(batch, dim=-1) # 논문 eq. 7의 첫 번재 term에서 L2_norm을 하고 있다.
        # shape: (batch_size, num_concepts, num_items)
        # batch.reshape(): 
        #   Batch user 중 한 user의 interactions는 [0, 1, 0, ..., 1, 1] 등으로 되어 있다.
        # 이를 concept_mask과 곱해 interaction이 있는 item 들을 k concept으로 나누게 된다.
        #   예를 들어, 2개의 concept이 있을 때, concept_mask이 [[1, 0, 0, ..., 0, 1], [0, 1, 1, ..., 1, 0]]와 같다면,
        #   한 user의 interaction은 이것과 곱해져 [[0, 0, 0, ..., 0, 1], [0, 1, 0, ..., 1, 0]]을 나눠지게 된다.
        # 즉, concept_mask은 masking 역할이다.

        batch = batch.reshape(batch_size * self.args.num_concepts, self.num_items)
        # shape: (batch_size * num_concepts, num_items)
        # Decoder에서 cosine similarity 그리고 reconsturciton, KL loss를 편하게 구하기 위해서 먼저 이렇게 만든다.
    
        h = self.encoder(batch)

        # shape: (batch_size * num_concepts, dim_item * 2)
        # Interactions를 이용해 k 개 concept latent vector의 평균, 분산을 얻었다.
        mu, log_var = h[:, :self.args.dim_item], h[:, self.args.dim_item:] # shape: (batch_size * num_concepts, dim_item)
        latent = self.gaussian_sampling(mu, log_var) # shape: (batch_size * num_concepts, dim_item)
        return latent, concept_mask, mu, log_var
        # shape: (batch_size * num_concepts, dim_item), (batch_size, num_items, num_concept), 
        #        (batch_size * num_concepts, dim_item), (batch_size * num_concepts, dim_item)

    def Decode(self, latent, concept_mask):
        # latent - shape: (batch_size * num_concepts, dim_item), 
        # concept_mask - shape: (batch_size, num_items, num_concept)
        batch_size = latent.shape[0] // self.args.num_concepts
        logits = self.compute_cosin_sim(latent, self.emb_items) # shape: (batch_size * num_concepts, num_items)
        probs_concept = torch.exp(logits).reshape(batch_size, self.args.num_concepts, self.num_items) # shape: (batch_size, num_concepts, num_items)
        # 각 batch user 마다 concept 별 각 concept에 있는 items을 소비할 확률값.
        batch_hat_concept = probs_concept * concept_mask # shape: (batch_size, num_concepts, num_items)
        batch_hat = torch.sum(batch_hat_concept, dim=1) # shape: (batch_size, num_items)
        batch_hat = torch.log(batch_hat) # 논문 Algorithm 1의 아랫부분을 보면 log를 두번 해준다... 왜 그렇지??
        batch_hat = F.log_softmax(batch_hat, dim=-1)
        return batch_hat


    def gaussian_sampling(self, mu, log_var):
        if self.training:
            std = torch.exp(0.5 * log_var)
            noise = torch.randn_like(std)
            return mu + std * noise
        else:
            return mu
    
    def forward(self, batch):
        # batch - shape: (batch_size, num_items)
        latent, concept_mask, mu, log_var = self.Encode(batch)
        batch_hat = self.Decode(latent, concept_mask)
        loss_recon, loss_kl = self.compute_loss(batch, batch_hat, mu, log_var)
        return batch_hat, loss_recon, loss_kl
    
    def compute_loss(self, batch, batch_hat, mu, log_var):
        loss_recon = torch.mean(torch.sum(-batch_hat * batch, dim=1)) # Reconstruction error: \sum_{u,i == 1} -ln p_{u,i}
        loss_kl = torch.mean(0.5 * torch.sum(torch.exp(log_var) + mu ** 2 - 1 - log_var, dim=1)) # Multinomial KL divergence 계산.
        return loss_recon, loss_kl

### 3. Train

#### 3.1. utils

In [5]:

def train_one_epoch(args, model, optimizer, dataloader, update_count_vae):
    model.train()
    total_loss = 0.0
    for batch in dataloader:
        batch = batch.to(args.device)

        prediction, loss_recon, loss_kl = model(batch) # model 자체에서 loss들이 계산됨. Prediction은 나중에 evaluation할 때 필요.
        annealing = min(args.beta, update_count_vae / (500 * args.batch_size)) 
        # annealing: beta 값을 넘어가면 안됨. -> 논문에서는 micro disentangle 때문에, 1 넘게 준다고 하는데 이상하네
        loss = loss_recon + annealing * loss_kl
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        update_count_vae += 1


    return total_loss / len(dataloader), update_count_vae


def recall_kth(preds, labels, k=100):
    # _, preds = torch.topk(outputs, k, sorted=False) # top k index
    rows = torch.arange(len(labels[0])).view(-1, 1)

    recall = torch.sum(labels[rows, preds], dim=1) \
           / torch.min(torch.Tensor([k]), torch.sum(labels, dim=1))
    recall[torch.isnan(recall)] = 0
    return recall


def compute_metric(target_items, predict_items, topK):
    # print(f'Temp Recall@100: {recall_kth(predict_items, target_items)}')
    precisions = []
    recalls = []
    ndcgs = []
    mrrs = []
    num_users = len(predict_items)

    for k in topK:
        sum_precision = sum_recall = sum_ndcg = sum_mrr = 0.0
        for user_id in range(num_users):
            if len(target_items[user_id]) == 0: continue
            mrr_flag = True
            num_hit = user_mrr = dcg = 0
            
            for rank_idx in range(k):
                if predict_items[user_id][rank_idx] in target_items[user_id]:
                    num_hit += 1 # precision, recall에 사용
                    dcg += 1.0 / np.log2(rank_idx + 2)                    
                    if mrr_flag:
                        user_mrr = 1.0 / (rank_idx+1.0)
                        mrr_flag = False
            
            idcg = 0.0
            for rank_idx in range(len(target_items[user_id])):
                idcg += 1.0/np.log2(rank_idx+2)
            ndcg = (dcg/idcg)

            sum_precision += num_hit / k
            sum_recall += num_hit / len(target_items[user_id])
            sum_ndcg += ndcg
            sum_mrr += user_mrr

        precision = round(sum_precision / num_users, 4)
        recall = round(sum_recall / num_users, 4)
        ndcg = round(sum_ndcg / num_users, 4)
        mrr = round(sum_mrr / num_users, 4)

        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)
        mrrs.append(mrr)

    return precisions, recalls, ndcgs, mrrs


def evaluate(args, model, loader, label_items: sp.csr_matrix, consumed_items: sp.csr_matrix):
    """
    Args
        args                    : hyper-parameters
        model                   : 학습된 model
        diffsuion               : Diffusion
        loader                  : Test data loader // no_shffule
        label_items             : Ground Truth, shape: (num_users, num_items) 중에서 target item에만 1
        consumed_items          : training data에서 사용된 이미 user가 선호도를 보인 items
        topK                    : top K list ex) [10, 20, 50]
    """
    model.eval()
    num_user = label_items.shape[0]
    user_idx_list = list(range(label_items.shape[0]))
    # target_items.shape[0] 대신 consumed_items.shape[0]도 ㄱㅊ

    predict_items = []
    target_items = []

    for user_id in range(num_user):
        # user_id에 해당하는, sp.csr_matrix로 저장되어 있는 user의 label item id를 list로 저장.
        # nonzero()하면 (row array, col array) 반환.
        # col array: np.ndarray의 idx 값이 item id임.
        target_items.append(label_items[user_id,:].nonzero()[1].tolist())

    with torch.no_grad():
        for batch_idx, x_0 in enumerate(loader):
            start_batch_user_id = batch_idx*args.batch_size
            end_batch_user_id = start_batch_user_id + len(x_0)
            batch_consumed_items = consumed_items[user_idx_list[start_batch_user_id:end_batch_user_id]]
            x_0 = x_0.to(args.device)
            prediction, _, _ = model(x_0)
            prediction[batch_consumed_items.nonzero()] = -np.inf

            _, indices = torch.topk(prediction, args.topK[-1]) # shape (x_0[1].shape, topK[-1])
            indices = indices.detach().cpu().numpy().tolist()
            predict_items.extend(indices)

        precisions, recalls, ndcgs, mrrs = compute_metric(target_items, predict_items, args.topK)
    
    return precisions, recalls, ndcgs, mrrs


def print_metric_results(topK, results):
    metric_list = ['Precision', 'Recall', 'nDCG', 'MRR']
    for idx, metric in enumerate(metric_list):
        str_result = ''
        for k_idx, k in enumerate(topK):
            str_metric = f'{metric}@{k}'
            str_result += f'    {str_metric:14s}: {results[idx][k_idx]:.4f}'
        print(str_result)


class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

#### 3.2. main

In [6]:
dict_args = {}
args = dotdict(dict_args)

# Training hyper
args.dataset_name = 'ml-1m_clean'
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
args.batch_size = 500
args.lr = 1e-3
args.weight_decay = 0.
args.epochs = 500
args.topK = [10, 20, 50, 100]
args.beta = 0.2 # annealing의 최대값: MultVAE의 값 사용.
args.patience = 200
args.freq_metric = 20

# vae hyper
args.num_concepts = 7
args.dim_item = 100


dir_path = os.path.join(os.getcwd(), args.dataset_name)
sp_train, sp_valid, sp_test, num_users, num_items =data_load(dir_path) 
# 논문 setting을 따라하면, 성능이 복원된다.
# 귀찮아서 그냥 DiffuRec setting 그대로 사용.
train_dataset = DataMacridVAE(torch.FloatTensor(sp_train.toarray()))
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, pin_memory=True, shuffle=True)
test_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False)

# Build MacirdVAE   
model = MacridVAE(args, num_items).to(args.device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)

update_count_vae = 0
best_metric, best_epoch = -100, 0
best_test_result = None
print("Start training")
for epoch in range(args.epochs):
    if epoch - best_epoch >= args.patience: # early stopping
        print('-'*18)
        print('Exiting from training early')
        break
    start = time.time()
    avg_loss, update_count_vae = train_one_epoch(args, model, optimizer, train_loader, update_count_vae)
    print(f'Epoch {epoch+1:>3} -  train loss: {avg_loss: >10.4f},  time: {str(timedelta(seconds=int(time.time() - start)))}')

    if (epoch+1) % args.freq_metric == 0:
        val_results = evaluate(args, model, test_loader, sp_valid, sp_train)
        test_results = evaluate(args, model, test_loader, sp_test, sp_train)
    
        val_recalls = val_results[1]
        if val_recalls[1] > best_metric: # Metric: Recall@20
            best_metric, best_epoch, best_test_result = val_recalls[1], epoch, test_results
            print('  Update Best')


        print('  Validation data')
        print_metric_results(args.topK, val_results)
        print('  Test data')
        print_metric_results(args.topK, test_results)

print('#'*106)
print("Test Metirc At Best Valid Metric")
print_metric_results(args.topK, best_test_result)
print('#'*106)

user num: 5949
item num: 2810
Start training
Epoch   1 -  train loss:   552.2536,  time: 0:00:01
Epoch   2 -  train loss:   535.7915,  time: 0:00:00
Epoch   3 -  train loss:   501.6090,  time: 0:00:00
Epoch   4 -  train loss:   495.8006,  time: 0:00:00
Epoch   5 -  train loss:   491.9161,  time: 0:00:00
Epoch   6 -  train loss:   489.7462,  time: 0:00:00
Epoch   7 -  train loss:   486.7854,  time: 0:00:00
Epoch   8 -  train loss:   484.2441,  time: 0:00:00
Epoch   9 -  train loss:   481.8403,  time: 0:00:00
Epoch  10 -  train loss:   478.8764,  time: 0:00:00
Epoch  11 -  train loss:   474.9220,  time: 0:00:00
Epoch  12 -  train loss:   470.5658,  time: 0:00:00
Epoch  13 -  train loss:   467.7573,  time: 0:00:00
Epoch  14 -  train loss:   464.3559,  time: 0:00:00
Epoch  15 -  train loss:   461.4891,  time: 0:00:00
Epoch  16 -  train loss:   458.6869,  time: 0:00:00
Epoch  17 -  train loss:   456.2356,  time: 0:00:00
Epoch  18 -  train loss:   454.3573,  time: 0:00:00
Epoch  19 -  train 