In [28]:
import argparse
import time
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
import pandas as pd
from scipy import sparse
import torch.nn.functional as F
import bottleneck as bn

In [29]:
def naive_sparse2tensor(data):
    return torch.FloatTensor(data.toarray())

In [30]:

class DataLoader():
    '''
    Load Movielens dataset
    '''
    def __init__(self, path):
        
        self.pro_dir = os.path.join(path, 'pro_sg')
        #error 설정
        assert os.path.exists(self.pro_dir), "Preprocessed files do not exist. Run data.py"
        # load_n_items 를 통해 이전에 저장해뒀던 아이템의 랜덤 순서 불러옴
        self.n_items = self.load_n_items()
    
    def load_data(self, datatype='train'):
        if datatype == 'train':
            return self._load_train_data()
        elif datatype == 'validation':
            return self._load_tr_te_data(datatype)
        elif datatype == 'test':
            return self._load_tr_te_data(datatype)
        elif datatype == 'sub':
            return self._load_train_data(datatype)
        else:
            raise ValueError("datatype should be in [train, validation, test, submission]")
    
    # self.n_items 에 이전에 저장해뒀던 아이템의 랜덤순서 가져다주는 함수
    def load_n_items(self):
        unique_sid = list()
        with open(os.path.join(self.pro_dir, 'unique_sub_sid.txt'), 'r') as f:
            for line in f:
                unique_sid.append(line.strip())
        n_items = len(unique_sid)
        return n_items
    
    def _load_train_data(self, datatype = 'train'):
        path = os.path.join(self.pro_dir, f'{datatype}.csv')
        
        tp = pd.read_csv(path)
        n_users = tp['uid'].max() + 1

        rows, cols = tp['uid'], tp['sid']
        #compressed sparse row matrix로 변환하기 (희소행렬을 다른식으로 변환하여 저장하는 방법)
        data = sparse.csr_matrix((np.ones_like(rows),
                                 (rows, cols)), dtype='float64',
                                 shape=(n_users, self.n_items))
        return data
    
    def _load_tr_te_data(self, datatype='test'):
        tr_path = os.path.join(self.pro_dir, '{}_tr.csv'.format(datatype))
        te_path = os.path.join(self.pro_dir, '{}_te.csv'.format(datatype))

        tp_tr = pd.read_csv(tr_path)
        tp_te = pd.read_csv(te_path)

        start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
        end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

        rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
        rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

        data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                                    (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, self.n_items))
        data_te = sparse.csr_matrix((np.ones_like(rows_te),
                                    (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, self.n_items))
        return data_tr, data_te

In [31]:
def predict(model, data_tr, is_VAE=False):
    model.eval()
    global update_count
    items = []
    with torch.no_grad():
        for start_idx in range(data_tr.shape[0]):
            data = data_tr[start_idx]
            data_tensor = naive_sparse2tensor(data).to(device)            
            # Multi-VAE
            if is_VAE :        
                recon_batch, mu, logvar = model(data_tensor)
            # Multi-DAE
            else:
                recon_batch = model(data_tensor)
            recon_batch = recon_batch.cpu().numpy()
            recon_batch[data.nonzero()] = -np.inf
            
            for rec in recon_batch:
                up = np.argpartition(rec, -10)[-10:].tolist()
                items.extend(up)
    return items

In [32]:

class DataLoader():
    '''
    Load Movielens dataset
    '''
    def __init__(self, path):
        
        self.pro_dir = os.path.join(path, 'pro_sg')
        #error 설정
        assert os.path.exists(self.pro_dir), "Preprocessed files do not exist. Run data.py"
        # load_n_items 를 통해 이전에 저장해뒀던 아이템의 랜덤 순서 불러옴
        self.n_items = self.load_n_items()
    
    def load_data(self, datatype='train'):
        if datatype == 'train':
            return self._load_train_data()
        elif datatype == 'validation':
            return self._load_tr_te_data(datatype)
        elif datatype == 'test':
            return self._load_tr_te_data(datatype)
        elif datatype == 'sub':
            return self._load_train_data(datatype)
        else:
            raise ValueError("datatype should be in [train, validation, test, submission]")
    
    # self.n_items 에 이전에 저장해뒀던 아이템의 랜덤순서 가져다주는 함수
    def load_n_items(self):
        unique_sid = list()
        with open(os.path.join(self.pro_dir, 'unique_sub_sid.txt'), 'r') as f:
            for line in f:
                unique_sid.append(line.strip())
        n_items = len(unique_sid)
        return n_items
    
    def _load_train_data(self, datatype = 'train'):
        path = os.path.join(self.pro_dir, f'{datatype}.csv')
        
        tp = pd.read_csv(path)
        n_users = tp['uid'].max() + 1

        rows, cols = tp['uid'], tp['sid']
        #compressed sparse row matrix로 변환하기 (희소행렬을 다른식으로 변환하여 저장하는 방법)
        data = sparse.csr_matrix((np.ones_like(rows),
                                 (rows, cols)), dtype='float64',
                                 shape=(n_users, self.n_items))
        return data
    
    def _load_tr_te_data(self, datatype='test'):
        tr_path = os.path.join(self.pro_dir, '{}_tr.csv'.format(datatype))
        te_path = os.path.join(self.pro_dir, '{}_te.csv'.format(datatype))

        tp_tr = pd.read_csv(tr_path)
        tp_te = pd.read_csv(te_path)

        start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
        end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

        rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
        rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

        data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                                    (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, self.n_items))
        data_te = sparse.csr_matrix((np.ones_like(rows_te),
                                    (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, self.n_items))
        return data_tr, data_te

In [33]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np


#이미 완성된 MultiDAE(denoising auto encoder)의 코드를 참고하여 그 아래 MultiVAE의 코드를 완성해보세요!
class MultiDAE(nn.Module):
    """
    Container module for Multi-DAE.

    Multi-DAE : Denoising Autoencoder with Multinomial Likelihood
    See Variational Autoencoders for Collaborative Filtering
    https://arxiv.org/abs/1802.05814
    """

    def __init__(self, p_dims, q_dims=None, dropout=0.5):
        super(MultiDAE, self).__init__()
        #p_dims, q_dims 는 input, output dimension 리스트
        #p_dims = [200, 600, 6807]
        self.p_dims = p_dims
        #q_dims
        if q_dims:
            assert q_dims[0] == p_dims[-1], "In and Out dimensions must equal to each other"
            assert q_dims[-1] == p_dims[0], "Latent dimension for p- and q- network mismatches."
            self.q_dims = q_dims
        # q_dims 없으면 p_dims 순서 뒤집어서 사용
        else:
            self.q_dims = p_dims[::-1]
        # 항목이 5개가 되게 함
        self.dims = self.q_dims + self.p_dims[1:]
        # nn.Sequential 과 비슷한함수로, Module 여러개 담아놓는 역할
        # nn.Linear(6807, 600), nn.Linear(600,200),nn.Linear(200, 600),nn.Linear(600,6807)
        self.layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.dims[:-1], self.dims[1:])])
        self.drop = nn.Dropout(dropout)
        
        self.init_weights()
    
    def forward(self, input):
        #input 정규화 (이유 : 학습 속도 높이고, Local optimum에 빠지게 하지 않기 위해)
        #input dropout 으로 몇가닥 끊기(과적합 방지)
        h = F.normalize(input)
        h = self.drop(h)
        #nn.Module에 저장해 뒀던 Linear 함수 적용
        for i, layer in enumerate(self.layers):
            h = layer(h)
            # 마지막 항에서는 tanh로 activation function 적용
            if i != len(self.layers) - 1:
                h = F.tanh(h)
        return h

    def init_weights(self):
        # 가중치 초기화 하는 함수 
        for layer in self.layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            # 가중치 함수 초기화 (평균=0 , 표준편차 = std)
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)


def loss_function_dae(recon_x, x):
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    return BCE



# TODO
# 다양한 VAE의 코드를 다음 코드를 확인한 뒤에, 아래코드에 맞춰서 직접 작성해보는 연습을 해보세요!
# https://github.com/AntixK/PyTorch-VAE
class MultiVAE(nn.Module):
    """
    Container module for Multi-VAE.

    Multi-VAE : Variational Autoencoder with Multinomial Likelihood
    See Variational Autoencoders for Collaborative Filtering
    https://arxiv.org/abs/1802.05814
    """

    def __init__(self, p_dims, q_dims=None, dropout=0.5):
        super(MultiVAE, self).__init__()
        # init 부분은 Multi DAE 와 동일
        self.p_dims = p_dims
        if q_dims:
            assert q_dims[0] == p_dims[-1], "In and Out dimensions must equal to each other"
            assert q_dims[-1] == p_dims[0], "Latent dimension for p- and q- network mismatches."
            self.q_dims = q_dims
        else:
            self.q_dims = p_dims[::-1]

        # Last dimension of q- network is for mean and variance
        temp_q_dims = self.q_dims[:-1] + [self.q_dims[-1] * 2]
        # encoder 용 : q_layers는 p_dims 뒤집고 마지막항 한번더 연산추가한 Linear layer 들의 결합
        self.q_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(temp_q_dims[:-1], temp_q_dims[1:])])
        # decoder 용 : p_layer는 p_dims 그대로 사용한 Linear layer 들의 결합으로
        self.p_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.p_dims[:-1], self.p_dims[1:])])
        
        self.drop = nn.Dropout(dropout)
        self.init_weights()
    
    # 인풋 -> 인코더 ->  파라미터 재정비 -> 디코더 -> 아웃풋 + 인코더 결과물(mu, logvar)
    def forward(self, input):
        mu, logvar = self.encode(input)
        z = self.reparameterize(mu, logvar)
        h = self.decode(z)
        return h, mu, logvar
    
    def encode(self, input):
        h = F.normalize(input)
        h = self.drop(h)
        #인코더에서는 MultiDAE 처럼 linear layer 돌림
        for i, layer in enumerate(self.q_layers):
            h = layer(h)
            if i != len(self.q_layers) - 1:
                h = F.tanh(h)
            else:
                # mu : 평균
                # logvar : log 분산 (표준편차가 음수가 되지 않기 위한 연산)
                # 처음 값들은 평균(mu)로 보내고 나머지 값들은 분산으로 보냄
                # h 는 [항목수, linear 로 변환된 dim]
                mu = h[:, :self.q_dims[-1]]
                logvar = h[:, self.q_dims[-1]:]
                # 이후 reparameterize 에서 연산처리함
        return mu, logvar

    # training 과정에서 역전파를 수행할 수 있도록 재매개변수화 함수를 따로 생성했다고 함.
    def reparameterize(self, mu, logvar):
        # 학습중일 때는 평균 중심으로 분산 흩뿌려서 제출
        if self.training:
            #logvar로 표준편차 계산
            std = torch.exp(0.5 * logvar)
            # std를 정규분포 값으로 초기화한 eps
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        # 아닐 때는 그냥 평균값 배출
        else:
            return mu

    def decode(self, z):
        h = z
        for i, layer in enumerate(self.p_layers):
            h = layer(h)
            if i != len(self.p_layers) - 1:
                h = F.tanh(h)
        return h

    def init_weights(self):
        for layer in self.q_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)
        
        for layer in self.p_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)



def loss_function_vae(recon_x, x, mu, logvar, anneal=1.0):
    # Loss function은 BCE와 KLD 사용
    # KL annealing 을 통해 Regularization 부여
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))
    # anneal 값을 0에서부터 특정 값까지 선형적으로 증가시켜 
    # 학습 초기에 reconstruction term을 강조하여 보다 효율적인 학습 도모함.
    return BCE + anneal * KLD




In [34]:

model = torch.load('MultiDAE.pt') 

In [35]:
## 각종 파라미터 세팅
parser = argparse.ArgumentParser(description='PyTorch Variational Autoencoders for Collaborative Filtering')


parser.add_argument('--data', type=str, default='',
                    help='Movielens dataset location')

parser.add_argument('--lr', type=float, default=1e-4,
                    help='initial learning rate')
parser.add_argument('--wd', type=float, default=0.00,
                    help='weight decay coefficient')
parser.add_argument('--batch_size', type=int, default=500,
                    help='batch size')
parser.add_argument('--epochs', type=int, default=20,
                    help='upper epoch limit')
parser.add_argument('--total_anneal_steps', type=int, default=200000,
                    help='the total number of gradient updates for annealing')
parser.add_argument('--anneal_cap', type=float, default=0.2,
                    help='largest annealing parameter')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--log_interval', type=int, default=100, metavar='N',
                    help='report interval')
parser.add_argument('--save', type=str, default='model.pt',
                    help='path to save the final model')
args = parser.parse_args([])

# Set the random seed manually for reproductibility.
torch.manual_seed(args.seed)

#만약 GPU가 사용가능한 환경이라면 GPU를 사용
if torch.cuda.is_available():
    args.cuda = True

device = torch.device("cuda" if args.cuda else "cpu")
device

device(type='cuda')

In [36]:
loader = DataLoader(args.data)

In [37]:
sub_data = loader.load_data('sub')

In [81]:
user2rec_list = predict(model, sub_data[user2id["5c944530817bf510af65740e"]], is_VAE=False)



In [76]:
id2item = pd.read_csv("id2item.csv")
id2user = pd.read_csv("id2user.csv")

In [77]:
user2id = id2user.set_index('2').to_dict()['1']
id2user = id2user.set_index('1').to_dict()['2']
id2item = id2item.set_index('1').to_dict()['2']

In [None]:
user2rec_list = [id2item[x] for x in user2rec_list]

In [88]:
from multivae import multivae_model
aa = multivae_model
aa.pred()

TypeError: pred() missing 1 required positional argument: 'self'