# DATA

In [10]:
import argparse
import time
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from scipy import sparse

## 각종 파라미터 세팅
parser = argparse.ArgumentParser(description='PyTorch Variational Autoencoders for Collaborative Filtering')


parser.add_argument('--data', type=str, default='data/train/',
                    help='Movielens dataset location')

parser.add_argument('--lr', type=float, default=1e-4,
                    help='initial learning rate')
parser.add_argument('--wd', type=float, default=0.00,
                    help='weight decay coefficient')
parser.add_argument('--batch_size', type=int, default=500,
                    help='batch size')
parser.add_argument('--epochs', type=int, default=20,
                    help='upper epoch limit')
parser.add_argument('--total_anneal_steps', type=int, default=200000,
                    help='the total number of gradient updates for annealing')
parser.add_argument('--anneal_cap', type=float, default=0.2,
                    help='largest annealing parameter')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--log_interval', type=int, default=100, metavar='N',
                    help='report interval')
parser.add_argument('--save', type=str, default='model.pt',
                    help='path to save the final model')
args = parser.parse_args([])

# Set the random seed manually for reproductibility.
torch.manual_seed(args.seed)

#만약 GPU가 사용가능한 환경이라면 GPU를 사용
if torch.cuda.is_available():
    args.cuda = True

device = torch.device("cuda" if args.cuda else "cpu")
device

device(type='cuda')

In [11]:
import os
import pandas as pd
from scipy import sparse
import numpy as np

def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()

    return count

# 특정한 횟수 이상의 리뷰가 존재하는(사용자의 경우 min_uc 이상, 아이템의 경우 min_sc이상) 
# 데이터만을 추출할 때 사용하는 함수입니다.
# 현재 데이터셋에서는 결과적으로 원본그대로 사용하게 됩니다.
def filter_triplets(tp, min_uc=5, min_sc=0):
    if min_sc > 0:
        itemcount = get_count(tp, 'item')
        tp = tp[tp['item'].isin(itemcount.index[itemcount >= min_sc])]

    if min_uc > 0:
        usercount = get_count(tp, 'user')
        tp = tp[tp['user'].isin(usercount.index[usercount >= min_uc])]

    usercount, itemcount = get_count(tp, 'user'), get_count(tp, 'item')
    return tp, usercount, itemcount

#훈련된 모델을 이용해 검증할 데이터를 분리하는 함수입니다.
#100개의 액션이 있다면, 그중에 test_prop 비율 만큼을 비워두고, 그것을 모델이 예측할 수 있는지를
#확인하기 위함입니다.
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user')
    tr_list, te_list = list(), list()

    np.random.seed(98765)
    
    for _, group in data_grouped_by_user:
        n_items_u = len(group)
        
        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        
        else:
            tr_list.append(group)
    
    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

def numerize(tp, profile2id, show2id):
    uid = tp['user'].apply(lambda x: profile2id[x])
    sid = tp['item'].apply(lambda x: show2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [12]:
print("Load and Preprocess Movielens dataset")
# Load Data
DATA_DIR = args.data
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'train_ratings.csv'), header=0)
print("원본 데이터\n", raw_data)

# Filter Data
raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=0)
#제공된 훈련데이터의 유저는 모두 5개 이상의 리뷰가 있습니다.
print("5번 이상의 리뷰가 있는 유저들로만 구성된 데이터\n",raw_data)

print("유저별 리뷰수\n",user_activity)
print("아이템별 리뷰수\n",item_popularity)

Load and Preprocess Movielens dataset
원본 데이터
            user   item        time
0            11   4643  1230782529
1            11    170  1230782534
2            11    531  1230782539
3            11    616  1230782542
4            11   2140  1230782563
...         ...    ...         ...
5154466  138493  44022  1260209449
5154467  138493   4958  1260209482
5154468  138493  68319  1260209720
5154469  138493  40819  1260209726
5154470  138493  27311  1260209807

[5154471 rows x 3 columns]
5번 이상의 리뷰가 있는 유저들로만 구성된 데이터
            user   item        time
0            11   4643  1230782529
1            11    170  1230782534
2            11    531  1230782539
3            11    616  1230782542
4            11   2140  1230782563
...         ...    ...         ...
5154466  138493  44022  1260209449
5154467  138493   4958  1260209482
5154468  138493  68319  1260209720
5154469  138493  40819  1260209726
5154470  138493  27311  1260209807

[5154471 rows x 3 columns]
유저별 리뷰수
 user
11        376
1

In [13]:
# Shuffle User Indices
unique_uid = user_activity.index
print("(BEFORE) unique_uid:",unique_uid)
np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]
print("(AFTER) unique_uid:",unique_uid)

n_users = unique_uid.size #31360
n_heldout_users = 0

tr_users = unique_uid
# Split Train/Validation/Test User Indices
tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2):]
# te_users = unique_uid[(n_users - n_heldout_users):]

#주의: 데이터의 수가 아닌 사용자의 수입니다!
print("훈련 데이터에 사용될 사용자 수:", len(tr_users))
print("검증 데이터에 사용될 사용자 수:", len(vd_users))
# print("테스트 데이터에 사용될 사용자 수:", len(te_users))




(BEFORE) unique_uid: Int64Index([    11,     14,     18,     25,     31,     35,     43,     50,
                58,     60,
            ...
            138459, 138461, 138470, 138471, 138472, 138473, 138475, 138486,
            138492, 138493],
           dtype='int64', name='user', length=31360)
(AFTER) unique_uid: Int64Index([ 27968,  67764,   2581,  82969, 137831,  48639,  97870,  40424,
             46835,  79570,
            ...
            114284,   9009,  21165,  33920,  22054, 135379, 125855,  41891,
             15720,  17029],
           dtype='int64', name='user', length=31360)
훈련 데이터에 사용될 사용자 수: 31360
검증 데이터에 사용될 사용자 수: 0


In [15]:
class DataLoader():
    '''
    Load Movielens dataset
    '''
    def __init__(self, path):
        
        self.pro_dir = os.path.join(path, 'pro_sg')
        assert os.path.exists(self.pro_dir), "Preprocessed files do not exist. Run data.py"

        self.n_items = self.load_n_items()
    
    def load_data(self, datatype='train'):
        if datatype == 'train':
            return self._load_train_data()
        elif datatype == 'validation':
            return self._load_tr_te_data(datatype)
        elif datatype == 'test':
            return self._load_tr_te_data(datatype)
        else:
            raise ValueError("datatype should be in [train, validation, test]")
        
    def load_n_items(self):
        unique_sid = list()
        with open(os.path.join(self.pro_dir, 'unique_sid.txt'), 'r') as f:
            for line in f:
                unique_sid.append(line.strip())
        n_items = len(unique_sid)
        return n_items
    
    def _load_train_data(self):
        path = os.path.join(self.pro_dir, 'train.csv')
        
        tp = pd.read_csv(path)
        n_users = tp['uid'].max() + 1

        rows, cols = tp['uid'], tp['sid']
        data = sparse.csr_matrix((np.ones_like(rows),
                                 (rows, cols)), dtype='float64',
                                 shape=(n_users, self.n_items))
        return data
    
    def _load_tr_te_data(self, datatype='test'):
        tr_path = os.path.join(self.pro_dir, '{}_tr.csv'.format(datatype))
        te_path = os.path.join(self.pro_dir, '{}_te.csv'.format(datatype))

        tp_tr = pd.read_csv(tr_path)
        tp_te = pd.read_csv(te_path)

        start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
        end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

        rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
        rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

        data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                                    (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, self.n_items))
        data_te = sparse.csr_matrix((np.ones_like(rows_te),
                                    (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, self.n_items))
        return data_tr, data_te

# model

In [16]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np



class MultiDAE(nn.Module):
    """
    Container module for Multi-DAE.

    Multi-DAE : Denoising Autoencoder with Multinomial Likelihood
    See Variational Autoencoders for Collaborative Filtering
    https://arxiv.org/abs/1802.05814
    """

    def __init__(self, p_dims, q_dims=None, dropout=0.5):
        super(MultiDAE, self).__init__()
        self.p_dims = p_dims
        if q_dims:
            assert q_dims[0] == p_dims[-1], "In and Out dimensions must equal to each other"
            assert q_dims[-1] == p_dims[0], "Latent dimension for p- and q- network mismatches."
            self.q_dims = q_dims
        else:
            self.q_dims = p_dims[::-1]

        self.dims = self.q_dims + self.p_dims[1:]
        self.layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.dims[:-1], self.dims[1:])])
        self.drop = nn.Dropout(dropout)
        
        self.init_weights()
    
    def forward(self, input):
        h = F.normalize(input)
        h = self.drop(h)

        for i, layer in enumerate(self.layers):
            h = layer(h)
            if i != len(self.layers) - 1:
                h = F.tanh(h)
        return h

    def init_weights(self):
        for layer in self.layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)



class MultiVAE(nn.Module):
    """
    Container module for Multi-VAE.

    Multi-VAE : Variational Autoencoder with Multinomial Likelihood
    See Variational Autoencoders for Collaborative Filtering
    https://arxiv.org/abs/1802.05814
    """

    def __init__(self, p_dims, q_dims=None, dropout=0.5):
        super(MultiVAE, self).__init__()
        self.p_dims = p_dims
        if q_dims:
            assert q_dims[0] == p_dims[-1], "In and Out dimensions must equal to each other"
            assert q_dims[-1] == p_dims[0], "Latent dimension for p- and q- network mismatches."
            self.q_dims = q_dims
        else:
            self.q_dims = p_dims[::-1]

        # Last dimension of q- network is for mean and variance
        temp_q_dims = self.q_dims[:-1] + [self.q_dims[-1] * 2]
        self.q_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(temp_q_dims[:-1], temp_q_dims[1:])])
        self.p_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.p_dims[:-1], self.p_dims[1:])])
        
        self.drop = nn.Dropout(dropout)
        self.init_weights()
    
    def forward(self, input):
        mu, logvar = self.encode(input)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar
    
    def encode(self, input):
        h = F.normalize(input)
        h = self.drop(h)
        
        for i, layer in enumerate(self.q_layers):
            h = layer(h)
            if i != len(self.q_layers) - 1:
                h = F.tanh(h)
            else:
                mu = h[:, :self.q_dims[-1]]
                logvar = h[:, self.q_dims[-1]:]
        return mu, logvar

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            return mu
    
    def decode(self, z):
        h = z
        for i, layer in enumerate(self.p_layers):
            h = layer(h)
            if i != len(self.p_layers) - 1:
                h = F.tanh(h)
        return h

    def init_weights(self):
        for layer in self.q_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)
        
        for layer in self.p_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)




def loss_function_vae(recon_x, x, mu, logvar, anneal=1.0):
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))

    return BCE + anneal * KLD

def loss_function_dae(recon_x, x):
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    return BCE





In [17]:
def sparse2torch_sparse(data):
    """
    Convert scipy sparse matrix to torch sparse tensor with L2 Normalization
    This is much faster than naive use of torch.FloatTensor(data.toarray())
    https://discuss.pytorch.org/t/sparse-tensor-use-cases/22047/2
    """
    samples = data.shape[0]
    features = data.shape[1]
    coo_data = data.tocoo()
    indices = torch.LongTensor([coo_data.row, coo_data.col])
    row_norms_inv = 1 / np.sqrt(data.sum(1))
    row2val = {i : row_norms_inv[i].item() for i in range(samples)}
    values = np.array([row2val[r] for r in coo_data.row])
    t = torch.sparse.FloatTensor(indices, torch.from_numpy(values).float(), [samples, features])
    return t

def naive_sparse2tensor(data):
    return torch.FloatTensor(data.toarray())

In [18]:
loader = DataLoader(args.data)

n_items = loader.load_n_items()
train_data = loader.load_data('train')
vad_data_tr, vad_data_te = loader.load_data('validation')
N = train_data.shape[0]
idxlist = list(range(N))


# Infer

In [10]:
# Multi VAE

In [33]:
# Multi VAE
with open('vae_adamW1000.pt', 'rb') as f:
    model1 = torch.load(f)
    
with open('vae_1000.pt', 'rb') as f:
    model2 = torch.load(f)
    
with open('vae_adamW1000drop1.pt', 'rb') as f:
    model3 = torch.load(f)
    
with open('vae_adamW1000drop3.pt', 'rb') as f:
    model4 = torch.load(f)
    
with open('vae_adamW1000drop7.pt', 'rb') as f:
    model5 = torch.load(f)
    
with open('vae_adamW1000drop9.pt', 'rb') as f:
    model6 = torch.load(f)
    
with open('vae_recall.pt', 'rb') as f:
    model7 = torch.load(f)
# DAE
with open('dae_200.pt', 'rb') as f:
    model7 = torch.load(f)

with open('dae_200drop1.pt', 'rb') as f:
    model8 = torch.load(f)
   
with open('dae_200drop3.pt', 'rb') as f:
    model9 = torch.load(f)
   
with open('dae_200drop7.pt', 'rb') as f:
    model10 = torch.load(f)
   
with open('dae_200drop9.pt', 'rb') as f:
    model11 = torch.load(f)
   

In [25]:
x = naive_sparse2tensor(train_data).to(device)

In [35]:
model1.eval()
model2.eval()
model3.eval()
model4.eval()
model5.eval()
model6.eval()
model7.eval()
model8.eval()
model9.eval()
model10.eval()
model11.eval()

with torch.no_grad():
    result1 = model1(x)[0]
    result2 = model2(x)[0]
    result3 = model3(x)[0]
    result4 = model4(x)[0]
    result5 = model5(x)[0]
    result6 = model6(x)[0]
    result7 = model7(x)[0]
    result8 = model8(x)
    result9 = model9(x)
    result10 = model10(x)
    result11 = model11(x)

In [36]:
print(result1.shape)
print(result2.shape)
print(result3.shape)
print(result4.shape)
print(result5.shape)
print(result6.shape)
print(result7.shape)
print(result8.shape)
print(result9.shape)
print(result10.shape)
print(result11.shape)

torch.Size([31360, 6807])
torch.Size([31360, 6807])
torch.Size([31360, 6807])
torch.Size([31360, 6807])
torch.Size([31360, 6807])
torch.Size([31360, 6807])
torch.Size([31360, 6807])


# output 정규화

In [37]:
v_min, v_max = result1.min(), result1.max()
new_min, new_max = 0,1
# scaling된 vector
result1 = (result1 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result1

tensor([[0.7910, 0.7551, 0.7706,  ..., 0.7046, 0.5837, 0.5908],
        [0.8467, 0.8224, 0.8079,  ..., 0.7061, 0.7035, 0.6650],
        [0.8427, 0.8135, 0.8207,  ..., 0.7237, 0.7062, 0.6394],
        ...,
        [0.8078, 0.8179, 0.7771,  ..., 0.6717, 0.5504, 0.6610],
        [0.7908, 0.7800, 0.8185,  ..., 0.6929, 0.5745, 0.6440],
        [0.7696, 0.8106, 0.8512,  ..., 0.7201, 0.5696, 0.5377]],
       device='cuda:0')

In [38]:
v_min, v_max = result2.min(), result2.max()
new_min, new_max = 0,1
# scaling된 vector
result2 = (result2 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result2

tensor([[0.6798, 0.6535, 0.6778,  ..., 0.5371, 0.2648, 0.3229],
        [0.8053, 0.7918, 0.7567,  ..., 0.5725, 0.5999, 0.5601],
        [0.7701, 0.7755, 0.7608,  ..., 0.6799, 0.5367, 0.3841],
        ...,
        [0.6537, 0.6936, 0.6717,  ..., 0.3450, 0.3216, 0.6149],
        [0.6609, 0.6460, 0.7198,  ..., 0.3853, 0.2826, 0.4845],
        [0.6189, 0.6902, 0.8163,  ..., 0.4951, 0.3547, 0.3612]],
       device='cuda:0')

In [39]:
v_min, v_max = result3.min(), result3.max()
new_min, new_max = 0,1
# scaling된 vector
result3 = (result3 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result3

tensor([[0.7782, 0.7516, 0.7483,  ..., 0.7594, 0.5533, 0.5695],
        [0.8060, 0.7887, 0.7649,  ..., 0.7152, 0.7201, 0.6253],
        [0.7648, 0.8112, 0.7748,  ..., 0.7366, 0.6736, 0.5640],
        ...,
        [0.7873, 0.8049, 0.7671,  ..., 0.6996, 0.5901, 0.8081],
        [0.8076, 0.8245, 0.8416,  ..., 0.7532, 0.6884, 0.7444],
        [0.7606, 0.7442, 0.8321,  ..., 0.7015, 0.6090, 0.5995]],
       device='cuda:0')

In [40]:
v_min, v_max = result4.min(), result4.max()
new_min, new_max = 0,1
# scaling된 vector
result4 = (result4 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result4

tensor([[0.7308, 0.7021, 0.7305,  ..., 0.6473, 0.5230, 0.5266],
        [0.8360, 0.8451, 0.7881,  ..., 0.6751, 0.7219, 0.5953],
        [0.8138, 0.8162, 0.8129,  ..., 0.7134, 0.6735, 0.5586],
        ...,
        [0.7784, 0.7871, 0.7693,  ..., 0.6528, 0.5909, 0.7060],
        [0.7790, 0.7457, 0.8139,  ..., 0.7289, 0.6071, 0.6409],
        [0.7406, 0.7608, 0.8540,  ..., 0.6894, 0.5778, 0.5290]],
       device='cuda:0')

In [41]:
v_min, v_max = result5.min(), result5.max()
new_min, new_max = 0,1
# scaling된 vector
result5 = (result5 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result5

tensor([[0.7358, 0.7258, 0.7312,  ..., 0.6614, 0.5040, 0.4604],
        [0.8269, 0.8038, 0.7962,  ..., 0.6633, 0.7242, 0.6226],
        [0.7836, 0.7989, 0.7900,  ..., 0.6897, 0.6792, 0.5043],
        ...,
        [0.7401, 0.7412, 0.7376,  ..., 0.5724, 0.5102, 0.6985],
        [0.7284, 0.7292, 0.7758,  ..., 0.5283, 0.4907, 0.6115],
        [0.7198, 0.7670, 0.8314,  ..., 0.6188, 0.5854, 0.4937]],
       device='cuda:0')

In [42]:
v_min, v_max = result6.min(), result6.max()
new_min, new_max = 0,1
# scaling된 vector
result6 = (result6 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result6

tensor([[0.6791, 0.6888, 0.7233,  ..., 0.4972, 0.2939, 0.3071],
        [0.7846, 0.8123, 0.7337,  ..., 0.6083, 0.6679, 0.5012],
        [0.7541, 0.8050, 0.7790,  ..., 0.6160, 0.5581, 0.4460],
        ...,
        [0.6726, 0.7423, 0.6645,  ..., 0.5035, 0.4056, 0.6071],
        [0.6723, 0.6883, 0.7396,  ..., 0.4781, 0.4146, 0.5234],
        [0.6757, 0.7140, 0.8499,  ..., 0.5322, 0.5213, 0.5190]],
       device='cuda:0')

In [43]:
v_min, v_max = result7.min(), result7.max()
new_min, new_max = 0,1
# scaling된 vector
result7 = (result7 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result7

tensor([[0.7426, 0.7408, 0.7405,  ..., 0.6508, 0.4493, 0.5023],
        [0.8357, 0.8178, 0.7856,  ..., 0.7047, 0.7072, 0.6083],
        [0.7881, 0.7897, 0.7764,  ..., 0.7325, 0.6273, 0.5341],
        ...,
        [0.7194, 0.7621, 0.7782,  ..., 0.6867, 0.5889, 0.7020],
        [0.7421, 0.7395, 0.8040,  ..., 0.6485, 0.5620, 0.5680],
        [0.7266, 0.7603, 0.8437,  ..., 0.7287, 0.5075, 0.5316]],
       device='cuda:0')

In [22]:
v_min, v_max = result8.min(), result8.max()
new_min, new_max = 0,1
# scaling된 vector
result8 = (result8 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result8

tensor([[0.5174, 0.5233, 0.5277,  ..., 0.4878, 0.2163, 0.3043],
        [0.7271, 0.6403, 0.6408,  ..., 0.5180, 0.5048, 0.3221],
        [0.6214, 0.6303, 0.5800,  ..., 0.4588, 0.4804, 0.2725],
        ...,
        [0.5285, 0.5674, 0.6016,  ..., 0.3661, 0.2225, 0.4797],
        [0.4962, 0.4425, 0.4812,  ..., 0.3456, 0.2679, 0.3384],
        [0.4916, 0.5581, 0.6838,  ..., 0.3574, 0.2765, 0.2702]],
       device='cuda:0')

In [23]:
v_min, v_max = result9.min(), result9.max()
new_min, new_max = 0,1
# scaling된 vector
result9 = (result9 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result9

tensor([[0.5804, 0.5328, 0.5659,  ..., 0.4702, 0.3303, 0.2837],
        [0.7012, 0.6692, 0.6575,  ..., 0.4621, 0.4729, 0.4200],
        [0.6473, 0.7465, 0.7072,  ..., 0.5101, 0.5630, 0.3991],
        ...,
        [0.5895, 0.5926, 0.5979,  ..., 0.5561, 0.2957, 0.5078],
        [0.5660, 0.4532, 0.6307,  ..., 0.3474, 0.2875, 0.4228],
        [0.5445, 0.6152, 0.7248,  ..., 0.4918, 0.3212, 0.3037]],
       device='cuda:0')

In [24]:
v_min, v_max = result10.min(), result10.max()
new_min, new_max = 0,1
# scaling된 vector
result10 = (result10 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result10

tensor([[0.5967, 0.5726, 0.5712,  ..., 0.4994, 0.2789, 0.2163],
        [0.7272, 0.7175, 0.6604,  ..., 0.5550, 0.5762, 0.4719],
        [0.6609, 0.6638, 0.6423,  ..., 0.5675, 0.4998, 0.2913],
        ...,
        [0.5112, 0.6484, 0.5624,  ..., 0.4025, 0.3289, 0.4473],
        [0.5577, 0.5479, 0.6242,  ..., 0.4080, 0.2303, 0.3395],
        [0.5230, 0.5768, 0.7345,  ..., 0.4962, 0.3706, 0.2604]],
       device='cuda:0')

In [25]:
v_min, v_max = result11.min(), result11.max()
new_min, new_max = 0,1
# scaling된 vector
result11 = (result11 - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
result11

tensor([[0.6076, 0.5683, 0.5926,  ..., 0.4253, 0.2755, 0.2345],
        [0.7380, 0.7303, 0.6682,  ..., 0.5544, 0.6255, 0.4841],
        [0.6859, 0.6823, 0.6608,  ..., 0.4962, 0.4859, 0.3407],
        ...,
        [0.6187, 0.6543, 0.6092,  ..., 0.2284, 0.3425, 0.5767],
        [0.5329, 0.5672, 0.6721,  ..., 0.3893, 0.2466, 0.3429],
        [0.5335, 0.6228, 0.7472,  ..., 0.4779, 0.3031, 0.3229]],
       device='cuda:0')

In [26]:
result = result1 + result2 + result3 + result4 +result5 + result6 +result7 + result8 + result9 + result10 +result11

In [45]:
result

tensor([[4.3591, 4.2660, 4.3738,  ..., 3.6984, 2.6188, 2.7101],
        [4.9352, 4.8932, 4.6682,  ..., 3.9300, 4.1247, 3.5525],
        [4.7524, 4.7987, 4.7398,  ..., 4.1552, 3.7809, 3.0665],
        ...,
        [4.3719, 4.5442, 4.3983,  ..., 3.4321, 2.9676, 3.9895],
        [4.3735, 4.3289, 4.6716,  ..., 3.4621, 2.9315, 3.4723],
        [4.2512, 4.5028, 5.0465,  ..., 3.7843, 3.1164, 2.9722]],
       device='cuda:0')

In [51]:
# 저장
torch.save(result, 'result.pt')

In [19]:
# 저장된 user-item matrix 불러오기
import torch
result1 = torch.load('result.pt')
result2 = torch.load('pby_result.pt')

In [27]:
show2id = dict((i,sid) for (i, sid) in enumerate(unique_sid))
profile2id = dict((i,pid) for (i, pid) in enumerate(unique_uid))

In [28]:
j = result - x * 200
n = j.sort()[1][:,-10:]
from tqdm.notebook import tqdm
sub_u, sub_i = [], []
for target_u in tqdm(range(0, 31360)) :
    target_i = n[target_u]
    for target in target_i:
        sub_u.append(profile2id[target_u])
        sub_i.append(show2id[int(target)])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=31360.0), HTML(value='')))




In [29]:
from tqdm.notebook import tqdm
sub_u, sub_i = [], []
for target_u in tqdm(range(0, 31360)) :
    target_i = n[target_u]
    for target in target_i:
        sub_u.append(profile2id[target_u])
        sub_i.append(show2id[int(target)])

In [30]:
submission = {"user" : sub_u, "item" : sub_i}

In [31]:
submission_df = pd.DataFrame(submission)
submission_df = submission_df.sort_values('user')

In [33]:
submission_df.to_csv('submission_last.csv', index=False)