In [1]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box
from copy import deepcopy

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

# 1. 학습 설정

In [2]:
config = {
    'data_path' : "/opt/ml/input/data/train" , # 데이터 경로
    
    'submission_path' : "../submission",
    'submission_name' : 'RecVAE_v1_submission.csv', 

    'model_path' : "../model", # 모델 저장 경로
    'model_name' : 'ex-RecVAE_v10.pt',

    'weight_decay' : 0.01,
    'hidden_dim': 600,
    'latent_dim' : 200,
    'dropout_rate' : 0.6,
    'gamma' : 0.0005,
    'beta' : None,
    'not_alternating' : True,
    'e_num_epochs' : 2,
    'd_num_epochs' : 1,

    'lr' : 0.0005,
    'batch_size' : 500,
    'num_epochs' : 200,
    'num_workers' : 2,

    'valid_samples' : 10,
    'seed' : 22,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

In [3]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

seed_everything(config.seed)

In [4]:
if not os.path.isdir(config.model_path):
    os.mkdir(config.model_path)

In [5]:
if not os.path.isdir(config.submission_path):
    os.mkdir(config.submission_path)

# 2. 데이터 전처리

In [6]:
class MakeMatrixDataSet():
    """
    MatrixDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'train_ratings.csv'))

        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('item')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('user')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['item'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['user'].apply(lambda x : self.user_encoder[x])

        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['time']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_dict를 바탕으로 행렬 생성
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.user_train[user.item()]] = 1
            else:
                mat[idx, self.user_train[user.item()] + self.user_valid[user.item()]] = 1
        return mat


In [7]:
class AEDataSet(Dataset):
    def __init__(self, num_user):
        self.num_user = num_user
        self.users = [i for i in range(num_user)]

    def __len__(self):
        return self.num_user

    def __getitem__(self, idx): 
        user = self.users[idx]
        return torch.LongTensor([user])

# 3. 모델

In [8]:
def swish(x):
    return x.mul(torch.sigmoid(x))

def log_norm_pdf(x, mu, logvar):
    return -0.5*(logvar + np.log(2 * np.pi) + (x - mu).pow(2) / logvar.exp())

class CompositePrior(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim, mixture_weights=[3/20, 3/4, 1/10]):
        super(CompositePrior, self).__init__()
        
        self.mixture_weights = mixture_weights
        
        self.mu_prior = nn.Parameter(torch.Tensor(1, latent_dim), requires_grad=False)
        self.mu_prior.data.fill_(0)
        
        self.logvar_prior = nn.Parameter(torch.Tensor(1, latent_dim), requires_grad=False)
        self.logvar_prior.data.fill_(0)
        
        self.logvar_uniform_prior = nn.Parameter(torch.Tensor(1, latent_dim), requires_grad=False)
        self.logvar_uniform_prior.data.fill_(10)
        
        self.encoder_old = Encoder(hidden_dim, latent_dim, input_dim)
        self.encoder_old.requires_grad_(False)
        
    def forward(self, x, z):

        post_mu, post_logvar = self.encoder_old(x, dropout_rate = 0)

        stnd_prior = log_norm_pdf(z, self.mu_prior, self.logvar_prior)
        post_prior = log_norm_pdf(z, post_mu, post_logvar)
        unif_prior = log_norm_pdf(z, self.mu_prior, self.logvar_uniform_prior)
        
        gaussians = [stnd_prior, post_prior, unif_prior]
        gaussians = [g.add(np.log(w)) for g, w in zip(gaussians, self.mixture_weights)]

        density_per_gaussian = torch.stack(gaussians, dim=-1)

        return torch.logsumexp(density_per_gaussian, dim=-1)

    
class Encoder(nn.Module):
    def __init__(self, hidden_dim, latent_dim, input_dim, eps=1e-1):
        super(Encoder, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.ln3 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc4 = nn.Linear(hidden_dim, hidden_dim)
        self.ln4 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc5 = nn.Linear(hidden_dim, hidden_dim)
        self.ln5 = nn.LayerNorm(hidden_dim, eps=eps)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
        
    def forward(self, x, dropout_rate):
        norm = x.pow(2).sum(dim=-1).sqrt()
        x = x / norm[:, None]
    
        x = F.dropout(x, p=dropout_rate, training=self.training)
        
        h1 = self.ln1(swish(self.fc1(x)))
        h2 = self.ln2(swish(self.fc2(h1) + h1))
        h3 = self.ln3(swish(self.fc3(h2) + h1 + h2))
        h4 = self.ln4(swish(self.fc4(h3) + h1 + h2 + h3))
        h5 = self.ln5(swish(self.fc5(h4) + h1 + h2 + h3 + h4))
        return self.fc_mu(h5), self.fc_logvar(h5)


class RecVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim = 600, latent_dim = 200):
        super(RecVAE, self).__init__()

        self.encoder = Encoder(hidden_dim, latent_dim, input_dim)
        self.prior = CompositePrior(hidden_dim, latent_dim, input_dim)
        self.decoder = nn.Linear(latent_dim, input_dim)
        
    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5*logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            return mu

    def forward(self, user_ratings, beta=None, gamma=0.005, dropout_rate=0.5, calculate_loss=True):
        mu, logvar = self.encoder(user_ratings, dropout_rate=dropout_rate)    
        z = self.reparameterize(mu, logvar)
        x_pred = self.decoder(z)

        if calculate_loss:
            if gamma:
                norm = user_ratings.sum(dim=-1)
                kl_weight = gamma * norm
            elif beta:
                kl_weight = beta

            mll = (F.log_softmax(x_pred, dim=-1) * user_ratings).sum(dim=-1).mean()
            kld = (log_norm_pdf(z, mu, logvar) - self.prior(user_ratings, z)).sum(dim=-1).mul(kl_weight).mean()
            negative_elbo = -(mll - kld)
            
            return (mll, kld), negative_elbo
            
        else:
            return x_pred

    def update_prior(self):
        self.prior.encoder_old.load_state_dict(deepcopy(self.encoder.state_dict()))

# 4. 학습 함수

In [9]:
def train(model, optimizer, data_loader, make_matrix_data_set, beta, gamma, dropout_rate):
    model.train()
    loss_val = 0
    for users in data_loader:
        mat = make_matrix_data_set.make_matrix(users)
        mat = mat.to(device)
        _, loss = model(user_ratings = mat, beta = beta, gamma = gamma, dropout_rate = dropout_rate)

        optimizer.zero_grad()
        loss_val += loss.item()
        loss.backward()
        optimizer.step()
    
    loss_val /= len(data_loader)

    return loss_val

def get_ndcg(pred_list, true_list):
    idcg = sum((1 / np.log2(rank + 2) for rank in range(1, len(pred_list))))
    dcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            dcg += 1 / np.log2(rank + 2)
    ndcg = dcg / idcg
    return ndcg

# hit == recall == precision
def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit

def evaluate(model, data_loader, user_train, user_valid, make_matrix_data_set, user_list):
    model.eval()

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    with torch.no_grad():
        for users in data_loader:
            mat = make_matrix_data_set.make_matrix(users)
            mat = mat.to(device)

            recon_mat = model(mat, calculate_loss = False)
            recon_mat[mat == 1] = -np.inf
            rec_list = recon_mat.argsort(dim = 1)

            for user, rec in zip(users, rec_list):
                if user.item() in user_list:
                    uv = user_valid[user.item()]
                    up = rec[-10:].cpu().numpy().tolist()
                    NDCG += get_ndcg(pred_list = up, true_list = uv)
                    HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(user_list)
    HIT /= len(user_list)

    return NDCG, HIT

def predict(model, data_loader, user_train, user_valid, make_matrix_data_set):
    model.eval()
    
    user2rec_list = {}
    with torch.no_grad():
        for users in data_loader:
            mat = make_matrix_data_set.make_matrix(users, train = False)
            mat = mat.to(device)

            recon_mat = model(mat, calculate_loss = False)
            recon_mat[mat == 1] = -np.inf
            rec_list = recon_mat.argsort(dim = 1)

            for user, rec in zip(users, rec_list):
                up = rec[-10:].cpu().numpy().tolist()[::-1]
                user2rec_list[user.item()] = up
    
    return user2rec_list

# 5. 학습

In [10]:
make_matrix_data_set = MakeMatrixDataSet(config = config)
user_train, user_valid = make_matrix_data_set.get_train_valid_data()

In [11]:
ae_dataset = AEDataSet(
    num_user = make_matrix_data_set.num_user,
    )

In [12]:
data_loader = DataLoader(
    ae_dataset,
    batch_size = config.batch_size, 
    shuffle = True, 
    pin_memory = True,
    num_workers = config.num_workers,
    )

In [13]:
model = RecVAE(
    input_dim = make_matrix_data_set.num_item,
    hidden_dim = config.hidden_dim,
    latent_dim = config.latent_dim).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay = config.weight_decay) # 0.19213

# optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, amsgrad = True) # 0.19194

# optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr) # 0.19188

# optimizer = torch.optim.RAdam(model.parameters(), lr=config.lr) # 0.19211

# optimizer_encoder = torch.optim.Adam(model.encoder.parameters(), lr=config.lr)
# optimizer_decoder = torch.optim.Adam(model.decoder.parameters(), lr=config.lr)

# model.load_state_dict(torch.load(os.path.join(config.model_path, 'RecVAE_v9.pt')))

In [15]:
zero_df = pd.read_csv('zero_df.csv')
user_list = zero_df['user'].tolist()

In [14]:
best_hit = 0
for epoch in range(1, config.num_epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        
        if config.not_alternating:
            train_loss = train(
                    model = model,
                    optimizer = optimizer, 
                    data_loader = data_loader,
                    make_matrix_data_set = make_matrix_data_set,
                    beta = config.beta,
                    gamma = config.gamma, 
                    dropout_rate = config.dropout_rate,
                    )
        
        else:
            for _ in range(config.e_num_epochs):
                train_loss = train(
                        model = model,
                        optimizer = optimizer_encoder, 
                        data_loader = data_loader,
                        make_matrix_data_set = make_matrix_data_set,
                        beta = config.beta,
                        gamma = config.gamma, 
                        dropout_rate = config.dropout_rate,
                        )

            model.update_prior()
            
            for _ in range(config.d_num_epochs):
                train_loss = train(
                        model = model,
                        optimizer = optimizer_decoder, 
                        data_loader = data_loader,
                        make_matrix_data_set = make_matrix_data_set,
                        beta = config.beta,
                        gamma = config.gamma, 
                        dropout_rate = 0.0,
                        )

        ndcg, hit = evaluate(
            model = model,
            data_loader = data_loader,
            user_train = user_train,
            user_valid = user_valid,
            make_matrix_data_set = make_matrix_data_set,
            user_list = user_list,
            )

        if best_hit < hit:
            best_hit = hit
            torch.save(model.state_dict(), os.path.join(config.model_path, config.model_name))

        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

Epoch:   1| Train loss: 1335.84590| NDCG@10: 0.08204| HIT@10: 0.06911: 100%|██████████| 1/1 [00:03<00:00,  3.90s/it]
Epoch:   2| Train loss: 1251.05373| NDCG@10: 0.10178| HIT@10: 0.08690: 100%|██████████| 1/1 [00:04<00:00,  4.01s/it]
Epoch:   3| Train loss: 1232.65335| NDCG@10: 0.11217| HIT@10: 0.09521: 100%|██████████| 1/1 [00:04<00:00,  4.01s/it]
Epoch:   4| Train loss: 1215.79339| NDCG@10: 0.11662| HIT@10: 0.09984: 100%|██████████| 1/1 [00:04<00:00,  4.01s/it]
Epoch:   5| Train loss: 1198.47177| NDCG@10: 0.11841| HIT@10: 0.10066: 100%|██████████| 1/1 [00:04<00:00,  4.05s/it]
Epoch:   6| Train loss: 1201.99280| NDCG@10: 0.12067| HIT@10: 0.10338: 100%|██████████| 1/1 [00:03<00:00,  3.99s/it]
Epoch:   7| Train loss: 1190.51726| NDCG@10: 0.11961| HIT@10: 0.10181: 100%|██████████| 1/1 [00:03<00:00,  3.82s/it]
Epoch:   8| Train loss: 1197.55256| NDCG@10: 0.12430| HIT@10: 0.10680: 100%|██████████| 1/1 [00:04<00:00,  4.30s/it]
Epoch:   9| Train loss: 1186.50349| NDCG@10: 0.12593| HIT@10: 0.

# 6. 예측

In [20]:
model.load_state_dict(torch.load(os.path.join(config.model_path, config.model_name)))

submission_data_loader = DataLoader(
    ae_dataset,
    batch_size = config.batch_size, 
    shuffle = False, 
    pin_memory = True,
    num_workers = config.num_workers,
    )

user2rec_list = predict(
    model = model, 
    data_loader = submission_data_loader,
    user_train = user_train, 
    user_valid = user_valid, 
    make_matrix_data_set = make_matrix_data_set
    )

submision = []
users = [i for i in range(0, make_matrix_data_set.num_user)]
for user in users:
    rec_item_list = user2rec_list[user]
    for item in rec_item_list:
        submision.append(
            {   
                'user' : make_matrix_data_set.user_decoder[user],
                'item' : make_matrix_data_set.item_decoder[item],
            }
        )

submision = pd.DataFrame(submision)

In [21]:
submision.to_csv(os.path.join(config.submission_path, config.submission_name), index=False)

In [22]:
submision

Unnamed: 0,user,item
0,11,31696
1,11,5218
2,11,32587
3,11,2617
4,11,3156
...,...,...
313595,138493,551
313596,138493,33615
313597,138493,2012
313598,138493,1270
