In [1]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

import warnings

warnings.filterwarnings(action='ignore')

# 1. 학습 설정

In [2]:
config = {
    'data_path' : "/opt/ml/input/data/train" , # 데이터 경로
    
    'submission_path' : "../submission",
    'submission_name' : 'multi-DAE_submission.csv', 

    'model_path' : "../model", # 모델 저장 경로
    'model_name' : 'Multi-DAE_v1.pt',

    'p_dims': [250, 500, 1000], 
    'dropout_rate' : 0.5,
    'weight_decay' : 0.00,
    'valid_samples' : 10, # 검증에 사용할 sample 수
    'seed' : 22,

    'lr' : 0.001,
    'batch_size' : 128,
    'num_epochs' : 200,
    'num_workers' : 2,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

In [None]:
if not os.path.isdir(config.model_path):
    os.mkdir(config.model_path)

In [None]:
if not os.path.isdir(config.submission_path):
    os.mkdir(config.submission_path)

# 2. 데이터 전처리

In [3]:
class MakeMatrixDataSet():
    """
    MatrixDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'train_ratings.csv'))

        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('item')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('user')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['item'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['user'].apply(lambda x : self.user_encoder[x])

        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['time']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_dict를 바탕으로 행렬 생성
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.user_train[user.item()]] = 1
            else:
                mat[idx, self.user_train[user.item()] + self.user_valid[user.item()]] = 1
        return mat


In [4]:
class AEDataSet(Dataset):
    def __init__(self, num_user):
        self.num_user = num_user
        self.users = [i for i in range(num_user)]

    def __len__(self):
        return self.num_user

    def __getitem__(self, idx): 
        user = self.users[idx]
        return torch.LongTensor([user])

# 3. 모델

In [5]:
class MultiDAE(nn.Module):
    """
    Container module for Multi-DAE.

    Multi-DAE : Denoising Autoencoder with Multinomial Likelihood
    See Variational Autoencoders for Collaborative Filtering
    https://arxiv.org/abs/1802.05814
    """

    def __init__(self, p_dims, dropout_rate = 0.5):
        super(MultiDAE, self).__init__()
        self.p_dims = p_dims
        self.q_dims = p_dims[::-1]

        self.dims = self.q_dims + self.p_dims[1:]
        self.layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.dims[:-1], self.dims[1:])])
        self.drop = nn.Dropout(dropout_rate)
        
        self.init_weights()
    
    def forward(self, input):
        h = F.normalize(input)
        h = self.drop(h)

        for i, layer in enumerate(self.layers):
            h = layer(h)
            if i != len(self.layers) - 1:
                h = F.tanh(h)
        return h

    def init_weights(self):
        for layer in self.layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)

# 4. 학습 함수

In [6]:
def train(model, criterion, optimizer, data_loader, make_matrix_data_set):
    model.train()
    loss_val = 0
    for users in data_loader:
        mat = make_matrix_data_set.make_matrix(users)
        mat = mat.to(device)
        recon_mat = model(mat)

        optimizer.zero_grad()
        loss = criterion(recon_mat, mat)

        loss_val += loss.item()

        loss.backward()
        optimizer.step()
    
    loss_val /= len(data_loader)

    return loss_val

def get_ndcg(pred_list, true_list):
    ndcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            ndcg += 1 / np.log2(rank + 2)
    return ndcg

# 대회 메트릭인 recall과 동일 
def get_hit(pred_list, true_list):
    hit = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            hit += 1
    hit /= len(true_list)
    return hit

def evaluate(model, user_train, user_valid, make_matrix_data_set):
    model.eval()

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    num_user_sample = 1000
    users = np.random.randint(0, make_matrix_data_set.num_user, num_user_sample) # 1000개만 sampling 하여 evaluation
    
    for user in users:
        ut = user_train[user]
        uv = user_valid[user]
        mat = make_matrix_data_set.make_matrix(torch.tensor([user]))
        mat = mat.to(device)
        
        with torch.no_grad():
            recon_mat = model(mat)
            recon_mat = recon_mat[0]
            recon_mat[ut] = torch.min(recon_mat) - 987654321
        
        rec_list = recon_mat.argsort()[-10:].cpu().numpy().tolist()

        NDCG += get_ndcg(pred_list = rec_list, true_list = uv)
        HIT += get_hit(pred_list = rec_list, true_list = uv)

    NDCG /= num_user_sample
    HIT /= num_user_sample

    return NDCG, HIT

def predict(model, user_train, user_valid, make_matrix_data_set):
    model.eval()

    user2rec_list = {}
    users = [i for i in range(0, make_matrix_data_set.num_user)]

    for user in users:
        ut = user_train[user]
        uv = user_valid[user]

        mat = make_matrix_data_set.make_matrix(torch.tensor([user]), train = False)
        mat = mat.to(device)

        with torch.no_grad():
            recon_mat = model(mat)
            recon_mat = recon_mat[0]
            recon_mat[ut + uv] = torch.min(recon_mat) - 987654321
        
        rec_list = recon_mat.argsort()[-10:].cpu().numpy().tolist()

        user2rec_list[user] = rec_list
    
    return user2rec_list

def loss_function_dae(recon_x, x):
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    return BCE

# 5. 학습

In [7]:
make_matrix_data_set = MakeMatrixDataSet(config = config)
user_train, user_valid = make_matrix_data_set.get_train_valid_data()

In [8]:
ae_dataset = AEDataSet(
    num_user = make_matrix_data_set.num_user,
    )

In [9]:
data_loader = DataLoader(
    ae_dataset,
    batch_size = config.batch_size, 
    shuffle = True, 
    pin_memory = True,
    num_workers = config.num_workers,
    )

In [10]:
model = MultiDAE(
    p_dims = config.p_dims + [make_matrix_data_set.num_item], 
    dropout_rate = config.dropout_rate).to(device)

criterion = loss_function_dae
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay = config.weight_decay)

In [11]:
best_hit = 0
for epoch in range(1, config.num_epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss = train(
            model = model, 
            criterion = criterion, 
            optimizer = optimizer, 
            data_loader = data_loader,
            make_matrix_data_set = make_matrix_data_set
            )
        
        ndcg, hit = evaluate(
            model = model, 
            user_train = user_train,
            user_valid = user_valid,
            make_matrix_data_set = make_matrix_data_set,
            )

        if best_hit < hit:
            best_hit = hit
            torch.save(model.state_dict(), os.path.join(config.model_path, config.model_name))

        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

Epoch:   1| Train loss: 1159.40209| NDCG@10: 0.54092| HIT@10: 0.13130: 100%|██████████| 1/1 [00:03<00:00,  3.51s/it]
Epoch:   2| Train loss: 1108.23961| NDCG@10: 0.60969| HIT@10: 0.14600: 100%|██████████| 1/1 [00:03<00:00,  3.55s/it]
Epoch:   3| Train loss: 1093.80492| NDCG@10: 0.60090| HIT@10: 0.14610: 100%|██████████| 1/1 [00:03<00:00,  3.47s/it]
Epoch:   4| Train loss: 1085.24679| NDCG@10: 0.67897| HIT@10: 0.16540: 100%|██████████| 1/1 [00:03<00:00,  3.45s/it]
Epoch:   5| Train loss: 1078.74146| NDCG@10: 0.66173| HIT@10: 0.16130: 100%|██████████| 1/1 [00:03<00:00,  3.22s/it]
Epoch:   6| Train loss: 1073.29819| NDCG@10: 0.68738| HIT@10: 0.16960: 100%|██████████| 1/1 [00:03<00:00,  3.48s/it]
Epoch:   7| Train loss: 1068.93453| NDCG@10: 0.66086| HIT@10: 0.15800: 100%|██████████| 1/1 [00:03<00:00,  3.29s/it]
Epoch:   8| Train loss: 1065.27909| NDCG@10: 0.71685| HIT@10: 0.17140: 100%|██████████| 1/1 [00:03<00:00,  3.52s/it]
Epoch:   9| Train loss: 1061.09031| NDCG@10: 0.68261| HIT@10: 0.

# 7. 예측

In [12]:
model.load_state_dict(torch.load(os.path.join(config.model_path, config.model_name)))

user2rec_list = predict(
    model = model, 
    user_train = user_train, 
    user_valid = user_valid, 
    make_matrix_data_set = make_matrix_data_set
    )

submision = []
users = [i for i in range(0, make_matrix_data_set.num_user)]
for user in users:
    rec_item_list = user2rec_list[user]
    for item in rec_item_list:
        submision.append(
            {   
                'user' : make_matrix_data_set.user_decoder[user],
                'item' : make_matrix_data_set.item_decoder[item],
            }
        )

submision = pd.DataFrame(submision)

In [18]:
submision.to_csv(os.path.join(config.submission_path, config.submission_name), index=False)