<a href="https://colab.research.google.com/github/SeongBeomLEE/RecsysTutorial/blob/main/CDAE/CDAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 해당 모델에 bpr loss 적용될 수 없을지 생각해보기 -> objective function으로 사용 가능

In [1]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

import warnings
import time
# from sklearn.model_selection import train_test_split

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

In [2]:
def set_seeds(seed: int = 42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True


In [3]:
set_seeds()

# 1. 학습 설정

In [4]:
config = {
'data_path' : "/opt/ml/input/data/train" , # 데이터 경로
'save_dir': './checkpoint/',
'saved_file_name':'best_model.pt',
'p_dims': [50, 700],
'dropout_rate' : 0.8,
'weight_decay' : 0.01,
'valid_samples' : 10, # 검증에 사용할 sample 수
'patience' : 30,
'lr' : 0.001,
'batch_size' : 500,
'num_epochs' : 30, # Recommendation : 200
'num_workers' : 2,
}
config = Box(config)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
device

'cuda'

# 2. 데이터 전처리

In [6]:
class MakeMatrixDataSet():
    """
    MatrixDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'train_ratings.csv'))
        
        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('item')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('user')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['item'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['user'].apply(lambda x : self.user_encoder[x])

        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['time']):
            users[user].append(item)
        
        for user in users:
            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_dict를 바탕으로 행렬 생성
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.user_train[user.item()]] = 1
            else:
                mat[idx, self.user_train[user.item()] + self.user_valid[user.item()]] = 1
        return mat

In [7]:
class AEDataSet(Dataset):
    def __init__(self, num_user):
        self.num_user = num_user
        self.users = [i for i in range(num_user)]

    def __len__(self):
        return self.num_user

    def __getitem__(self, idx): 
        user = self.users[idx]
        return torch.LongTensor([user])

# 3. 모델

In [8]:
class CDAE(nn.Module):

    def __init__(self, p_dims, user_num, item_num, dropout_rate = 0.5):
        super(CDAE, self).__init__()
        self.p_dims = p_dims
        self.q_dims = p_dims[::-1]

        self.dims = self.q_dims + self.p_dims[1:]
        self.layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.dims[:-1], self.dims[1:])])
        self.drop = nn.Dropout(dropout_rate)
        self.user_embedding = nn.Embedding(user_num, item_num)
        self.init_weights()
        
    
    def forward(self, _input, user):
        h = F.normalize(_input)
        h = self.drop(h)
        h += self.user_embedding(user)

        for i, layer in enumerate(self.layers):
            h = layer(h)
            if i != len(self.layers) - 1:
                h = F.sigmoid(h)
        return h

    def init_weights(self):
        for layer in self.layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)

In [9]:
class LossFunc(nn.Module):

    def __init__(self, loss_type = 'Multinomial', model_type = None):
        super(LossFunc, self).__init__()
        self.loss_type = loss_type
        self.model_type = model_type

    def forward(self, recon_x = None, x = None, mu = None, logvar = None, anneal = None):
        if self.loss_type == 'Gaussian':
            loss = self.Gaussian(recon_x, x)
        elif self.loss_type == 'Logistic':
            loss = self.Logistic(recon_x, x)
        elif self.loss_type == 'Multinomial':
            loss = self.Multinomial(recon_x, x)

        
        if self.model_type == 'VAE':
            KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))
            loss = loss + anneal * KLD
        
        return loss

    def Gaussian(self, recon_x, x):
        gaussian = F.mse_loss(recon_x, x)
        return gaussian

    def Logistic(self, recon_x, x):
        logistic = F.binary_cross_entropy(recon_x.sigmoid(), x, reduction='none').sum(1).mean()
        return logistic

    def Multinomial(self, recon_x, x):
        multinomial = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
        return multinomial

# 4. 학습 함수

In [10]:
def train(model, criterion, optimizer, data_loader, make_matrix_data_set):
    model.train()
    loss_val = 0
    for users in data_loader:
        mat = make_matrix_data_set.make_matrix(users)
        mat = mat.to(device)
        recon_mat = model(mat, users.view(-1).to(device))

        optimizer.zero_grad()
        loss = criterion(recon_x = recon_mat, x = mat)

        loss_val += loss.item()

        loss.backward()
        optimizer.step()
    
    loss_val /= len(data_loader)

    return loss_val

def get_ndcg(pred_list, true_list):
    idcg = sum((1 / np.log2(rank + 2) for rank in range(1, len(pred_list))))
    dcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            dcg += 1 / np.log2(rank + 2)
    ndcg = dcg / idcg
    return ndcg

# hit == recall == precision
def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit

def evaluate(model, data_loader, user_train, user_valid, make_matrix_data_set):
    
    model.eval()

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    with torch.no_grad():
        for users in data_loader:
            mat = make_matrix_data_set.make_matrix(users)
            mat = mat.to(device)

            recon_mat = model(mat, users.view(-1).to(device))
            recon_mat[mat == 1] = -np.inf
            rec_list = recon_mat.argsort(dim = 1)

            for user, rec in zip(users, rec_list):
                uv = user_valid[user.item()]
                up = rec[-10:].cpu().numpy().tolist()[::-1]
                NDCG += get_ndcg(pred_list = up, true_list = uv)
                HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(data_loader.dataset)
    HIT /= len(data_loader.dataset)

    return NDCG, HIT

In [80]:
def inference(model, data_loader, decoder, make_matrix_data_set):  
    
    model.eval()
    
    user_list,item_list,prob_list = [],[],[]

    with torch.no_grad():
        for users in data_loader:
            mat = make_matrix_data_set.make_matrix(users)
            mat = mat.to(device)

            recon_mat = model(mat, users.view(-1).to(device))
            recon_mat[mat == 1] = -np.inf
            rec_list = recon_mat.argsort(dim = 1)
            rec_prob = recon_mat.sort(dim = 1)

            for user, rec in zip(users, rec_list):
                rec_items = rec[-10:].cpu().numpy().tolist()[::-1]
                rec_items = [decoder['item'][int(item)]for item in rec_items]
                rec_users = decoder['user'][int(user)].repeat(10)
                
                user_list = np.concatenate([user_list,rec_users])
                item_list = np.concatenate([item_list,rec_items])
                

    submit = pd.DataFrame(zip(user_list,item_list), columns=['user','item']).astype({'user':'int','item':'int'})
    submit = submit.sort_values(by=['user'])
    return submit

# 5. 학습

In [55]:
make_matrix_data_set = MakeMatrixDataSet(config = config)
user_train, user_valid = make_matrix_data_set.get_train_valid_data()
decoder = {}
_,decoder['user'] = make_matrix_data_set.generate_encoder_decoder('user')
_,decoder['item'] = make_matrix_data_set.generate_encoder_decoder('item')

In [56]:
ae_dataset = AEDataSet(
    num_user = make_matrix_data_set.num_user,
    )

In [14]:
data_loader = DataLoader(
    ae_dataset,
    batch_size = config.batch_size, 
    shuffle = True, 
    pin_memory = True,
    num_workers = config.num_workers,
    )

In [15]:
model = CDAE(
    p_dims = config.p_dims + [make_matrix_data_set.num_item],
    user_num = make_matrix_data_set.num_user, 
    item_num = make_matrix_data_set.num_item,
    dropout_rate = config.dropout_rate).to(device)
criterion = LossFunc(loss_type = 'Logistic')
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

In [16]:
def main(config):
    best_hit,best_ndcg = -1,-1
    early_stopping_counter = 0

    for epoch in range(1, config.num_epochs + 1):
        train_loss = train(
            model = model, 
            criterion = criterion, 
            optimizer = optimizer, 
            data_loader = data_loader,
            make_matrix_data_set = make_matrix_data_set,
            )

        ndcg, hit = evaluate(
            model = model, 
            data_loader = data_loader,
            user_train = user_train,
            user_valid = user_valid,
            make_matrix_data_set = make_matrix_data_set,
            )
        # print('time:',end_time-start_time)
        # if epoch % 10 == 0:
        print(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')
            
        if hit > best_hit:
            best_hit = hit
            torch.save(model.state_dict(), config.save_dir + config.saved_file_name)
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= config.patience:
                break
        if ndcg > best_ndcg:
            best_ndcg = ndcg
    print(f'Best NDCG@10: {best_ndcg:.5f}| Best HIT@10: {best_hit:.5f}')
    
main(config)

Epoch:   1| Train loss: 815.81133| NDCG@10: 0.13146| HIT@10: 0.08847
Epoch:   2| Train loss: 561.60996| NDCG@10: 0.13164| HIT@10: 0.08863
Epoch:   3| Train loss: 558.46746| NDCG@10: 0.13096| HIT@10: 0.08859
Epoch:   4| Train loss: 537.65672| NDCG@10: 0.13032| HIT@10: 0.08868
Epoch:   5| Train loss: 521.38920| NDCG@10: 0.13020| HIT@10: 0.08855
Epoch:   6| Train loss: 514.09129| NDCG@10: 0.13027| HIT@10: 0.08822
Epoch:   7| Train loss: 508.22806| NDCG@10: 0.13073| HIT@10: 0.08822
Epoch:   8| Train loss: 504.78173| NDCG@10: 0.13036| HIT@10: 0.08880
Epoch:   9| Train loss: 502.18185| NDCG@10: 0.13107| HIT@10: 0.08845
Epoch:  10| Train loss: 500.81223| NDCG@10: 0.12973| HIT@10: 0.08750
Epoch:  11| Train loss: 499.81687| NDCG@10: 0.13104| HIT@10: 0.08835
Epoch:  12| Train loss: 498.95233| NDCG@10: 0.13106| HIT@10: 0.08841
Epoch:  13| Train loss: 497.93820| NDCG@10: 0.12975| HIT@10: 0.08871
Epoch:  14| Train loss: 495.45230| NDCG@10: 0.13178| HIT@10: 0.08930
Epoch:  15| Train loss: 492.06213|

In [81]:
model.load_state_dict(torch.load(config.save_dir + config.saved_file_name))
submission = inference(
        model = model, 
        data_loader = data_loader,
        decoder = decoder,
        make_matrix_data_set = make_matrix_data_set
        )

In [82]:
submission.head()

Unnamed: 0,user,item
253796,11,733
253797,11,4886
253794,11,8961
253793,11,2115
253792,11,5418


In [None]:
submission.to_csv("output/submission_cdae.csv", index=False)

## 6. Optuna - Hyperparameter Tuning

In [None]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [None]:
config = {
'data_path' : "/opt/ml/input/data/train" , # 데이터 경로
'save_dir': './checkpoint/',
'saved_file_name':'best_model.pt',
'p_dims': [50, 500],
'dropout_rate' : 0.7,
'weight_decay' : 0.01,
'valid_samples' : 10, # 검증에 사용할 sample 수
'patience' : 30,
'lr' : 0.001,
'batch_size' : 500,
'num_epochs' : 100,
'num_workers' : 2,
}
config = Box(config)
def objective(trial):
    config.batch_size = trial.suggest_categorical('batch_size',[256, 512, 1024, 2048])
    config.lr = trial.suggest_loguniform('lr',0.001,0.01)
    config.weight_decay = trial.suggest_loguniform('weight_decay',1e-07,1e-06)
    config.dropout_rate = trial.suggest_categorical("dropout_rate",[0.2,0.3,0.4,0.5,0.6,0.7,0.8])
    config.num_layers = trial.suggest_int('num_layers',1 , 4)
    config.p_dims = [trial.suggest_int('hidden_dims',50,800)] * config.num_layers
    
    model = CDAE(
    p_dims = config.p_dims + [make_matrix_data_set.num_item],
    user_num = make_matrix_data_set.num_user, 
    item_num = make_matrix_data_set.num_item,
    dropout_rate = config.dropout_rate).to(device)
    criterion = LossFunc(loss_type = 'Logistic')
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    best_hit = -1
    for epoch in range(1, config.num_epochs + 1):
        # start_time = time.time() # 측정 시작 -> 1 epoch 당 5~6초 정도 소요됨
        train_loss = train(
            model = model, 
            criterion = criterion, 
            optimizer = optimizer, 
            data_loader = data_loader,
            make_matrix_data_set = make_matrix_data_set,
            )

        ndcg, hit = evaluate(
            model = model, 
            data_loader = data_loader,
            user_train = user_train,
            user_valid = user_valid,
            make_matrix_data_set = make_matrix_data_set,
            )
        if hit > best_hit:
            best_hit = hit
    return best_hit

In [None]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cdae_parameter_opt',
    direction = 'maximize',
    sampler = sampler)

study.optimize(objective, n_trials=30)

print('best params {} :'.format(fold+1), study.best_value)
print(study.best_params)

In [None]:
print('best params {} :',study.best_value)
print(study.best_params)

In [None]:
config = {
'data_path' : "/opt/ml/input/data/train" , # 데이터 경로
'save_dir': './checkpoint/',
'saved_file_name':'best_model_tanh.pt',
'p_dims': [50, 800, 500],
'dropout_rate' : 0.7,
'weight_decay' : 0.01,
'valid_samples' : 10, # 검증에 사용할 sample 수
'patience' : 30,
'lr' : 0.001,
'batch_size' : 500,
'num_epochs' : 200,
'num_workers' : 2,
}
for key, value in study.best_params.items():
    if key == 'hidden_dims':
        config['p_dims'] = [value] * study.best_params['num_layers']
    else:
        config[key] = value
# print(config)

config = Box(config)

model = CDAE(
    p_dims = config.p_dims + [make_matrix_data_set.num_item],
    user_num = make_matrix_data_set.num_user, 
    item_num = make_matrix_data_set.num_item,
    dropout_rate = config.dropout_rate).to(device)
criterion = LossFunc(loss_type = 'Logistic')
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

main(config)


In [None]:
model.load_state_dict(torch.load(config.save_dir + config.saved_file_name))
submission = inference(
        model = model, 
        data_loader = data_loader,
        decoder = decoder,
        make_matrix_data_set = make_matrix_data_set
        )

In [None]:
submission.head(10)

In [None]:
submission.to_csv("output/submission_cdae_optuna_tanh.csv", index=False)