In [1]:
import math
import numpy as np
import scipy.sparse as sp
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

# 1. 학습 설정

In [2]:
config = {
    'data_path' : "/opt/ml/input/data/train" , # 데이터 경로
    
    'submission_path' : "../submission",
    'submission_name' : 'Multi-EASE_submission.csv', 

    'model_path' : "../model", # 모델 저장 경로
    'model_name' : 'Multi-EASE_v1.pt',
    
    'weight_decay' : 0.0,
    'valid_samples' : 10, # 검증에 사용할 sample 수
    'seed' : 22,


    'lr' : 0.0005,
    'batch_size' : 1000,
    'num_epochs' : 200,
    'num_workers' : 2,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

# 2. 데이터 전처리

In [3]:
class MakeMatrixDataSet():
    """
    MatrixDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'train_ratings.csv'))
        
        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('item')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('user')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['item'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['user'].apply(lambda x : self.user_encoder[x])

        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['time']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_dict를 바탕으로 행렬 생성
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.user_train[user.item()]] = 1
            else:
                mat[idx, self.user_train[user.item()] + self.user_valid[user.item()]] = 1
        return mat

    def make_sparse_matrix(self, test = False):
        X = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        
        for user in self.user_train.keys():
            item_list = self.user_train[user]
            X[user, item_list] = 1.0
        
        if test:
            for user in self.user_valid.keys():
                item_list = self.user_valid[user]
                X[user, item_list] = 1.0

        return X.tocsr()

In [4]:
class AEDataSet(Dataset):
    def __init__(self, num_user):
        self.num_user = num_user

    def __len__(self):
        return self.num_user

    def __getitem__(self, user):
        return torch.LongTensor([user])

# 3. 모델

In [5]:
class MultiEASE(nn.Module):
    def __init__(self, X):
        super(MultiEASE, self).__init__()
        self.user_emb = nn.Embedding.from_pretrained(X)
        self.user_emb.requires_grad_(True)

    
    def _convert_sp_mat_to_sp_tensor(self, X):
        """
        Convert scipy sparse matrix to PyTorch sparse matrix

        Arguments:
        ----------
        X = Adjacency matrix, scipy sparse matrix
        """
        coo = X.tocoo().astype(np.float32)
        i = torch.LongTensor(np.mat([coo.row, coo.col]))
        v = torch.FloatTensor(coo.data)
        res = torch.sparse.FloatTensor(i, v, coo.shape).to(device)
        return res
    
    def set_B(self, X, reg):
        X = self._convert_sp_mat_to_sp_tensor(X)
        G = X.to_dense().t() @ X.to_dense()
        diagIndices = torch.eye(G.shape[0]) == 1
        G[diagIndices] += reg
        
        P = G.inverse()
        B = P / (-1 * P.diag())
        B[diagIndices] = 0

        self.B = B

    def forward(self, user):
        user_emb = self.user_emb(user)
        output = user_emb @ self.B

        return output

In [6]:
class MultiLoss(nn.Module):
    def __init__(self):
        super(MultiLoss, self).__init__()
        
    def forward(self, x_pred, user_ratings):
        mll = (F.log_softmax(x_pred, dim=-1) * user_ratings).sum(dim=-1).mean()
        return -mll

# 4. 학습 함수

In [7]:
def train(model, criterion, optimizer, data_loader, make_matrix_data_set):
    model.train()
    loss_val = 0

    for users in data_loader:
        mat = make_matrix_data_set.make_matrix(users)
        mat = mat.to(device)
        
        recon_mat = model(users.view(-1).to(device))
        
        optimizer.zero_grad()
        loss = criterion(recon_mat, mat)

        loss_val += loss.item()

        loss.backward()
        optimizer.step()

    loss_val /= len(data_loader)

    return loss_val

def get_ndcg(pred_list, true_list):
    idcg = sum((1 / np.log2(rank + 2) for rank in range(1, len(pred_list))))
    dcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            dcg += 1 / np.log2(rank + 2)
    ndcg = dcg / idcg
    return ndcg

# hit == recall == precision
def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit

def evaluate(model, data_loader, user_train, user_valid, make_matrix_data_set):
    model.eval()

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    with torch.no_grad():
        for users in data_loader:
            mat = make_matrix_data_set.make_matrix(users)
            mat = mat.to(device)
            
            recon_mat = model(users.view(-1).to(device))
            recon_mat[mat == 1] = -np.inf
            rec_list = recon_mat.argsort(dim = 1)

            for user, rec in zip(users, rec_list):
                uv = user_valid[user.item()]
                up = rec[-10:].cpu().numpy().tolist()[::-1]
                NDCG += get_ndcg(pred_list = up, true_list = uv)
                HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(data_loader.dataset)
    HIT /= len(data_loader.dataset)

    return NDCG, HIT

In [8]:
class MultiLoss(nn.Module):
    def __init__(self):
        super(MultiLoss, self).__init__()
        
    def forward(self, x_pred, user_ratings):
        mll = (F.log_softmax(x_pred, dim=-1) * user_ratings).sum(dim=-1).mean()
        return -mll

# 5. 학습

In [9]:
make_matrix_data_set = MakeMatrixDataSet(config = config)
user_train, user_valid = make_matrix_data_set.get_train_valid_data()
X = make_matrix_data_set.make_sparse_matrix()

In [10]:
ae_dataset = AEDataSet(
    num_user = make_matrix_data_set.num_user,
    )

In [11]:
data_loader = DataLoader(
    ae_dataset,
    batch_size = config.batch_size, 
    shuffle = True,
    pin_memory = True,
    num_workers = config.num_workers,
    )

In [12]:
model = MultiEASE(
    X = torch.from_numpy(X.toarray()).to(device)
).to(device)

model.set_B(X = X, reg = 680)

criterion = MultiLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay = config.weight_decay)

In [13]:
best_hit = 0
for epoch in range(1, config.num_epochs + 1):
    tbar = tqdm(range(1))
    for _ in tbar:
        train_loss = train(
            model = model, 
            criterion = criterion, 
            optimizer = optimizer, 
            data_loader = data_loader,
            make_matrix_data_set = make_matrix_data_set,
            )
        
        ndcg, hit = evaluate(
            model = model, 
            data_loader = data_loader,
            user_train = user_train,
            user_valid = user_valid,
            make_matrix_data_set = make_matrix_data_set,
            )

        if best_hit < hit:
            best_hit = hit
            torch.save(model.state_dict(), os.path.join(config.model_path, config.model_name))

        tbar.set_description(f'Epoch: {epoch:3d}| Train loss: {train_loss:.5f}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

Epoch:   1| Train loss: 1304.61062| NDCG@10: 0.31086| HIT@10: 0.20414: 100%|██████████| 1/1 [00:14<00:00, 14.96s/it]
Epoch:   2| Train loss: 1305.82963| NDCG@10: 0.31102| HIT@10: 0.20419: 100%|██████████| 1/1 [00:15<00:00, 15.30s/it]
Epoch:   3| Train loss: 1303.69287| NDCG@10: 0.31136| HIT@10: 0.20446: 100%|██████████| 1/1 [00:15<00:00, 15.08s/it]
Epoch:   4| Train loss: 1300.14899| NDCG@10: 0.31162| HIT@10: 0.20466: 100%|██████████| 1/1 [00:15<00:00, 15.06s/it]
Epoch:   5| Train loss: 1301.27822| NDCG@10: 0.31185| HIT@10: 0.20482: 100%|██████████| 1/1 [00:15<00:00, 15.01s/it]
Epoch:   6| Train loss: 1302.25953| NDCG@10: 0.31202| HIT@10: 0.20496: 100%|██████████| 1/1 [00:14<00:00, 14.47s/it]
Epoch:   7| Train loss: 1299.61404| NDCG@10: 0.31228| HIT@10: 0.20513: 100%|██████████| 1/1 [00:15<00:00, 15.01s/it]
Epoch:   8| Train loss: 1299.41044| NDCG@10: 0.31240| HIT@10: 0.20524: 100%|██████████| 1/1 [00:15<00:00, 15.03s/it]
Epoch:   9| Train loss: 1297.84320| NDCG@10: 0.31256| HIT@10: 0.

KeyboardInterrupt: 

In [24]:
NDCG = 0
HIT = 0

user_data = model.user_emb.weight.data
mat = torch.from_numpy(X.toarray()).to(device)
user_data[mat == 1] = -np.inf
rec_list = user_data.argsort(dim = 1)

for user, rec in enumerate(rec_list):
    uv = user_valid[user]
    up = rec[-10:].cpu().numpy().tolist()[::-1]
    NDCG += get_ndcg(pred_list = up, true_list = uv)
    HIT += get_hit(pred_list = up, true_list = uv)

NDCG /= len(rec_list)
HIT /= len(rec_list)

In [48]:
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
for user in range(31360):
    output = cos(mat[user], mat)

KeyboardInterrupt: 