In [1]:
import math
import numpy as np
import scipy.sparse as sp
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

# 1. 학습 설정

In [15]:
config = {
    'data_path' : "/opt/ml/input/data/train" , # 데이터 경로
    'model_path' : "../model",
    
    'submission_path' : "../submission",
    'submission_name' : 'oof_EASE_v2_submission.csv',

    'candidate_item_num' : 5,
    'valid_samples' : 10, # 검증에 사용할 sample 수
    'seed' : 22,
    'reg' : 750,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

# 2. 데이터 전처리

In [9]:
class MakeMatrixDataSet():
    """
    MatrixDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'train_ratings.csv'))
        
        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('item')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('user')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['item'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['user'].apply(lambda x : self.user_encoder[x])

        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['time']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_dict를 바탕으로 행렬 생성
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.user_train[user.item()]] = 1
            else:
                mat[idx, self.user_train[user.item()] + self.user_valid[user.item()]] = 1
        return mat

    def make_sparse_matrix(self, test = False):
        X = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        
        for user in self.user_train.keys():
            item_list = self.user_train[user]
            X[user, item_list] = 1.0
        
        if test:
            for user in self.user_valid.keys():
                item_list = self.user_valid[user]
                X[user, item_list] = 1.0

        return X.tocsr()

    def oof_make_sparse_matrix(self, seed):
        train_X = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        user_valid = {}

        users = defaultdict(list)
        group_df = self.df.groupby('user_idx')
        for user, items in group_df:
            users[user].extend(items['item_idx'].tolist())
        
        for user in users:
            np.random.seed(seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            train_X[user, train] = 1.0
            user_valid[user] = valid

        return train_X.tocsr(), user_valid

In [10]:
class AEDataSet(Dataset):
    def __init__(self, num_user):
        self.num_user = num_user
        self.users = [i for i in range(num_user)]

    def __len__(self):
        return self.num_user

    def __getitem__(self, idx): 
        user = self.users[idx]
        return torch.LongTensor([user])

# 3. 모델

In [11]:
class EASE():
    def __init__(self, X, reg):
        self.X = self._convert_sp_mat_to_sp_tensor(X)
        self.reg = reg
    
    def _convert_sp_mat_to_sp_tensor(self, X):
        """
        Convert scipy sparse matrix to PyTorch sparse matrix

        Arguments:
        ----------
        X = Adjacency matrix, scipy sparse matrix
        """
        coo = X.tocoo().astype(np.float32)
        i = torch.LongTensor(np.mat([coo.row, coo.col]))
        v = torch.FloatTensor(coo.data)
        res = torch.sparse.FloatTensor(i, v, coo.shape).to(device)
        return res
    
    def fit(self):
        '''

        진짜 정말 간단한 식으로 모델을 만듬

        '''
        G = self.X.to_dense().t() @ self.X.to_dense()
        diagIndices = torch.eye(G.shape[0]) == 1
        G[diagIndices] += self.reg

        P = G.inverse()
        B = P / (-1 * P.diag())
        B[diagIndices] = 0
    
        self.B = B
    
    def predict(self, X):
        X = self._convert_sp_mat_to_sp_tensor(X)
        self.pred = X.to_dense() @ self.B

# 4. 학습 함수

In [12]:
def get_ndcg(pred_list, true_list):
    idcg = sum((1 / np.log2(rank + 2) for rank in range(1, len(pred_list))))
    dcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            dcg += 1 / np.log2(rank + 2)
    ndcg = dcg / idcg
    return ndcg

# hit == recall == precision
def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit

def evaluate(model, X, user_valid):

    mat = torch.from_numpy(X)

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    recon_mat = model.pred.cpu()
    recon_mat[mat == 1] = -np.inf
    rec_list = recon_mat.argsort(dim = 1)

    for user, rec in enumerate(rec_list):
        uv = user_valid[user]
        up = rec[-10:].cpu().numpy().tolist()[::-1]
        NDCG += get_ndcg(pred_list = up, true_list = uv)
        HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(rec_list)
    HIT /= len(rec_list)

    return NDCG, HIT


def predict(model, X):
    user2rec = {}

    mat = torch.from_numpy(X)

    recon_mat = model.pred.cpu()
    recon_mat[mat == 1] = -np.inf
    rec_list = recon_mat.argsort(dim = 1)

    for user, rec in enumerate(rec_list):
        up = rec[-10:].cpu().numpy().tolist()[::-1]
        user2rec[user] = up
    
    return user2rec

def ensemble(oof2user2rec, users):
    ensemble_user2rec = {}

    score_li = np.array([1/np.log2(rank + 2) for rank in range(0, 10)])

    for user in users:

        user2rec_list = []
        all_user_rec = []
        
        for oof in oof2user2rec.keys():
            user2rec = oof2user2rec[oof]
            user2rec_list.append(user2rec[user])
            all_user_rec += user2rec[user]
        
        all_user_rec = list(set(all_user_rec))

        rec_df = pd.DataFrame(index = all_user_rec)

        for oof in oof2user2rec.keys():
            rec_df.loc[user2rec_list[oof - 1], f'oof{oof}_rec_score'] = score_li
        
        rec_df = rec_df.fillna(min(score_li))
        rec_df['total_rec_score'] = rec_df.sum(axis = 1)

        rec_df = rec_df.sort_values('total_rec_score', ascending = False)
        up = rec_df.index.tolist()[:10]

        ensemble_user2rec[user] = up

    return ensemble_user2rec

# 5. 학습

In [13]:
make_matrix_data_set = MakeMatrixDataSet(config = config)
X_test = make_matrix_data_set.make_sparse_matrix(test = True)

In [14]:
users = [user for user in range(make_matrix_data_set.num_user)]
seed = config.seed
oof = 1
oof2user2rec = {}

for _ in range(5):

    train_X, user_valid = make_matrix_data_set.oof_make_sparse_matrix(seed = seed)
    model = EASE(X = train_X, reg = 750)
    model.fit()
    model.predict(X = train_X)

    ndcg, hit = evaluate(model = model, X = train_X.todense(), user_valid = user_valid)
    print(f'oof-{oof} NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')
    
    model.predict(X = X_test)
    user2rec = predict(
        model = model, 
        X = X_test.todense(),
    )

    oof2user2rec[oof] = user2rec

    oof += 1
    seed += 1

oof-1 NDCG@10: 0.31055| HIT@10: 0.20384
oof-2 NDCG@10: 0.32204| HIT@10: 0.21099
oof-3 NDCG@10: 0.33080| HIT@10: 0.21594
oof-4 NDCG@10: 0.30952| HIT@10: 0.20276
oof-5 NDCG@10: 0.30010| HIT@10: 0.19581


```
아이템 스플릿

oof-1 NDCG@10: 0.31055| HIT@10: 0.20384
oof-2 NDCG@10: 0.32204| HIT@10: 0.21099
oof-3 NDCG@10: 0.33080| HIT@10: 0.21594
oof-4 NDCG@10: 0.30952| HIT@10: 0.20276
oof-5 NDCG@10: 0.30010| HIT@10: 0.19581
```

```
유저 스플릿

NDCG@10: 0.23292| HIT@10: 0.20569
NDCG@10: 0.23085| HIT@10: 0.20391
NDCG@10: 0.22740| HIT@10: 0.20234
NDCG@10: 0.22795| HIT@10: 0.20214
NDCG@10: 0.23052| HIT@10: 0.20450
```

# 6. 예측

In [16]:
user2rec_list = ensemble(oof2user2rec = oof2user2rec, users = users)

submision = []
users = [i for i in range(0, make_matrix_data_set.num_user)]
for user in users:
    rec_item_list = user2rec_list[user]
    for item in rec_item_list:
        submision.append(
            {   
                'user' : make_matrix_data_set.user_decoder[user],
                'item' : make_matrix_data_set.item_decoder[item],
            }
        )

submision = pd.DataFrame(submision)

In [17]:
submision.to_csv(os.path.join(config.submission_path, config.submission_name), index=False)

In [18]:
submision

Unnamed: 0,user,item
0,11,4370
1,11,4886
2,11,8961
3,11,40815
4,11,47
...,...,...
313595,138493,1270
313596,138493,32587
313597,138493,2762
313598,138493,8970
