In [1]:
import math
import numpy as np
import scipy.sparse as sp
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from box import Box

import warnings

warnings.filterwarnings(action='ignore')
torch.set_printoptions(sci_mode=True)

In [2]:
class GMF(nn.Module):
    def __init__(self, num_user, num_item, num_factor):
        super(GMF, self).__init__()
        self.user_emb = nn.Embedding(num_user, num_factor)
        self.item_emb = nn.Embedding(num_item, num_factor)
        
        self.predict_layer = nn.Sequential(
            nn.Linear(num_factor, 1, bias = False)
        )

        self._init_weight_()
    
    def _init_weight_(self):
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
        for m in self.predict_layer:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, a=1)
    
    def forward(self, user, item):
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)

        output = self.predict_layer(user_emb * item_emb)

        return output.view(-1)

# 1. 학습 설정

In [23]:
config = {
    'data_path' : "/opt/ml/input/data/train" , # 데이터 경로
    'model_path' : "../model",
    
    'submission_path' : "../submission",
    'submission_name' : 'EASE_v3_submission.csv',

    'candidate_item_num' : 5,
    'valid_samples' : 10, # 검증에 사용할 sample 수
    'seed' : 22,
    'reg' : 1000,
}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Box(config)

# 2. 데이터 전처리

In [4]:
class MakeMatrixDataSet():
    """
    MatrixDataSet 생성
    """
    def __init__(self, config):
        self.config = config
        self.df = pd.read_csv(os.path.join(self.config.data_path, 'train_ratings.csv'))
        
        self.item_encoder, self.item_decoder = self.generate_encoder_decoder('item')
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder('user')
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df['item_idx'] = self.df['item'].apply(lambda x : self.item_encoder[x])
        self.df['user_idx'] = self.df['user'].apply(lambda x : self.user_encoder[x])

        self.user_train, self.user_valid = self.generate_sequence_data()

    def generate_encoder_decoder(self, col : str) -> dict:
        """
        encoder, decoder 생성

        Args:
            col (str): 생성할 columns 명
        Returns:
            dict: 생성된 user encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = self.df[col].unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data 생성

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        user_valid = {}
        for user, item, time in zip(self.df['user_idx'], self.df['item_idx'], self.df['time']):
            users[user].append(item)
        
        for user in users:
            np.random.seed(self.config.seed)

            user_total = users[user]
            valid = np.random.choice(user_total, size = self.config.valid_samples, replace = False).tolist()
            train = list(set(user_total) - set(valid))

            user_train[user] = train
            user_valid[user] = valid # valid_samples 개수 만큼 검증에 활용 (현재 Task와 가장 유사하게)

        return user_train, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.user_valid

    def make_matrix(self, user_list, train = True):
        """
        user_item_dict를 바탕으로 행렬 생성
        """
        mat = torch.zeros(size = (user_list.size(0), self.num_item))
        for idx, user in enumerate(user_list):
            if train:
                mat[idx, self.user_train[user.item()]] = 1
            else:
                mat[idx, self.user_train[user.item()] + self.user_valid[user.item()]] = 1
        return mat

    def make_sparse_matrix(self, test = False):
        X = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        
        for user in self.user_train.keys():
            item_list = self.user_train[user]
            X[user, item_list] = 1.0
        
        if test:
            for user in self.user_valid.keys():
                item_list = self.user_valid[user]
                X[user, item_list] = 1.0

        return X.tocsr()
    

    def make_year_candidate_item(self, train = True):
        user_pro_df = pd.read_csv(os.path.join(self.config.data_path, 'user_pro.csv'))
        item_pro_df = pd.read_csv(os.path.join(self.config.data_path, 'item_pro.csv'))

        item_pro_df['item'] = item_pro_df['item'].apply(lambda x : self.item_encoder[x])
        item_pro_df['year'] = item_pro_df['year'].astype(int)

        user_pro_df['user'] = user_pro_df['user'].apply(lambda x : self.user_encoder[x])
        user_pro_df['max_year'] = user_pro_df['max_year'].astype(int)
        
        year2item_list = {}
        year_list = user_pro_df['max_year'].unique().tolist()
        for year in year_list:
            item_list = item_pro_df[item_pro_df['year'] <= year + 1]['item'].tolist()
            year2item_list[year + 1] = item_list

        all_item_list = [i for i in range(self.num_item)]
        group_df = user_pro_df.groupby('user')
        candidate = {}
        if train:
            for user, df in group_df:
                max_year = df['max_year'].values[0]
                candidate_item_list = year2item_list[max_year + 1]
                candidate_item_list = set(all_item_list) - set(candidate_item_list)
                candidate_item_list = list(candidate_item_list | set(self.user_train[user]))
                candidate[user] = candidate_item_list
        else:
            for user, df in group_df:
                max_year = df['max_year'].values[0]
                candidate_item_list = year2item_list[max_year + 1]
                candidate_item_list = set(all_item_list) - set(candidate_item_list)
                candidate_item_list = candidate_item_list | set(self.user_train[user])
                candidate_item_list = list(candidate_item_list | set(self.user_valid[user]))
                candidate[user] = candidate_item_list

        return candidate

    def make_cos_candidate_item(self, candidate_item_num, train = True):
        gmf = GMF(
            num_user = self.num_user, 
            num_item = self.num_item, 
            num_factor = 512).to(device)

        gmf.load_state_dict(torch.load(os.path.join(self.config.model_path, 'GMF_v1.pt')))
        movie_emb = gmf.item_emb.weight.data.cpu()
        
        cos_mm = torch.nn.CosineSimilarity(dim=1)
        cos_sim_list = []
        for target_item in range(len(movie_emb)):
            cos_sim_score = cos_mm(movie_emb[target_item], movie_emb)
            cos_sim_index = cos_sim_score.argsort()
            cos_sim_list.append(cos_sim_index.numpy()[::-1][:candidate_item_num + 1].tolist())
        
        cos_sim_list = np.array(cos_sim_list)

        candidate = {}
        if train:
            for user in self.user_train.keys():
                candidate_item_list = set(cos_sim_list[self.user_train[user], :].reshape(-1).tolist())
                candidate_item_list = list(candidate_item_list - set(self.user_train[user]))
                candidate[user] = candidate_item_list
        else:
            for user in self.user_train.keys():
                candidate_item_list = set(cos_sim_list[self.user_train[user] + self.user_valid[user], :].reshape(-1).tolist())
                candidate_item_list = candidate_item_list - set(self.user_train[user])
                candidate_item_list = list(candidate_item_list - set(self.user_valid[user]))
                candidate[user] = candidate_item_list
        
        return candidate

    def m_s_m(self, candidate):
        X = sp.dok_matrix((self.num_user, self.num_item), dtype=np.float32)
        for user in candidate.keys():
            item_list = candidate[user]
            X[user, item_list] = 1.0

        return X.tocsr()

In [5]:
class AEDataSet(Dataset):
    def __init__(self, num_user):
        self.num_user = num_user
        self.users = [i for i in range(num_user)]

    def __len__(self):
        return self.num_user

    def __getitem__(self, idx): 
        user = self.users[idx]
        return torch.LongTensor([user])

# 3. 모델

In [6]:
class EASE():
    def __init__(self, X, reg):
        self.X = self._convert_sp_mat_to_sp_tensor(X)
        self.reg = reg
    
    def _convert_sp_mat_to_sp_tensor(self, X):
        """
        Convert scipy sparse matrix to PyTorch sparse matrix

        Arguments:
        ----------
        X = Adjacency matrix, scipy sparse matrix
        """
        coo = X.tocoo().astype(np.float32)
        i = torch.LongTensor(np.mat([coo.row, coo.col]))
        v = torch.FloatTensor(coo.data)
        res = torch.sparse.FloatTensor(i, v, coo.shape).to(device)
        return res
    
    def fit(self):
        '''

        진짜 정말 간단한 식으로 모델을 만듬

        '''
        G = self.X.to_dense().t() @ self.X.to_dense()
        diagIndices = torch.eye(G.shape[0]) == 1
        G[diagIndices] += self.reg

        P = G.inverse()
        B = P / (-1 * P.diag())
        B[diagIndices] = 0

        self.pred = self.X.to_dense() @ B

# 4. 학습 함수

In [7]:
def get_ndcg(pred_list, true_list):
    ndcg = 0
    for rank, pred in enumerate(pred_list):
        if pred in true_list:
            ndcg += 1 / np.log2(rank + 2)
    return ndcg

# hit == recall == precision
def get_hit(pred_list, true_list):
    hit_list = set(true_list) & set(pred_list)
    hit = len(hit_list) / len(true_list)
    return hit

def evaluate(model, X, user_train, user_valid):

    NDCG = 0.0 # NDCG@10
    HIT = 0.0 # HIT@10

    recon_mat = model.pred.cpu()
    score = recon_mat * torch.from_numpy(1 - X)
    rec_list = score.argsort(dim = 1)

    for user, rec in enumerate(rec_list):
        uv = user_valid[user]
        up = rec[-10:].cpu().numpy().tolist()
        NDCG += get_ndcg(pred_list = up, true_list = uv)
        HIT += get_hit(pred_list = up, true_list = uv)

    NDCG /= len(rec_list)
    HIT /= len(rec_list)

    return NDCG, HIT


def predict(model, X):
    user2rec = {}

    recon_mat = model.pred.cpu()
    score = recon_mat * torch.from_numpy(1 - X)
    rec_list = score.argsort(dim = 1)

    for user, rec in enumerate(rec_list):
        up = rec[-10:].cpu().numpy().tolist()
        user2rec[user] = up
    
    return user2rec

# 5. 학습

In [8]:
make_matrix_data_set = MakeMatrixDataSet(config = config)
user_train, user_valid = make_matrix_data_set.get_train_valid_data()
X = make_matrix_data_set.make_sparse_matrix()

In [9]:
model = EASE(X = X, reg = config.reg)
model.fit()

In [11]:
candidate = make_matrix_data_set.make_cos_candidate_item(candidate_item_num = config.candidate_item_num, train = True)

In [12]:
new_X = make_matrix_data_set.m_s_m(candidate)
new_X = 1 - new_X.todense()

In [13]:
ndcg, hit = evaluate(model = model, X = new_X, user_train = user_train, user_valid = user_valid)
print(f'NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

NDCG@10: 0.70355| HIT@10: 0.17417


In [30]:
ndcg, hit = evaluate(model = model, X = X.todense(), user_train = user_train, user_valid = user_valid)
print(f'NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

NDCG@10: 0.69426| HIT@10: 0.17225


In [None]:
for candidate_item_num in range(1, 20):
    candidate = make_matrix_data_set.make_cos_candidate_item(candidate_item_num = candidate_item_num, train = True)
    new_X = make_matrix_data_set.m_s_m(candidate)
    new_X = 1 - new_X.todense()

    ndcg, hit = evaluate(model = model, X = new_X, user_train = user_train, user_valid = user_valid)
    print(f'candidate_item_num: {candidate_item_num}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

```
candidate_item_num: 1| NDCG@10: 0.58045| HIT@10: 0.14915
candidate_item_num: 2| NDCG@10: 0.67349| HIT@10: 0.16913
candidate_item_num: 3| NDCG@10: 0.69687| HIT@10: 0.17347
candidate_item_num: 4| NDCG@10: 0.70453| HIT@10: 0.17469
candidate_item_num: 5| NDCG@10: 0.70501| HIT@10: 0.17461
candidate_item_num: 6| NDCG@10: 0.70404| HIT@10: 0.17441
candidate_item_num: 7| NDCG@10: 0.70468| HIT@10: 0.17448
candidate_item_num: 8| NDCG@10: 0.70447| HIT@10: 0.17434

```

In [None]:
candidate = make_matrix_data_set.make_cos_candidate_item(candidate_item_num = 4, train = True)
new_X = make_matrix_data_set.m_s_m(candidate)
new_X = 1 - new_X.todense()

In [None]:
for reg in [1000000, 100000, 10000, 1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:
    model = EASE(X = X, reg = reg)
    model.fit()
    ndcg, hit = evaluate(model = model, X = new_X, user_train = user_train, user_valid = user_valid)
    print(f'reg: {reg}| NDCG@10: {ndcg:.5f}| HIT@10: {hit:.5f}')

```
reg: 1000000| NDCG@10: 0.46015| HIT@10: 0.11131
reg: 100000| NDCG@10: 0.57234| HIT@10: 0.14224
reg: 10000| NDCG@10: 0.72272| HIT@10: 0.18252
reg: 1000| NDCG@10: 0.79034| HIT@10: 0.19934
reg: 100| NDCG@10: 0.77785| HIT@10: 0.19445
reg: 10| NDCG@10: 0.73045| HIT@10: 0.18189
reg: 1| NDCG@10: 0.70640| HIT@10: 0.17564
reg: 0.1| NDCG@10: 0.70441| HIT@10: 0.17476
reg: 0.01| NDCG@10: 0.70451| HIT@10: 0.17470
reg: 0.001| NDCG@10: 0.70453| HIT@10: 0.17469
reg: 0.0001| NDCG@10: 0.70453| HIT@10: 0.17469
reg: 1e-05| NDCG@10: 0.70453| HIT@10: 0.17469

```

# 6. 예측

In [19]:
X_test = make_matrix_data_set.make_sparse_matrix(test = True)

In [20]:
model = EASE(X = X_test, reg = 1000)
model.fit()

In [21]:
candidate = make_matrix_data_set.make_cos_candidate_item(candidate_item_num = 5, train = False)
new_X = make_matrix_data_set.m_s_m(candidate)
new_X = 1 - new_X.todense()

In [22]:
user2rec_list = predict(
    model = model, 
    X = new_X,
    )

submision = []
users = [i for i in range(0, make_matrix_data_set.num_user)]
for user in users:
    rec_item_list = user2rec_list[user]
    for item in rec_item_list:
        submision.append(
            {   
                'user' : make_matrix_data_set.user_decoder[user],
                'item' : make_matrix_data_set.item_decoder[item],
            }
        )

submision = pd.DataFrame(submision)

In [24]:
submision.to_csv(os.path.join(config.submission_path, config.submission_name), index=False)

In [25]:
submision

Unnamed: 0,user,item
0,11,2987
1,11,2174
2,11,7438
3,11,2
4,11,7373
...,...,...
313595,138493,1270
313596,138493,5349
313597,138493,8961
313598,138493,2628
