In [1]:
import time

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter

from sagemaker import evaluation
from torch.utils import data

from model.neu_mf import NeuMF
from validation_dataset import ValidationDataset

#### Hyper-parameter와 그외 설정

In [2]:
args = {
    # dataset
    "negative_sample_ratio": 4,
    
    # model
    "predictive_factor_num": 8,
    "mlp_layer_num": 3,
    
    # learning
    "epochs": 60,
    "batch_size": 256,
    "lr": 0.001,
    
    # evaluation
    "eval_k": 10,
}

DEVICE = 'mps'

#### 사용자와 아이템 수 정의

In [3]:
user_number = 943
item_number = 1682

#### 데이터 준비

In [4]:
class ImplicitSparseDataset(data.Dataset):
    @staticmethod
    def __create_negative_samples(M, N, sparse_Y_u, sparse_Y_i, sample_ratio):
        sparse_Y = [(u.item(), i.item()) for (u, i) in zip(sparse_Y_u, sparse_Y_i)]

        negative_samples = []
        for u in range(M):
            for _ in range(sample_ratio):
                while True:
                    i = np.random.randint(N)
                    if (u, i) not in sparse_Y:
                        break
                negative_samples.append((u, i))

        negative_samples = torch.tensor(negative_samples)
        return negative_samples[:, 0], negative_samples[:, 1]

    def __init__(self, M, N, sparse_Y_u, sparse_Y_i, include_negative_samples=False, negative_sample_ratio=1):
        super(ImplicitSparseDataset, self).__init__()

        assert len(sparse_Y_u) == len(sparse_Y_i), "Length of `sparse_Y_u` and `sparse_Y_i` must be equal."

        self.len = len(sparse_Y_u)
        self.sparsity = 1 - (self.len / (M * N))

        self.Y_u = sparse_Y_u
        self.Y_i = sparse_Y_i
        self.Y_value = torch.tensor([1.0] * self.len)

        if include_negative_samples:
            negative_sample_size = M * negative_sample_ratio
            self.len = self.len + negative_sample_size

            n_Y_u, n_Y_i = ImplicitSparseDataset.__create_negative_samples(M, N, self.Y_u, self.Y_i,
                                                                           negative_sample_ratio)

            self.Y_u = torch.cat((self.Y_u, n_Y_u), dim=0)
            self.Y_i = torch.cat((self.Y_i, n_Y_i), dim=0)
            self.Y_value = torch.cat((self.Y_value, torch.tensor([0.0] * negative_sample_size)), dim=0)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.Y_u[idx], self.Y_i[idx], self.Y_value[idx]

    def get_sparsity(self):
        return self.sparsity

In [5]:
def create_dataset(train_csv, val_csv, _user_number, _item_number, negative_sample_ratio):
    _train_dataset = pd.read_csv(train_csv, dtype={0: np.int32, 1: np.int32}, header=None)
    _train_dataset = torch.from_numpy(_train_dataset.values)
    _train_dataset = ImplicitSparseDataset(_user_number, _item_number, _train_dataset[:, 0], _train_dataset[:, 1],
                                          include_negative_samples=True, negative_sample_ratio=negative_sample_ratio)

    _val_dataset = ValidationDataset(val_csv)

    return _train_dataset, _val_dataset

In [6]:
train_dataset, val_dataset = create_dataset('../ml-100k.train.csv', '../ml-100k.val.csv',
                                            user_number, item_number, args['negative_sample_ratio'])

train_loader = data.DataLoader(train_dataset, batch_size=args["batch_size"], shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=args["batch_size"], shuffle=False)

#### 모델 생성

In [7]:
model = NeuMF("neumf-model_validation",
              user_number, item_number,
              predictive_factor_num=args["predictive_factor_num"],
              mlp_layer_num=args["mlp_layer_num"], dropout_prob=0.3).to(DEVICE)
model

NeuMF(
  (mlp_P): Embedding(943, 16)
  (mlp_Q): Embedding(1682, 16)
  (mlp_layer_X): Sequential(
    (0): Sequential(
      (0): Dropout(p=0.3, inplace=False)
      (1): Linear(in_features=32, out_features=16, bias=True)
      (2): ReLU()
    )
    (1): Sequential(
      (0): Dropout(p=0.3, inplace=False)
      (1): Linear(in_features=16, out_features=8, bias=True)
      (2): ReLU()
    )
    (2): Sequential(
      (0): Dropout(p=0.3, inplace=False)
      (1): Linear(in_features=8, out_features=4, bias=True)
      (2): ReLU()
    )
  )
  (gmf_P): Embedding(943, 4)
  (gmf_Q): Embedding(1682, 4)
  (neu_mf): Linear(in_features=8, out_features=1, bias=True)
)


#### 학습 진행

In [8]:
def train(run_name):
    global train_loader
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args["lr"])
    
    writer = SummaryWriter(log_dir="runs/" + run_name)
    for epoch in range(args["epochs"]):
        start_time = time.time()
    
        batch = 0
        loss_sum = 0
        model.train() 
        for user, item, label in train_loader:
            batch += 1
    
            user = user.to(DEVICE)
            item = item.to(DEVICE)
            label = label.reshape(-1, 1).to(DEVICE)
    
            model.zero_grad()
            outputs = model(user, item)
    
            loss = criterion(outputs, label)
            loss_sum += loss.item()
    
            loss.backward()
            optimizer.step()
    
        writer.add_scalar("train/loss", loss_sum / batch, epoch + 1)
    
        model.eval()
        with torch.no_grad():
            hr, mean_ap, norm_dcg = evaluation.evaluate(model, val_loader, args["eval_k"], device=DEVICE)
            writer.add_scalar("val/hr", hr, epoch + 1)
            writer.add_scalar("val/mAP", mean_ap, epoch + 1)
            writer.add_scalar("val/nDCG", norm_dcg, epoch + 1)
    
        elapsed_time = time.time() - start_time
        print("The time elapse of epoch {:03d} is: ".format(epoch + 1) + str(elapsed_time))

<b>Run 1</b>

In [None]:
train("model_validation-run1")

![](img/model_validation-run1-train.png)<br/>
![](img/model_validation-run1-val.png)<br/>


기이하게도 학습을 거듭할수록 성능지표가 떨어지는 현상을 보였다. 비록 validation 성과가 반락하는 통상적인 overfitting의 모습은 아니였지만 train-loss가 수렴치를 갖지않고 0에 가까워 지도록 하향했다는 점에서 과적합을 의심해 보았다.

우선 학습데이터셋의 negative sample이 positive instance에 비해 현저히 적었다는 점을 문제삼아 보았다. 논문에 의하면 positive instance 대 negative instance의 비율을 hyper-parameter로 정의하고 조절하였으나 Run1에서는 user 당 negative instance의 개수를 조절치로 두었다.

In [9]:
# Train Dataset Size
len(train_loader.dataset)

83772

Update `ImplicitSparseDataset.__create_negative_samples`

In [10]:
class ImplicitSparseDataset(data.Dataset):
    @staticmethod
    def __create_negative_samples(N, sparse_Y_u, sparse_Y_i, sample_ratio):
        all_items = torch.arange(N, dtype=torch.int32)
    
        negative_samples = torch.tensor([], dtype=torch.int32)
        
        for user_id in sparse_Y_u.unique():
            positive_items = sparse_Y_i[sparse_Y_u == user_id]
            negative_sample_size = len(positive_items) * sample_ratio
                
            possible_negative_items = all_items[torch.isin(all_items, positive_items, invert=True)]
            
            random_indices = torch.randperm(len(possible_negative_items))[:negative_sample_size]
            
            negative_items = possible_negative_items[random_indices].unsqueeze(dim=1)
            negative_items = torch.cat((torch.full((negative_items.size(0), 1), user_id), negative_items), dim=1)
            
            negative_samples = torch.cat((negative_samples, negative_items))
        
        return negative_samples[:, 0], negative_samples[:, 1]

    def __init__(self, M, N, sparse_Y_u, sparse_Y_i, include_negative_samples=False, negative_sample_ratio=1):
        super(ImplicitSparseDataset, self).__init__()

        assert len(sparse_Y_u) == len(sparse_Y_i), "Length of `sparse_Y_u` and `sparse_Y_i` must be equal."

        self.len = len(sparse_Y_u)
        self.sparsity = 1 - (self.len / (M * N))

        self.Y_u = sparse_Y_u
        self.Y_i = sparse_Y_i
        self.Y_value = torch.tensor([1.0] * self.len)

        if include_negative_samples:
            n_Y_u, n_Y_i = ImplicitSparseDataset.__create_negative_samples(N, self.Y_u, self.Y_i, negative_sample_ratio)
            negative_sample_size = n_Y_u.size(0)
    
            self.Y_u = torch.cat((self.Y_u, n_Y_u), dim=0)
            self.Y_i = torch.cat((self.Y_i, n_Y_i), dim=0)
            self.Y_value = torch.cat((self.Y_value, torch.tensor([0.0] * negative_sample_size)), dim=0)
            self.len = self.len + negative_sample_size

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.Y_u[idx], self.Y_i[idx], self.Y_value[idx]

    def get_sparsity(self):
        return self.sparsity

In [11]:
train_dataset, val_dataset = create_dataset('../ml-100k.train.csv', '../ml-100k.val.csv',
                                            user_number, item_number, args['negative_sample_ratio'])

train_loader = data.DataLoader(train_dataset, batch_size=args["batch_size"], shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=args["batch_size"], shuffle=False)

In [12]:
# Train Dataset Size
len(train_loader.dataset)

394518

<b>Run 2</b>

In [195]:
train("model_validation-run2")

The time elapse of epoch 001 is: 43.537853956222534
The time elapse of epoch 002 is: 36.81234622001648
The time elapse of epoch 003 is: 38.75852084159851
The time elapse of epoch 004 is: 36.98269605636597
The time elapse of epoch 005 is: 37.29785394668579
The time elapse of epoch 006 is: 36.2294819355011
The time elapse of epoch 007 is: 37.140844106674194
The time elapse of epoch 008 is: 36.70030212402344
The time elapse of epoch 009 is: 36.600587129592896
The time elapse of epoch 010 is: 35.73583912849426
The time elapse of epoch 011 is: 36.8011257648468
The time elapse of epoch 012 is: 36.28961396217346
The time elapse of epoch 013 is: 36.05220985412598
The time elapse of epoch 014 is: 37.043025970458984
The time elapse of epoch 015 is: 37.14964032173157
The time elapse of epoch 016 is: 36.62901711463928
The time elapse of epoch 017 is: 35.4528911113739
The time elapse of epoch 018 is: 34.6025128364563
The time elapse of epoch 019 is: 36.737536907196045
The time elapse of epoch 020 i

![](img/model_validation-run2-train.png)<br/>
![](img/model_validation-run2-val.png)<br/>

이번에는 정상적으로 학습이 되는 것을 확인하였지만 HR@10, mAP@10, nDCG@10의 최고점이 각각 `0.6182`, `0.3011`, `0.3758`로 논문의 결과보다 떨어진다.<br />
따라서 이번에는 매 epoch마다 negative sampling을 새로 생성하도록 변화를 주어 학습을 시도해본다.<br />

In [13]:
class ImplicitSparseDataset(data.Dataset):
    @staticmethod
    def __create_negative_samples(N, sparse_Y_u, sparse_Y_i, sample_ratio):
        all_items = torch.arange(N, dtype=torch.int32)
    
        negative_samples = torch.tensor([], dtype=torch.int32)
        
        for user_id in sparse_Y_u.unique():
            positive_items = sparse_Y_i[sparse_Y_u == user_id]
            negative_sample_size = len(positive_items) * sample_ratio
                
            possible_negative_items = all_items[torch.isin(all_items, positive_items, invert=True)]
            
            random_indices = torch.randperm(len(possible_negative_items))[:negative_sample_size]
            
            negative_items = possible_negative_items[random_indices].unsqueeze(dim=1)
            negative_items = torch.cat((torch.full((negative_items.size(0), 1), user_id), negative_items), dim=1)
            
            negative_samples = torch.cat((negative_samples, negative_items))
        
        return negative_samples[:, 0], negative_samples[:, 1]

    def __init__(self, M, N, sparse_Y_u, sparse_Y_i, negative_sample_ratio=1):
        super(ImplicitSparseDataset, self).__init__()

        assert len(sparse_Y_u) == len(sparse_Y_i), "Length of `sparse_Y_u` and `sparse_Y_i` must be equal."
        
        self.M = M
        self.N = N
        self.negative_sample_ratio = negative_sample_ratio

        self.p_Y_u = sparse_Y_u
        self.p_Y_i = sparse_Y_i
        self.p_len = len(self.p_Y_u)
        self.p_Y_value = torch.tensor([1.0] * self.p_len)
        
        self.Y_u = self.p_Y_u
        self.Y_i = self.p_Y_i
        self.len = self.p_len
        self.Y_value = self.p_Y_value
            
    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.Y_u[idx], self.Y_i[idx], self.Y_value[idx]
    
    def regenerate_negative_samples(self):
        n_Y_u, n_Y_i = ImplicitSparseDataset.__create_negative_samples(self.N, self.p_Y_u, self.p_Y_i, self.negative_sample_ratio)
        n_len = n_Y_u.size(0)
        n_Y_value = torch.tensor([0.0] * n_len)
        
        self.Y_u = torch.cat((self.p_Y_u, n_Y_u), dim=0)
        self.Y_i = torch.cat((self.p_Y_i, n_Y_i), dim=0)
        self.len = self.p_len + n_len
        self.Y_value = torch.cat((self.p_Y_value, n_Y_value), dim=0)

    def get_sparsity(self):
        return 1 - (self.len / (self.M * self.N))

In [14]:
def create_dataset(train_csv, val_csv, _user_number, _item_number, negative_sample_ratio):
    _train_dataset = pd.read_csv(train_csv, dtype={0: np.int32, 1: np.int32}, header=None)
    _train_dataset = torch.from_numpy(_train_dataset.values)
    _train_dataset = ImplicitSparseDataset(_user_number, _item_number, _train_dataset[:, 0], _train_dataset[:, 1], negative_sample_ratio=negative_sample_ratio)

    _val_dataset = ValidationDataset(val_csv)

    return _train_dataset, _val_dataset

In [15]:
train_dataset, val_dataset = create_dataset('../ml-100k.train.csv', '../ml-100k.val.csv',
                                            user_number, item_number, args['negative_sample_ratio'])

train_loader = data.DataLoader(train_dataset, batch_size=args["batch_size"], shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=args["batch_size"], shuffle=False)

In [16]:
# Train Dataset Size (before generating negative samples)
len(train_loader.dataset)

80000

In [17]:
# Train Dataset Size (after generating negative samples)
train_loader.dataset.regenerate_negative_samples()
len(train_loader.dataset)

394518

In [18]:
def train(run_name):
    global train_loader
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args["lr"])
    
    writer = SummaryWriter(log_dir="runs/" + run_name)
    for epoch in range(args["epochs"]):
        start_time = time.time()
    
        batch = 0
        loss_sum = 0
        model.train() 
        train_loader.dataset.regenerate_negative_samples()
        for user, item, label in train_loader:
            batch += 1
    
            user = user.to(DEVICE)
            item = item.to(DEVICE)
            label = label.reshape(-1, 1).to(DEVICE)
    
            model.zero_grad()
            outputs = model(user, item)
    
            loss = criterion(outputs, label)
            loss_sum += loss.item()
    
            loss.backward()
            optimizer.step()
    
        writer.add_scalar("train/loss", loss_sum / batch, epoch + 1)
    
        model.eval()
        with torch.no_grad():
            hr, mean_ap, norm_dcg = evaluation.evaluate(model, val_loader, args["eval_k"], device=DEVICE)
            writer.add_scalar("val/hr", hr, epoch + 1)
            writer.add_scalar("val/mAP", mean_ap, epoch + 1)
            writer.add_scalar("val/nDCG", norm_dcg, epoch + 1)
    
        elapsed_time = time.time() - start_time
        print("The time elapse of epoch {:03d} is: ".format(epoch + 1) + str(elapsed_time))

<b>Run 3</b>

In [64]:
train("model_validation-run3")

The time elapse of epoch 001 is: 37.58870720863342
The time elapse of epoch 002 is: 37.18972325325012
The time elapse of epoch 003 is: 37.77529287338257
The time elapse of epoch 004 is: 38.025829792022705
The time elapse of epoch 005 is: 39.96527576446533
The time elapse of epoch 006 is: 40.833390951156616
The time elapse of epoch 007 is: 39.278671979904175
The time elapse of epoch 008 is: 37.38867998123169
The time elapse of epoch 009 is: 37.829949378967285
The time elapse of epoch 010 is: 37.66372084617615
The time elapse of epoch 011 is: 36.48789596557617
The time elapse of epoch 012 is: 36.897764921188354
The time elapse of epoch 013 is: 38.004019021987915
The time elapse of epoch 014 is: 38.76742506027222
The time elapse of epoch 015 is: 38.22804021835327
The time elapse of epoch 016 is: 37.607645988464355
The time elapse of epoch 017 is: 38.61990189552307
The time elapse of epoch 018 is: 37.97487688064575
The time elapse of epoch 019 is: 37.43840193748474
The time elapse of epoch

![](img/model_validation-run3-train.png)<br/>
![](img/model_validation-run3-val.png)<br/>

HR@10: `0.6336`<br/>
mAP@10: `0.316`<br/>
nDCG@10: `0.3905`<br/>

여전히 논문의 결과(HR@10=`0.671`, nDCG@10=`0.399`)에 살짝 못미치지만 데이터세트가 논문에서 사용한 1M이 아닌 100K 데이터셋이라는 점, hyper-parameter tuning이 되기전 최초 학습이라는 점, 그리고 마지막 layer의 factor를 최소로 잡고 학습을 시켰다는 점 등을 고려하면 개선 작업을 통하여 서비스에 활용해볼 가치는 충분하다고 생각된다.