- Neural Collaborative Filtering [https://arxiv.org/pdf/1708.05031]
- Neural Collaborative Filtering 리뷰 [https://leehyejin91.github.io/post-ncf/]
- 최대우도법 (MLE) [https://angeloyeo.github.io/2020/07/17/MLE.html]
- PyTorch Embedding [https://wikidocs.net/64779]


In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn

device = "mps"

# 2. Preliminaries

- $U$: Set of Users
- $I$: Set of Items
- $M = |U|$; 즉, # of Users
- $N = |I|$; 즉, # of Items
- $Y$: user$\times$item 행렬 (shape=($M$, $N$)); $Y_{u,i}=1$은 user $u$과 item $i$간의 상호작용이 존재했음을 의미

> 상호작용이란 user가 item을 열람했거나, 구매했거나 등의 암시적인(implicit) 정보를 의미하며, 주의할 점은 이것이 명시적인(explicit) 선호를 뜻하진 않는다는 것이다.<br/>
> 따라서 $Y_{u,i}=0$ 은 상호작용이 없는 것이지, 해당 item을 비선호 한다는 의미는 아니다.<br/>

In [2]:
demo_M = 4
demo_N = 5

demo_k = 2

demo_Y_sparse_indices_df = pd.DataFrame([
    [0, 0],
    [0, 2],
    [0, 3],
    [1, 1],
    [1, 4],
    [2, 0],
    [2, 1],
    [2, 2],
    [3, 3],
    [3, 4],
], columns=['user_id', 'item_id'])

def build_Y(sparse_indices_df, _M, _N):
    return sp.coo_matrix(([1.0] * len(sparse_indices_df), (sparse_indices_df['user_id'], sparse_indices_df['item_id'])), shape=(_M, _N))

demo_Y = build_Y(demo_Y_sparse_indices_df, demo_M, demo_N)
demo_Y.todense()

matrix([[1., 0., 1., 1., 0.],
        [0., 1., 0., 0., 1.],
        [1., 1., 1., 0., 0.],
        [0., 0., 0., 1., 1.]])

In [3]:
demo_u = []
demo_i = []
demo_y = []
demo_Y_dense = demo_Y.todense()
for u in range(demo_Y.shape[0]):
    for i in range(demo_Y.shape[1]):
        demo_u.append(u)
        demo_i.append(i)
        demo_y.append([demo_Y_dense[u, i]])

demo_u = torch.tensor(demo_u).to(device)
demo_i = torch.tensor(demo_i).to(device)
demo_y = torch.tensor(demo_y).to(device, dtype=torch.float)

# 3. Neural Collaborative Filtering

## 3.1 General Framework Design

![](./img/mlp.png)

### 3.1.1. Embedding Layer

- $P$: shape=($M$, $k$)
- $Q$: shape=($N$, $k$)

In [4]:
# User Lookup Table (P)
demo_P = nn.Embedding(num_embeddings=demo_M, embedding_dim=demo_k) # U x user latent vector, shape=(M, k)
demo_P.weight.shape

torch.Size([4, 2])

In [5]:
# Item Lookup Table (Q)
demo_Q = nn.Embedding(num_embeddings=demo_N, embedding_dim=demo_k) # I x item latent vector, shape=(N, k)
demo_Q.weight.shape

torch.Size([5, 2])

- $v^U_u$: user $u$를 나타내는 one-hot 벡터; shape=($n$, $M$)
- $v^I_i$: user $u$를 나타내는 one-hot 벡터; shape=($n$, $N$)
<br/>
<br/>
- $p_u=v^U_uP$: shape=($n$, $k$): user latent vector
- $q_u=v^I_iP$: shape=($n$, $k$): item latent vector

In [6]:
demo_n = len(demo_Y_sparse_indices_df) # batch size
demo_n

10

In [7]:
# User Latent Vector
demo_p_u = demo_P(torch.tensor(demo_Y_sparse_indices_df['user_id'])) # shape=(n, k)
demo_p_u.shape

torch.Size([10, 2])

In [8]:
# Item Latent Vector
demo_q_u = demo_Q(torch.tensor(demo_Y_sparse_indices_df['item_id'])) # shape=(n, k)
demo_q_u.shape

torch.Size([10, 2])

### 3.1.2. Neural CF Layers

$$
\Phi_1(p_u, q_u) = [p_u, q_u]\\
$$

In [9]:
demo_phi_1 = torch.cat((demo_p_u, demo_q_u), -1) # BATCH x (user latent vector, item latent vector), shape=(n, k+k)
demo_phi_1.shape

torch.Size([10, 4])

논문에 따르면
- Bottom Layer가 넓고 순차적으로 Neuron수를 반씩 줄여나가는 Tower Pattern으로 구현
- 활성화 함수로 ReLU를 사용하는것이 결과적으로 조금더 괜찮은 성능을 보임

추가적으로 Hidden Layer에 성능을 높이기 위해 Dropout Layer를 추가함

In [10]:
def create_layer(_input_size, dropout_prob=0.5):
    output_size = _input_size // 2
    return nn.Sequential(
        nn.Dropout(p=dropout_prob),
        nn.Linear(_input_size, output_size),
        nn.ReLU(),
    ), output_size

num_layers = 1

input_size = demo_k * 2
layers = []
for i in range(num_layers):
    layer, input_size = create_layer(input_size)
    layers.append(layer)
demo_phi_X = nn.Sequential(*layers)(demo_phi_1)
demo_phi_X.shape



torch.Size([10, 2])

### 3.1.3. Output Layer

In [11]:
phi_out = nn.Linear(input_size, 1)(demo_phi_X)
phi_out

tensor([[-0.4513],
        [-0.4513],
        [-0.5988],
        [-0.4513],
        [-0.5849],
        [ 0.1692],
        [-0.5125],
        [-0.5742],
        [-0.3045],
        [-0.4513]], grad_fn=<AddmmBackward0>)

### 3.1.4. General Framework Implementation

In [12]:
class NCFFramework(nn.Module):
    @staticmethod
    def __create_layer(in_size, dropout_prob=0.5):
        out_size = in_size // 2
        return nn.Sequential(
            nn.Dropout(p=dropout_prob),
            nn.Linear(in_size, out_size),
            nn.ReLU(),
        ), out_size 
    
    def __init__(self, M, N, ncfl_num_layers, ncfl_out_size, dropout_prob=0.5):
        super(NCFFramework, self).__init__()
        
        self.M = M
        self.N = N
        
        assert ncfl_num_layers >= 1, "Neural CF layers should have at least one layer."
        self.ncfl_num_layers = ncfl_num_layers # Number of layers in 'neural collaborative filtering layers'

        self.ncfl_in_size = ((2 ** self.ncfl_num_layers) * ncfl_out_size)
        self.ncfl_out_size = ncfl_out_size # 'neural collaborative filtering layers' output dimension
        
        self.embedding_dim = self.ncfl_in_size // 2
        
        self.P = nn.Embedding(num_embeddings=self.M, embedding_dim=self.embedding_dim)
        self.Q = nn.Embedding(num_embeddings=self.N, embedding_dim=self.embedding_dim)
        
        last_out_size = self.ncfl_in_size
        layers = []
        for i in range(self.ncfl_num_layers):
            layer, last_out_size = NCFFramework.__create_layer(last_out_size, dropout_prob)
            layers.append(layer)
        self.phi_X = nn.Sequential(*layers)
        
        self.phi_out = nn.Linear(last_out_size, 1)
        
        self.__init_weights()
        
    def __init_weights(self):
        nn.init.normal_(self.P.weight, std=.01)
        nn.init.normal_(self.Q.weight, std=.01)
        
        for phi in self.phi_X:
            for phi_layer in phi:
                if isinstance(phi_layer, nn.Linear):
                    nn.init.xavier_uniform_(phi_layer.weight)
                    phi_layer.bias.data.zero_()
                
        nn.init.kaiming_uniform_(self.phi_out.weight, nonlinearity="sigmoid")
        self.phi_out.bias.data.zero_()
        
    def forward(self, user_id, item_id):
        p_u = self.P(user_id)
        q_i = self.Q(item_id)
        
        phi_1_result = torch.cat((p_u, q_i), -1)
        phi_X_result = self.phi_X(phi_1_result)
        return self.phi_out(phi_X_result)

In [13]:
demo_model = NCFFramework(demo_M, demo_N, ncfl_num_layers=3, ncfl_out_size=32, dropout_prob=0.3).to(device)
demo_model

NCFFramework(
  (P): Embedding(4, 128)
  (Q): Embedding(5, 128)
  (phi_X): Sequential(
    (0): Sequential(
      (0): Dropout(p=0.3, inplace=False)
      (1): Linear(in_features=256, out_features=128, bias=True)
      (2): ReLU()
    )
    (1): Sequential(
      (0): Dropout(p=0.3, inplace=False)
      (1): Linear(in_features=128, out_features=64, bias=True)
      (2): ReLU()
    )
    (2): Sequential(
      (0): Dropout(p=0.3, inplace=False)
      (1): Linear(in_features=64, out_features=32, bias=True)
      (2): ReLU()
    )
  )
  (phi_out): Linear(in_features=32, out_features=1, bias=True)
)

In [14]:
demo_y_hat = demo_model.forward(demo_u, demo_i)
demo_y_hat

tensor([[ 0.0040],
        [-0.0008],
        [ 0.0277],
        [ 0.0113],
        [ 0.0110],
        [ 0.0032],
        [ 0.0080],
        [ 0.0104],
        [ 0.0105],
        [ 0.0169],
        [ 0.0109],
        [ 0.0137],
        [ 0.0146],
        [ 0.0135],
        [ 0.0019],
        [ 0.0080],
        [ 0.0071],
        [ 0.0109],
        [ 0.0113],
        [ 0.0237]], device='mps:0', grad_fn=<LinearBackward0>)

### 3.1.5. Learning


상호관계 여부를 예측하는 Binary Classification 문제이기 때문에 대표적으로 사용되는 Binary Cross Entropy Loss를 사용한다.<br/>
모델의 output이 Sigmoid 활성화 함수를 거치지 않은 logits이기 때문에 Sigmoid 레이어가 내재화된 BCEWithLogitsLoss 손실함수를 사용한다.<br/>

In [15]:
demo_loss_function = nn.BCEWithLogitsLoss()
demo_loss_function(demo_y_hat, demo_y).item()

0.6915115118026733

In [16]:
learning_rate = 0.01
demo_optimizer = torch.optim.Adam(demo_model.parameters(), lr=learning_rate)

demo_model.train()

epochs = 100
for i in range(epochs):
    demo_model.zero_grad()
    demo_y_hat = demo_model(demo_u, demo_i)

    loss = demo_loss_function(demo_y_hat, demo_y)
    loss.backward()

    print(loss.item())

    demo_optimizer.step()

0.6929309964179993
0.6916479468345642
0.6865730285644531
0.6881486773490906
0.6858047842979431
0.6703415513038635
0.6391724944114685
0.664279580116272
0.6496123671531677
0.5650702714920044
0.548984944820404
0.5117141008377075
0.4072394371032715
0.43329548835754395
0.31386232376098633
0.28013715147972107
0.27188488841056824
0.22473230957984924
0.24421150982379913
0.10248155891895294
0.06292670220136642
0.05329655483365059
0.088319793343544
0.1314503401517868
0.013704395852982998
0.0022017862647771835
0.01306657213717699
0.00787320639938116
0.3810669481754303
0.0012017246335744858
0.0023655544500797987
0.0002806294651236385
0.09312061220407486
0.2558099925518036
0.008946609683334827
0.002869856311008334
0.017996544018387794
0.18198277056217194
0.06985878199338913
0.006510592997074127
0.30160093307495117
0.06476017832756042
0.0037618314381688833
0.1319386214017868
0.00021823789575137198
0.020836306735873222
0.06986329704523087
0.03938419744372368
0.005239080172032118
0.14293740689754486
0

In [17]:
demo_model.eval()
demo_prediction = demo_model.forward(demo_u, demo_i)
torch.sigmoid(demo_prediction)

tensor([[1.0000e+00],
        [2.8731e-05],
        [1.0000e+00],
        [9.9998e-01],
        [6.0938e-05],
        [7.9006e-06],
        [1.0000e+00],
        [5.5600e-05],
        [7.3863e-04],
        [1.0000e+00],
        [1.0000e+00],
        [1.0000e+00],
        [1.0000e+00],
        [9.2660e-04],
        [5.3489e-03],
        [1.0757e-08],
        [6.5150e-04],
        [5.1420e-09],
        [9.9933e-01],
        [9.9991e-01]], device='mps:0', grad_fn=<SigmoidBackward0>)

## 3.2. Neural Matrix Factorization (Generalized Matrix Factorization, GMF + Multi-Layer Perceptron, MLP)

![](./img/neumf.png)

In [25]:
class NeuMF(nn.Module):
    @staticmethod
    def create_layer(in_size, dropout_prob=0.5):
        out_size = in_size // 2
        return nn.Sequential(
            nn.Dropout(p=dropout_prob),
            nn.Linear(in_size, out_size),
            nn.ReLU(),
        ), out_size

    def __init__(self, M, N, predictive_factor_num, mlp_num_layers, dropout_prob=0.5):
        super(NeuMF, self).__init__()

        self.M = M
        self.N = N

        assert predictive_factor_num % 2 == 0, "Number of predictive factor should be divisible by 2."

        # MLP
        assert mlp_num_layers >= 1, "MLP should have at least one layer."

        mlp_out_size = predictive_factor_num // 2
        mlp_in_size = ((2 ** mlp_num_layers) * mlp_out_size)

        self.mlp_embedding_dim = mlp_in_size // 2
        self.mlp_P = nn.Embedding(num_embeddings=self.M, embedding_dim=self.mlp_embedding_dim)
        self.mlp_Q = nn.Embedding(num_embeddings=self.N, embedding_dim=self.mlp_embedding_dim)

        last_out_size = mlp_in_size
        layers = []
        for i in range(mlp_num_layers):
            layer, last_out_size = NeuMF.create_layer(last_out_size, dropout_prob)
            layers.append(layer)
        self.mlp_layer_X = nn.Sequential(*layers)
        # END OF MLP

        # GMF
        self.gmf_embedding_dim = predictive_factor_num // 2
        self.gmf_P = nn.Embedding(num_embeddings=self.M, embedding_dim=self.gmf_embedding_dim)
        self.gmf_Q = nn.Embedding(num_embeddings=self.N, embedding_dim=self.gmf_embedding_dim)
        # END OF GMF

        self.neu_mf = nn.Linear(predictive_factor_num, 1)

        self.__init_weights()

    def __init_weights(self):
        nn.init.normal_(self.gmf_P.weight, std=.01)
        nn.init.normal_(self.gmf_Q.weight, std=.01)

        nn.init.normal_(self.mlp_P.weight, std=.01)
        nn.init.normal_(self.mlp_Q.weight, std=.01)

        for mlp_layer in self.mlp_layer_X:
            for inner_layer in mlp_layer:
                if isinstance(inner_layer, nn.Linear):
                    nn.init.xavier_uniform_(inner_layer.weight)
                    inner_layer.bias.data.zero_()

        nn.init.kaiming_uniform_(self.neu_mf.weight, nonlinearity="sigmoid")
        self.neu_mf.bias.data.zero_()

    def forward(self, user_id, item_id):
        # GMF
        gfm_p_u = self.gmf_P(user_id)
        gfm_q_i = self.gmf_Q(item_id)
        gfm_out = torch.multiply(gfm_p_u, gfm_q_i)

        # MLP
        mlp_p_u = self.mlp_P(user_id)
        mlp_q_i = self.mlp_Q(item_id)
        layer_1_out = torch.cat((mlp_p_u, mlp_q_i), -1)
        mlp_out = self.mlp_layer_X(layer_1_out)

        neu_cf_in = torch.cat((gfm_out, mlp_out), -1)
        return self.neu_mf(neu_cf_in)

In [26]:
demo_ncf_model = NeuMF(demo_M, demo_N, predictive_factor_num=8, mlp_num_layers=3).to(device)

demo_ncf_loss_function = nn.BCEWithLogitsLoss()

learning_rate = 0.01
demo_ncf_optimizer = torch.optim.Adam(demo_ncf_model.parameters(), lr=learning_rate)

epochs = 100
demo_ncf_model.train()
for i in range(epochs):
    demo_ncf_model.zero_grad()
    demo_y_hat = demo_ncf_model(demo_u, demo_i)

    loss = demo_ncf_loss_function(demo_y_hat, demo_y)
    loss.backward()

    print(loss.item())

    demo_ncf_optimizer.step()

0.6937944889068604
0.6931107640266418
0.6945368647575378
0.6925746202468872
0.6922988295555115
0.6909939646720886
0.6915990114212036
0.6902710199356079
0.6903945207595825
0.6832013130187988
0.6844273805618286
0.6832268834114075
0.6834337115287781
0.6690720915794373
0.6843172907829285
0.6848973631858826
0.6755701303482056
0.6706559062004089
0.7047848701477051
0.6637685298919678
0.6622054576873779
0.6623641848564148
0.6597537994384766
0.672028660774231
0.6179949641227722
0.6390998363494873
0.6626035571098328
0.6291914582252502
0.6434594988822937
0.6338084936141968
0.598013162612915
0.6148225665092468
0.6113141775131226
0.5853621959686279
0.5767350792884827
0.5816375613212585
0.5522779822349548
0.5281620621681213
0.5018259882926941
0.5415157675743103
0.5321045517921448
0.5060431361198425
0.49864616990089417
0.45321089029312134
0.4529767632484436
0.42777082324028015
0.43425512313842773
0.4173538386821747
0.4217115342617035
0.4546630382537842
0.3612145483493805
0.3771856725215912
0.36658149