In [223]:
import os
import pandas as pd
import numpy as np
import torch
from typing import Tuple
from tqdm import tqdm
from torch.utils.data import Dataset
import torch.nn as nn

In [3]:
def load_data(data_dir: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_df = pd.read_csv(os.path.join(data_dir, 'train/train_ratings.csv')).drop('time', axis=1)
    sub_df = pd.read_csv(os.path.join(data_dir, 'eval/sample_submission.csv'))
    
    return train_df, sub_df

In [4]:
train_df, sub_df = load_data("../../data/")
train_df

Unnamed: 0,user,item
0,11,4643
1,11,170
2,11,531
3,11,616
4,11,2140
...,...,...
5154466,138493,44022
5154467,138493,4958
5154468,138493,68319
5154469,138493,40819


In [102]:
train_df['user'].value_counts()

user
1916     2912
12416    1980
5563     1842
7417     1830
20832    1795
         ... 
2528       32
29208      28
24060      22
15643      19
12135      16
Name: count, Length: 31360, dtype: int64

In [300]:
def process_data(train_df: pd.DataFrame, max_len: int, k: int, n_neg_samples: int) -> Tuple[dict, int, int, dict]:
    item_idx = train_df['item'].unique()
    user_idx = train_df['user'].unique()
    
    user2idx = {user:idx for idx,user in enumerate(user_idx)}
    item2idx = {item:idx+1 for idx,item in enumerate(item_idx)}
    idx2item = {idx+1:item for idx,item in enumerate(item_idx)}
    n_items = len(item2idx)
    n_users = len(user2idx)
    
    train_df['user'] = train_df['user'].map(user2idx)
    train_df['item'] = train_df['item'].map(item2idx)
    
    total = train_df.groupby('user')['item'].apply(np.array)
    train_seq = list()
    valid_seq = list()
    valid_target = list()
    infer_seq = list()
    for user_idx, user_total in enumerate(tqdm(total)):
        # user_valid_target: 맨 뒤에서 5개, 중간에서 5개 추출
        user_valid_target = np.random.choice(user_total[:-(k//2)], (k//2), replace=False)
        user_valid_target = np.append(user_valid_target, user_total[-(k-k//2):])
        valid_target.append(user_valid_target)
        
        # user_total_train: user_valid_target 제외
        user_total_train = user_total[~np.isin(user_total, user_valid_target)]
        # max_len만큼 여러번 샘플링해서 train_seq에 추가
        for _ in range(1):
            user_train_seq_idx = np.random.choice(
                np.arange(0, user_total_train.size), min(user_total_train.size, max_len), replace=False)
            user_train_seq_idx = np.sort(user_train_seq_idx)
            train_sample = user_total_train[user_train_seq_idx]
            train_seq.append(train_sample)
        
        temp_valid_seq_idx = np.sort(np.random.choice(
            np.arange(0, user_total_train.size), min(user_total_train.size, max_len-k), replace=False))
        temp_valid_seq = user_total_train[temp_valid_seq_idx]
        user_valid_seq = np.zeros(temp_valid_seq.size+k, dtype=int)
        user_valid_seq[-k//2:] = n_items+1
        idx = np.sort(
            np.random.choice(np.arange(0, temp_valid_seq.size+(k//2)), k//2, replace=False))
        user_valid_seq[idx] = n_items+1
        user_valid_seq[user_valid_seq == 0] = temp_valid_seq
        if user_valid_seq.size < max_len :
            pad_len = max_len - user_valid_seq.size
            user_valid_seq = np.append([0]*pad_len, user_valid_seq)
        valid_seq.append(user_valid_seq)
        
        temp_infer_seq_idx = np.sort(np.random.choice(
            np.arange(0, user_total.size), min(user_total.size, max_len-k), replace=False))
        temp_infer_seq = user_total[temp_infer_seq_idx]
        user_infer_seq = np.zeros(temp_infer_seq.size+k, dtype=int)
        user_infer_seq[-k//2:] = n_items+1
        idx = np.sort(
            np.random.choice(np.arange(0, temp_infer_seq.size+(k//2)), k//2, replace=False))
        user_infer_seq[idx] = n_items+1
        user_infer_seq[user_infer_seq == 0] = temp_infer_seq
        if user_infer_seq.size < max_len :
            pad_len = max_len - user_infer_seq.size
            user_infer_seq = np.append([0]*pad_len, user_infer_seq)
        infer_seq.append(user_infer_seq)
        
    data = {'train': train_seq,
            'valid': valid_seq,
            'valid_target': valid_target,
            'infer': infer_seq}
    
    return data, n_users, n_items, idx2item

In [301]:
data, n_users, n_items, idx2item = process_data(train_df, 50, 10, 100)

100%|██████████| 31360/31360 [00:04<00:00, 6408.46it/s]


In [302]:
len(data['train'])

31360

In [310]:
for idx, seq in enumerate(data['train']):
    if seq.size < 20:
        print(idx)
        print(seq)

12135
[ 83 375 206 160 314 293]
15643
[ 286  272   73 1908 6408   74  771  765 6440]
24060
[1629  629 5073 5103 3246 3456 5092 2687 5072 6685 4178 4311]
29208
[6007 3557 1793 1187 4264 5407 5211 2713 1765 3239 1646   47 4175 5014
 5715 3982 5444 4160]


In [311]:
print(data['train'][12135].size)
print(data['train'][12135])

6
[ 83 375 206 160 314 293]


In [312]:
print(data['valid'][12135].size)
print(data['valid'][12135])

50
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0   83 6808  375  206 6808  160 6808 6808
  314 6808  293 6808 6808 6808 6808 6808]


In [313]:
print(data['valid_target'][12135])

[ 310  201 1203  199  264  266  631 1420  601  444]


In [314]:
print(data['infer'][12135].size)
print(data['infer'][12135])

50
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0 6808  199 6808   83
  375 1203  206 6808  264  160  310  314  293  201  266  631 1420 6808
 6808  601  444 6808 6808 6808 6808 6808]


In [315]:
class BERT4RecDataset(Dataset):
    def __init__(self,
                 train_data: pd.Series,
                 n_users: int,
                 n_items: int,
                 max_len: int,
                 k:int,
                 mask_prob: float):
        self.train_data = train_data
        self.n_users = n_users
        self.n_items = n_items
        self.max_len = max_len
        self.k = k
        self.mask_prob = mask_prob

    def __len__(self):
        return len(self.train_data)

    def __getitem__(self, user_idx: int) -> Tuple[torch.tensor, torch.tensor]: 
        seq = self.train_data[user_idx]
        masked_seq = seq.copy()
        labels = np.zeros_like(seq)
        # for item_idx in seq[:-(self.k//2)]:
        #     prob = np.random.random()
        #     if prob < self.mask_prob:
        #         labels.append(item_idx)  # 학습에 사용
        #         masked_seq.append(self.n_items+1)
        #     else:
        #         labels.append(0)  # 학습에 사용 X
        #         masked_seq.append(item_idx)
        # labels.extend(seq[-(self.k//2):])
        # masked_seq.extend([self.n_items+1]*(self.k//2))
        
        mask_idx = np.random.choice(np.arange(0, seq.size-(self.k//2)), int(seq.size*self.mask_prob))
        # 중간 랜덤 5개
        masked_seq[mask_idx] = self.n_items+1
        # 마지막 5개
        masked_seq[-(self.k//2):] = self.n_items+1
        labels[mask_idx] = seq[mask_idx]
        labels[-(self.k//2):] = seq[-(self.k//2):]
                
        # zero padding
        if seq.size < self.max_len:
            pad_len = self.max_len - seq.size
            masked_seq = np.append([0] * pad_len, masked_seq)
            labels = np.append([0] * pad_len, labels)
        
        masked_seq = torch.LongTensor(masked_seq)
        labels = torch.LongTensor(labels)
        
        return masked_seq, labels

In [316]:
dataset = BERT4RecDataset(data['train'], n_users, n_items, 50, 10, 0.1)

In [317]:
len(dataset)

31360

In [319]:
dataset[12135]

(tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,   83, 6808, 6808, 6808,
         6808, 6808]),
 tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0, 375, 206, 160, 314, 293]))

In [320]:
label = dataset[12135][1]
label

tensor([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0, 375, 206, 160, 314, 293])

In [169]:
label[label != 0]

tensor([ 14,  92, 158, 270, 284, 344, 351, 357, 364, 371])

In [156]:
data['valid'][20]

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          356, 1421, 1423, 1415,  666,  255, 1404, 1400,  743, 1424,  261,   81,
          859,  266,  267,  199,   85,  268, 1420,  741,  735,  314,  667,   80,
         1143,  732,  600, 1087,  602,   47,  554,  265,  203,  625,  739, 1427,
         1069, 6808]])

In [172]:
x = torch.tensor([5, 4, 2, 3, 6, 1])
x.topk(3, largest=True)

torch.return_types.topk(
values=tensor([6, 5, 4]),
indices=tensor([4, 0, 1]))

In [321]:
import torch
import torch.nn as nn


class EncoderLayer(nn.Module):
    def __init__(self,
                 embed_dim: int,
                 n_heads: int,
                 pffn_hidden_dim: int,
                 dropout_rate: float):
        super(EncoderLayer, self).__init__()        
        self.mha = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=n_heads, dropout=dropout_rate)
        self.pffn = nn.Sequential(
                    nn.Linear(embed_dim, pffn_hidden_dim),
                    nn.GELU(),
                    nn.Linear(pffn_hidden_dim, embed_dim)
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.layer_norm2 = nn.LayerNorm(embed_dim)
        
    def forward(self, embed_seq, padding_mask):
        """
        attention -> residual connection -> pointwise feed forward network -> residual connection
        input: size(batch_size, max_len, embed_dim)
        output: size(batch_size, max_len, embed_dim)
        """
        embed_seq = embed_seq.transpose(0, 1)
        mha_out, _ = self.mha(embed_seq, embed_seq, embed_seq, key_padding_mask=padding_mask)
        mha_out = mha_out.transpose(0, 1)
        mha_out = self.layer_norm1(self.dropout(mha_out) + embed_seq.transpose(0, 1))
        
        pffn_out = self.pffn(mha_out)
        out = self.layer_norm2(self.dropout(pffn_out) + mha_out)
        
        return out
    
    
class BERT4Rec(nn.Module):
    def __init__(self,
                 n_items: int,
                 embed_dim: int,
                 max_len: int,
                 n_layers: int,
                 n_heads: int,
                 pffn_hidden_dim: int,
                 dropout_rate: float,
                 device: torch.device):
        super(BERT4Rec, self).__init__()
        self.max_len = max_len
        self.n_layers = n_layers
        self.device = device
        
        self.item_embed = nn.Embedding(n_items+2, embed_dim, padding_idx=0)
        self.pos_embed = nn.Embedding(max_len, embed_dim)
        
        self.encoder_layer = nn.ModuleList(
            [EncoderLayer(embed_dim, n_heads, pffn_hidden_dim, dropout_rate)
             for _ in range(self.n_layers)]
            )
        
        self.out_layer = nn.Linear(embed_dim, n_items+1)
        
        
    def embedding_layer(self, seq: torch.tensor) -> torch.tensor:
        """
        input: shape(batch_size, max_len)
        output: shape(batch_size, max_len, embed_dim)
        """
        item_embed = self.item_embed(seq)
        pos = torch.arange(self.max_len, device=self.device).unsqueeze(0)
        pos_embed = self.pos_embed(pos).repeat(item_embed.size(0), 1, 1)
        embed_seq = item_embed + pos_embed
        
        return embed_seq
    
    def forward(self, seq: torch.tensor) -> torch.tensor:
        """
        embedding -> encoder -> output
        input: shape(batch_size, max_len)
        output: shape(batch_size, max_len, n_items+1)
        """
        embed_seq = self.embedding_layer(seq)
        
        padding_mask = (seq==0).bool().to(self.device)
        out = embed_seq
        for block in self.encoder_layer:
            out = block(out, padding_mask)
            
        out = self.out_layer(out)
        
        return out

In [235]:
def metric(output, target):
    """
    Computes Recall@k for the specified values of k

    Args:
    output (torch.Tensor): model's output probabilities, size (batch_size, max_len, num_classes)
    target (torch.Tensor): ground truth, size (batch_size, max_len)

    Returns:
    float, recall value
    """
    output_reshaped = output.view(-1, output.size(-1))  # Reshape output to (batch_size * max_len, num_classes)
    target_reshaped = (target - 1).view(-1)  # Reshape target to (batch_size * max_len,)
    
    criterion = nn.CrossEntropyLoss(ignore_index=-1)  # ignore padding index
    loss = criterion(output_reshaped, target_reshaped)

    mask_idx = (target != 0)  # get indices of non-zero target elements
    print(mask_idx)

    output = output[mask_idx]  # filter output by these indices
    target = target[mask_idx] - 1  # filter target by these indices
    print(output)
    print(f"target: {target}")
    _, pred = output.topk(1, dim=-1)  # get top k predictions
    pred = pred.squeeze(-1)
    print(f"pred: {pred}")
    correct = pred.eq(target)  # compare predictions to target

    recall = correct.sum().item() / correct.size(0)  # calculate recall

    return recall, loss


In [236]:
# logits for each item
output = torch.tensor([
    [[0.1, 0.2, 0.7], [0.3, 0.2, 0.5], [0.1, 0.3, 0.6], [0.2, 0.1, 0.7]],  # batch 1
    [[0.5, 0.3, 0.2], [0.2, 0.1, 0.7], [0.4, 0.3, 0.3], [0.6, 0.4, 0.0]],  # batch 2
    [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5], [0.3, 0.1, 0.6], [0.4, 0.7, 0.1]],  # batch 3
    [[0.6, 0.3, 0.1], [0.2, 0.1, 0.7], [0.1, 0.6, 0.3], [0.7, 0.3, 0.0]],  # batch 4
    [[0.3, 0.3, 0.4], [0.2, 0.2, 0.6], [0.5, 0.2, 0.3], [0.4, 0.6, 0.0]]   # batch 5
])

# true classes for each item, with some items unmasked (represented by 0)
target = torch.tensor([
    [1, 0, 3, 0],  # batch 1
    [2, 3, 0, 0],  # batch 2
    [0, 1, 0, 3],  # batch 3
    [1, 0, 3, 0],  # batch 4
    [0, 3, 0, 2]   # batch 5
])

recall, loss = metric(output, target)
print("Loss", loss)
print("Recall:", recall)


tensor([[0.1000, 0.2000, 0.7000],
        [0.3000, 0.2000, 0.5000],
        [0.1000, 0.3000, 0.6000],
        [0.2000, 0.1000, 0.7000],
        [0.5000, 0.3000, 0.2000],
        [0.2000, 0.1000, 0.7000],
        [0.4000, 0.3000, 0.3000],
        [0.6000, 0.4000, 0.0000],
        [0.1000, 0.2000, 0.7000],
        [0.2000, 0.3000, 0.5000],
        [0.3000, 0.1000, 0.6000],
        [0.4000, 0.7000, 0.1000],
        [0.6000, 0.3000, 0.1000],
        [0.2000, 0.1000, 0.7000],
        [0.1000, 0.6000, 0.3000],
        [0.7000, 0.3000, 0.0000],
        [0.3000, 0.3000, 0.4000],
        [0.2000, 0.2000, 0.6000],
        [0.5000, 0.2000, 0.3000],
        [0.4000, 0.6000, 0.0000]])
tensor([ 0, -1,  2, -1,  1,  2, -1, -1, -1,  0, -1,  2,  0, -1,  2, -1, -1,  2,
        -1,  1])
tensor([[ True, False,  True, False],
        [ True,  True, False, False],
        [False,  True, False,  True],
        [ True, False,  True, False],
        [False,  True, False,  True]])
tensor([[0.1000, 0.2000, 0.7000