# BERT4REC with genres embed

#### github 원본 출처
* BERT4Rec
* github 저장소 : constantfear/bert4rec
* github link : https://github.com/constantfear/bert4rec?tab=readme-ov-file
* paper link : https://arxiv.org/pdf/1904.06690

### 1. Import Libraries

In [0]:
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import os
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pyspark.sql.functions import split, array_contains, col
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

import pickle

import warnings

#### 1.1 Make configuration

In [0]:
config = {
    'max_len' : 80,
    'hidden_units' : 256, # Embedding 
    'num_heads' : 2, # Multi-head layer 
    'num_layers': 2, # block encoder layer
    'dropout_rate' : 0.1, # dropout
    'lr' : 0.001,
    'batch_size' : 64,
    'num_epochs' : 10,
    'num_workers' : 0,
    'mask_prob' : 0.15, 
}

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

### 2. Data Preprocessing

#### 2.1 Make Train Data

* train, validation 데이터 로드 및 통합
* 사용자별 시청 영화 개수 필터링
* 영화 장르 정보 처리
* 사용자/아이템 인코더 및 디코더 생성
* 사용자별 시청 시퀀스 데이터 생성 (학습/검증용 분리)

In [0]:
class MakeSequenceDataSet():
    """
    SequenceData
    
    사용자별로 시청한 영화 목록을 시퀀스로 정리한 테이블로 원본 테이블을 변환
    """
    def __init__(self):
        
        print('Reading data...')
        
        # Spark로 CSV를 읽은 후 pandas DataFrame으로 변환
        # rating이 4이상인 데이터만 사용
        train_df = spark.table("`1dt_team8_databricks`.final.train_data")[
            ['userId', 'movieId', 'rating', 'timestamp']
        ].filter("rating >= 4").toPandas()
        valid_df = spark.table("`1dt_team8_databricks`.final.validation_data")[
            ['userId', 'movieId', 'rating', 'timestamp']
        ].filter("rating >= 4").toPandas()

        combined_df = pd.concat([train_df, valid_df], ignore_index=True)
        self.df = combined_df

        # userId 별로 카운트했을 때 movieID 개수가 4 이하인 userId를 제외
        user_counts = self.df['userId'].value_counts()
        valid_users = user_counts[user_counts >= 4].index
        self.df = self.df[self.df['userId'].isin(valid_users)].copy()
        
        self.movies = spark.table("1dt_team8_databricks.`movielens-32m`.movies").toPandas()
        
        print('Applying genres...')
        
        self.genres = [
            "Action",
            "Adventure",
            "Animation",
            "Children's",
            "Comedy",
            "Crime",
            "Documentary",
            "Drama",
            "Fantasy",
            "Film-Noir",
            "Horror",
            "Musical",
            "Mystery",
            "Romance",
            "Sci-Fi",
            "Thriller",
            "War",
            "Western",
        ]
        
        for genre in self.genres:
            self.movies[genre] = self.movies["genres"].apply(
                lambda values: int(genre in values.split("|"))
            )
        
        self._movie_genres = self.movies[self.genres].to_numpy()
        
        print('Generate encoder and decoder...')
        self.item_encoder, self.item_decoder = self.generate_encoder_decoder(self.movies['movieId'])
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder(self.df['userId'])
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)

        self.df.loc[:, 'item_idx'] = self.df['movieId'].apply(lambda x : self.item_encoder.get(x, -1) + 1)
        self.df.loc[:, 'user_idx'] = self.df['userId'].apply(lambda x : self.user_encoder.get(x, -1))
        
        self.df = self.df.sort_values(['user_idx', 'timestamp']) 
        
        print('Generate sequence data...')
        self.user_train, self.genres_seq, self.user_valid = self.generate_sequence_data()
        
        print('Save tokenizer')
        
        # 변환기 저장 경로
        tokenizer_path = '/Volumes/1dt_team8_databricks/bert_model/models'
        os.makedirs(tokenizer_path, exist_ok=True)

        # 변환기 저장
        with open(os.path.join(tokenizer_path, 'bert4rec_model_0609_2_item_encoder.pkl'), 'wb') as f:
            pickle.dump(self.item_encoder, f)
        with open(os.path.join(tokenizer_path, 'bert4rec_model_0609_2_item_decoder.pkl'), 'wb') as f:
            pickle.dump(self.item_decoder, f)
        with open(os.path.join(tokenizer_path, 'bert4rec_model_0609_2_user_encoder.pkl'), 'wb') as f:
            pickle.dump(self.user_encoder, f)
        with open(os.path.join(tokenizer_path, 'bert4rec_model_0609_2_user_decoder.pkl'), 'wb') as f:
            pickle.dump(self.user_decoder, f)

        print('Finish!!!')

    def generate_encoder_decoder(self, col) -> dict:
        """
        encoder, decoder

        Args:
            col (str): columns
        Returns:
            dict: encoder, decoder
        """

        encoder = {}
        decoder = {}
        ids = col.unique()

        for idx, _id in enumerate(ids):
            encoder[_id] = idx
            decoder[idx] = _id

        return encoder, decoder
    
    def movie_genres(self, idx):
        return self._movie_genres[idx-1].tolist()
        
    
    def generate_sequence_data(self) -> dict:
        """
        sequence_data

        Returns:
            dict: train user sequence / valid user sequence
        """
        users = defaultdict(list)
        user_train = {}
        genres_seq = {}
        user_valid = {}
        group_df = self.df.groupby('user_idx')
        for user, item in group_df:
            users[user].extend(item['item_idx'].tolist())
            
        
        for user in users:
            user_train[user] = users[user][:-1]
            genres_seq[user] = [self.movie_genres(i) for i in user_train[user]]
            user_valid[user] = users[user][-1]

        return user_train, genres_seq, user_valid
    
    def get_train_valid_data(self):
        return self.user_train, self.genres_seq, self.user_valid

#### 2.2 Make Test Data

* test 데이터 로드
* 사용자별 시청 영화 개수 필터링
* 입력 시퀀스(Input)와 정답(Ground Truth) 분리

In [0]:
class MakeTestSequenceDataSet():
    """
    Test Sequence DataSet
    
    - test 테이블에서 데이터를 불러옵니다.
    - 선호 영화가 13개 이상인 사용자만 필터링합니다.
    - 각 사용자의 영화 목록에서 마지막 10개는 ground truth로,
      나머지(최소 3개 이상)는 입력 시퀀스로 사용합니다.
    """
    def __init__(self):
        print('Reading test data...')
        self.df = spark.table("`1dt_team8_databricks`.final.test_data")[
            ['userId', 'movieId', 'rating', 'timestamp']
        ].filter("rating >= 4").toPandas()
        user_counts = self.df['userId'].value_counts()
        valid_users = user_counts[user_counts >= 13].index
        self.df = self.df[self.df['userId'].isin(valid_users)].copy()
        
        self.movies = spark.table("1dt_team8_databricks.`movielens-32m`.movies").toPandas()
        
        print('Applying genres for test data...')
        self.genres = [
            "Action",
            "Adventure",
            "Animation",
            "Children's",
            "Comedy",
            "Crime",
            "Documentary",
            "Drama",
            "Fantasy",
            "Film-Noir",
            "Horror",
            "Musical",
            "Mystery",
            "Romance",
            "Sci-Fi",
            "Thriller",
            "War",
            "Western",
        ]
        for genre in self.genres:
            self.movies[genre] = self.movies["genres"].apply(
                lambda values: int(genre in values.split("|"))
            )
        self._movie_genres = self.movies[self.genres].to_numpy()
        
        print('Generate encoder and decoder for test data...')
        self.item_encoder, self.item_decoder = self.generate_encoder_decoder(self.movies['movieId'])
        self.user_encoder, self.user_decoder = self.generate_encoder_decoder(self.df['userId'])
        self.num_item, self.num_user = len(self.item_encoder), len(self.user_encoder)
        
        self.df = self.df.copy()
        self.df['item_idx'] = self.df['movieId'].apply(lambda x: self.item_encoder.get(x, -1) + 1)
        self.df['user_idx'] = self.df['userId'].apply(lambda x: self.user_encoder.get(x, -1))
        self.df = self.df.sort_values(['user_idx', 'timestamp'])
        
        print('Generate test sequence data (split input & ground truth)...')
        self.user_input, self.genres_input, self.user_groundtruth = self.generate_sequence_data()
        print('Finish test data preparation!')
    
    def generate_encoder_decoder(self, col) -> dict:
        encoder = {}
        decoder = {}
        for idx, _id in enumerate(col.unique()):
            encoder[_id] = idx
            decoder[idx] = _id
        return encoder, decoder
    
    def movie_genres(self, idx):
        return self._movie_genres[idx-1].tolist()
    
    def generate_sequence_data(self) -> tuple:
        """
        각 사용자에 대해
         - user_input: 영화 목록에서 마지막 10개를 제외한 나머지(입력 시퀀스)
         - user_groundtruth: 마지막 10개 영화 (평가지표 측정용 정답)
         - genres_input: 입력 영화에 대한 장르 시퀀스
        """
        user_movies = {}
        grouped = self.df.groupby('user_idx')
        for user, items in grouped:
            user_movies[user] = items['item_idx'].tolist()
        
        user_input = {}
        genres_input = {}
        user_groundtruth = {}
        for user, seq in user_movies.items():
            # 이미 13개 이상인 사용자만 있으므로
            user_input[user] = seq[:-10]
            user_groundtruth[user] = seq[-10:]
            genres_input[user] = [self.movie_genres(i) for i in user_input[user]]
        return user_input, genres_input, user_groundtruth
    
    def get_test_data(self):
        return self.user_input, self.genres_input, self.user_groundtruth

#### 2.3. Define BERT Train Data

Masking, Random Replacement 등 BERT 학습을 위한 토큰 변환 로직 구현<br>
이제 학습을 위한 Dataset 클래스를 설명하겠습니다. 해당 데이터셋은 **사용자들과 그들의 시청 순서(시퀀스)**로 구성됩니다. 모델 학습 시, 입력되는 시퀀스는 다음과 같은 방식으로 수정됩니다:

* 시퀀스에서 무작위로 15%의 토큰을 선택합니다.
* 선택된 토큰들에 대해 다음과 같이 처리합니다:
    * 80%는 마스크 토큰으로 대체됩니다.
    * 10%는 무작위 토큰으로 대체됩니다.
    * 나머지 10%는 변경되지 않습니다.

시청한 영화 시퀀스 외에도, 각 영화에 해당하는 장르 시퀀스가 있으며, 이 역시 영화 시퀀스와 같은 방식으로 수정됩니다.<br>
따라서, 학습 데이터셋의 하나의 요소는 다음과 같은 구조를 가집니다:

Dataset[i] = item_sequence, genres_sequence, labels

* item_sequence — 변경된 영화 시퀀스 [1 x n] (여기서 n은 시퀀스 길이)
* genres_sequence — 영화에 해당하는 장르의 변경된 시퀀스 [m x n] (m은 장르 수, 이 경우 m = 18)
* labels — 실제 시청한 영화들의 원래 시퀀스로, 모델이 예측해야 할 정답 [1 x n]
    

In [0]:
class BERTRecDataSet(Dataset):
    #def __init__(self, user_train, movie_genres, max_len, num_user, num_item, mask_prob):
    def __init__(self, sequence_dataset, user_train, genres_seq, max_len, num_user, num_item, mask_prob):
        self.sequence_dataset = sequence_dataset # MakeSequenceDataSet 객체를 저장
        self.user_train = user_train
        #self.movie_genres = movie_genres
        self.genres_seq = genres_seq
        self.max_len = max_len
        self.num_user = num_user
        self.num_item = num_item
        self.mask_prob = mask_prob
        self._all_items = set([i for i in range(1, self.num_item + 1)])

    def __len__(self):
        return self.num_user

    def __getitem__(self, user): 
        
        user_seq = self.user_train[user]
        genre_seq = self.genres_seq[user]
        tokens = []
        genres_seq = []
        labels = []
        
        for s, g in zip(user_seq[-self.max_len:], genre_seq[-self.max_len:]):
            prob = np.random.random()
            if prob < self.mask_prob:
                prob /= self.mask_prob
                if prob < 0.8:
                    # masking
                    tokens.append(self.num_item + 1)
                    genres_seq.append([1]*18)
                elif prob < 0.9:
                    # noise
                    rnd_token = self.random_neg_sampling(rated_item = user_seq, num_item_sample = 1)[0]
                    tokens.append(rnd_token)
                    genres_seq.append(self.sequence_dataset.movie_genres(rnd_token))
                else:
                    tokens.append(s)
                    genres_seq.append(g)
            else:
                tokens.append(s)
                genres_seq.append(g)
            labels.append(s)

        mask_len = self.max_len - len(tokens)
        
        tokens = [0] * mask_len + tokens
        genres_seq = [[0]*18] * mask_len + genres_seq
        labels = [0] * mask_len + labels

        return torch.LongTensor(tokens), torch.Tensor(genres_seq), torch.LongTensor(labels)

    def random_neg_sampling(self, rated_item: list, num_item_sample: int):
        """
        사용자가 시청한 아이템을 제외한 나머지 아이템 중에서
        num_item_sample 개수만큼 랜덤으로 샘플링합니다.
        """
        candidates = list(self._all_items - set(rated_item))
        
        # 만약 남은 후보가 뽑으려는 샘플 수보다 적으면, 남은 만큼만 뽑도록 처리
        if len(candidates) < num_item_sample:
            return random.sample(candidates, len(candidates))
            
        return random.sample(candidates, num_item_sample)

In [0]:
# 기존 train 데이터 (MakeSequenceDataSet) 그대로 사용
make_sequence_dataset = MakeSequenceDataSet()
user_train, movie_genres, user_valid = make_sequence_dataset.get_train_valid_data()

In [0]:
# test 데이터는 별도 테이블에서 불러옴 (MakeTestSequenceDataSet)
make_test_dataset = MakeTestSequenceDataSet()
user_input, genres_input, user_groundtruth = make_test_dataset.get_test_data()

#### 2.4. Create Data Loader

* 학습을 위한 배치(Batch) 데이터 생성

In [0]:
bert4rec_dataset = BERTRecDataSet(
    sequence_dataset = make_sequence_dataset, # make_sequence_dataset 객체 전달
    user_train = user_train,
    genres_seq = movie_genres, # movie_genres = movie_genres,
    max_len = config['max_len'], 
    num_user = make_sequence_dataset.num_user, 
    num_item = make_sequence_dataset.num_item,
    mask_prob = config['mask_prob'],
)

In [0]:
data_loader = DataLoader(
    bert4rec_dataset, 
    batch_size = config['batch_size'], 
    shuffle = True, 
    pin_memory = True,
    num_workers = config['num_workers'],
)

* data example

In [0]:
#%%time
d = next(iter(data_loader))

In [0]:
d

### 3. BERT Model

#### 3.1. Define Model Components

* 임베딩 (Embedding)
  * TokenEmbedding (영화 아이템 임베딩)
  * PositionalEmbedding (위치 임베딩)
  * GenresEmbedding (장르 임베딩)
  * BERTEmbedding (위 세 가지 임베딩 결합)
<br>
<br>
* 트랜스포머 블록 (Transformer Block)
  * Attention 및 MultiHeadedAttention
  * PositionwiseFeedForward
  * SublayerConnection, LayerNorm 등

In [0]:
class PositionalEmbedding(nn.Module):
    def __init__(self, max_len, d_model):
        super().__init__()

        # Compute the positional encodings once in log space.
        self.pe = nn.Embedding(max_len, d_model)

    def forward(self, x):
        batch_size = x.size(0)
        return self.pe.weight.unsqueeze(0).repeat(batch_size, 1, 1)

class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, embed_size=512):
        super().__init__(vocab_size, embed_size, padding_idx=0)
        
class GenresEmbedding(nn.Module):
    def __init__(self, genres_size, embed_size=512):
        super().__init__()
        self.linear_1 = nn.Linear(genres_size, genres_size*2)
        self.act = nn.ReLU()
        self.linear_2 = nn.Linear(genres_size*2, embed_size)
    def forward(self, genres_vec):
        x = self.linear_2(self.act(self.linear_1(genres_vec)))
        return x

In [0]:
class BERTEmbedding(nn.Module):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)

        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, vocab_size, genres_size, embed_size, max_len, dropout=0.1):
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """
        super().__init__()
        self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
        self.position = PositionalEmbedding(max_len=max_len, d_model=embed_size)
        self.genres_emb = GenresEmbedding(genres_size=genres_size, embed_size=embed_size)
        self.dropout = nn.Dropout(p=dropout)
        self.embed_size = embed_size

    def forward(self, sequence, genres):
        x = self.token(sequence) + self.position(sequence) + self.genres_emb(genres)
        return self.dropout(x)

In [0]:
class Attention(nn.Module):
    """
    Compute 'Scaled Dot Product Attention
    """

    def forward(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) \
                 / math.sqrt(query.size(-1))

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        p_attn = F.softmax(scores, dim=-1)

        if dropout is not None:
            p_attn = dropout(p_attn)

        return torch.matmul(p_attn, value), p_attn

In [0]:
class MultiHeadedAttention(nn.Module):
    """
    Take in model size and number of heads.
    """

    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0

        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h

        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
        self.output_linear = nn.Linear(d_model, d_model)
        self.attention = Attention()

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)

        return self.output_linear(x)

In [0]:
class GELU(nn.Module):
    """
    Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
    """

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

In [0]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = GELU()

    def forward(self, x):
        return self.w_2(self.dropout(self.activation(self.w_1(x))))

In [0]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [0]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [0]:
class TransformerBlock(nn.Module):
    """
    Bidirectional Encoder = Transformer (self-attention)
    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
    """

    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):
        x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
        x = self.output_sublayer(x, self.feed_forward)
        return self.dropout(x)

#### 3.2. Assembling BERT model

In [0]:
class BERT(nn.Module):
    def __init__(self, bert_max_len, num_items, genres_size, bert_num_blocks, bert_num_heads,
                 bert_hidden_units, bert_dropout):
        super().__init__()

        # fix_random_seed_as(args.model_init_seed)
        # self.init_weights()

        max_len = bert_max_len
        num_items = num_items
        n_layers = bert_num_blocks
        heads = bert_num_heads
        self.vocab_size = num_items + 2
        self.genres_size = genres_size
        hidden = bert_hidden_units
        self.hidden = hidden
        dropout = bert_dropout

        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(vocab_size=self.vocab_size, genres_size=self.genres_size, 
                                       embed_size=self.hidden, max_len=max_len, dropout=dropout)

        # multi-layers transformer blocks, deep network
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(hidden, heads, hidden * 4, dropout) for _ in range(n_layers)])
        self.out = nn.Linear(hidden, self.vocab_size)
        
    def forward(self, x, genres):
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        # embedding the indexed sequence to sequence of vectors
#        print(x.shape)
        x = self.embedding(x, genres)

        # running over multiple transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer.forward(x, mask)
        
        x = self.out(x)
        return x

    def init_weights(self):
        pass

모델의 작동 원리:

1) 모델의 입력으로는 사용자가 시청한 영화들의 시퀀스와, 해당 영화들의 장르 시퀀스가 주어집니다. 모델의 출력은 시청한 영화들에 대한 로짓(logits) 시퀀스입니다.

2) 모델 학습 시, 전체 시퀀스를 예측하도록 모델을 학습시킵니다. 즉, 모델이 마스킹된 토큰들을 구별하고, 그 자리에 올바른 토큰을 출력할 수 있도록 학습합니다.

3) 모델을 실제 사용할 때는, 사용자의 시청 영화 시퀀스 끝에 마스크 토큰을 추가하고, 모델은 전체 시퀀스를 출력합니다. 이 출력 시퀀스의 마지막 요소가 모델이 추천하는 다음 영화가 됩니다.

In [0]:
model = BERT(
    num_items = make_sequence_dataset.num_item,
    genres_size = 18,
    bert_hidden_units = config['hidden_units'], 
    bert_num_heads = config['num_heads'], 
    bert_num_blocks = config['num_layers'], 
    bert_max_len = config['max_len'], 
    bert_dropout = config['dropout_rate'], 
).to(device)
metrics = {'train_loss': []}

* 모델 구조 확인

In [0]:
print(model)

* 출력 예시

In [0]:
seq, genres, _ = d
output = model(seq.to(device), genres.to(device))
print(output)
print(output.shape)

### 4. Training and Evaluaton Model

In [0]:
from IPython.display import clear_output
from tqdm import tqdm, trange

In [0]:
device

#### 4.1. Define Functions for Training and Evaluation

In [0]:
def train(model, criterion, optimizer, data_loader):
    model.train()
    loss_val = 0
    i = len(metrics['train_loss'])
    #, 'dev_ndcg': [] }
    for seq, genres, labels in data_loader:
        seq, genres, labels = seq.to(device), genres.to(device), labels.to(device)
        logits = model(seq, genres) # (bs, t, vocab)
        logits = logits.view(-1, logits.size(-1)) # (bs * t, vocab)
        labels = labels.view(-1) # (bs * t)
        
        optimizer.zero_grad()
        loss = criterion(logits, labels)
        
        loss_val += loss.item()
        metrics['train_loss'].append((i, loss.item()))

        loss.backward()
        optimizer.step()
        
       
        if i%100 == 0:
#             metrics['dev_ndcg'].append((i, compute_ndcg(model, dev_inp, dev_out)))
            clear_output(True)
            plt.figure(figsize=(5,4))
            for j, (name, history) in enumerate(sorted(metrics.items())):
#                 plt.subplot(1, len(metrics), j + 1)
                plt.title(name)
                plt.plot(*zip(*history))
                plt.grid()
            plt.show()
        i += 1
    
    loss_val /= len(data_loader)

    return loss_val

In [0]:
def evaluate_test(model, user_input, user_groundtruth, max_len, bert4rec_dataset, test_dataset):
    """
    test 데이터 평가 함수
      - user_input: 각 사용자의 입력 시퀀스 (영화 id 리스트)
      - user_groundtruth: 각 사용자의 ground truth 영화 (마지막 10개)
      - test_dataset: test dataset 클래스 (장르 정보 제공용)
    """
    model.eval()
    NDCG = 0.0
    HIT = 0.0
    precision = 0.0
    recall = 0.0
    num_item_sample = 100
    users = list(user_input.keys())
    
    for user in tqdm(users):
        input_seq = user_input[user]
        groundtruth = set(user_groundtruth[user])
        
        # 입력 시퀀스를 max_len에 맞게 처리 (패딩)
        seq = input_seq[-max_len:]
        genre_seq = [test_dataset.movie_genres(i) for i in input_seq][-max_len:]
        padding_len = max_len - len(seq)
        seq = [0]*padding_len + seq
        genre_seq = [[0]*18 for _ in range(padding_len)] + genre_seq
        
        rated_items = set(input_seq) | groundtruth
        neg_items = bert4rec_dataset.random_neg_sampling(rated_item=list(rated_items), num_item_sample=num_item_sample)
        items = list(groundtruth) + neg_items  # 후보군 구성
        
        with torch.no_grad():
            seq_tensor = torch.LongTensor([seq]).to(device)
            genre_tensor = torch.FloatTensor([genre_seq]).to(device)
            logits = model(seq_tensor, genre_tensor)
            scores = -logits[0, -1, items]  # 마지막 토큰의 출력 점수 활용
            
            top_k = torch.topk(scores, k=10, largest=False).indices
            top_k_items = set([items[i] for i in top_k.cpu().numpy()])
            
            hit_items = top_k_items & groundtruth
            precision += len(hit_items) / 10.0
            recall += len(hit_items) / len(groundtruth)
            HIT += 1 if len(hit_items) > 0 else 0
            
            dcg = 0.0
            for rank, idx in enumerate(top_k.cpu().numpy()):
                item = items[idx]
                if item in groundtruth:
                    dcg += 1 / np.log2(rank + 2)
            ideal_dcg = sum([1 / np.log2(i + 2) for i in range(10)])
            NDCG += dcg / ideal_dcg if ideal_dcg > 0 else 0.0

    total_users = len(users)
    ndcg = NDCG / total_users
    hit = HIT / total_users
    prec = precision / total_users
    rec = recall / total_users

    print(f'[Test] NDCG@10: {ndcg:.4f} | HIT@10: {hit:.4f}')
    print(f'[Test] Precision@10: {prec:.4f} | Recall@10: {rec:.4f}')
    
    return ndcg, hit, prec, rec

* 손실 함수와 옵티마이저 정의

In [0]:
criterion = nn.CrossEntropyLoss(ignore_index=0) 
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])

#### 4.2. Training Model

In [0]:
import gc

loss_list = []
for epoch in tqdm(range(1, config['num_epochs'] + 1)):
    train_loss = train(model, criterion, optimizer, data_loader)
    loss_list.append(train_loss)

    if epoch % 5 == 0 or epoch == 1:
        print(f'Epoch: {epoch:3d} | Train loss: {train_loss:.5f}')
    
    # 메모리 정리
    gc.collect()
    torch.cuda.empty_cache()

In [0]:
plt.plot(loss_list)

#### 4.3. Evaluation Model

In [0]:
ndcg, hit, precision, recall = evaluate_test(
    model=model,
    user_input=user_input,
    user_groundtruth=user_groundtruth,
    max_len=config['max_len'],
    bert4rec_dataset=bert4rec_dataset,
    test_dataset=make_test_dataset
)

* 평가지표 결과 출력

In [0]:
print(f'NDCG@10: {ndcg}| HIT@10: {hit}')
print(f'precision@10: {precision}| recall@10: {recall}')

* 모델 저장

In [0]:
# 모델 저장
torch.save(model.state_dict(), "/Volumes/1dt_team8_databricks/bert_model/models/bert4rec_model_0609_2")