In [0]:
import mlflow
import pickle
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import cloudpickle
from mlflow.models import infer_signature
import math

In [0]:
config = {
    'max_len': 80,
    'hidden_units': 256,
    'num_heads': 2,
    'num_layers': 2,
    'dropout_rate': 0.1,
    'lr': 0.001,
    'batch_size': 64,
    'num_epochs': 10,
    'num_workers': 0,
    'mask_prob': 0.15,
}

In [0]:
# ==============================================================================
# 0. 모델 클래스 정의
# MLflow가 BERT 모델을 로드할 때 참고할 수 있는 해당 클래스들의 구조
# ==============================================================================
class PositionalEmbedding(nn.Module):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.pe = nn.Embedding(max_len, d_model)

    def forward(self, x):
        batch_size = x.size(0)
        return self.pe.weight.unsqueeze(0).repeat(batch_size, 1, 1)

class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, embed_size=512):
        super().__init__(vocab_size, embed_size, padding_idx=0)

class GenresEmbedding(nn.Module):
    def __init__(self, genres_size, embed_size=512):
        super().__init__()
        self.linear_1 = nn.Linear(genres_size, genres_size * 2)
        self.act = nn.ReLU()
        self.linear_2 = nn.Linear(genres_size * 2, embed_size)
    def forward(self, genres_vec):
        x = self.linear_2(self.act(self.linear_1(genres_vec)))
        return x

class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, genres_size, embed_size, max_len, dropout=0.1):
        super().__init__()
        self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
        self.position = PositionalEmbedding(max_len=max_len, d_model=embed_size)
        self.genres_emb = GenresEmbedding(genres_size=genres_size, embed_size=embed_size)
        self.dropout = nn.Dropout(p=dropout)
        self.embed_size = embed_size

    def forward(self, sequence, genres):
        x = self.token(sequence) + self.position(sequence) + self.genres_emb(genres)
        return self.dropout(x)

class Attention(nn.Module):
    def forward(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) \
                 / math.sqrt(query.size(-1))
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        p_attn = F.softmax(scores, dim=-1)
        if dropout is not None:
            p_attn = dropout(p_attn)
        return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
        self.output_linear = nn.Linear(d_model, d_model)
        self.attention = Attention()
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        return self.output_linear(x)

class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = GELU()

    def forward(self, x):
        return self.w_2(self.dropout(self.activation(self.w_1(x))))

class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

class TransformerBlock(nn.Module):
    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):
        x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
        x = self.output_sublayer(x, self.feed_forward)
        return self.dropout(x)

class BERT(nn.Module):
    def __init__(self, bert_max_len, num_items, genres_size, bert_num_blocks, bert_num_heads,
                 bert_hidden_units, bert_dropout):
        super().__init__()
        max_len = bert_max_len
        n_layers = bert_num_blocks
        heads = bert_num_heads
        self.vocab_size = num_items + 2
        self.genres_size = genres_size
        hidden = bert_hidden_units
        self.hidden = hidden
        dropout = bert_dropout
        self.embedding = BERTEmbedding(vocab_size=self.vocab_size, genres_size=self.genres_size, 
                                       embed_size=self.hidden, max_len=max_len, dropout=dropout)
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(hidden, heads, hidden * 4, dropout) for _ in range(n_layers)])
        self.out = nn.Linear(hidden, self.vocab_size)
        
    def forward(self, x, genres):
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
        x = self.embedding(x, genres)
        for transformer in self.transformer_blocks:
            x = transformer.forward(x, mask)
        x = self.out(x)
        return x

In [0]:
# ==============================================================================
# 1. MLflow `pyfunc.PythonModel` 래퍼 클래스 작성
# ==============================================================================

class MLflowBert4Rec(mlflow.pyfunc.PythonModel):

    def __init__(self, model_class, model_config):
        self.model_class = model_class
        self.model_config = model_config

    def load_context(self, context):
        """모델 서빙을 위해 필요한 파일(Artifacts)들을 로딩합니다."""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # 1. 인코더/디코더 로딩
        with open(context.artifacts["item_encoder"], 'rb') as f:
            self.item_encoder = pickle.load(f)
        with open(context.artifacts["item_decoder"], 'rb') as f:
            self.item_decoder = pickle.load(f)
        
        self.num_item = len(self.item_encoder)
        self.mask_token = self.num_item + 1

        # 2. 장르 정보 로딩
        # 모델을 재사용 가능하게 만들기 위해, Spark 테이블 직접 접근 대신
        # movies 데이터를 artifact로 함께 패키징하여 로딩합니다.
        self.genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
        movies_df = pd.read_csv(context.artifacts["movies_data"])
        
        movies_df_encoded = movies_df[movies_df['movieId'].isin(self.item_encoder)].copy()
        movies_df_encoded['item_idx'] = movies_df_encoded['movieId'].apply(lambda x: self.item_encoder[x] + 1)
        for g in self.genres:
            movies_df_encoded[g] = movies_df_encoded["genres"].apply(lambda v: 1 if g in str(v) else 0)
        temp_df = movies_df_encoded.set_index('item_idx')[self.genres]
        self._movie_genres_map = {idx: row.tolist() for idx, row in temp_df.iterrows()}
        self._movie_genres_map[0] = [0] * len(self.genres)
        self._movie_genres_map[self.mask_token] = [0] * len(self.genres)

        # 3. PyTorch 모델 초기화 및 가중치 로딩
        self.model = BERT(
            num_items=self.num_item,
            genres_size=len(self.genres),
            bert_hidden_units=self.model_config['hidden_units'],
            bert_num_heads=self.model_config['num_heads'],
            bert_num_blocks=self.model_config['num_layers'],
            bert_max_len=self.model_config['max_len'],
            bert_dropout=self.model_config['dropout_rate'],
        ).to(self.device)
        
        self.model.load_state_dict(torch.load(context.artifacts["pytorch_model"], map_location=self.device))
        self.model.eval()

    def _get_movie_genres(self, item_idx):
        return self._movie_genres_map.get(item_idx, [0] * len(self.genres))

    def _predict_single(self, history, k=10):
        """단일 사용자의 시청 기록에 대해 예측을 수행합니다."""
        seq = [self.item_encoder.get(movie_id, -1) + 1 for movie_id in history]
        seq = [s for s in seq if s > 0]
        seq.append(self.mask_token)
        
        tokens = seq[-self.model_config['max_len']:]
        padding_len = self.model_config['max_len'] - len(tokens)
        tokens = [0] * padding_len + tokens
        
        genres_seq = [self._get_movie_genres(token) for token in tokens]
        
        tokens_tensor = torch.LongTensor([tokens]).to(self.device)
        genres_tensor = torch.FloatTensor([genres_seq]).to(self.device)
        
        with torch.no_grad():
            predictions = self.model(tokens_tensor, genres_tensor)
            
        last_item_logits = predictions[0, -1, :]
        for item_idx in set(tokens):
            if item_idx != 0 and item_idx != self.mask_token:
                last_item_logits[item_idx] = -np.inf
                
        _, top_k_indices = torch.topk(last_item_logits, k=k, largest=True)
        return [self.item_decoder.get(idx.item() - 1) for idx in top_k_indices]

    def predict(self, context, model_input):
        """
        MLflow 표준 predict 메소드. Pandas DataFrame 입력을 받아 처리합니다.
        model_input: 'movie_history' 컬럼을 포함하는 DataFrame. (예: [[1, 2, 3]])
        """
        results = model_input['movie_history'].apply(lambda history: self._predict_single(history, k=10))
        return pd.DataFrame({'recommendations': results})

In [0]:
import os
import mlflow
from pyspark.dbutils import DBUtils

# Databricks 유틸리티를 가져옵니다.
dbutils = DBUtils(spark.sparkContext)

# ==============================================================================
# 2. MLflow 실행(Run) 및 모델 로깅 (수정된 최종 버전)
# ==============================================================================

# --- 원본 파일 경로 설정 ---
# Unity Catalog Volume에 저장된 파일들의 경로입니다.
SOURCE_BASE_PATH = "/Volumes/1dt_team8_databricks/bert_model/models"
SOURCE_MODEL_WEIGHTS_PATH = os.path.join(SOURCE_BASE_PATH, "bert4rec_model_0609_2")
SOURCE_ITEM_ENCODER_PATH = os.path.join(SOURCE_BASE_PATH, 'bert4rec_model_0609_2_item_encoder.pkl')
SOURCE_ITEM_DECODER_PATH = os.path.join(SOURCE_BASE_PATH, 'bert4rec_model_0609_2_item_decoder.pkl')

# 장르 정보가 포함된 movies 데이터 경로
MOVIES_TABLE_NAME = "`1dt_team8_databricks`.`movielens-32m`.movies"
movies_df = spark.table(MOVIES_TABLE_NAME).toPandas()


# --- 로컬 임시 경로 설정 및 파일 복사 ---
# 로깅할 파일들을 드라이버의 로컬 /tmp/ 폴더로 복사합니다.
LOCAL_TMP_PATH = "/tmp/bert4rec_artifacts"
if not os.path.exists(LOCAL_TMP_PATH):
    os.makedirs(LOCAL_TMP_PATH)

# 복사할 파일 경로들을 정의
LOCAL_MODEL_WEIGHTS_PATH = os.path.join(LOCAL_TMP_PATH, "bert4rec_model_0609_2")
LOCAL_ITEM_ENCODER_PATH = os.path.join(LOCAL_TMP_PATH, 'bert4rec_model_0609_2_item_encoder.pkl')
LOCAL_ITEM_DECODER_PATH = os.path.join(LOCAL_TMP_PATH, 'bert4rec_model_0609_2_item_decoder.pkl')
LOCAL_MOVIES_DATA_PATH = os.path.join(LOCAL_TMP_PATH, "movies.csv")

# dbutils.fs.cp를 사용하여 Volume에서 로컬 드라이버로 파일을 복사
# "file:" 스키마를 사용하여 로컬 파일 시스템임을 명시합니다.
dbutils.fs.cp(SOURCE_MODEL_WEIGHTS_PATH.replace("/Volumes", "dbfs:/Volumes"), "file:" + LOCAL_MODEL_WEIGHTS_PATH)
dbutils.fs.cp(SOURCE_ITEM_ENCODER_PATH.replace("/Volumes", "dbfs:/Volumes"), "file:" + LOCAL_ITEM_ENCODER_PATH)
dbutils.fs.cp(SOURCE_ITEM_DECODER_PATH.replace("/Volumes", "dbfs:/Volumes"), "file:" + LOCAL_ITEM_DECODER_PATH)

# Pandas DataFrame은 로컬에 바로 저장
movies_df.to_csv(LOCAL_MOVIES_DATA_PATH, index=False)

print("Artifacts copied to local temporary directory:", LOCAL_TMP_PATH)


# --- MLflow 로깅 시작 ---
with mlflow.start_run() as run:
    # 1. 이제 로컬 임시 경로를 사용하여 artifacts 딕셔너리를 생성합니다.
    artifacts = {
        "pytorch_model": LOCAL_MODEL_WEIGHTS_PATH,
        "item_encoder": LOCAL_ITEM_ENCODER_PATH,
        "item_decoder": LOCAL_ITEM_DECODER_PATH,
        "movies_data": LOCAL_MOVIES_DATA_PATH
    }

    # 2. 모델 실행에 필요한 conda 환경 정의
    conda_env = {
        'channels': ['defaults', 'pytorch'],
        'dependencies': [
            'python=3.11',  # 모델 저장 환경과 버전을 일치시킵니다.
            'pip',
            # Conda 의존성 목록
            'pytorch==2.3.1',
            'cpuonly',      # CPU 전용 PyTorch를 설치하도록 명시합니다.
            {'pip': [
                'mlflow',
                'pandas',
                'cloudpickle==2.2.1', # 저장 환경과 버전을 일치시킵니다.
                'numpy'
            ]}
        ],
        'name': 'bert4rec_cpu_env'
    }

    # 3. 모델의 입력과 출력 스키마(Signature) 정의
    # 입력 예시: 'movie_history' 컬럼을 가진 DataFrame
    input_example = pd.DataFrame({
        "movie_history": [[1, 89745, 4993, 5952, 7153]]
    })

    # 출력 예시: 'recommendations' 컬럼을 가진 DataFrame
    output_example = pd.DataFrame({
        "recommendations": [[318, 296, 2571, 1196, 1210, 480, 589, 110, 260, 1198]]
    })
    signature = infer_signature(input_example, output_example)

    registered_model_name = "bert4rec_v4"
    
    # 4. MLflow pyfunc 모델 로깅
    mlflow.pyfunc.log_model(
        artifact_path="bert4rec_model",
        python_model=MLflowBert4Rec(model_class=BERT, model_config=config),
        artifacts=artifacts, # 수정된 로컬 경로가 담긴 딕셔너리를 사용
        conda_env=conda_env,
        signature=signature,
        registered_model_name=registered_model_name
    )

    print(f"모델이 MLflow에 등록되었습니다. Run ID: {run.info.run_id}")
    print(f"등록된 모델 이름: {registered_model_name}")

In [0]:
# ==============================================================================
# 3. 등록된 모델 로딩 및 추론 (Databricks 최종 버전)
# ==============================================================================
model_uri = f"models:/{registered_model_name}/6" # 버전은 상황에 맞게 조정
loaded_model = mlflow.pyfunc.load_model(model_uri)

In [0]:
# 추론할 데이터 (Pandas DataFrame 형태)
inference_data = pd.DataFrame({
    "movie_history": [
        #[1, 89745, 4993, 5952, 7153],
        [1, 89745, 4993, 5952, 7153, 4896, 5816, 8368, 109487, 33794, 58559, 91529]
    ]
})

# 예측 수행
recommendations = loaded_model.predict(inference_data)
print("\n--- MLflow 모델 추론 결과 ---")
print(recommendations)

In [0]:
# MLflow 모델 추론 결과 리스트로 반환
result_movieids = recommendations.iloc[0]
result_movieids = result_movieids.tolist()[0]

In [0]:
# 리스트로 반환된 모델 추론 결과를 movie 테이블을 이용해서 영화 이름으로 변환

def get_movie_titles(movie_ids, movies_table_name):
    """
    주어진 movie_ids 리스트에 해당하는 영화 제목을 반환합니다.

    Args:
        movie_ids (list): 영화 ID 리스트
        movies_table_name (str): 영화 메타데이터가 있는 Spark 테이블 이름

    Returns:
        list: 영화 제목 리스트
    """
    query = f"""
    SELECT title
    FROM {movies_table_name}
    WHERE movieId IN ({','.join(map(str, movie_ids))})
    """
    titles_df = spark.sql(query)
    titles_list = titles_df.select("title").rdd.flatMap(lambda x: x).collect()
    return titles_list

In [0]:
# 영화 이름 변환 함수 사용

movie_ids = result_movieids
movies_table_name = "1dt_team8_databricks.`movielens-32m`.movies"
titles = [{'title': title} for title in get_movie_titles(movie_ids, movies_table_name)]
display(titles)