In [0]:
%pip install optuna
%pip install graphviz


from graphviz import Digraph
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import sum as _sum
from pyspark.sql.window import Window
from pyspark.ml.functions import vector_to_array
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType, StructField, IntegerType, ArrayType, StringType, DoubleType, LongType, FloatType
from mlflow.models.signature import infer_signature
from xgboost.spark import SparkXGBClassifier
import mlflow
import mlflow.pyfunc
import mlflow.xgboost
from mlflow.models.signature import infer_signature
import math
from builtins import sum as py_sum
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tempfile
import shap
import json
import optuna

In [0]:
spark = SparkSession.builder \
    .appName("sparkXGBoost_final") \
    .getOrCreate()
mlflow.autolog(disable=True)
mlflow.spark.autolog(disable=True)

# 데이터 로드

In [0]:
catalog = "1dt_team8_databricks"
schema = "`final`"
path = f"{catalog}.{schema}"

try:
    train = spark.read.table(f"{path}.train_temp")
    validation = spark.read.table(f"{path}.validation_temp")
    test = spark.read.table(f"{path}.test_temp")
except Exception as e:
    print(f"Error loading data from Unity Catalog Volume: {e}")

train = train.withColumn("label", when(train["rating"] >= 4, 1).otherwise(0))
validation = validation.withColumn("label", when(validation["rating"] >= 4, 1).otherwise(0))
test = test.withColumn("label", when(test["rating"] >= 4, 1).otherwise(0))

train = train.withColumn("year", col("year").cast("int"))
validation = validation.withColumn("year", col("year").cast("int"))
test = test.withColumn("year", col("year").cast("int"))

# 파이프라인

In [0]:
user_indexer = StringIndexer(inputCol="userId", outputCol="userIndex", handleInvalid="keep")
movie_indexer = StringIndexer(inputCol="movieId", outputCol="movieIndex", handleInvalid="keep")

tokenizer = RegexTokenizer(inputCol="genres", outputCol="genres_tokens", pattern="\\|")
vectorizer = CountVectorizer(inputCol="genres_tokens", outputCol="genres_vec")

assembler_all = VectorAssembler(
    inputCols=["genres_vec", "userIndex", "movieIndex", "year"],
    outputCol="features"
)

pipeline = Pipeline(stages=[
    tokenizer, vectorizer, 
    user_indexer, movie_indexer, assembler_all
])

user_indexer_model = user_indexer.fit(train)
pipeline_model = pipeline.fit(train)
train_transformed = pipeline_model.transform(train)
validation_transformed = pipeline_model.transform(validation)
test_transformed = pipeline_model.transform(test)

train_transformed.cache()
validation_transformed.cache()
test_transformed.cache()

In [0]:
for i, stage in enumerate(pipeline_model.stages):
    displayHTML(f"<b>Stage {i}</b>: {stage.__class__.__name__}<br>Description: {stage}")

# MLFlow + Optuna

In [0]:
k =10
mlflow.set_experiment("/1dt003MLflow")

def objective(trial):
    param = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "num_round": trial.suggest_int("num_round", 50, 100),
        "eval_metric": "logloss",
    }

    model = SparkXGBClassifier(
        features_col="features",
        label_col="label",
        prediction_col="prediction",
        probability_col="probability",
        seed = 0,
        **{
            "maxDepth": param["max_depth"],
            "eta": param["eta"],
            "numRound": param["num_round"],
            "evalMetric": "logloss"
        }
    )

    model_fitted = model.fit(train_transformed)
    pred_test = model_fitted.transform(validation_transformed)
    pred_test = pred_test.withColumn("y_pred_prob", vector_to_array(col("probability"))[1])

    window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_prob").desc())
    top_k = pred_test.withColumn("rank", row_number().over(window_spec)) \
                     .filter(col("rank") <= k)

    precision_df = top_k.groupBy("userIndex").agg(avg("label").alias("user_precision"))
    result = precision_df.select(avg("user_precision").alias("precision_at_k")).collect()

    avg_precision_at_k = result[0]["precision_at_k"] if result and result[0]["precision_at_k"] is not None else 0.0

    return -avg_precision_at_k

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

best_params = study.best_params
spark_params = {
    "max_depth": best_params["max_depth"],
    "eta": best_params["eta"],
    "num_round": best_params["num_round"],
    "eval_metric": "logloss",
    "features_col": "features",
    "label_col": "label",
    "prediction_col": "prediction",
    "probability_col": "probability",
    "missing": 0.0,
    "seed": 0
}

In [0]:
trainval_df = train_transformed.union(validation_transformed)

final_model = SparkXGBClassifier(
    features_col="features",
    label_col="label",
    prediction_col="prediction",
    probability_col="probability",
    seed=0,
    **{
        "maxDepth": spark_params["max_depth"],
        "eta": spark_params["eta"],
        "numRound": spark_params["num_round"],
        "evalMetric": "logloss"
    }
)

final_model_fitted = final_model.fit(trainval_df)

test_pred = final_model_fitted.transform(test_transformed) \
    .withColumn("y_pred_proba", vector_to_array(col("probability"))[1])    

# 사용자 별 추천

In [0]:
test_users = test_transformed.select("userIndex").distinct()

seen_movies = (
    train_transformed.select("userIndex", "movieIndex")
    .union(validation_transformed.select("userIndex", "movieIndex"))
    .join(test_users, on="userIndex", how="inner")
    .distinct()
    .groupBy("userIndex")
    .agg(collect_set("movieIndex").alias("seen_movies"))
)

pred_filtered = (
    test_pred.join(seen_movies, on="userIndex", how="left_outer")
    .filter((col("seen_movies").isNull()) | (~array_contains(col("seen_movies"), col("movieIndex"))))
)

K = 10
window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_proba").desc())

top_k = (
    pred_filtered.withColumn("rank", row_number().over(window_spec))
    .filter(col("rank") <= K)
)

movie_index_map = (
    train_transformed.select("movieIndex", col("movieId").alias("movieId_map"))
    .dropDuplicates()
)

movies_meta = (
    train.select("movieId", "title", "genres")
    .dropDuplicates()
    .withColumnRenamed("movieId", "movieId_meta")
    .withColumnRenamed("title", "title_meta")
    .withColumnRenamed("genres", "genres_meta")
)

top_k_with_movieId = top_k.join(movie_index_map, on="movieIndex", how="left")

top_k_with_meta = top_k_with_movieId.join(
    movies_meta,
    top_k_with_movieId["movieId_map"] == movies_meta["movieId_meta"],
    how="left"
)

recommendations  = (
    top_k_with_meta.select(
        "userIndex",
        col("title_meta").alias("title"),
        col("y_pred_proba").alias("predicted_rating")
    )
    .orderBy("userIndex", col("predicted_rating").desc())
)

display(recommendations)

# 평가지표

In [0]:
K = 10

# 1. 사용자별 Top-K 추천 항목 선택 (이미 만들어둔 top_k 활용)

# 2. Precision@K 계산
precision_df = top_k.groupBy("userIndex") \
    .agg(
        _sum("label").alias("true_positives"),
        count("label").alias("recommended_count")
    ) \
    .withColumn("precision_at_k", col("true_positives") / col("recommended_count"))

mean_precision_at_k = precision_df.select(avg("precision_at_k").alias("mean_precision_at_k")) \
    .first()["mean_precision_at_k"]

print(f"[Test Set] Precision@{K}: {mean_precision_at_k:.4f}")

# 3. 사용자별 실제 긍정 아이템 총 개수 (test_pred 기준)
total_relevant_df = test_pred.groupBy("userIndex") \
    .agg(_sum("label").alias("total_relevant"))

# 4. 사용자별 Top-K 추천 내 실제 긍정 개수
true_positives_df = top_k.groupBy("userIndex") \
    .agg(_sum("label").alias("true_positives"))

# 5. Recall@K 계산
recall_df = true_positives_df.join(total_relevant_df, on="userIndex") \
    .withColumn("recall_at_k", col("true_positives") / col("total_relevant"))

mean_recall_at_k = recall_df.select(avg("recall_at_k").alias("mean_recall_at_k")) \
    .first()["mean_recall_at_k"]

print(f"[Test Set] Recall@{K}: {mean_recall_at_k:.4f}")

# 6. nDCG@K 계산을 위한 사용자별 label 리스트 수집 (top_k 기준, rank 정렬)
user_labels_test = top_k.orderBy("userIndex", "rank") \
    .groupBy("userIndex") \
    .agg(collect_list("label").alias("label_list"))

# DCG 계산 함수
def dcg_at_k(labels, k):
    if not labels:
        return 0.0
    return py_sum(rel / math.log2(idx + 2) for idx, rel in enumerate(labels[:k]))

# IDCG 계산 함수
def idcg_at_k(labels, k):
    if not labels:
        return 0.0
    sorted_labels = sorted(labels, reverse=True)
    return py_sum(rel / math.log2(idx + 2) for idx, rel in enumerate(sorted_labels[:k]))

# UDF 등록
dcg_udf = udf(lambda x: dcg_at_k(x, K), DoubleType())
idcg_udf = udf(lambda x: idcg_at_k(x, K), DoubleType())

# nDCG 계산
ndcg_test_df = user_labels_test \
    .withColumn("dcg", dcg_udf("label_list")) \
    .withColumn("idcg", idcg_udf("label_list")) \
    .withColumn("ndcg_at_k", col("dcg") / col("idcg"))

mean_ndcg_at_k = ndcg_test_df.select(avg("ndcg_at_k").alias("mean_ndcg_at_k")) \
    .first()["mean_ndcg_at_k"]

print(f"[Test Set] nDCG@{K}: {mean_ndcg_at_k:.4f}")

# 7. 시각화
metrics = ['Precision@K', 'Recall@K', 'nDCG@K']
scores = [mean_precision_at_k, mean_recall_at_k, mean_ndcg_at_k]

plt.figure(figsize=(6, 4))
bars = plt.bar(metrics, scores, color=['skyblue', 'lightgreen', 'salmon'])

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2.0, height + 0.01, f"{height:.4f}", ha='center', va='bottom')

plt.title(f'Evaluation Metrics @K={K}')
plt.ylim(0, 1.1)
plt.ylabel("Score")
plt.tight_layout()
plt.show()

# 모델 저장

In [0]:
# 하이퍼파라미터 예시
best_params = {
    "maxDepth": spark_params["max_depth"],
    "eta": spark_params["eta"],
    "numRound": spark_params["num_round"],
    "evalMetric": "logloss"
}

# 평가지표 예시
metrics = {
    "precision_at_k": mean_precision_at_k,
    "recall_at_k": mean_recall_at_k,
    "ndcg_at_k": mean_ndcg_at_k,
    "K": K
}

# MLflow 실험명과 run 이름 설정
mlflow.set_experiment("/1dt003MLflow")

with mlflow.start_run(run_name="XGBoost_Final_Model") as run:
    run_id = run.info.run_id
    
    # 1. 하이퍼파라미터 기록
    mlflow.log_params(best_params)
    
    # 2. 파이프라인 저장 (train 데이터로 fit된 전처리 파이프라인)
    mlflow.spark.log_model(pipeline_model, artifact_path="preprocessing_pipeline")
    
    # 3. 모델 저장 (train+val 데이터로 재학습한 최종 모델)
    mlflow.spark.log_model(final_model_fitted, artifact_path="xgb_model")
    
    # 4. 추천 결과 저장 (Spark DataFrame → Pandas → JSON)
    recommendations_pd = recommendations.toPandas()
    recommendations_json = recommendations_pd.to_json(orient="records")
    with tempfile.NamedTemporaryFile("w", suffix=".json", delete=False) as f:
        f.write(recommendations_json)
        temp_recommendations_path = f.name
    mlflow.log_artifact(temp_recommendations_path, artifact_path="recommendations")
    
    # 5. 평가지표 기록
    for key, value in metrics.items():
        mlflow.log_metric(key, value)
    
    # 6. 하이퍼파라미터 별도 artifact로 저장 (선택사항)
    with tempfile.NamedTemporaryFile("w", suffix=".json", delete=False) as f:
        json.dump(best_params, f)
        temp_params_path = f.name
    mlflow.log_artifact(temp_params_path, artifact_path="params")

print(f"MLflow run completed. Run ID: {run_id}")

#신규 유저

In [0]:
from pyspark.sql.functions import col, collect_set, lit, desc, udf
from pyspark.sql.types import DoubleType
from pyspark.ml.functions import vector_to_array

# Step 1. 신규 유저가 본 영화 리스트
new_user_seen_movies = [1, 10, 50, 300]
new_user_set = set(new_user_seen_movies)

# Step 2. 기존 유저별 시청 영화 집합 생성
user_movie_sets = train.groupBy("userId").agg(
    collect_set("movieId").alias("movie_set")
)

# Step 3. Jaccard 유사도 계산 함수 정의
def jaccard_similarity(set1, set2):
    set1, set2 = set(set1), set(set2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return float(intersection) / union if union != 0 else 0.0

jaccard_udf = udf(lambda x: jaccard_similarity(new_user_set, x), DoubleType())

# Step 4. 가장 유사한 유저 1명 추출
similar_users = user_movie_sets.withColumn("jaccard_sim", jaccard_udf(col("movie_set"))) \
                               .orderBy(desc("jaccard_sim")) \
                               .limit(1)

if similar_users.count() == 0:
    print("❌ 유사한 유저를 찾을 수 없습니다.")
else:
    # Step 5. 유사 유저의 userId 및 userIndex 추출
    top_user_id = similar_users.select("userId").first()["userId"]
    top_user_index = user_indexer_model.transform(
        spark.createDataFrame([(top_user_id,)], ["userId"])
    ).select("userIndex").first()["userIndex"]

    # Step 6. 신규 유저가 보지 않은 영화만 필터링 (train + test 기준)
    all_movies = train.select("movieId").union(test.select("movieId")).dropDuplicates()
    unseen_movies = all_movies.filter(~col("movieId").isin(new_user_seen_movies))

    # Step 7. 메타데이터 조인 (영화 제목 및 장르)
    unseen_movies = unseen_movies.join(
        movies_meta,
        unseen_movies.movieId == movies_meta.movieId_meta,
        how="left"
    ).select(
        unseen_movies.movieId,
        movies_meta["title_meta"].alias("title"),
        movies_meta["genres_meta"].alias("genres")
    )

    # Step 8. 연도 정보 조인
    movie_year_df = train.select("movieId", "year").dropna().dropDuplicates(["movieId"])
    unseen_movies = unseen_movies.join(movie_year_df, on="movieId", how="left")

    # Step 9. userId 및 유사 유저 ID 덮어쓰기
    unseen_movies = unseen_movies.withColumn("userId", lit(top_user_id))

    # Step 10. pipeline_model로 feature 추출
    features_df = pipeline_model.transform(unseen_movies)

    # Step 11. 유사 유저의 userIndex 덮어쓰기
    features_df = features_df.withColumn("userIndex", lit(top_user_index))

    # Step 12. 중복 영화 제거
    features_unique = features_df.dropDuplicates(["movieId"])

    # Step 13. 예측 수행
    predicted_df = final_model_fitted.transform(features_unique)

    # Step 14. 긍정 클래스 확률 추출
    predicted_df = predicted_df.withColumn("prediction_proba", vector_to_array(col("probability"))[1])

    # Step 15. Top-K 추출
    K = 10
    top_k_df = predicted_df.orderBy(col("prediction_proba").desc()).limit(K)

    # Step 16. 결과 출력
    top_k_df.select("movieId", "title", "prediction_proba").show(truncate=False)


# 모델 서비스(배포 실패)

In [0]:
class MovieRecommender(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        self.spark = SparkSession.builder.getOrCreate()

        # 파이프라인과 모델 로드
        pipeline_uri = context.artifacts["pipeline_model_path"]
        self.pipeline_model = mlflow.spark.load_model(pipeline_uri)

        model_uri = context.artifacts["xgboost_model_path"]
        self.final_model = mlflow.spark.load_model(model_uri)

        # 데이터 불러오기
        catalog = "1dt_team8_databricks"
        schema = "final"
        path = f"{catalog}.{schema}"

        self.train_df = self.spark.read.table(f"{path}.train_temp").cache()
        self.movie_meta = self.train_df.select("movieId", "title", "genres", "year").distinct().cache()
        self.all_movies = self.movie_meta.select("movieId").distinct().cache()
        self.user_movies = (
            self.train_df.groupBy("userId")
            .agg(F.collect_set("movieId").alias("movie_set"))
            .cache()
        )

        # UDF 등록 (load_context에서 한번만)
        self.extract_prob_udf = udf(lambda v: float(v[1]), FloatType())

    def jaccard_similarity(self, set1, set2):
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union if union != 0 else 0

    def find_most_similar_user(self, new_user_movies):
        max_sim = 0
        best_user = None
        new_user_set = set(new_user_movies)
        # 사용자 수가 많을 경우 성능 이슈 가능 → 실서비스는 별도 캐싱 권장
        for row in self.user_movies.collect():
            sim = self.jaccard_similarity(new_user_set, set(row["movie_set"]))
            if sim > max_sim:
                max_sim = sim
                best_user = row["userId"]
        return best_user

    def predict(self, context, model_input):
        # 입력 검증
        if "userId" not in model_input or "seen_movie_ids" not in model_input:
            raise ValueError("Input must contain 'userId' and 'seen_movie_ids' columns")

        user_id = int(model_input["userId"].iloc[0])
        seen_movies = set(model_input["seen_movie_ids"].iloc[0])

        # 유저 존재 여부 체크
        user_exists = self.user_movies.filter(col("userId") == user_id).count() > 0

        if user_exists:
            exclude_movies = set(
                self.user_movies.filter(col("userId") == user_id).select("movie_set").collect()[0][0]
            ).union(seen_movies)
        else:
            similar_user = self.find_most_similar_user(seen_movies)
            if similar_user is None:
                exclude_movies = seen_movies
            else:
                similar_movies = self.user_movies.filter(col("userId") == similar_user).select("movie_set").collect()[0][0]
                exclude_movies = set(similar_movies).union(seen_movies)

        # 유저가 안본 영화 필터링
        unseen_movies = self.all_movies.filter(~self.all_movies.movieId.isin(list(exclude_movies)))

        user_unseen_df = unseen_movies.join(self.movie_meta, on="movieId", how="left") \
                                      .withColumn("userId", lit(user_id))

        # 파이프라인 변환
        features_df = self.pipeline_model.transform(user_unseen_df)

        # 예측 수행
        predictions = self.final_model.transform(features_df)
        predictions = predictions.withColumn("score", self.extract_prob_udf(col("probability")))

        # 상위 10개 추천
        top_k = predictions.orderBy(col("score").desc()).limit(10)
        top_k_with_title = top_k.select("movieId", "title")

        return top_k_with_title.toPandas()

# 입력 예시 및 서명
input_example = pd.DataFrame({
    "userId": [1],
    "seen_movie_ids": [[1, 2, 3]]
})
output_example = pd.DataFrame({
    "movieId": [10],
    "title": ["Movie A"]
})

signature = infer_signature(input_example, output_example)

with mlflow.start_run(run_name="Xgboost_Service") as run:
    # 1. 파이프라인 모델 저장
    mlflow.spark.log_model(pipeline_model, artifact_path="pipeline_model")

    # 2. XGBoost Booster 모델 별도 저장 (final_model_fitted는 Booster 객체 또는 학습된 모델)
    booster = final_model_fitted.get_booster()
    local_path = "/tmp/xgb_native.model"
    booster.save_model(local_path)
    mlflow.xgboost.log_model(booster, artifact_path="xgboost_model")

    # 3. pyfunc 모델 저장
    mlflow.pyfunc.log_model(
        artifact_path="movie_recommender_Xgb",
        python_model=MovieRecommender(),
        artifacts={
            "pipeline_model_path": f"runs:/{run.info.run_id}/pipeline_model",
            "xgboost_model_path": f"runs:/{run.info.run_id}/xgboost_model"
        },
        input_example=input_example,
        signature=signature
    )

    # 4. 모델 등록
    model_uri = f"runs:/{run.info.run_id}/movie_recommender_Xgb"
    mlflow.register_model(model_uri, "movie_recommender_Xgb")

In [0]:
class MovieRecommender(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        self.spark = SparkSession.builder.getOrCreate()

        # 파이프라인과 모델 로드
        pipeline_uri = context.artifacts["pipeline_model_path"]
        self.pipeline_model = mlflow.spark.load_model(pipeline_uri)

        model_uri = context.artifacts["xgboost_model_path"]
        self.final_model = mlflow.spark.load_model(model_uri)

        # 데이터 불러오기
        catalog = "1dt_team8_databricks"
        schema = "final"
        path = f"{catalog}.{schema}"

        self.train_df = self.spark.read.table(f"{path}.train_temp").cache()
        self.movie_meta = self.train_df.select("movieId", "title", "genres", "year").distinct().cache()
        self.all_movies = self.movie_meta.select("movieId").distinct().cache()
        self.user_movies = (
            self.train_df.groupBy("userId")
            .agg(F.collect_set("movieId").alias("movie_set"))
            .cache()
        )

        # UDF 등록 (load_context에서 한번만)
        self.extract_prob_udf = udf(lambda v: float(v[1]), FloatType())

    def jaccard_similarity(self, set1, set2):
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union if union != 0 else 0

    def find_most_similar_user(self, new_user_movies):
        max_sim = 0
        best_user = None
        new_user_set = set(new_user_movies)
        # 사용자 수가 많을 경우 성능 이슈 가능 → 실서비스는 별도 캐싱 권장
        for row in self.user_movies.collect():
            sim = self.jaccard_similarity(new_user_set, set(row["movie_set"]))
            if sim > max_sim:
                max_sim = sim
                best_user = row["userId"]
        return best_user

    def predict(self, context, model_input):
        # 입력 검증
        if "userId" not in model_input or "seen_movie_ids" not in model_input:
            raise ValueError("Input must contain 'userId' and 'seen_movie_ids' columns")

        user_id = int(model_input["userId"].iloc[0])
        seen_movies = set(model_input["seen_movie_ids"].iloc[0])

        # 유저 존재 여부 체크
        user_exists = self.user_movies.filter(col("userId") == user_id).count() > 0

        if user_exists:
            exclude_movies = set(
                self.user_movies.filter(col("userId") == user_id).select("movie_set").collect()[0][0]
            ).union(seen_movies)
        else:
            similar_user = self.find_most_similar_user(seen_movies)
            if similar_user is None:
                exclude_movies = seen_movies
            else:
                similar_movies = self.user_movies.filter(col("userId") == similar_user).select("movie_set").collect()[0][0]
                exclude_movies = set(similar_movies).union(seen_movies)

        # 유저가 안본 영화 필터링
        unseen_movies = self.all_movies.filter(~self.all_movies.movieId.isin(list(exclude_movies)))

        user_unseen_df = unseen_movies.join(self.movie_meta, on="movieId", how="left") \
                                      .withColumn("userId", lit(user_id))

        # 파이프라인 변환
        features_df = self.pipeline_model.transform(user_unseen_df)

        # 예측 수행
        predictions = self.final_model.transform(features_df)
        predictions = predictions.withColumn("score", self.extract_prob_udf(col("probability")))

        # 상위 10개 추천
        top_k = predictions.orderBy(col("score").desc()).limit(10)
        top_k_with_title = top_k.select("movieId", "title")

        return top_k_with_title.toPandas()

# 입력 예시 및 서명
input_example = pd.DataFrame({
    "userId": [1],
    "seen_movie_ids": [[1, 2, 3]]
})
output_example = pd.DataFrame({
    "movieId": [10],
    "title": ["Movie A"]
})

signature = infer_signature(input_example, output_example)

with mlflow.start_run(run_name="Xgboost_Service") as run:
    # 1. 파이프라인 모델 로컬 저장
    pipeline_local_path = "/tmp/pipeline_model"
    mlflow.spark.save_model(pipeline_model, pipeline_local_path)
    mlflow.spark.log_model(pipeline_model, artifact_path="pipeline_model")

    # 2. XGBoost Booster 모델 로컬 저장
    booster = final_model_fitted.get_booster()
    xgb_local_path = "/tmp/xgb_model"
    os.makedirs(xgb_local_path, exist_ok=True)
    booster.save_model(f"{xgb_local_path}/model.bst")
    mlflow.xgboost.log_model(booster, artifact_path="xgboost_model")

    # 3. pyfunc 모델 저장 시 artifacts에 로컬 경로 지정
    mlflow.pyfunc.log_model(
        artifact_path="movie_recommender_Xgboost",
        python_model=MovieRecommender(),
        artifacts={
            "pipeline_model_path": pipeline_local_path,  # 로컬 경로 지정
            "xgboost_model_path": xgb_local_path         # 로컬 경로 지정
        },
        input_example=input_example,
        signature=signature
    )

    # 4. 모델 등록
    model_uri = f"runs:/{run.info.run_id}/movie_recommender_Xgboost"
    mlflow.register_model(model_uri, "movie_recommender_Xgboost")

# 전체 유저 featureImportance

In [0]:
vectorizer_model = None
for stage in pipeline_model.stages:
    if stage.uid.startswith("CountVectorizer"):
        vectorizer_model = stage
        break

genre_vocab = vectorizer_model.vocabulary  
genre_feature_names = ["genre_" + genre for genre in genre_vocab]
feature_names = genre_feature_names + ["userIndex", "movieIndex", "year"]

sampled_features = test_transformed.select("features").limit(500).collect()
X = np.vstack([row["features"].toArray() for row in sampled_features])

booster = final_model_fitted.get_booster()

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X)

summary_df = pd.DataFrame({
    'Feature': feature_names,
    'Mean_Abs_SHAP': np.abs(shap_values).mean(axis=0)
}).sort_values(by='Mean_Abs_SHAP', ascending=False)

display(summary_df)

Databricks visualization. Run in Databricks to view.

# 특정 유저에게 왜 해당 영화 추천했는지

In [0]:
from pyspark.sql.functions import col

# 1. CountVectorizer에서 genre feature 이름 추출
vectorizer_model = next(
    (stage for stage in pipeline_model.stages if stage.uid.startswith("CountVectorizer")),
    None
)
if vectorizer_model is None:
    raise ValueError("CountVectorizer가 pipeline_model에 존재하지 않습니다.")

genre_vocab = vectorizer_model.vocabulary
genre_feature_names = ["genre_" + genre for genre in genre_vocab]
feature_names = genre_feature_names + ["userIndex", "movieIndex", "year"]

# 2. 대상 사용자 선택
target_user = 1

# 3. 추천 영화 1개 선택
user_recs = recommendations.filter(col("userIndex") == target_user).limit(1).collect()
if not user_recs:
    print(f"User {target_user} has no recommendations.")
else:
    rec_row = user_recs[0]
    title = rec_row["title"]

    # movieId 추출 (title 기반)
    movie_row = movies_meta.filter(col("title_meta") == title).select("movieId_meta").collect()
    if not movie_row:
        print(f"title에 해당하는 movieId를 찾을 수 없습니다: {title}")
    else:
        movie_id = movie_row[0]["movieId_meta"]

        # test_transformed에서 features 벡터 추출
        feature_row = test_transformed.filter(
            (col("userIndex") == target_user) & (col("movieId") == movie_id)
        ).select("features").collect()

        if not feature_row:
            print(f"feature vector를 찾을 수 없습니다: userIndex={target_user}, movieId={movie_id}")
        else:
            feature_vector = feature_row[0]["features"].toArray().reshape(1, -1)

            # final_model은 SparkXGBClassifierModel -> booster 추출
            booster = final_model_fitted.get_booster()  # SparkXGBClassifierModel 기준

            explainer = shap.TreeExplainer(booster)
            shap_values = explainer.shap_values(feature_vector)

            print(f"\n🎬 Movie Title: {title}")
            print(f"🎥 MovieId: {movie_id}")
            genres_row = movies_meta.filter(col("movieId_meta") == movie_id).select("genres_meta").collect()
            genres = genres_row[0]["genres_meta"] if genres_row else "Unknown"
            print(f"📚 Genres: {genres}")

            display(shap_df)

In [0]:
# 예: SHAP 값, 피처 이름, 실제 feature 값이 있다고 가정
shap_vals = shap_values[0]  # 1개 샘플에 대한 SHAP 값
feature_vals = feature_vector[0]  # 실제 피처 값
names = feature_names

# SHAP 값 절대값 기준으로 Top-N 영향력 높은 피처만 보기
top_n = 10
indices = np.argsort(np.abs(shap_vals))[-top_n:][::-1]

top_names = [names[i] for i in indices]
top_shap_vals = [shap_vals[i] for i in indices]
top_feature_vals = [feature_vals[i] for i in indices]

# 수치형 막대 그래프
plt.figure(figsize=(10, 6))
bars = plt.barh(top_names, top_shap_vals, color=["#ff6f69" if v < 0 else "#88d8b0" for v in top_shap_vals])
plt.xlabel("SHAP Value (Feature Impact)")
plt.title(f"Top {top_n} SHAP Feature Importances")

# 실제 피처 값 같이 표시
for i, (bar, val) in enumerate(zip(bars, top_feature_vals)):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f" = {val:.2f}", va='center')

plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# 모델 로드

In [0]:
run_id = "67ff5e58e5dc47d2a114e83c36d61f06"
model_uri = f"runs:/{run_id}/xgb_model"

preprocessing_uri = f"runs:/{run_id}/preprocessing_pipeline"
pipeline_model = mlflow.spark.load_model(preprocessing_uri)

final_model_fitted = mlflow.spark.load_model(model_uri)

test_transformed = pipeline_model.transform(test)
test_df = final_model_fitted.transform(test_transformed)
user_indexer_model = pipeline_model.stages[2]

test_pred = final_model_fitted.transform(test_df) \
    .withColumn("y_pred_proba", vector_to_array(col("probability"))[1])

# artifact URI 구성
artifact_uri = f"runs:/{run_id}/recommendations/tmpq44uxkn1.json"

# 추천 결과 다운로드 (로컬 MLflow 서버 또는 Databricks 환경 따라 다름)
local_path = mlflow.artifacts.download_artifacts(artifact_uri)

# JSON 파일 읽기
with open(local_path, "r", encoding="utf-8") as f:
    recommendations = json.load(f)

recommendations = [
    [int(item["userIndex"]), item["title"], item["predicted_rating"]]
    for item in recommendations
]
movies_meta = (
    train.select("movieId", "title", "genres")
    .dropDuplicates()
    .withColumnRenamed("movieId", "movieId_meta")
    .withColumnRenamed("title", "title_meta")
    .withColumnRenamed("genres", "genres_meta")
)