#0. 환경

In [0]:
%pip install optuna

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import sum as _sum
from pyspark.sql.window import Window
from pyspark.ml.functions import vector_to_array
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType, IntegerType
from xgboost.spark import SparkXGBClassifier
import mlflow
import math
from builtins import sum as py_sum
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tempfile
import shap
# import optuna

In [0]:
spark = SparkSession.builder \
    .appName("sparkXGBoost") \
    .getOrCreate()
mlflow.autolog(disable=True)
mlflow.spark.autolog(disable=True)

# 1. 데이터로드

### 데이터 로드

In [0]:
catalog = "1dt_team8_databricks"
schema = "`final`"
path = f"{catalog}.{schema}"

try:
    train = spark.read.table(f"{path}.train_df")
    validation = spark.read.table(f"{path}.validation_df")
    test = spark.read.table(f"{path}.test_df")
except Exception as e:
    print(f"Error loading data from Unity Catalog Volume: {e}")
# display(train)
# display(validation)
# display(test)

train = train.withColumn("label", when(train["rating"] >= 4, 1).otherwise(0))
validation = validation.withColumn("label", when(validation["rating"] >= 4, 1).otherwise(0))
test = test.withColumn("label", when(test["rating"] >= 4, 1).otherwise(0))


# train, validation, test에서 movieId 컬럼만 추출해서 중복 제거한 뒤 합치기
movies_train = train.select("movieId").distinct()
movies_validation = validation.select("movieId").distinct()
movies_test = test.select("movieId").distinct()

# 세 데이터프레임 합치기
all_movies = movies_train.union(movies_validation).union(movies_test).distinct()

### 임시코드

In [0]:
catalog = "1dt_team8_databricks"
schema = "`imdb`"
imdb_path = f"{catalog}.{schema}"

try:
    title_basics = spark.read.table(f"{imdb_path}.title_basics")
    print("Data loaded successfully from Unity Catalog Volume.")
except Exception as e:
    print(f"Error loading data from Unity Catalog Volume: {e}")
    print(f"Please ensure CSV files (imdb_ratings.csv) exist in {imdb_path}")

In [0]:
catalog = "1dt_team8_databricks"
schema = "`movielens-32m`"
volume_path = f"{catalog}.{schema}"

links = spark.read.table(f"{volume_path}.links")

catalog = "1dt_team8_databricks"
schema = "`imdb`"
imdb_path = f"{catalog}.{schema}"

# 1. links에서 imdbId → tconst 형식으로 변환
links_with_tconst = links.withColumn(
    "tconst", concat(lit("tt"), lpad(col("imdbId").cast("string"), 7, "0"))
)

# 2. 필요한 열만 추출
title_years = title_basics.select("tconst", "startYear")

# 3. movies별로 year가 null인 경우에만 보완할 준비
def fill_year_from_title_basics(df, links_df, title_basics_df):
    # movieId 기준으로 links 조인 → tconst 획득
    df_with_tconst = df.join(links_df.select("movieId", "tconst"), on="movieId", how="left")

    # tconst 기준으로 title_basics 조인 → startYear 획득
    df_with_year = df_with_tconst.join(
        title_basics_df.select("tconst", "startYear"), on="tconst", how="left"
    )

    # 기존 year가 null인 경우 startYear로 대체
    df_filled = df_with_year.withColumn(
        "year",
        when(col("year").isNull(), col("startYear")).otherwise(col("year"))
    )

    # 불필요한 열 제거 및 정리
    return df_filled.drop("startYear", "tconst")

# 4. 각 데이터셋에 적용
train = fill_year_from_title_basics(train, links_with_tconst, title_years)
validation = fill_year_from_title_basics(validation, links_with_tconst, title_years)
test = fill_year_from_title_basics(test, links_with_tconst, title_years)

# 5. 결측치가 정말 채워졌는지 확인
print("Train 결측치 수:", train.filter(col("year").isNull()).count())
print("Val 결측치 수:", validation.filter(col("year").isNull()).count())
print("Test 결측치 수:", test.filter(col("year").isNull()).count())

train = train.withColumn("year", col("year").cast(IntegerType()))
validation = validation.withColumn("year", col("year").cast(IntegerType()))

test = test.withColumn("year", col("year").cast(IntegerType()))

# 2. 데이터분리

## 파이프라인

In [0]:
user_indexer = StringIndexer(inputCol="userId", outputCol="userIndex", handleInvalid="keep")
movie_indexer = StringIndexer(inputCol="movieId", outputCol="movieIndex", handleInvalid="keep")
user_indexer_model = user_indexer.fit(train)
movie_indexer_model = movie_indexer.fit(train)

tokenizer = RegexTokenizer(inputCol="genres", outputCol="genres_tokens", pattern="\\|")
vectorizer = CountVectorizer(inputCol="genres_tokens", outputCol="genres_vec")

assembler_all = VectorAssembler(
    inputCols=["genres_vec", "userIndex", "movieIndex", "year"],
    outputCol="features"
)

pipeline = Pipeline(stages=[
    tokenizer, vectorizer, 
    user_indexer, movie_indexer, assembler_all
])

pipeline_model = pipeline.fit(train)
train_transformed = pipeline_model.transform(train)
validation_transformed = pipeline_model.transform(validation)
test_transformed = pipeline_model.transform(test)

train_transformed.cache()
validation_transformed.cache()
test_transformed.cache()

# 3. 모델 설계 및 평가

## XgbClassifier

In [0]:
xgb_model = SparkXGBClassifier(
    max_depth=6,
    num_round=100,
    eta=0.1,
    eval_metric='logloss',
    missing=0.0,  # NaN 처리 방식
    features_col="features",
    label_col="label",
    prediction_col="prediction",
    probability_col="probability",
    seed=0
)

# 2. 학습
xgb_model_fitted = xgb_model.fit(train_transformed)

In [0]:
# 1. validation 데이터로 예측 수행
raw_predictions = xgb_model_fitted.transform(validation_transformed)

# 2. 예측 확률(y_pred_proba) 컬럼 추가 (probability 벡터에서 긍정 클래스 확률 추출)
predictions = raw_predictions.withColumn("y_pred_proba", vector_to_array(col("probability"))[1])

# 3. 필요한 컬럼만 선택 (userIndex, movieIndex, 실제 label, prediction, 예측 확률)
predictions = predictions.select(
    "userIndex",
    "movieIndex",
    col("label"),
    "prediction",
    "y_pred_proba"
)

# 4. 결과 확인
predictions.show(truncate=False)

### 평가지표

In [0]:
# 1. Top-K 추출
k = 10
window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_proba").desc())

topk_df = predictions.withColumn("rank", row_number().over(window_spec)) \
                     .filter(col("rank") <= k)

In [0]:
# 2. Precision@K 계산
precision_df = topk_df.groupBy("userIndex") \
    .agg(_sum(col("label")).alias("true_positives")) \
    .withColumn("precision_at_k", col("true_positives") / k)

mean_precision = precision_df.agg(avg("precision_at_k")).first()[0]
print(f"[Validation Set] precision@{k}: {mean_precision:.4f}")

In [0]:
# 3. Recall@K 계산
total_relevant_df = predictions.groupBy("userIndex") \
    .agg(_sum(col("label")).alias("total_relevant"))

recall_df = precision_df.join(total_relevant_df, on="userIndex") \
    .withColumn("recall_at_k", col("true_positives") / col("total_relevant"))

mean_recall = recall_df.agg(avg("recall_at_k")).first()[0]
print(f"[Validation Set] Recall@{k}: {mean_recall:.4f}")

In [0]:
# 4. NDCG@K 계산
# 사용자별 정답(label) 리스트 (예측 확률 기준 정렬, validation 데이터)
user_labels_val = topk_df.orderBy("userIndex", "rank") \
    .groupBy("userIndex") \
    .agg(collect_list("label").alias("label_list"))

# DCG@K 계산 함수
def dcg_at_k(labels, k):
    if not labels:
        return 0.0
    return py_sum([rel / math.log2(idx + 2) for idx, rel in enumerate(labels[:k])])

# IDCG@K 계산 함수
def idcg_at_k(labels, k):
    if not labels:
        return 0.0
    sorted_labels = sorted(labels, reverse=True)
    return py_sum([rel / math.log2(idx + 2) for idx, rel in enumerate(sorted_labels[:k])])

# UDF 등록
dcg_udf = udf(lambda x: dcg_at_k(x, K), DoubleType())
idcg_udf = udf(lambda x: idcg_at_k(x, K), DoubleType())

# 사용자별 nDCG 계산 (validation)
ndcg_val_df = user_labels_val.withColumn("dcg", dcg_udf("label_list")) \
                             .withColumn("idcg", idcg_udf("label_list")) \
                             .withColumn("ndcg_at_k", col("dcg") / col("idcg"))

# 평균 nDCG@K (validation)
mean_ndcg_val = ndcg_val_df.select(avg("ndcg_at_k").alias("mean_ndcg_at_k")).first()["mean_ndcg_at_k"]

print(f"[Validation Set] nDCG@{k}: {mean_ndcg_val:.4f}")

In [0]:
print(f"Precision@{k}: {mean_precision:.4f}")
print(f"Recall@{k}: {mean_recall:.4f}")
print(f"[Validation Set] nDCG@{k}: {mean_ndcg_val:.4f}")

### MLFlow + Optuna

In [0]:
mlflow.set_experiment("/MLflow")

def objective(trial):
    # 하이퍼파라미터 샘플링
    param = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "num_round": trial.suggest_int("num_round", 50, 100),
        "eval_metric": "logloss",
    }
    

    with mlflow.start_run(nested=True):
        mlflow.log_params(param)

        model = SparkXGBClassifier(
            features_col="features",
            label_col="label",
            prediction_col="prediction",
            probability_col="probability",
            seed = 0,
            **{
                "maxDepth": param["max_depth"],
                "eta": param["eta"],
                "numRound": param["num_round"],
                "evalMetric": "logloss"
            }
        )
        # 모델 학습
        model_fitted = model.fit(train_transformed)

        # 예측 수행
        pred_test = model_fitted.transform(validation_transformed)

        # 확률 추출
        pred_test = pred_test.withColumn("y_pred_prob", vector_to_array(col("probability"))[1])

        # userIndex별 상위 K개 추출
        window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_prob").desc())
        top_k = pred_test.withColumn("rank", row_number().over(window_spec)) \
                         .filter(col("rank") <= k)


        # 사용자별 평균 정답 비율 → precision@K
        precision_df = top_k.groupBy("userIndex").agg(avg("label").alias("user_precision"))
        result = precision_df.select(avg("user_precision").alias("precision_at_k")).collect()

        if not result or result[0]["precision_at_k"] is None:
            avg_precision_at_k = 0.0
        else:
            avg_precision_at_k = result[0]["precision_at_k"]

        # MLflow 기록
        mlflow.log_metric("precision_at_k", avg_precision_at_k)
        mlflow.spark.log_model(model_fitted, "model")

        # Optuna 최소화 함수 → precision 최대화를 위해 음수 반환
        return -avg_precision_at_k

In [0]:
# Optuna study 실행
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

In [0]:
best_params = study.best_params
spark_params = {
    "max_depth": best_params["max_depth"],
    "eta": best_params["eta"],
    "num_round": best_params["num_round"],
    "eval_metric": "logloss",
    "features_col": "features",
    "label_col": "label",
    "prediction_col": "prediction",
    "probability_col": "probability",
    "missing": 0.0,
    "seed": 0
}

In [0]:
# 3. 최종 모델 학습 및 MLflow에 저장
with mlflow.start_run(run_name="SparkXGBoost"):
    mlflow.log_params(best_params)

    final_model = SparkXGBClassifier(**spark_params)
    fitted_model = final_model.fit(train_transformed)

    mlflow.spark.log_model(fitted_model, "final_model")

    # 필요 시 예측 결과 생성
    final_predictions = fitted_model.transform(validation_transformed)
    final_predictions.select("userIndex", "movieIndex", "prediction").show()

# 4. 결과

## 모델 로드

In [0]:
K = 10

# 1. 최적화된 모델 로드 (run_id 또는 experiment에서 마지막 모델 불러오기)
model_uri = "runs:/d425b9e818124a1dbcc3fb2a6e2741f6/final_model"
final_model = mlflow.spark.load_model(model_uri)

# 2. 테스트 데이터로 예측 수행
pred_test = final_model.transform(test_transformed)

# 3. 예측 확률(y_pred_proba) 컬럼 추가 (probability 벡터에서 긍정 클래스 확률 추출)
pred_test = pred_test.withColumn("y_pred_proba", vector_to_array(col("probability"))[1])

## 평가지표

In [0]:
K = 10

# 사용자별 상위 K개 추천 항목 선택
window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_proba").desc())
top_k = pred_test.withColumn("rank", row_number().over(window_spec)) \
                 .filter(col("rank") <= K)

In [0]:
# 사용자별 Precision@K 계산
precision_df = top_k.groupBy("userIndex") \
                   .agg(avg("label").alias("precision_at_k"))

mean_precision_at_k = precision_df.select(avg("precision_at_k").alias("mean_precision_at_k")) \
                                  .first()["mean_precision_at_k"]

print(f"[Test Set] Precision@{K}: {mean_precision_at_k:.4f}")

In [0]:
# 사용자별 실제 긍정 아이템 총 개수 계산
total_relevant_df = pred_test.groupBy("userIndex") \
                             .agg(_sum("label").alias("total_relevant"))

# 사용자별 상위 K개 추천 내 실제 긍정 개수 계산
true_positives_df = top_k.groupBy("userIndex") \
                        .agg(_sum("label").alias("true_positives"))

# Recall@K 계산 (true positives / total relevant)
recall_df = true_positives_df.join(total_relevant_df, on="userIndex") \
                             .withColumn("recall_at_k", 
                                         col("true_positives") / col("total_relevant"))

mean_recall_at_k = recall_df.select(avg("recall_at_k").alias("mean_recall_at_k")) \
                           .first()["mean_recall_at_k"]

print(f"[Test Set] Recall@{K}: {mean_recall_at_k:.4f}")

In [0]:
# 사용자별 정답(label) 리스트 (정렬 순서: 예측 확률 기준)
user_labels = top_k.orderBy("userIndex", "rank") \
    .groupBy("userIndex") \
    .agg(collect_list("label").alias("label_list"))

# DCG@K 계산 함수
def dcg_at_k(labels, k):
    import math
    if not labels:
        return 0.0
    return py_sum([rel / math.log2(idx + 2) for idx, rel in enumerate(labels[:k])])

# IDCG@K 계산 함수
def idcg_at_k(labels, k):
    if not labels:
        return 0.0
    sorted_labels = sorted(labels, reverse=True)
    return py_sum([rel / math.log2(idx + 2) for idx, rel in enumerate(sorted_labels[:k])])

# UDF 등록
dcg_udf = udf(lambda x: dcg_at_k(x, K), DoubleType())
idcg_udf = udf(lambda x: idcg_at_k(x, K), DoubleType())

# 사용자별 정답 리스트에서 nDCG 계산
ndcg_df = user_labels.withColumn("dcg", dcg_udf("label_list")) \
                     .withColumn("idcg", idcg_udf("label_list")) \
                     .withColumn("ndcg_at_k", col("dcg") / col("idcg"))

# 평균 nDCG@K
mean_ndcg = ndcg_df.select(avg("ndcg_at_k").alias("mean_ndcg_at_k")).first()["mean_ndcg_at_k"]

print(f"[Test Set] nDCG@{K}: {mean_ndcg:.4f}")

In [0]:
mlflow.set_experiment("/MLflow")

with mlflow.start_run(run_name="SparkXgboostMetric"):
    mlflow.log_metric("Precision@K", mean_precision_at_k)
    mlflow.log_metric("Recall@K", mean_recall_at_k)
    mlflow.log_metric("nDCG@K", mean_ndcg)

print(f"[Test Set] Precision@{K}: {mean_precision_at_k:.4f}")
print(f"[Test Set] Recall@{K}: {mean_recall_at_k:.4f}")
print(f"[Test Set] nDCG@{K}: {mean_ndcg:.4f}")

In [0]:
metrics = ['Precision@K', 'Recall@K', 'nDCG@K']
scores = [mean_precision_at_k, mean_recall_at_k, mean_ndcg]

# 시각화
plt.figure(figsize=(6, 4))
bars = plt.bar(metrics, scores, color=['skyblue', 'lightgreen', 'salmon'])

# 바 위에 수치 표기
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, height + 0.01, f"{height:.4f}", ha='center', va='bottom')

plt.title(f'Evaluation Metrics @K={K}')
plt.ylim(0, 1.1)
plt.ylabel("Score")
plt.tight_layout()
plt.show()

## 사용자별 추천 영화(시청 영화 제외)

In [0]:
# 1. 사용자별 본 영화 집합 생성 (train, validation 데이터 기준)
seen_movies_df = train_transformed.select("userIndex", "movieIndex") \
    .union(validation_transformed.select("userIndex", "movieIndex")) \
    .distinct() \
    .groupBy("userIndex") \
    .agg(collect_set("movieIndex").alias("seen_movies"))

# 2. test 데이터 예측 결과 생성 (userIndex, movieIndex, probability, label 등 포함)
# test_pred = xgb_model_fitted.transform(test_transformed)
test_pred = final_model.transform(test_transformed)

# 3. test_pred에서 긍정 클래스 확률 추출 (probability 벡터에서 두 번째 값)
test_pred = test_pred.withColumn("y_pred_proba", vector_to_array(col("probability"))[1])

# 4. test_pred와 seen_movies_df를 userIndex 기준으로 left_outer join
pred_with_seen = test_pred.join(seen_movies_df, on="userIndex", how="left_outer")

# 5. 이미 본 영화 제외 (seen_movies가 null일 경우 대비)
pred_filtered = pred_with_seen.filter(
    (col("seen_movies").isNull()) | (~array_contains(col("seen_movies"), col("movieIndex")))
)

# 6. Top-K 추천 추출 (예: K = 10)
K = 10
window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_proba").desc())

top_k = pred_filtered.withColumn("rank", row_number().over(window_spec)) \
                     .filter(col("rank") <= K)

# movieIndex → movieId 매핑 테이블 생성
movie_index_mapping = train_transformed.select(
    "movieIndex",
    col("movieId").alias("movieId_map")
).dropDuplicates()

# movieId + title 정보 추출
movies_meta = train.select(
    col("movieId").alias("movieId_meta"),
    col("title").alias("title_meta"),
    col("genres").alias("genres_meta")  # ✅ 이 부분 추가
).dropDuplicates()

# 7. top_k에 movieId 추가
top_k_with_id = top_k.join(movie_index_mapping, on="movieIndex", how="left")

# 8. movieId 기준으로 title 조인
top_k_with_title = top_k_with_id.join(
    movies_meta,
    top_k_with_id["movieId_map"] == movies_meta["movieId_meta"],
    how="left"
)

# 9. 추천 리스트 구조화
top_k_with_title = top_k_with_title.withColumn(
    "recommendation",
    struct(
        col("movieId_map").alias("movieId"),
        col("title_meta").alias("title"),
        col("y_pred_proba").alias("pred_rating")
    )
)

# 10. 사용자별 추천 리스트 그룹화
recommendations = top_k_with_title.groupBy("userIndex") \
    .agg(collect_list("recommendation").alias("recommendations")) \
    .orderBy("userIndex")

# 11. 결과 출력
display(recommendations)

In [0]:
tmp_dir = tempfile.mkdtemp()

# CSV 저장 경로
csv_path = f"{tmp_dir}/csv_result"
recommendations.coalesce(1).write.mode("overwrite").option("header", "true").csv(csv_path)

# Parquet 저장 경로
parquet_path = f"{tmp_dir}/parquet_result"
recommendations.write.mode("overwrite").parquet(parquet_path)

mlflow.set_experiment("/MLflow")

with mlflow.start_run(run_name="SparkXgboostresult"):
    mlflow.log_artifacts(csv_path, artifact_path="result_csv")

    # Parquet을 MLflow에 저장 (아티팩트 이름: result_parquet)
    mlflow.log_artifacts(parquet_path, artifact_path="result_parquet")

## 시각화

배열을 flatten

In [0]:
# 1. recommendations 배열 explode
recommendations_exploded = recommendations.select(
    "userIndex",
    explode(col("recommendations")).alias("rec")
)

# 2. rec 구조체에서 필드 추출하여 컬럼 생성
recommendations_flat = recommendations_exploded.select(
    "userIndex",
    col("rec.movieId").alias("movieId"),
    col("rec.title").alias("title"),
    col("rec.pred_rating").alias("pred_rating")
)

rec_with_genres  = test.select("movieId", "title", "genres").dropDuplicates()

# 3. movieId 기준으로 join
rec_with_genres  = recommendations_flat.join(
    rec_with_genres ,
    on="movieId",
    how="left"
)

# 4. genres를 '|' 기준으로 분리 후 explode
recs_vis = rec_with_genres.withColumn("genre", explode(split(col("genres"), "\\|")))
display(recs_vis)

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

# 5. 추가사항

## 5-1. 추가 User 처리, 어떤 영화 추천

1. 신규 유저가 본 영화와 기존 유저들의 시청 목록 간 Jaccard 유사도를 계산하여 가장 유사한 기존 유저를 식별함.
2. 유사한 기존 유저의 userIndex로 대체, 신규 유저가 보지 않은 영화들에 대해 추천 모델로 예측을 수행함.
3. 예측 결과 중 상위 K개의 영화를 추출하여 신규 유저에게 추천함.



In [0]:
# Step 1. 신규 유저가 본 영화 리스트
new_user_seen_movies = [1, 10, 50, 300]
new_user_set = set(new_user_seen_movies)

# Step 2. 기존 유저별 시청 영화 집합 생성
user_movie_sets = train.groupBy("userId").agg(collect_set("movieId").alias("movie_set"))

# Step 3. Jaccard 유사도 계산
def jaccard_similarity(set1, set2):
    set1, set2 = set(set1), set(set2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return float(intersection) / union if union != 0 else 0.0

jaccard_udf = udf(lambda x: jaccard_similarity(new_user_set, x), DoubleType())

similar_users = user_movie_sets.withColumn("jaccard_sim", jaccard_udf(col("movie_set"))) \
                               .orderBy(desc("jaccard_sim")) \
                               .limit(1)

# Step 4. 유사한 기존 유저의 userId 및 userIndex 추출
top_user_id = similar_users.select("userId").first()["userId"]
top_user_index = user_indexer_model.transform(
    spark.createDataFrame([(top_user_id,)], ["userId"])
).select("userIndex").first()["userIndex"]

# Step 5. 신규 유저가 보지 않은 영화만 필터링 (train + test 영화 모두 포함 가능)
all_movies = train.select("movieId").union(test.select("movieId")).dropDuplicates()
unseen_movies = all_movies.filter(~col("movieId").isin(new_user_seen_movies))

# movies_meta: movieId와 title이 포함된 영화 메타 데이터 (train 또는 별도)
unseen_movies = unseen_movies.join(
    movies_meta,
    unseen_movies.movieId == movies_meta.movieId_meta,
    how="left"
).select(
    unseen_movies.movieId,
    movies_meta.title_meta.alias("title"),
    movies_meta.genres_meta.alias("genres")
)

# train에서 고유한 movieId-year 정보 추출
movie_year_df = train.select("movieId", "year").dropna().dropDuplicates(["movieId"])

# unseen_movies에 year 붙이기
unseen_movies = unseen_movies.join(movie_year_df, on="movieId", how="left")

In [0]:
# Step 6. userId, genres 컬럼 추가
unseen_movies = unseen_movies.withColumn("userId", lit(top_user_id))

# Step 7. pipeline_model로 features 추출
unseen_with_features = pipeline_model.transform(unseen_movies)

# Step 8. 모든 row에 해당 유사 유저의 userIndex 덮어쓰기
unseen_with_features = unseen_with_features.withColumn("userIndex", lit(top_user_index))

# Step 9. 중복 영화 제거
unseen_unique = unseen_with_features.dropDuplicates(["movieId"])


# Step 10. Spark ML 모델로 예측 수행 (transform 사용)
predicted_df = final_model.transform(unseen_unique)

# Step 11. 확률 벡터에서 긍정 클래스(1) 확률 추출
predicted_df = predicted_df.withColumn("prediction_proba", vector_to_array(col("probability"))[1])

# Step 12. 상위 K개 추출
K = 10
top_k_df = predicted_df.orderBy(col("prediction_proba").desc()).limit(K)

# Step 13. 결과 출력
top_k_df.select("movieId", "title", "prediction_proba").show()

## 5-2. feature importance

### 전체 유저

In [0]:
# 1. feature_names 설정
vectorizer_model = None
for stage in pipeline_model.stages:
    if stage.uid.startswith("CountVectorizer"):
        vectorizer_model = stage
        break

genre_vocab = vectorizer_model.vocabulary  
genre_feature_names = ["genre_" + genre for genre in genre_vocab]
feature_names = genre_feature_names + ["userIndex", "movieIndex", "year"]

# 2. test_transformed에서 features 추출 (전체 or 일부)
sampled_features = test_transformed.select("features").limit(500).collect()
X = np.vstack([row["features"].toArray() for row in sampled_features])  # (500, n_features)

# 3. XGBoost Booster 추출 및 SHAP 계산
xgb_stage = final_model.stages[-1]
booster = xgb_stage.get_booster()

explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X)

summary_df = pd.DataFrame({
    'Feature': feature_names,
    'Mean_Abs_SHAP': np.abs(shap_values).mean(axis=0)
}).sort_values(by='Mean_Abs_SHAP', ascending=False)

display(summary_df)  # Databricks에서 Bar chart로 보기 추천

Databricks visualization. Run in Databricks to view.

### 특정 유저에게 왜 해당 영화 추천했는지

In [0]:
# 1. vectorizer_model에서 vocabulary 가져오기
vectorizer_model = None
for stage in pipeline_model.stages:
    if stage.uid.startswith("CountVectorizer"):
        vectorizer_model = stage
        break

genre_vocab = vectorizer_model.vocabulary  

# 2. 인덱스 기준으로 단어 정렬
genre_feature_names = ["genre_" + genre for genre in genre_vocab]

# 4. 전체 feature 이름 리스트 (genres_vec + userIndex + movieIndex + year 순서로 맞춰야 함)
feature_names = genre_feature_names + ["userIndex", "movieIndex", "year"]


# --- 이후 SHAP 계산 및 출력 예시 ---
target_user = 0
user_recs = recommendations.filter(col("userIndex") == target_user).select("recommendations").collect()

if not user_recs:
    print(f"User {target_user} has no recommendations.")
else:
    rec_list = user_recs[0]["recommendations"]
    rec = rec_list[0]  # 1개만 선택

    movie_id = rec['movieId']
    movie_idx_row = movie_index_mapping.filter(col("movieId_map") == movie_id).select("movieIndex").collect()
    if not movie_idx_row:
        print(f"movieIndex를 찾을 수 없음: movieId={movie_id}")
    else:
        movie_index_val = movie_idx_row[0]["movieIndex"]

        feature_row = test_transformed.filter(
            (col("userIndex") == target_user) & (col("movieIndex") == movie_index_val)
        ).select("features").collect()

        if not feature_row:
            print(f"feature vector를 찾을 수 없음: userIndex={target_user}, movieIndex={movie_index_val}")
        else:
            feature_vector = feature_row[0]["features"].toArray().reshape(1, -1)

            # SHAP explainer 생성
            xgb_stage = final_model.stages[-1]
            booster = xgb_stage.get_booster()
            explainer = shap.TreeExplainer(booster)

            shap_values = explainer.shap_values(feature_vector)

            print(f"Movie: {rec['title']} (movieId: {movie_id})")
            genres_row = movies_meta.filter(movies_meta.movieId_meta == movie_id).select("genres_meta").collect()
            if genres_row:
                genres = genres_row[0]["genres_meta"]
                print(f"Genres: {genres}")
            else:
                print("Genres 정보가 없습니다.")


            for name, val in zip(feature_names, shap_values[0]):
                print(f"  {name}: {val:.4f}")

In [0]:
# 예: SHAP 값, 피처 이름, 실제 feature 값이 있다고 가정
shap_vals = shap_values[0]  # 1개 샘플에 대한 SHAP 값
feature_vals = feature_vector[0]  # 실제 피처 값
names = feature_names

# SHAP 값 절대값 기준으로 Top-N 영향력 높은 피처만 보기
top_n = 10
indices = np.argsort(np.abs(shap_vals))[-top_n:][::-1]

top_names = [names[i] for i in indices]
top_shap_vals = [shap_vals[i] for i in indices]
top_feature_vals = [feature_vals[i] for i in indices]

# 수치형 막대 그래프
plt.figure(figsize=(10, 6))
bars = plt.barh(top_names, top_shap_vals, color=["#ff6f69" if v < 0 else "#88d8b0" for v in top_shap_vals])
plt.xlabel("SHAP Value (Feature Impact)")
plt.title(f"Top {top_n} SHAP Feature Importances")

# 실제 피처 값 같이 표시
for i, (bar, val) in enumerate(zip(bars, top_feature_vals)):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f" = {val:.2f}", va='center')

plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 한계

영화 Index가 중요한 역할(즉 인기영화를 추천하는 경향) -> genre의 영향 높지 않음