#0. 환경

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col, rand, row_number, floor, when, count, collect_list, struct, concat, lit, lpad, avg, desc
from pyspark.sql.window import Window
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, VectorAssembler, MinMaxScaler, StringIndexer
from pyspark.ml import Pipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor, XGBClassifier
import optuna
import mlflow
import mlflow.sklearn
import joblib
import os
from pyspark.sql.functions import col, regexp_extract, regexp_replace, count

In [0]:
spark = SparkSession.builder \
    .appName("IMDB_small") \
    .getOrCreate()

# 1. 데이터로드

## Bronze

### Movielends 로드

In [0]:
catalog = "1dt_team8_databricks"
schema = "`final`"
path = f"{catalog}.{schema}"

try:
    train = spark.read.table(f"{path}.train_df")
    validation = spark.read.table(f"{path}.validation_df")
    test = spark.read.table(f"{path}.test_df")
except Exception as e:
    print(f"Error loading data from Unity Catalog Volume: {e}")
# display(train)
# display(validation)
# display(test)

In [0]:
catalog = "1dt_team8_databricks"
schema = "`movielens-small`"
# schema = "`movielens-32m`"
bronze_path = f"{catalog}.{schema}"

try:
    ratings = spark.read.table(f"{bronze_path}.ratings")
    ratings = ratings.select("userId","movieId","rating")
    movies = spark.read.table(f"{bronze_path}.movies")
    tags = spark.read.table(f"{bronze_path}.tags")
    links = spark.read.table(f"{bronze_path}.links")
    links = links.withColumn("imdb_id", concat(lit("tt"), lpad(col("imdbId"), 7, "0")))
    print("Data loaded successfully from Unity Catalog Volume.")
except Exception as e:
    print(f"Error loading data from Unity Catalog Volume: {e}")
    print(f"Please ensure CSV files (movies.csv, ratings.csv, links.csv, tags.csv) exist in {volume_path}")

In [0]:
movies.count()

In [0]:
movies_df = movies.withColumn(
    "year",
    regexp_extract(col("title"), r"\((\d{4})\)\s*$", 1).cast("int")
)

### IMDB 로드

In [0]:
catalog = "1dt_team8_databricks"
schema = "`imdb`"
bronze_path = f"{catalog}.{schema}"

try:
    imdb_ratings = spark.read.table(f"{bronze_path}.title_ratings")
    print("Data loaded successfully from Unity Catalog Volume.")
except Exception as e:
    print(f"Error loading data from Unity Catalog Volume: {e}")
    print(f"Please ensure CSV files (imdb_ratings.csv) exist in {volume_path}")

In [0]:
# 1. 연도별 영화 개수 집계
yearly_movie_count_filter_df = movies_df.groupBy("year").agg(count("*").alias("movie_count"))
yearly_movie_count_filter_df = yearly_movie_count_filter_df.orderBy("year")

# 2. Pandas로 변환
yearly_movie_count_filter_pd = yearly_movie_count_filter_df.toPandas()

# 3. Null 연도 제외 (옵션)
yearly_movie_count_filter_pd = yearly_movie_count_filter_pd.dropna(subset=["year"])

# 4. 막대그래프 그리기
plt.figure(figsize=(15, 6))
plt.bar(yearly_movie_count_filter_pd['year'], yearly_movie_count_filter_pd['movie_count'], color='skyblue')

plt.title("Number of Movies per Year")
plt.xlabel("Year")
plt.ylabel("Movie Count")
plt.xticks(rotation=90)  # 연도 회전
plt.tight_layout()
plt.show()

## Silver

In [0]:
ratings = ratings.dropna()
ratings = ratings.filter((ratings.rating >= 0) & (ratings.rating <= 5))

imdb_ratings = imdb_ratings.dropna()
imdb_ratings = imdb_ratings.filter((imdb_ratings.averageRating >= 0) & (imdb_ratings.numVotes > 0))

## Gold

In [0]:
movies_with_ratings = movies.join(ratings, on="movieId", how="left_outer") \
                             .filter(col("rating").isNotNull())

merged = movies_with_ratings.join(links, on="movieId", how="inner")

df_merged = merged.join(imdb_ratings, merged.imdb_id == imdb_ratings.tconst, how="left_outer")
df_cleaned = df_merged.drop("r_movieId", "tconst", "imdbId", "tmdbID", "imdb_id") \
                      .filter(col("averageRating").isNotNull() & col("numVotes").isNotNull()) \
                      .select("movieId", "userId", "title", "averageRating", "numVotes", "genres", "rating") \
                      .withColumn("label", (col("rating") >= 4).cast("int"))

In [0]:
(df_cleaned.count())

#2. 데이터분석

In [0]:
# df = df_cleaned.toPandas()

In [0]:
# if 'df' in locals() and df is not None:
#     plt.figure(figsize=(10,8))
#     sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
#     #plt.title('변수 간 상관계수 히트맵')
#     plt.title('Correlation Heatmap of Variables')
#     plt.show()
# else:
#     print("오류: df (Pandas DataFrame)가 생성되지 않았습니다. 2단계를 다시 확인해주세요.")

# 3. 데이터분리

## 파이프라인

In [0]:
tokenizer = RegexTokenizer(inputCol="genres", outputCol="genres_tokens", pattern="\\|")
vectorizer = CountVectorizer(inputCol="genres_tokens", outputCol="genres_vec")
assembler_numvotes = VectorAssembler(inputCols=["numVotes"], outputCol="numVotes_vec")
scaler = MinMaxScaler(inputCol="numVotes_vec", outputCol="numVotes_scaled")
user_indexer = StringIndexer(inputCol="userId", outputCol="userIndex")
user_indexer_model = user_indexer.fit(df_cleaned)

movie_indexer = StringIndexer(inputCol="movieId", outputCol="movieIndex")
assembler_all = VectorAssembler(
    inputCols=["genres_vec", "averageRating", "numVotes_scaled", "userIndex", "movieIndex"],
    outputCol="features"
)

pipeline = Pipeline(stages=[
    tokenizer, vectorizer, assembler_numvotes, scaler,
    user_indexer, movie_indexer, assembler_all
])

pipeline_model = pipeline.fit(df_cleaned)
df_transformed = pipeline_model.transform(df_cleaned)

## 사용자 기준 데이터분할

In [0]:
window_spec = Window.partitionBy("userId").orderBy(rand(seed=0))
ranked = df_transformed.withColumn("row_num", row_number().over(window_spec))
user_counts = ranked.groupBy("userId").agg(count("*").alias("total"))

split_df = ranked.join(user_counts, on="userId").withColumn(
    "set",
    when(col("row_num") <= floor(col("total") * 0.6), "train")
    .when(col("row_num") <= floor(col("total") * 0.8), "valid")
    .otherwise("test")
)

In [0]:
def df_to_xy(df, label_col="rating"):
    pdf = df.select("features", label_col).toPandas()
    X = np.vstack(pdf["features"].apply(lambda x: x.toArray()))
    y = pdf[label_col].values
    return X, y

train_df = split_df.filter(col("set") == "train")
valid_df = split_df.filter(col("set") == "valid")
test_df  = split_df.filter(col("set") == "test")

In [0]:
X_train, y_train = df_to_xy(train_df)
X_valid, y_valid = df_to_xy(valid_df)
X_test, y_test   = df_to_xy(test_df)

# 4. 모델 설계 및 평가

## 평가지표 정의

In [0]:
# # Precision@K 정의(사용자별로 Top-K 추천 결과에서 실제 rating이 threshold 이상인 비율을 계산.)
def precision_at_k(df: pd.DataFrame, k: int = 5, threshold: float = 4.0) -> float:
    # 사용자별 정렬된 top-k 영화 선택
    top_k = (
        df.sort_values(["userIndex", "y_pred_prob"], ascending=[True, False])
        .groupby("userIndex")
        .head(k)
    )

    # 정답 판단: 실제 평점이 threshold 이상인 경우를 정답으로 간주
    top_k["hit"] = (top_k["y_true"] >= threshold).astype(int)

    # 사용자별 precision 계산
    user_precision = top_k.groupby("userIndex")["hit"].mean()

    # 전체 평균 Precision@K
    return user_precision.mean()

## XgboostRegressor

In [0]:
# model = XGBRegressor(
#     objective="reg:squarederror",
#     max_depth=6,
#     n_estimators=100,
#     learning_rate=0.1,
#     early_stopping_rounds=50,
#     seed=0,
#     n_jobs=-1
# )

# model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

In [0]:
# test_pdf = test_df.select("userIndex", "movieIndex", "rating").toPandas()
# test_pdf["y_pred_prob"] = model.predict(X_test)
# test_pdf.rename(columns={"rating": "y_true"}, inplace=True)
# test_pred = spark.createDataFrame(test_pdf)

### 평가지표

In [0]:
# p_at_5 = precision_at_k(test_pdf, k=10)
# print(f"Precision@5: {p_at_5:.4f}")

### 사용자별 추천 영화

In [0]:
# from pyspark.sql.window import Window
# from pyspark.sql.functions import row_number, col

# K = 10
# # 1. 사용자별 예측 상위 K개 영화 선택
# window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_prob").desc())

# ranked = test_pred.withColumn("rank", row_number().over(window_spec)) \
#                   .filter(col("rank") <= K)

# # 2. 추천 영화 정보 묶기 (movieId와 예측 평점)
# ranked = ranked.withColumn("recommendation", struct(col("movieIndex").alias("movieId"), col("y_pred_prob").alias("pred_rating")))

# # 3. 사용자별로 리스트로 그룹화
# recommendations = ranked.groupBy("userIndex") \
#     .agg(collect_list("recommendation").alias("recommendations")) \
#     .orderBy("userIndex")

# display(recommendations)

### 성능 개선(optuna)

In [0]:
# # 레이블 생성 (평점 4 이상은 1, 아니면 0)
# # Optuna 목적 함수 정의
# def objective(trial):
#     # 하이퍼파라미터 설정
#     param = {
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#         "n_estimators": trial.suggest_int("n_estimators", 50, 300),
#         "objective": "reg:squarederror",
#         "use_label_encoder": False,
#         "eval_metric": "logloss",
#         "early_stopping_rounds": 20
#     }

#     # XGBoost 모델 학습
#     model = XGBRegressor(**param)
#     model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

#     y_pred = model.predict(X_test)
#     test_pdf = test_df.select("userIndex", "movieIndex", "rating").toPandas()
#     test_pdf["y_pred_prob"] = y_pred
#     test_pdf.rename(columns={"rating": "y_true"}, inplace=True)

#     pred = spark.createDataFrame(test_pdf)

#     window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_prob").desc())
#     top_k = pred.withColumn("rank", row_number().over(window_spec)).filter(col("rank") <= K)

#     # Precision@K 계산
#     top_k = top_k.withColumn("y_true", (col("y_true") >= 4).cast("int"))
#     precision_df = top_k.groupBy("userIndex").agg({"y_true": "avg"})
#     avg_precision_at_k = precision_df.selectExpr("avg(`avg(y_true)`) as precision_at_k").collect()[0]["precision_at_k"]

#     return avg_precision_at_k  # Optuna는 최소화를 수행하므로 음수 반환

In [0]:
Op# study = optuna.create_study(direction="maximize")  # Precision@K를 최대화
# study.optimize(objective, n_trials=30)  # 30번 튜닝

# print("Best params:", study.best_params)
# print("Best Precision@K:", study.best_value)

In [0]:
# best_params = study.best_params

# best_model = XGBRegressor(**best_params)
# best_model.fit(X_train, y_train)

# best_pred = best_model.predict(X_test)

# joblib.dump(best_model, "best_xgb_reg_model.pkl")

# # loaded_model = joblib.load("best_xgb_model.pkl")
# # y_pred = loaded_model.predict(X_test)

### Oputna + mlflow

In [0]:
# 레이블 생성 (평점 4 이상은 1, 아니면 0)
# Optuna 목적 함수 정의
def objective(trial):
    # 하이퍼파라미터 설정
    param = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "objective": "reg:squarederror",
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "early_stopping_rounds": 20
    }

    with mlflow.start_run(nested=True):
            # ✅ 파라미터 기록
            mlflow.log_params(param)

            # XGBoost 모델 학습
            model = XGBRegressor(**param)
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

            # 예측
            y_pred = model.predict(X_test)

            # Spark DataFrame으로 Precision@K 계산
            test_pdf = test_df.select("userIndex", "movieIndex", "rating").toPandas()
            test_pdf["y_pred_prob"] = y_pred
            test_pdf.rename(columns={"rating": "y_true"}, inplace=True)

            pred = spark.createDataFrame(test_pdf)
            window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_prob").desc())
            top_k = pred.withColumn("rank", row_number().over(window_spec)).filter(col("rank") <= K)

            top_k = top_k.withColumn("y_true", (col("y_true") >= 4).cast("int"))
            precision_df = top_k.groupBy("userIndex").agg({"y_true": "avg"})
            avg_precision_at_k = precision_df.selectExpr("avg(`avg(y_true)`) as precision_at_k").collect()[0]["precision_at_k"]

            # ✅ 메트릭 기록
            mlflow.log_metric("precision_at_k", avg_precision_at_k)

            # ✅ 모델 저장
            mlflow.sklearn.log_model(model, "model")

            return -avg_precision_at_k  # Optuna는 최소화 수행하므로 음수로 반환

In [0]:
# best_model = XGBRegressor(**study.best_params)
# best_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

# with mlflow.start_run(run_name="final_best_model"):
#     mlflow.log_params(study.best_params)
#     mlflow.sklearn.log_model(best_model, "final_model")

## XgbClassifier

### 데이터 준비

In [0]:
X_train, y_train = df_to_xy(train_df, "label")
X_valid, y_valid = df_to_xy(valid_df, "label")
X_test, y_test   = df_to_xy(test_df, "label")

### 모델 설계 및 실험

In [0]:
model = XGBClassifier(
    objective = 'binary:logistic',
    max_depth=6,
    n_estimators=100,
    learning_rate=0.1,
    early_stopping_rounds=50,
    seed=0,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

In [0]:
test_pdf = test_df.select("userIndex", "movieIndex", "rating").toPandas()
test_pdf["y_pred_prob"] = model.predict_proba(X_test)[:,1]
test_pdf.rename(columns={"rating": "y_true"}, inplace=True)
test_pred = spark.createDataFrame(test_pdf)

### 평가지표

In [0]:
p_at_5 = precision_at_k(test_pdf, k=10)
print(f"Precision@5: {p_at_5:.4f}")

### 사용자별 추천 영화

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

K = 10
# 1. 사용자별 예측 상위 K개 영화 선택
window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_prob").desc())

ranked = test_pred.withColumn("rank", row_number().over(window_spec)) \
                  .filter(col("rank") <= K)

# 2. 추천 영화 정보 묶기 (movieId와 예측 평점)
ranked = ranked.withColumn("recommendation", struct(col("movieIndex").alias("movieId"), col("y_pred_prob").alias("pred_rating")))

# 3. 사용자별로 리스트로 그룹화
recommendations = ranked.groupBy("userIndex") \
    .agg(collect_list("recommendation").alias("recommendations")) \
    .orderBy("userIndex")

display(recommendations)

### Optuna

In [0]:
# 레이블 생성 (평점 4 이상은 1, 아니면 0)
# Optuna 목적 함수 정의
def objective(trial):
    # 하이퍼파라미터 설정
    param = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "early_stopping_rounds" : 20
    }

    # XGBoost 모델 학습
    model = XGBClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],  verbose=False)

    y_pred = model.predict_proba(X_test)[:,1]
    test_pdf = test_df.select("userIndex", "movieIndex", "rating").toPandas()
    test_pdf["y_pred_prob"] = y_pred
    test_pdf.rename(columns={"rating": "y_true"}, inplace=True)

    pred = spark.createDataFrame(test_pdf)

    window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_prob").desc())
    top_k = pred.withColumn("rank", row_number().over(window_spec)).filter(col("rank") <= K)

    # Precision@K 계산
    top_k = top_k.withColumn("y_true", (col("y_true") >= 4).cast("int"))
    precision_df = top_k.groupBy("userIndex").agg({"y_true": "avg"})
    avg_precision_at_k = precision_df.selectExpr("avg(`avg(y_true)`) as precision_at_k").collect()[0]["precision_at_k"]

    return avg_precision_at_k  # Optuna는 최소화를 수행하므로 음수 반환

In [0]:
study = optuna.create_study(direction="maximize")  # Precision@K를 최대화
study.optimize(objective, n_trials=30)  # 30번 튜닝

print("Best params:", study.best_params)
print("Best Precision@K:", study.best_value)

In [0]:
best_params = study.best_params

best_model = XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

best_pred = best_model.predict_proba(X_test)[:,1]

# 1. 디렉토리 생성
os.makedirs("/dbfs/FileStore/models", exist_ok=True)

joblib.dump(best_model, "/dbfs/FileStore/models/best_xgb_clf_model.pkl")

# loaded_model = joblib.load("best_xgb_model.pkl")
# y_pred = loaded_model.predict(X_test)

### Optuna + mlflow

In [0]:
# 레이블 생성 (평점 4 이상은 1, 아니면 0)
# Optuna 목적 함수 정의
def objective(trial):
    # 하이퍼파라미터 설정
    param = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "objective": "reg:squarederror",
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "early_stopping_rounds": 20
    }

    with mlflow.start_run(nested=True):
            # ✅ 파라미터 기록
            mlflow.log_params(param)

            # XGBoost 모델 학습
            model = XGBRegressor(**param)
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

            # 예측
            y_pred = model.predict_proba(X_test)[:,1]

            # Spark DataFrame으로 Precision@K 계산
            test_pdf = test_df.select("userIndex", "movieIndex", "rating").toPandas()
            test_pdf["y_pred_prob"] = y_pred
            test_pdf.rename(columns={"rating": "y_true"}, inplace=True)

            pred = spark.createDataFrame(test_pdf)
            window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_prob").desc())
            top_k = pred.withColumn("rank", row_number().over(window_spec)).filter(col("rank") <= K)

            top_k = top_k.withColumn("y_true", (col("y_true") >= 4).cast("int"))
            precision_df = top_k.groupBy("userIndex").agg({"y_true": "avg"})
            avg_precision_at_k = precision_df.selectExpr("avg(`avg(y_true)`) as precision_at_k").collect()[0]["precision_at_k"]

            # ✅ 메트릭 기록
            mlflow.log_metric("precision_at_k", avg_precision_at_k)

            # ✅ 모델 저장
            mlflow.sklearn.log_model(model, "model")

            return -avg_precision_at_k  # Optuna는 최소화 수행하므로 음수로 반환

In [0]:
best_model = XGBClassifier(**study.best_params)
best_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

with mlflow.start_run(run_name="final_best_model"):
    mlflow.log_params(study.best_params)
    mlflow.sklearn.log_model(best_model, "final_model")

# 5. 한계

## 5-1. 안 본 영화 중에서 추천(rating 있다면 본 영화로 판단)

In [0]:
from pyspark.sql.functions import collect_set

# 1. 사용자별 이미 본 영화(movieIndex) 집합 생성 (train + valid + test 혹은 전체 학습 데이터에서)
# 보통은 train_df, valid_df에 있는 user-movie pairs가 '본 영화'임
seen_movies_df = train_df.select("userIndex", "movieIndex") \
    .union(valid_df.select("userIndex", "movieIndex")) \
    .distinct() \
    .groupBy("userIndex") \
    .agg(collect_set("movieIndex").alias("seen_movies"))

# 2. test_pred와 join해서 이미 본 영화 제외
test_pred_filtered = test_pred.join(seen_movies_df, on="userIndex", how="left_outer")

# 3. 이미 본 영화 제외 조건 추가 (seen_movies가 null일 수도 있으니 null-safe 처리)
from pyspark.sql.functions import array_contains

test_pred_filtered = test_pred_filtered.filter(
    (col("seen_movies").isNull()) | (~array_contains(col("seen_movies"), col("movieIndex")))
)

# 4. 사용자별 추천 상위 K개 추출
window_spec = Window.partitionBy("userIndex").orderBy(col("y_pred_prob").desc())

ranked = test_pred_filtered.withColumn("rank", row_number().over(window_spec)) \
                           .filter(col("rank") <= K)

ranked = ranked.withColumn("recommendation", struct(col("movieIndex").alias("movieId"), col("y_pred_prob").alias("pred_rating")))

recommendations = ranked.groupBy("userIndex") \
                        .agg(collect_list("recommendation").alias("recommendations")) \
                        .orderBy("userIndex")

display(recommendations)

## 5-2. 추가 User가 들어왔을때 어떤 영화를 추천할지

1. 신규유저에 영화정보를 바탕으로 기존유저와 매칭

2. 새로운 user_index 추천하지 않음(새로 학습해야함) - 현재 모델에서 user_index가 중요한 역할/ 처음 보는 User_index보고 엉뚱한 예측할 확률 높음

1. Cold Start 유저 처리
신규 유저:

A안: userIndex 제거한 일반 모델로 추천 제공 (장르, 평균 평점 기반)

B안: 기존 유저와 유사도 기반으로 매칭 → 그 유저의 추천을 가져옴 (collaborative filtering 유사)

2. 데이터 업데이트 및 재학습
신규 rating이 수집되면 데이터에 추가

일정 주기 (일간/주간 등)에 따라 모델 재학습

재학습된 모델은 최신화된 추천 반영 가능



1. 시청 영화 바탕으로 유사한 기존 유저 indexr를 찾고,
2. 그 indexr와 시청 영화 바탕으로 추천 영화 예측

In [0]:
# Step 1. 신규 유저가 본 영화 리스트
new_user_seen_movies = [1, 10, 50, 300]
new_user_set = set(new_user_seen_movies)

# Step 2. 기존 유저별 시청 영화 집합 생성
user_movie_sets = ratings.groupBy("userId").agg(collect_set("movieId").alias("movie_set"))

# Step 3. Jaccard 유사도 계산
def jaccard_similarity(set1, set2):
    set1, set2 = set(set1), set(set2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return float(intersection) / union if union != 0 else 0.0

jaccard_udf = udf(lambda x: jaccard_similarity(new_user_set, x), DoubleType())

similar_users = user_movie_sets.withColumn("jaccard_sim", jaccard_udf(col("movie_set"))) \
                               .orderBy(desc("jaccard_sim")) \
                               .limit(1)

# Step 4. 유사한 기존 유저의 userId 및 userIndex 추출
top_user_id = similar_users.select("userId").first()["userId"]
top_user_index = user_indexer_model.transform(
    spark.createDataFrame([(top_user_id,)], ["userId"])
).select("userIndex").first()["userIndex"]

In [0]:
# Step 5. 전체 영화 목록에서, 신규 유저가 보지 않은 영화만 필터링
unseen_movies = df_cleaned.filter(~col("movieId").isin(new_user_seen_movies))

# Step 6. 추천 후보 영화들에 대해 movieIndex, features 등 변환 수행
unseen_with_features = pipeline_model.transform(unseen_movies)

# Step 7. 해당 유사 유저 index를 모든 row에 추가
unseen_with_features = unseen_with_features.withColumn("userIndex", lit(top_user_index))

unseen_unique = unseen_with_features.dropDuplicates(["movieId"])

# Step 8. features 컬럼을 numpy 배열로 변환
pdf_unseen = unseen_unique.select("features", "movieId", "title").toPandas()
X_unseen = np.vstack(pdf_unseen["features"].apply(lambda x: x.toArray()))

# Step 9. 모델 예측 (regressor라 predict_proba 아님, predict 사용)
y_pred = model.predict_proba(X_unseen)[:,1]

# Step 10. 예측 결과를 DataFrame에 추가
pdf_unseen["prediction"] = y_pred

# Step 11. 상위 K개 추천
top_k = pdf_unseen.sort_values(by="prediction", ascending=False).head(10)

# 출력
print(top_k[["movieId", "title", "prediction"]])