In [0]:
        [사용자]
           │
     ┌─────▼─────┐
     │  설문 응답 │  ← 사용자의 취향(장르 등)
     └─────┬─────┘
           │
     ┌─────▼────────────┐
     │사용자 성향 벡터 생성│
     └─────┬────────────┘
           │
           ▼
[ 사용자 기반 추천 모델 ]  ← Cosine Similarity 등

           ▲
           │
     ┌─────▼─────┐
     │ 평점 데이터 │  ← ratings.csv
     └─────┬─────┘
           ▼
   [ ALS 추천 모델 ]  ← Spark ML ALS

           ▼
  ┌────────┴────────┐
  │ Hybrid 추천 조합│ ← 설문 기반 + ALS 기반 결합 (가중 평균 등)
  └────────┬────────┘
           ▼
     [추천 영화 리스트]
           ▼
       [시각화/출력]


In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

# 1. Unity Catalog에서 데이터 불러오기
catalog = "1dt_team8_databricks"
schema = "final"
base_path = f"{catalog}.{schema}"

# 데이터셋 로딩
train = spark.read.table(f"{base_path}.train_df")
validation = spark.read.table(f"{base_path}.validation_df")
test = spark.read.table(f"{base_path}.test_df")
df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

# 2. ALS 모델 정의
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True
)

# 3. 모델 학습
model = als.fit(train)

# 4. 특정 사용자 추천 (예: userId 123)
user_df = spark.createDataFrame([(123,)], ["userId"])
userRecs = model.recommendForUserSubset(user_df, 10)

# 5. 추천 결과 정리
userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

# 6. index 부여
windowSpec = Window.orderBy(col("rating").desc())
indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .select("index", "movieId")

# 7. 결과 출력
display(indexedRecs)

## ML Flow 사용

In [0]:
%python
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

# 0. 실험 설정
mlflow.set_experiment('/Users/1dt011@msacademy.msai.kr/1dt011')

# 1. 데이터 로딩 (Unity Catalog 사용)
catalog = "1dt_team8_databricks"
schema = "final"
path = f"{catalog}.{schema}"

train = spark.read.table(f"{path}.train_df")
test = spark.read.table(f"{path}.test_df")
df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

# 2. MLflow 실험 시작
with mlflow.start_run(run_name="ALS-Recommender-UnityCatalog"):

    # 하이퍼파라미터
    rank = 10
    maxIter = 10
    regParam = 0.1

    # ALS 모델 정의 및 학습
    als = ALS(
        userCol="userId", itemCol="movieId", ratingCol="rating",
        rank=rank, maxIter=maxIter, regParam=regParam,
        coldStartStrategy="drop", nonnegative=True
    )

    model = als.fit(train)
    predictions = model.transform(test)

    # 평가 지표 계산
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)

    # MLflow 로깅
    mlflow.log_param("rank", rank)
    mlflow.log_param("maxIter", maxIter)
    mlflow.log_param("regParam", regParam)
    mlflow.log_metric("rmse", rmse)

    # 모델 저장
    mlflow.spark.log_model(model, "als_model")

    print(f"✅ MLflow Run Completed - RMSE: {rmse:.4f}")


## 평가지표_ Precision@10 , Recall@10

In [0]:
from pyspark.sql.functions import expr, collect_set, size, array_intersect
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

# 1. Unity Catalog 데이터 불러오기
catalog = "1dt_team8_databricks"
schema = "final"
path = f"{catalog}.{schema}"

train = spark.read.table(f"{path}.train_df")
test = spark.read.table(f"{path}.test_df")

# 2. ALS 모델 정의
als = ALS(
    userCol="userId", itemCol="movieId", ratingCol="rating",
    coldStartStrategy="drop", nonnegative=True
)

# 3. 모델 학습
model = als.fit(train)

# 4. 특정 사용자 추천 (userId 예: 123)
user_df = spark.createDataFrame([(123,)], ["userId"])
userRecs = model.recommendForUserSubset(user_df, 10)

# 5. 추천 결과 정리
userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

# 순위(index) 부여
windowSpec = Window.orderBy(col("rating").desc())
indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .select("index", "movieId")

# 6. 결과 출력

# -------------------------------------- #

# 7. 테스트셋에서 긍정적 평가(예: rating >= 4.0)만 필터링하여 실제 정답 만들기
positive_test = test.filter(col("rating") >= 4.0) \
                    .groupBy("userId") \
                    .agg(collect_set("movieId").alias("true_items"))

# 8. 모든 사용자에 대해 추천 10개 생성
userRecsAll = model.recommendForAllUsers(10)

# 9. 추천 리스트에서 movieId만 추출
predicted_items = userRecsAll.select("userId", 
    expr("transform(recommendations, x -> x.movieId)").alias("pred_items")
)

# 10. 실제와 예측을 join
joined = predicted_items.join(positive_test, on="userId")

# 11. Precision@10, Recall@10 계산
metrics = joined.withColumn("num_relevant_and_recommended", 
                                size(array_intersect("pred_items", "true_items"))) \
                .withColumn("precision_at_10", 
                                col("num_relevant_and_recommended") / expr("size(pred_items)")) \
                .withColumn("recall_at_10", 
                                col("num_relevant_and_recommended") / expr("size(true_items)"))

# 12. 평균 값 출력
precision_recall = metrics.selectExpr("avg(precision_at_10) as avg_precision_at_10", 
                                      "avg(recall_at_10) as avg_recall_at_10")

# 결과 보기
precision_recall.show()


##**개선**
✅ 개선 포인트 요약
항목	변경 전	변경 후 제안

ALS 파라미터	기본값 사용	rank, maxIter, regParam 조정

추천 수	Top 10	Top 50으로 늘리고 그중 상위 N개 평가

추천 유저	고정 userId	다수 유저에 대해 평가 자동화 가능하게


In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

# 1. Unity Catalog에서 데이터 불러오기
catalog = "1dt_team8_databricks"
schema = "final"
base_path = f"{catalog}.{schema}"

train = spark.read.table(f"{base_path}.train_df")
validation = spark.read.table(f"{base_path}.validation_df")
test = spark.read.table(f"{base_path}.test_df")

df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

# 2. ALS 모델 정의 (파라미터 조정)
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
    rank=20,
    maxIter=15,
    regParam=0.1
)

# 3. 모델 학습
model = als.fit(train)

# 4. 추천 대상 사용자 설정
user_ids = [123]  # 하나의 사용자만 추천할 경우
user_df = spark.createDataFrame([(uid,) for uid in user_ids], ["userId"])

# 5. Top 50 추천 받아서 Top 10만 출력
userRecs = model.recommendForUserSubset(user_df, 50)

# 6. 추천 결과 정리
userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

# 7. Top 10 영화만 index 부여하여 추출
windowSpec = Window.partitionBy("userId").orderBy(col("rating").desc())
topN = 10

indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .filter(col("index") < topN) \
                              .select("index", "movieId")

# 8. 결과 출력
display(indexedRecs)


In [0]:
from pyspark.sql.functions import col, countDistinct, collect_set, size
from pyspark.sql import functions as F

# 1. 추천 결과(topNRecs)에는 userId, movieId가 포함되어 있어야 함
#    topNRecs: ALS로 추천된 Top 10 movie per user

# 2. 실제 평가 데이터에서 평점 4.0 이상인 영화만 긍정적으로 간주
actual_relevant = test.filter(col("rating") >= 4.0) \
                      .select("userId", "movieId") \
                      .distinct() \
                      .groupBy("userId") \
                      .agg(collect_set("movieId").alias("actual_movies"))

# 3. 추천 결과를 userId별로 movieId 리스트로 집계
predicted_recs = topNRecs.groupBy("userId") \
                         .agg(collect_set("movieId").alias("predicted_movies"))

# 4. 실제/예측 join
joined = predicted_recs.join(actual_relevant, on="userId", how="inner")

# 5. Precision@10, Recall@10 계산
def precision_recall_udf(predicted, actual):
    predicted_set = set(predicted)
    actual_set = set(actual)
    intersection = predicted_set & actual_set
    precision = len(intersection) / len(predicted_set) if predicted_set else 0.0
    recall = len(intersection) / len(actual_set) if actual_set else 0.0
    return (precision, recall)

from pyspark.sql.types import StructType, StructField, DoubleType
from pyspark.sql.functions import udf

schema = StructType([
    StructField("precision", DoubleType(), True),
    StructField("recall", DoubleType(), True)
])

precision_recall_udf_spark = udf(precision_recall_udf, schema)

# 6. 컬럼 생성
scored = joined.withColumn("metrics", precision_recall_udf_spark(col("predicted_movies"), col("actual_movies"))) \
               .select("userId", "metrics.*")

# 7. 평균 계산
avg_scores = scored.select(F.avg("precision").alias("avg_precision_at_10"),
                           F.avg("recall").alias("avg_recall_at_10"))

# 8. 출력
display(avg_scores)


In [0]:
import matplotlib.pyplot as plt
import numpy as np

# 지표 이름
metrics = ['Precision@10', 'Recall@10']

# 개선 전, 후 값
before = [0.00387, 0.00422]
after = [0.01356, 0.01423]

x = np.arange(len(metrics))  # X축 위치
width = 0.35                 # 바 너비

fig, ax = plt.subplots(figsize=(6, 4))
bars1 = ax.bar(x - width/2, before, width, label='Before', color='lightcoral')
bars2 = ax.bar(x + width/2, after, width, label='After', color='skyblue')

# 레이블 및 타이틀
ax.set_ylabel('Score')
ax.set_title('Avg Precision@10 and Recall@10 (Before vs After)')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# 값 표시 (소수점 3자리)
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 텍스트 오프셋
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(bars1)
autolabel(bars2)

plt.ylim(0, max(after) + 0.005)
plt.tight_layout()
plt.show()
