## 1. 군집화 기반 사용자 영화추천

In [0]:

┌──────────────────────┐
│ 1. 데이터 로딩        │
│ - movies.csv         │
│ - ratings.csv        │
└─────────┬────────────┘
          │
          ▼
┌──────────────────────────────────────┐
│ 2. 장르 전처리                        │
│ - 장르 explode 및 원-핫 인코딩         │
└─────────┬────────────────────────────┘
          │
          ▼
┌──────────────────────────────────────┐
│ 3. 사용자 장르 선호도 벡터화           │
│ - 장르 x 평점 → 사용자 프로파일 생성    │
└─────────┬────────────────────────────┘
          │
          ▼
┌────────────────────────────┐
│ 4. 사용자 벡터 → features   │
│ - VectorAssembler 사용     │
└─────────┬──────────────────┘
          │
          ▼
┌────────────────────────────┐
│ 5. KMeans 군집화            │
│ - 사용자 군집 분류           │
└─────────┬──────────────────┘
          │
          ▼
┌────────────────────────────────────┐
│ 6. 클러스터 내 영화 평균 평점 계산    │
│ - 각 군집 내 인기 영화 파악          │
└─────────┬──────────────────────────┘
          │
          ▼
┌─────────────────────────────────────┐
│ 7. 특정 사용자에게 추천 수행          │
│ - 같은 클러스터 + 안 본 영화          │
│ - 평균 평점 기반 상위 N개 추천        │
└─────────┬───────────────────────────┘
          │
          ▼
┌─────────────────────────────────────┐
│ 8. 추천 결과 출력                    │
│ - movieId, title, genres, avg_rating│
└─────────────────────────────────────┘


In [0]:
from pyspark.sql.functions import col, split, explode, avg, when, max, row_number
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.window import Window

def recommend_movies_by_user(userId: int):
    catalog = "1dt_team8_databricks"
    schema = "final"
    path = f"{catalog}.{schema}"
    
    df1 = spark.read.table(f"{path}.movies")
    df2 = spark.read.table(f"{path}.links")      
    df3 = spark.read.table(f"{path}.ratings")
    df4 = spark.read.table(f"{path}.tags")   

    movies_with_genres = df1.withColumn("genre", explode(split("genres", "\\|")))
    distinct_genres = movies_with_genres.select("genre").distinct().rdd.flatMap(lambda x: x).collect()

    for genre in distinct_genres:
        movies_with_genres = movies_with_genres.withColumn(
            f"genre_{genre}",
            when(col("genre") == genre, 1).otherwise(0)
        )

    genre_features = movies_with_genres.groupBy("movieId").agg(
        *[max(f"genre_{genre}").alias(f"genre_{genre}") for genre in distinct_genres]
    )

    ratings_with_genres = df3.join(genre_features, on="movieId", how="inner")

    user_profile = ratings_with_genres.groupBy("userId").agg(
        *[avg(f"genre_{genre}").alias(f"pref_{genre}") for genre in distinct_genres]
    )

    feature_cols = [c for c in user_profile.columns if c.startswith("pref_")]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    user_features = assembler.transform(user_profile)

    kmeans = KMeans(k=5, seed=42)
    model = kmeans.fit(user_features)

    user_clusters = model.transform(user_features).select("userId", "prediction")
    ratings_with_cluster = df3.join(user_clusters, on="userId")
    movie_avg_by_cluster = ratings_with_cluster.groupBy("prediction", "movieId") \
        .agg(avg("rating").alias("avg_rating"))

    movie_avg_with_titles = movie_avg_by_cluster.join(df1.select("movieId", "title", "genres"), on="movieId")

    user_seen_movies = df3.filter(col("userId") == userId).select("movieId").distinct()
    user_cluster = user_clusters.filter(col("userId") == userId).select("prediction").collect()[0][0]

    recommend_pool = movie_avg_with_titles.filter(col("prediction") == user_cluster)
    recommend_pool_unseen = recommend_pool.join(user_seen_movies, on="movieId", how="left_anti")

    top_recommendations = recommend_pool_unseen \
        .filter(col("genres") != "(no genres listed)") \
        .orderBy(col("avg_rating").desc()) \
        .limit(10) \
        .select("movieId")

    indexed = top_recommendations.withColumn(
        "index", row_number().over(Window.orderBy(col("movieId"))) - 1
    ).select("index", "movieId")

    display(indexed)


In [0]:
recommend_movies_by_user(2)

## ML Flow 사용

In [0]:
import mlflow
import mlflow.spark
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

def run_kmeans_with_mlflow(user_profile_df, k=5, seed=42):
    # ✅ MLflow 실험 위치 설정
    mlflow.set_experiment("/Users/1dt011@msacademy.msai.kr/1dt011")

    feature_cols = [col for col in user_profile_df.columns if col.startswith("pref_")]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    user_features = assembler.transform(user_profile_df)

    with mlflow.start_run(run_name="KMeans_User_Clustering"):
        mlflow.log_param("k", k)
        mlflow.log_param("seed", seed)

        kmeans = KMeans(k=k, seed=seed)
        model = kmeans.fit(user_features)

        # 클러스터링 비용(Within Set Sum of Squared Errors)
        mlflow.log_metric("wsse", model.summary.trainingCost)

        # 모델 저장
        mlflow.spark.log_model(model, "kmeans_model")

        # userId와 클러스터 예측 결과만 반환
        return model.transform(user_features).select("userId", "prediction")


In [0]:
from pyspark.sql.functions import col, split, explode, avg

# 1. 데이터 불러오기
movies_df = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
ratings_df = spark.read.table("`1dt_team8_databricks`.`movielens-small`.ratings")

# 2. 타입 변환
ratings_df = ratings_df.withColumn("movieId", col("movieId").cast("int")) \
                       .withColumn("rating", col("rating").cast("float")) \
                       .withColumn("userId", col("userId").cast("int"))
movies_df = movies_df.withColumn("movieId", col("movieId").cast("int"))

# 3. 장르 explode
movie_genres = movies_df.withColumn("genre", explode(split(col("genres"), "\\|"))).select("movieId", "genre")

# 4. 평점과 장르 조인
ratings_with_genre = ratings_df.join(movie_genres, on="movieId", how="inner")

# 5. 사용자별 장르별 평균 평점 계산
user_genre_pref = ratings_with_genre.groupBy("userId", "genre").agg(avg("rating").alias("avg_rating"))

# 6. Pivot 해서 wide format으로 변환
user_profile_df = user_genre_pref.groupBy("userId").pivot("genre").agg(avg("avg_rating"))

# 7. Null 값을 0으로 대체 (평점이 없는 장르는 0으로 간주)
user_profile_df = user_profile_df.fillna(0)

# 8. 컬럼명에 'pref_' 접두어 붙이기 (userId 제외)
for col_name in user_profile_df.columns:
    if col_name != "userId":
        user_profile_df = user_profile_df.withColumnRenamed(col_name, f"pref_{col_name}")

# 9. 앞서 정의한 MLflow 로깅이 포함된 KMeans 함수 호출
result_df = run_kmeans_with_mlflow(user_profile_df, k=5, seed=42)

# 10. 결과 확인 (예시)
display(result_df)


##평가지표_ Precision@10 , Recall@10

In [0]:
%python
from pyspark.sql.functions import col, split, explode, avg, when, max, row_number
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.window import Window

def recommend_movies_by_user(userId: int):
    df1 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
    df2 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.links")
    df3 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.ratings")
    df4 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.tags") 

    movies_with_genres = df1.withColumn("genre", explode(split("genres", "\\|")))
    distinct_genres = movies_with_genres.select("genre").distinct().rdd.flatMap(lambda x: x).collect()

    for genre in distinct_genres:
        movies_with_genres = movies_with_genres.withColumn(
            f"genre_{genre}",
            when(col("genre") == genre, 1).otherwise(0)
        )

    genre_features = movies_with_genres.groupBy("movieId").agg(
        *[max(f"genre_{genre}").alias(f"genre_{genre}") for genre in distinct_genres]
    )

    ratings_with_genres = df3.join(genre_features, on="movieId", how="inner")

    user_profile = ratings_with_genres.groupBy("userId").agg(
        *[avg(f"genre_{genre}").alias(f"pref_{genre}") for genre in distinct_genres]
    )

    feature_cols = [col for col in user_profile.columns if col.startswith("pref_")]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    user_features = assembler.transform(user_profile)

    kmeans = KMeans(k=5, seed=42)
    model = kmeans.fit(user_features)

    user_clusters = model.transform(user_features).select("userId", "prediction")
    ratings_with_cluster = df3.join(user_clusters, on="userId")
    movie_avg_by_cluster = ratings_with_cluster.groupBy("prediction", "movieId") \
        .agg(avg("rating").alias("avg_rating"))

    movie_avg_with_titles = movie_avg_by_cluster.join(df1.select("movieId", "title", "genres"), on="movieId")

    user_seen_movies = df3.filter(col("userId") == userId).select("movieId").distinct()
    user_cluster = user_clusters.filter(col("userId") == userId).select("prediction").collect()[0][0]

    recommend_pool = movie_avg_with_titles.filter(col("prediction") == user_cluster)
    recommend_pool_unseen = recommend_pool.join(user_seen_movies, on="movieId", how="left_anti")

    top_recommendations = recommend_pool_unseen \
        .filter(col("genres") != "(no genres listed)") \
        .orderBy(col("avg_rating").desc()) \
        .limit(10) \
        .select("movieId")

    indexed = top_recommendations.withColumn(
        "index", row_number().over(Window.orderBy(col("movieId"))) - 1
    ).select("index", "movieId")


def evaluate_precision_recall_at_10(userId: int):
    recommended_movies = recommend_movies_by_user(userId)
    if recommended_movies is not None:
        recommended_movies = recommended_movies.select("movieId").limit(10)
    else:
        raise ValueError(f"No recommendations found for userId: {userId}")

    relevant_movies = df3.filter(
        (col("userId") == userId) & (col("rating") >= 4.0)
    ).select("movieId").distinct()

    true_positives = recommended_movies.join(relevant_movies, "movieId").count()
    precision = true_positives / recommended_movies.count()
    recall = true_positives / relevant_movies.count()

    return precision, recall

In [0]:
from pyspark.sql.functions import col, split, explode, avg, when, max, row_number
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.window import Window
import pandas as pd

def recommend_movies_by_user_return_df(userId: int):
    df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
    movies_with_genres = df_movies.withColumn("genre", explode(split("genres", "\\|")))
    distinct_genres = movies_with_genres.select("genre").distinct().rdd.flatMap(lambda x: x).collect()

    for genre in distinct_genres:
        movies_with_genres = movies_with_genres.withColumn(
            f"genre_{genre}",
            when(col("genre") == genre, 1).otherwise(0)
        )
    genre_features = movies_with_genres.groupBy("movieId").agg(
        *[max(f"genre_{genre}").alias(f"genre_{genre}") for genre in distinct_genres]
    )
    
    ratings_with_genres = train.join(genre_features, on="movieId", how="inner")
    
    user_profile = ratings_with_genres.groupBy("userId").agg(
        *[avg(f"genre_{genre}").alias(f"pref_{genre}") for genre in distinct_genres]
    )
    
    assembler = VectorAssembler(inputCols=[c for c in user_profile.columns if c.startswith("pref_")], outputCol="features")
    user_features = assembler.transform(user_profile)
    
    kmeans = KMeans(k=5, seed=42)
    model = kmeans.fit(user_features)
    
    user_clusters = model.transform(user_features).select("userId", "prediction")
    
    ratings_with_cluster = train.join(user_clusters, on="userId")
    movie_avg_by_cluster = ratings_with_cluster.groupBy("prediction", "movieId").agg(avg("rating").alias("avg_rating"))
    
    movie_avg_with_titles = movie_avg_by_cluster.join(df_movies.select("movieId", "title", "genres"), on="movieId")
    
    user_seen_movies = train.filter(col("userId") == userId).select("movieId").distinct()
    
    cluster_row = user_clusters.filter(col("userId") == userId).collect()
    if not cluster_row:
        return None
    user_cluster = cluster_row[0]["prediction"]
    
    recommend_pool = movie_avg_with_titles.filter(col("prediction") == user_cluster)
    recommend_pool_unseen = recommend_pool.join(user_seen_movies, on="movieId", how="left_anti")
    
    top_recommendations = recommend_pool_unseen \
        .filter(col("genres") != "(no genres listed)") \
        .orderBy(col("avg_rating").desc()) \
        .limit(10) \
        .select("movieId")
    
    return top_recommendations

def evaluate_precision_recall_safe(userId: int, top_k: int = 10):
    try:
        rec_df = recommend_movies_by_user_return_df(userId)
    except Exception:
        return {"userId": userId, "precision": 0.0, "recall": 0.0}
    
    if rec_df is None or rec_df.count() == 0:
        return {"userId": userId, "precision": 0.0, "recall": 0.0}
    
    predicted = rec_df.select("movieId").rdd.flatMap(lambda x: x).collect()
    
    actual_df = validation.filter((col("userId") == userId) & (col("rating") >= 4.0)).select("movieId").distinct()
    actual = actual_df.rdd.flatMap(lambda x: x).collect()
    
    if len(actual) == 0:
        return {"userId": userId, "precision": 0.0, "recall": 0.0}
    
    intersection = set(predicted) & set(actual)
    precision = len(intersection) / top_k
    recall = len(intersection) / len(actual)
    
    return {"userId": userId, "precision": precision, "recall": recall}

qualified_users_df = validation.filter(col("rating") >= 4.0) \
    .groupBy("userId") \
    .count() \
    .filter("count >= 5") \
    .orderBy("count", ascending=False)

user_ids = [row["userId"] for row in qualified_users_df.collect()]

results = [evaluate_precision_recall_safe(uid) for uid in user_ids[:30]]

df_results = pd.DataFrame(results)
avg_precision = df_results["precision"].mean()
avg_recall = df_results["recall"].mean()

print(f"▶ 평균 Precision@10: {avg_precision:.4f}")
print(f"▶ 평균 Recall@10: {avg_recall:.4f}")


## 개선

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col
from sklearn.metrics import silhouette_score
import pandas as pd

def optimize_kmeans_k(user_profile_df, k_min=2, k_max=10):
    feature_cols = [col for col in user_profile_df.columns if col.startswith("pref_")]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    user_features_df = assembler.transform(user_profile_df).select("userId", "features")

    features_np = user_features_df.select("features").rdd.map(lambda x: x[0].toArray()).collect()
    features_np = pd.DataFrame(features_np).values  # numpy array

    results = []

    for k in range(k_min, k_max + 1):
        kmeans = KMeans(k=k, seed=42)
        model = kmeans.fit(user_features_df)
        predictions = model.transform(user_features_df)
        preds = predictions.select("prediction").rdd.map(lambda x: x[0]).collect()
        score = silhouette_score(features_np, preds)

        results.append({"k": k, "silhouette_score": score})
        print(f"✅ k={k}, silhouette_score={score:.4f}")

    best = max(results, key=lambda x: x["silhouette_score"])
    print(f"\n📌 최적 k: {best['k']} (Silhouette Score: {best['silhouette_score']:.4f})")

    return pd.DataFrame(results), best["k"]

In [0]:
%python
from pyspark.sql.functions import col, split, explode, avg, when, max as spark_max


df1 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
df3 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.ratings")
df2 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.links")
df4 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.tags")

# 2. 영화 장르 분해 (movies.genres → 개별 장르 컬럼으로)
movies_with_genres = df1.withColumn("genre", explode(split("genres", "\\|")))

# 3. 모든 장르 리스트 추출
distinct_genres = movies_with_genres.select("genre").distinct().rdd.flatMap(lambda x: x).collect()

# 4. 각 장르에 대해 binary 컬럼 생성 (해당 장르면 1, 아니면 0)
for genre in distinct_genres:
    movies_with_genres = movies_with_genres.withColumn(
        f"genre_{genre}", when(col("genre") == genre, 1).otherwise(0)
    )

# 5. 영화별 장르 one-hot encoding 집계
genre_features = movies_with_genres.groupBy("movieId").agg(
    *[spark_max(f"genre_{genre}").alias(f"genre_{genre}") for genre in distinct_genres]
)

# 6. 사용자 평점 데이터와 장르 결합
ratings_with_genres = df3.join(genre_features, on="movieId", how="inner")

# 7. 사용자별 장르 선호도 (평균) 계산 → 결과가 user_profile_df
user_profile_df = ratings_with_genres.groupBy("userId").agg(
    *[avg(f"genre_{genre}").alias(f"pref_{genre}") for genre in distinct_genres]
)

# 8. KMeans 최적 군집 수 찾기 및 실행 (이 함수는 따로 정의되어 있어야 함)
results_df, best_k = optimize_kmeans_k(user_profile_df, k_min=2, k_max=10)

In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

# 1. Unity Catalog에서 데이터 불러오기
catalog = "1dt_team8_databricks"
schema = "final"
base_path = f"{catalog}.{schema}"

train = spark.read.table(f"{base_path}.train_df")
validation = spark.read.table(f"{base_path}.validation_df")
test = spark.read.table(f"{base_path}.test_df")

df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

# 2. ALS 모델 정의 (파라미터 조정)
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
    rank=20,
    maxIter=15,
    regParam=0.1
)

# 3. 모델 학습
model = als.fit(train)

# 4. 추천 대상 사용자 설정
user_ids = [123]  # 하나의 사용자만 추천할 경우
user_df = spark.createDataFrame([(uid,) for uid in user_ids], ["userId"])

# 5. Top 50 추천 받아서 Top 10만 출력
userRecs = model.recommendForUserSubset(user_df, 50)

# 6. 추천 결과 정리
userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

# 7. Top 10 영화만 index 부여하여 추출
windowSpec = Window.partitionBy("userId").orderBy(col("rating").desc())
topN = 10

indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .filter(col("index") < topN) \
                              .select("index", "movieId")

# 8. 결과 출력
display(indexedRecs)


In [0]:
from pyspark.sql.functions import col, explode, expr, size
from pyspark.sql import Window
from pyspark.sql.functions import row_number

def evaluate_precision_recall_at_10(model, test_df, df_movies):
    # 1. 추천 대상 사용자만 추출
    users = test_df.select("userId").distinct()

    # 2. 각 사용자에 대해 top-10 추천
    userRecs = model.recommendForUserSubset(users, 10)

    # 3. 추천 결과 explode
    recs = userRecs.select("userId", explode("recommendations").alias("rec")) \
                   .select("userId", col("rec.movieId").alias("movieId"))

    # 4. test_df에서 실제 본 영화 가져오기
    test_actual = test_df.select("userId", "movieId").distinct()

    # 5. 추천 결과와 실제 값 비교 (True Positive)
    hits = recs.join(test_actual, on=["userId", "movieId"])

    # 6. Precision@10 = (# hits) / 10
    precision_per_user = hits.groupBy("userId").count().withColumnRenamed("count", "num_hits") \
                             .withColumn("precision_at_10", col("num_hits") / 10.0)

    # 7. Recall@10 = (# hits) / (# actual items in test set for that user)
    actual_count = test_actual.groupBy("userId").count().withColumnRenamed("count", "actual_count")
    recall_per_user = hits.groupBy("userId").count().withColumnRenamed("count", "num_hits") \
                          .join(actual_count, on="userId") \
                          .withColumn("recall_at_10", col("num_hits") / col("actual_count"))

    # 8. 평균 Precision, Recall 계산
    avg_precision = precision_per_user.agg({"precision_at_10": "avg"}).first()[0]
    avg_recall = recall_per_user.agg({"recall_at_10": "avg"}).first()[0]

    print(f"📊 Precision@10: {avg_precision:.4f}")
    print(f"📊 Recall@10:    {avg_recall:.4f}")


In [0]:
evaluate_precision_recall_at_10(model, test, df_movies)

In [0]:
import matplotlib.pyplot as plt
import numpy as np

# 지표 이름
metrics = ['Precision@10', 'Recall@10']

# 개선 전, 후 값
before = [0.0100, 0.0002]
after = [0.1080, 0.0744]

x = np.arange(len(metrics))  # X축 위치
width = 0.35                 # 바 너비

fig, ax = plt.subplots()
bars1 = ax.bar(x - width/2, before, width, label='Before', color='lightcoral')
bars2 = ax.bar(x + width/2, after, width, label='After', color='skyblue')

# 레이블 및 타이틀
ax.set_ylabel('Score')
ax.set_title('Performance Comparison: Before vs After')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# 값 표시
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 텍스트 오프셋
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(bars1)
autolabel(bars2)

plt.ylim(0, max(after) + 0.05)
plt.tight_layout()
plt.show()


## 2. ALS 기반 사용자 영화 추천

In [0]:
        [사용자]
           │
     ┌─────▼─────┐
     │  설문 응답 │  ← 사용자의 취향(장르 등)
     └─────┬─────┘
           │
     ┌─────▼────────────┐
     │사용자 성향 벡터 생성│
     └─────┬────────────┘
           │
           ▼
[ 사용자 기반 추천 모델 ]  ← Cosine Similarity 등

           ▲
           │
     ┌─────▼─────┐
     │ 평점 데이터 │  ← ratings.csv
     └─────┬─────┘
           ▼
   [ ALS 추천 모델 ]  ← Spark ML ALS

           ▼
  ┌────────┴────────┐
  │ Hybrid 추천 조합│ ← 설문 기반 + ALS 기반 결합 (가중 평균 등)
  └────────┬────────┘
           ▼
     [추천 영화 리스트]
           ▼
       [시각화/출력]


In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

catalog = "1dt_team8_databricks"
schema = "final"
base_path = f"{catalog}.{schema}"

train = spark.read.table(f"{base_path}.train_df")
validation = spark.read.table(f"{base_path}.validation_df")
test = spark.read.table(f"{base_path}.test_df")
df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True
)

model = als.fit(train)

user_df = spark.createDataFrame([(123,)], ["userId"])
userRecs = model.recommendForUserSubset(user_df, 10)

userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

windowSpec = Window.orderBy(col("rating").desc())
indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .select("index", "movieId")

display(indexedRecs)

## ML Flow 사용

In [0]:
%python
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

# 0. 실험 설정
mlflow.set_experiment('/Users/1dt011@msacademy.msai.kr/1dt011')

# 1. 데이터 로딩 (Unity Catalog 사용)
catalog = "1dt_team8_databricks"
schema = "final"
path = f"{catalog}.{schema}"

train = spark.read.table(f"{path}.train_df")
test = spark.read.table(f"{path}.test_df")
df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

# 2. MLflow 실험 시작
with mlflow.start_run(run_name="ALS-Recommender-UnityCatalog"):

    # 하이퍼파라미터
    rank = 10
    maxIter = 10
    regParam = 0.1

    # ALS 모델 정의 및 학습
    als = ALS(
        userCol="userId", itemCol="movieId", ratingCol="rating",
        rank=rank, maxIter=maxIter, regParam=regParam,
        coldStartStrategy="drop", nonnegative=True
    )

    model = als.fit(train)
    predictions = model.transform(test)

    # 평가 지표 계산
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)

    # MLflow 로깅
    mlflow.log_param("rank", rank)
    mlflow.log_param("maxIter", maxIter)
    mlflow.log_param("regParam", regParam)
    mlflow.log_metric("rmse", rmse)

    # 모델 저장
    mlflow.spark.log_model(model, "als_model")

    print(f"✅ MLflow Run Completed - RMSE: {rmse:.4f}")


## 평가지표_ Precision@10 , Recall@10

In [0]:
from pyspark.sql.functions import expr, collect_set, size, array_intersect
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

catalog = "1dt_team8_databricks"
schema = "final"
path = f"{catalog}.{schema}"

train = spark.read.table(f"{path}.train_df")
test = spark.read.table(f"{path}.test_df")

als = ALS(
    userCol="userId", itemCol="movieId", ratingCol="rating",
    coldStartStrategy="drop", nonnegative=True
)

model = als.fit(train)

user_df = spark.createDataFrame([(123,)], ["userId"])
userRecs = model.recommendForUserSubset(user_df, 10)

userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

windowSpec = Window.orderBy(col("rating").desc())
indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .select("index", "movieId")

positive_test = test.filter(col("rating") >= 4.0) \
                    .groupBy("userId") \
                    .agg(collect_set("movieId").alias("true_items"))

userRecsAll = model.recommendForAllUsers(10)

predicted_items = userRecsAll.select("userId", 
    expr("transform(recommendations, x -> x.movieId)").alias("pred_items")
)

joined = predicted_items.join(positive_test, on="userId")

metrics = joined.withColumn("num_relevant_and_recommended", 
                                size(array_intersect("pred_items", "true_items"))) \
                .withColumn("precision_at_10", 
                                col("num_relevant_and_recommended") / expr("size(pred_items)")) \
                .withColumn("recall_at_10", 
                                col("num_relevant_and_recommended") / expr("size(true_items)"))

precision_recall = metrics.selectExpr("avg(precision_at_10) as avg_precision_at_10", 
                                      "avg(recall_at_10) as avg_recall_at_10")

precision_recall.show()


##**개선**
✅ 개선 포인트 요약
항목	변경 전	변경 후 제안

ALS 파라미터	기본값 사용	rank, maxIter, regParam 조정

추천 수	Top 10	Top 50으로 늘리고 그중 상위 N개 평가

추천 유저	고정 userId	다수 유저에 대해 평가 자동화 가능하게


In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

catalog = "1dt_team8_databricks"
schema = "final"
base_path = f"{catalog}.{schema}"

train = spark.read.table(f"{base_path}.train_df")
validation = spark.read.table(f"{base_path}.validation_df")
test = spark.read.table(f"{base_path}.test_df")

df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
    rank=20,
    maxIter=15,
    regParam=0.1
)

model = als.fit(train)

user_ids = [123]
user_df = spark.createDataFrame([(uid,) for uid in user_ids], ["userId"])

userRecs = model.recommendForUserSubset(user_df, 50)

userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

windowSpec = Window.partitionBy("userId").orderBy(col("rating").desc())
topN = 10

indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .filter(col("index") < topN) \
                              .select("index", "movieId")

display(indexedRecs)


In [0]:
from pyspark.sql.functions import col, countDistinct, collect_set, size
from pyspark.sql import functions as F

# 1. 추천 결과(topNRecs)에는 userId, movieId가 포함되어 있어야 함
#    topNRecs: ALS로 추천된 Top 10 movie per user

# 2. 실제 평가 데이터에서 평점 4.0 이상인 영화만 긍정적으로 간주
actual_relevant = test.filter(col("rating") >= 4.0) \
                      .select("userId", "movieId") \
                      .distinct() \
                      .groupBy("userId") \
                      .agg(collect_set("movieId").alias("actual_movies"))

# 3. 추천 결과를 userId별로 movieId 리스트로 집계
predicted_recs = topNRecs.groupBy("userId") \
                         .agg(collect_set("movieId").alias("predicted_movies"))

# 4. 실제/예측 join
joined = predicted_recs.join(actual_relevant, on="userId", how="inner")

# 5. Precision@10, Recall@10 계산
def precision_recall_udf(predicted, actual):
    predicted_set = set(predicted)
    actual_set = set(actual)
    intersection = predicted_set & actual_set
    precision = len(intersection) / len(predicted_set) if predicted_set else 0.0
    recall = len(intersection) / len(actual_set) if actual_set else 0.0
    return (precision, recall)

from pyspark.sql.types import StructType, StructField, DoubleType
from pyspark.sql.functions import udf

schema = StructType([
    StructField("precision", DoubleType(), True),
    StructField("recall", DoubleType(), True)
])

precision_recall_udf_spark = udf(precision_recall_udf, schema)

# 6. 컬럼 생성
scored = joined.withColumn("metrics", precision_recall_udf_spark(col("predicted_movies"), col("actual_movies"))) \
               .select("userId", "metrics.*")

# 7. 평균 계산
avg_scores = scored.select(F.avg("precision").alias("avg_precision_at_10"),
                           F.avg("recall").alias("avg_recall_at_10"))

# 8. 출력
display(avg_scores)


In [0]:
import matplotlib.pyplot as plt
import numpy as np

# 지표 이름
metrics = ['Precision@10', 'Recall@10']

# 개선 전, 후 값
before = [0.00387, 0.00422]
after = [0.01356, 0.01423]

x = np.arange(len(metrics))  # X축 위치
width = 0.35                 # 바 너비

fig, ax = plt.subplots(figsize=(6, 4))
bars1 = ax.bar(x - width/2, before, width, label='Before', color='lightcoral')
bars2 = ax.bar(x + width/2, after, width, label='After', color='skyblue')

# 레이블 및 타이틀
ax.set_ylabel('Score')
ax.set_title('Avg Precision@10 and Recall@10 (Before vs After)')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# 값 표시 (소수점 3자리)
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 텍스트 오프셋
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(bars1)
autolabel(bars2)

plt.ylim(0, max(after) + 0.005)
plt.tight_layout()
plt.show()
