## 1. 군집화 기반 사용자 영화추천

In [0]:

┌──────────────────────┐
│ 1. 데이터 로딩        │
│ - movies.csv         │
│ - ratings.csv        │
└─────────┬────────────┘
          │
          ▼
┌──────────────────────────────────────┐
│ 2. 장르 전처리                        │
│ - 장르 explode 및 원-핫 인코딩         │
└─────────┬────────────────────────────┘
          │
          ▼
┌──────────────────────────────────────┐
│ 3. 사용자 장르 선호도 벡터화           │
│ - 장르 x 평점 → 사용자 프로파일 생성    │
└─────────┬────────────────────────────┘
          │
          ▼
┌────────────────────────────┐
│ 4. 사용자 벡터 → features   │
│ - VectorAssembler 사용     │
└─────────┬──────────────────┘
          │
          ▼
┌────────────────────────────┐
│ 5. KMeans 군집화            │
│ - 사용자 군집 분류           │
└─────────┬──────────────────┘
          │
          ▼
┌────────────────────────────────────┐
│ 6. 클러스터 내 영화 평균 평점 계산    │
│ - 각 군집 내 인기 영화 파악          │
└─────────┬──────────────────────────┘
          │
          ▼
┌─────────────────────────────────────┐
│ 7. 특정 사용자에게 추천 수행          │
│ - 같은 클러스터 + 안 본 영화          │
│ - 평균 평점 기반 상위 N개 추천        │
└─────────┬───────────────────────────┘
          │
          ▼
┌─────────────────────────────────────┐
│ 8. 추천 결과 출력                    │
│ - movieId, title, genres, avg_rating│
└─────────────────────────────────────┘


In [0]:
from pyspark.sql.functions import col, split, explode, avg, when, max, row_number
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.window import Window

def recommend_movies_by_user(userId: int):
    catalog = "1dt_team8_databricks"
    schema = "final"
    path = f"{catalog}.{schema}"
    
    # 1. 데이터 불러오기 (Unity Catalog 경로로 수정)
    df1 = spark.read.table(f"{path}.movies")
    df2 = spark.read.table(f"{path}.links")      # 현재 코드에서는 df2 사용 안됨 (필요 시 사용)
    df3 = spark.read.table(f"{path}.ratings")
    df4 = spark.read.table(f"{path}.tags")       # 현재 코드에서는 df4 사용 안됨 (필요 시 사용)

    # 2. 영화 장르 전처리
    movies_with_genres = df1.withColumn("genre", explode(split("genres", "\\|")))
    distinct_genres = movies_with_genres.select("genre").distinct().rdd.flatMap(lambda x: x).collect()

    for genre in distinct_genres:
        movies_with_genres = movies_with_genres.withColumn(
            f"genre_{genre}",
            when(col("genre") == genre, 1).otherwise(0)
        )

    genre_features = movies_with_genres.groupBy("movieId").agg(
        *[max(f"genre_{genre}").alias(f"genre_{genre}") for genre in distinct_genres]
    )

    # 3. 사용자-장르 선호도 계산
    ratings_with_genres = df3.join(genre_features, on="movieId", how="inner")

    user_profile = ratings_with_genres.groupBy("userId").agg(
        *[avg(f"genre_{genre}").alias(f"pref_{genre}") for genre in distinct_genres]
    )

    # 4. 벡터화 및 클러스터링
    feature_cols = [c for c in user_profile.columns if c.startswith("pref_")]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    user_features = assembler.transform(user_profile)

    kmeans = KMeans(k=5, seed=42)
    model = kmeans.fit(user_features)

    user_clusters = model.transform(user_features).select("userId", "prediction")
    ratings_with_cluster = df3.join(user_clusters, on="userId")
    movie_avg_by_cluster = ratings_with_cluster.groupBy("prediction", "movieId") \
        .agg(avg("rating").alias("avg_rating"))

    movie_avg_with_titles = movie_avg_by_cluster.join(df1.select("movieId", "title", "genres"), on="movieId")

    # 5. 추천 로직 실행
    user_seen_movies = df3.filter(col("userId") == userId).select("movieId").distinct()
    user_cluster = user_clusters.filter(col("userId") == userId).select("prediction").collect()[0][0]

    recommend_pool = movie_avg_with_titles.filter(col("prediction") == user_cluster)
    recommend_pool_unseen = recommend_pool.join(user_seen_movies, on="movieId", how="left_anti")

    top_recommendations = recommend_pool_unseen \
        .filter(col("genres") != "(no genres listed)") \
        .orderBy(col("avg_rating").desc()) \
        .limit(10) \
        .select("movieId")

    # 6. index 추가 및 결과 출력
    indexed = top_recommendations.withColumn(
        "index", row_number().over(Window.orderBy(col("movieId"))) - 1
    ).select("index", "movieId")

    display(indexed)


In [0]:
recommend_movies_by_user(2)

## ML Flow 사용

In [0]:
import mlflow
import mlflow.spark
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

def run_kmeans_with_mlflow(user_profile_df, k=5, seed=42):
    # ✅ MLflow 실험 위치 설정
    mlflow.set_experiment("/Users/1dt011@msacademy.msai.kr/1dt011")

    feature_cols = [col for col in user_profile_df.columns if col.startswith("pref_")]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    user_features = assembler.transform(user_profile_df)

    with mlflow.start_run(run_name="KMeans_User_Clustering"):
        mlflow.log_param("k", k)
        mlflow.log_param("seed", seed)

        kmeans = KMeans(k=k, seed=seed)
        model = kmeans.fit(user_features)

        # 클러스터링 비용(Within Set Sum of Squared Errors)
        mlflow.log_metric("wsse", model.summary.trainingCost)

        # 모델 저장
        mlflow.spark.log_model(model, "kmeans_model")

        # userId와 클러스터 예측 결과만 반환
        return model.transform(user_features).select("userId", "prediction")


In [0]:
from pyspark.sql.functions import col, split, explode, avg

# 1. 데이터 불러오기
movies_df = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
ratings_df = spark.read.table("`1dt_team8_databricks`.`movielens-small`.ratings")

# 2. 타입 변환
ratings_df = ratings_df.withColumn("movieId", col("movieId").cast("int")) \
                       .withColumn("rating", col("rating").cast("float")) \
                       .withColumn("userId", col("userId").cast("int"))
movies_df = movies_df.withColumn("movieId", col("movieId").cast("int"))

# 3. 장르 explode
movie_genres = movies_df.withColumn("genre", explode(split(col("genres"), "\\|"))).select("movieId", "genre")

# 4. 평점과 장르 조인
ratings_with_genre = ratings_df.join(movie_genres, on="movieId", how="inner")

# 5. 사용자별 장르별 평균 평점 계산
user_genre_pref = ratings_with_genre.groupBy("userId", "genre").agg(avg("rating").alias("avg_rating"))

# 6. Pivot 해서 wide format으로 변환
user_profile_df = user_genre_pref.groupBy("userId").pivot("genre").agg(avg("avg_rating"))

# 7. Null 값을 0으로 대체 (평점이 없는 장르는 0으로 간주)
user_profile_df = user_profile_df.fillna(0)

# 8. 컬럼명에 'pref_' 접두어 붙이기 (userId 제외)
for col_name in user_profile_df.columns:
    if col_name != "userId":
        user_profile_df = user_profile_df.withColumnRenamed(col_name, f"pref_{col_name}")

# 9. 앞서 정의한 MLflow 로깅이 포함된 KMeans 함수 호출
result_df = run_kmeans_with_mlflow(user_profile_df, k=5, seed=42)

# 10. 결과 확인 (예시)
display(result_df)


##평가지표_ Precision@10 , Recall@10

In [0]:
%python
from pyspark.sql.functions import col, split, explode, avg, when, max, row_number
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.window import Window

def recommend_movies_by_user(userId: int):
    # 1. 데이터 불러오기
    df1 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
    df2 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.links")
    df3 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.ratings")
    df4 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.tags") 

    # 2. 영화 장르 전처리
    movies_with_genres = df1.withColumn("genre", explode(split("genres", "\\|")))
    distinct_genres = movies_with_genres.select("genre").distinct().rdd.flatMap(lambda x: x).collect()

    for genre in distinct_genres:
        movies_with_genres = movies_with_genres.withColumn(
            f"genre_{genre}",
            when(col("genre") == genre, 1).otherwise(0)
        )

    genre_features = movies_with_genres.groupBy("movieId").agg(
        *[max(f"genre_{genre}").alias(f"genre_{genre}") for genre in distinct_genres]
    )

    # 3. 사용자-장르 선호도 계산
    ratings_with_genres = df3.join(genre_features, on="movieId", how="inner")

    user_profile = ratings_with_genres.groupBy("userId").agg(
        *[avg(f"genre_{genre}").alias(f"pref_{genre}") for genre in distinct_genres]
    )

    # 4. 벡터화 및 클러스터링
    feature_cols = [col for col in user_profile.columns if col.startswith("pref_")]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    user_features = assembler.transform(user_profile)

    kmeans = KMeans(k=5, seed=42)
    model = kmeans.fit(user_features)

    user_clusters = model.transform(user_features).select("userId", "prediction")
    ratings_with_cluster = df3.join(user_clusters, on="userId")
    movie_avg_by_cluster = ratings_with_cluster.groupBy("prediction", "movieId") \
        .agg(avg("rating").alias("avg_rating"))

    movie_avg_with_titles = movie_avg_by_cluster.join(df1.select("movieId", "title", "genres"), on="movieId")

    # 5. 추천 로직 실행
    user_seen_movies = df3.filter(col("userId") == userId).select("movieId").distinct()
    user_cluster = user_clusters.filter(col("userId") == userId).select("prediction").collect()[0][0]

    recommend_pool = movie_avg_with_titles.filter(col("prediction") == user_cluster)
    recommend_pool_unseen = recommend_pool.join(user_seen_movies, on="movieId", how="left_anti")

    top_recommendations = recommend_pool_unseen \
        .filter(col("genres") != "(no genres listed)") \
        .orderBy(col("avg_rating").desc()) \
        .limit(10) \
        .select("movieId")

    # 6. index 추가 및 결과 출력
    indexed = top_recommendations.withColumn(
        "index", row_number().over(Window.orderBy(col("movieId"))) - 1
    ).select("index", "movieId")


def evaluate_precision_recall_at_10(userId: int):
    # Ensure recommend_movies_by_user returns a DataFrame
    recommended_movies = recommend_movies_by_user(userId)
    if recommended_movies is not None:
        recommended_movies = recommended_movies.select("movieId").limit(10)
    else:
        raise ValueError(f"No recommendations found for userId: {userId}")

    # Actual movies liked by the user (rating 4.0 or higher)
    relevant_movies = df3.filter(
        (col("userId") == userId) & (col("rating") >= 4.0)
    ).select("movieId").distinct()

    # Calculate precision and recall
    true_positives = recommended_movies.join(relevant_movies, "movieId").count()
    precision = true_positives / recommended_movies.count()
    recall = true_positives / relevant_movies.count()

    return precision, recall

In [0]:
evaluate_precision_recall_at_10(26)

In [0]:
from pyspark.sql.functions import col, split, explode, avg, when, max, row_number
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.window import Window
import pandas as pd

# 1. 데이터 로딩 (이미 로딩되었다고 가정)
# train, validation, test = ...

# 2. 추천 함수 (train 데이터 기반)
def recommend_movies_by_user_return_df(userId: int):
    # 영화, 장르 데이터 로드
    df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
    
    # 장르 explode
    movies_with_genres = df_movies.withColumn("genre", explode(split("genres", "\\|")))
    distinct_genres = movies_with_genres.select("genre").distinct().rdd.flatMap(lambda x: x).collect()

    for genre in distinct_genres:
        movies_with_genres = movies_with_genres.withColumn(
            f"genre_{genre}",
            when(col("genre") == genre, 1).otherwise(0)
        )
    genre_features = movies_with_genres.groupBy("movieId").agg(
        *[max(f"genre_{genre}").alias(f"genre_{genre}") for genre in distinct_genres]
    )
    
    # train 데이터로 유저-장르 선호도 계산
    ratings_with_genres = train.join(genre_features, on="movieId", how="inner")
    
    user_profile = ratings_with_genres.groupBy("userId").agg(
        *[avg(f"genre_{genre}").alias(f"pref_{genre}") for genre in distinct_genres]
    )
    
    assembler = VectorAssembler(inputCols=[c for c in user_profile.columns if c.startswith("pref_")], outputCol="features")
    user_features = assembler.transform(user_profile)
    
    kmeans = KMeans(k=5, seed=42)
    model = kmeans.fit(user_features)
    
    user_clusters = model.transform(user_features).select("userId", "prediction")
    
    ratings_with_cluster = train.join(user_clusters, on="userId")
    movie_avg_by_cluster = ratings_with_cluster.groupBy("prediction", "movieId").agg(avg("rating").alias("avg_rating"))
    
    movie_avg_with_titles = movie_avg_by_cluster.join(df_movies.select("movieId", "title", "genres"), on="movieId")
    
    # user가 이미 본 영화 필터링 (train 기준)
    user_seen_movies = train.filter(col("userId") == userId).select("movieId").distinct()
    
    cluster_row = user_clusters.filter(col("userId") == userId).collect()
    if not cluster_row:
        return None
    user_cluster = cluster_row[0]["prediction"]
    
    recommend_pool = movie_avg_with_titles.filter(col("prediction") == user_cluster)
    recommend_pool_unseen = recommend_pool.join(user_seen_movies, on="movieId", how="left_anti")
    
    top_recommendations = recommend_pool_unseen \
        .filter(col("genres") != "(no genres listed)") \
        .orderBy(col("avg_rating").desc()) \
        .limit(10) \
        .select("movieId")
    
    return top_recommendations

# 3. 평가지표 함수 (validation 또는 test 기준 실제 평점과 비교)
def evaluate_precision_recall_safe(userId: int, top_k: int = 10):
    try:
        rec_df = recommend_movies_by_user_return_df(userId)
    except Exception:
        return {"userId": userId, "precision": 0.0, "recall": 0.0}
    
    if rec_df is None or rec_df.count() == 0:
        return {"userId": userId, "precision": 0.0, "recall": 0.0}
    
    predicted = rec_df.select("movieId").rdd.flatMap(lambda x: x).collect()
    
    # validation 또는 test 데이터에서 rating >= 4.0인 실제 선호 영화 추출
    actual_df = validation.filter((col("userId") == userId) & (col("rating") >= 4.0)).select("movieId").distinct()
    actual = actual_df.rdd.flatMap(lambda x: x).collect()
    
    if len(actual) == 0:
        return {"userId": userId, "precision": 0.0, "recall": 0.0}
    
    intersection = set(predicted) & set(actual)
    precision = len(intersection) / top_k
    recall = len(intersection) / len(actual)
    
    return {"userId": userId, "precision": precision, "recall": recall}

# 4. 평가 대상 사용자 선정 (validation 데이터 기준 rating 4점 이상 5개 이상)
qualified_users_df = validation.filter(col("rating") >= 4.0) \
    .groupBy("userId") \
    .count() \
    .filter("count >= 5") \
    .orderBy("count", ascending=False)

user_ids = [row["userId"] for row in qualified_users_df.collect()]

# 5. 평가 수행 (상위 30명 기준)
results = [evaluate_precision_recall_safe(uid) for uid in user_ids[:30]]

# 6. 평균 정밀도 및 재현율 계산 및 출력
df_results = pd.DataFrame(results)
avg_precision = df_results["precision"].mean()
avg_recall = df_results["recall"].mean()

print(f"▶ 평균 Precision@10: {avg_precision:.4f}")
print(f"▶ 평균 Recall@10: {avg_recall:.4f}")


In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
import pandas as pd

# ALS 모델 학습 (train 데이터 사용)
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",  # 평가 시 null 제거
    nonnegative=True,
    implicitPrefs=False,
    rank=10,
    maxIter=10,
    regParam=0.1,
    seed=42
)

als_model = als.fit(train)

# 사용자별 Top-N 추천 생성
userRecs = als_model.recommendForAllUsers(10)

# 평가 대상 사용자 추출 (validation 기준 rating 4 이상 5개 이상 본 유저)
qualified_users_df = validation.filter(col("rating") >= 4.0) \
    .groupBy("userId").count() \
    .filter("count >= 5") \
    .orderBy("count", ascending=False)

user_ids = [row["userId"] for row in qualified_users_df.collect()][:30]

# 평가지표 함수
def evaluate_als_precision_recall(userId: int):
    # 추천된 영화 리스트
    rec_row = userRecs.filter(col("userId") == userId).select("recommendations").collect()
    if not rec_row:
        return {"userId": userId, "precision": 0.0, "recall": 0.0}
    
    predicted = [r.movieId for r in rec_row[0]["recommendations"]]

    # 실제 좋아한 영화 리스트 (validation 기준 평점 4 이상)
    actual = validation.filter((col("userId") == userId) & (col("rating") >= 4.0)) \
                       .select("movieId").rdd.flatMap(lambda x: x).collect()
    
    if not actual:
        return {"userId": userId, "precision": 0.0, "recall": 0.0}

    intersection = set(predicted) & set(actual)
    precision = len(intersection) / len(predicted)
    recall = len(intersection) / len(actual)

    return {"userId": userId, "precision": precision, "recall": recall}

# 평가 실행
results = [evaluate_als_precision_recall(uid) for uid in user_ids]

# 평균 지표 계산
df_results = pd.DataFrame(results)
avg_precision = df_results["precision"].mean()
avg_recall = df_results["recall"].mean()

print(f"▶ 평균 Precision@10: {avg_precision:.4f}")
print(f"▶ 평균 Recall@10: {avg_recall:.4f}")

## 2. ALS 기반 사용자 영화 추천

In [0]:
        [사용자]
           │
     ┌─────▼─────┐
     │  설문 응답 │  ← 사용자의 취향(장르 등)
     └─────┬─────┘
           │
     ┌─────▼────────────┐
     │사용자 성향 벡터 생성│
     └─────┬────────────┘
           │
           ▼
[ 사용자 기반 추천 모델 ]  ← Cosine Similarity 등

           ▲
           │
     ┌─────▼─────┐
     │ 평점 데이터 │  ← ratings.csv
     └─────┬─────┘
           ▼
   [ ALS 추천 모델 ]  ← Spark ML ALS

           ▼
  ┌────────┴────────┐
  │ Hybrid 추천 조합│ ← 설문 기반 + ALS 기반 결합 (가중 평균 등)
  └────────┬────────┘
           ▼
     [추천 영화 리스트]
           ▼
       [시각화/출력]


In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

# 1. Unity Catalog에서 데이터 불러오기
catalog = "1dt_team8_databricks"
schema = "final"
base_path = f"{catalog}.{schema}"

# 데이터셋 로딩
train = spark.read.table(f"{base_path}.train_df")
validation = spark.read.table(f"{base_path}.validation_df")
test = spark.read.table(f"{base_path}.test_df")
df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

# 2. ALS 모델 정의
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True
)

# 3. 모델 학습
model = als.fit(train)

# 4. 특정 사용자 추천 (예: userId 123)
user_df = spark.createDataFrame([(123,)], ["userId"])
userRecs = model.recommendForUserSubset(user_df, 10)

# 5. 추천 결과 정리
userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

# 6. index 부여
windowSpec = Window.orderBy(col("rating").desc())
indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .select("index", "movieId")

# 7. 결과 출력
display(indexedRecs)

## ML Flow 사용

In [0]:
%python
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

# 0. 실험 설정
mlflow.set_experiment('/Users/1dt011@msacademy.msai.kr/1dt011')

# 1. 데이터 로딩 (Unity Catalog 사용)
catalog = "1dt_team8_databricks"
schema = "final"
path = f"{catalog}.{schema}"

train = spark.read.table(f"{path}.train_df")
test = spark.read.table(f"{path}.test_df")
df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

# 2. MLflow 실험 시작
with mlflow.start_run(run_name="ALS-Recommender-UnityCatalog"):

    # 하이퍼파라미터
    rank = 10
    maxIter = 10
    regParam = 0.1

    # ALS 모델 정의 및 학습
    als = ALS(
        userCol="userId", itemCol="movieId", ratingCol="rating",
        rank=rank, maxIter=maxIter, regParam=regParam,
        coldStartStrategy="drop", nonnegative=True
    )

    model = als.fit(train)
    predictions = model.transform(test)

    # 평가 지표 계산
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)

    # MLflow 로깅
    mlflow.log_param("rank", rank)
    mlflow.log_param("maxIter", maxIter)
    mlflow.log_param("regParam", regParam)
    mlflow.log_metric("rmse", rmse)

    # 모델 저장
    mlflow.spark.log_model(model, "als_model")

    print(f"✅ MLflow Run Completed - RMSE: {rmse:.4f}")


## 평가지표_ Precision@10 , Recall@10

In [0]:
from pyspark.sql.functions import expr, collect_set, size, array_intersect
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

# 1. Unity Catalog 데이터 불러오기
catalog = "1dt_team8_databricks"
schema = "final"
path = f"{catalog}.{schema}"

train = spark.read.table(f"{path}.train_df")
test = spark.read.table(f"{path}.test_df")

# 2. ALS 모델 정의
als = ALS(
    userCol="userId", itemCol="movieId", ratingCol="rating",
    coldStartStrategy="drop", nonnegative=True
)

# 3. 모델 학습
model = als.fit(train)

# 4. 특정 사용자 추천 (userId 예: 123)
user_df = spark.createDataFrame([(123,)], ["userId"])
userRecs = model.recommendForUserSubset(user_df, 10)

# 5. 추천 결과 정리
userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

# 순위(index) 부여
windowSpec = Window.orderBy(col("rating").desc())
indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .select("index", "movieId")

# 6. 결과 출력

# -------------------------------------- #

# 7. 테스트셋에서 긍정적 평가(예: rating >= 4.0)만 필터링하여 실제 정답 만들기
positive_test = test.filter(col("rating") >= 4.0) \
                    .groupBy("userId") \
                    .agg(collect_set("movieId").alias("true_items"))

# 8. 모든 사용자에 대해 추천 10개 생성
userRecsAll = model.recommendForAllUsers(10)

# 9. 추천 리스트에서 movieId만 추출
predicted_items = userRecsAll.select("userId", 
    expr("transform(recommendations, x -> x.movieId)").alias("pred_items")
)

# 10. 실제와 예측을 join
joined = predicted_items.join(positive_test, on="userId")

# 11. Precision@10, Recall@10 계산
metrics = joined.withColumn("num_relevant_and_recommended", 
                                size(array_intersect("pred_items", "true_items"))) \
                .withColumn("precision_at_10", 
                                col("num_relevant_and_recommended") / expr("size(pred_items)")) \
                .withColumn("recall_at_10", 
                                col("num_relevant_and_recommended") / expr("size(true_items)"))

# 12. 평균 값 출력
precision_recall = metrics.selectExpr("avg(precision_at_10) as avg_precision_at_10", 
                                      "avg(recall_at_10) as avg_recall_at_10")

# 결과 보기
precision_recall.show()


In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

# 1. Unity Catalog에서 데이터 불러오기
catalog = "1dt_team8_databricks"
schema = "final"
base_path = f"{catalog}.{schema}"

train = spark.read.table(f"{base_path}.train_df")
validation = spark.read.table(f"{base_path}.validation_df")
test = spark.read.table(f"{base_path}.test_df")

df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies") \
                      .withColumn("movieId", col("movieId").cast("integer"))

# 2. ALS 모델 정의 (개선된 하이퍼파라미터)
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
    rank=50,
    maxIter=15,
    regParam=0.05,
    seed=42
)

# 3. 모델 학습
model = als.fit(train)

# 4. 추천 생성: Top 50
userRecs = model.recommendForAllUsers(50)

# 예시: userId 123의 추천만 확인하고 싶다면 아래 주석을 해제
# userRecs = model.recommendForUserSubset(spark.createDataFrame([(123,)], ["userId"]), 50)

# 5. 추천 결과 정리
userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

# 6. Top 10 추출 (user별 정렬 및 상위 10개)
windowSpec = Window.partitionBy("userId").orderBy(col("rating").desc())
topNRecs = userRecsExploded.withColumn("rank", row_number().over(windowSpec)) \
                           .filter(col("rank") <= 10) \
                           .drop("rank")

# 7. 영화 정보와 조인 (선택)
result_with_titles = topNRecs.join(df_movies, on="movieId", how="left") \
                             .select("userId", "movieId", "title", "rating")

# 8. 결과 확인
display(result_with_titles)


In [0]:
from pyspark.sql.functions import col, countDistinct, collect_set, size
from pyspark.sql import functions as F

# 1. 추천 결과(topNRecs)에는 userId, movieId가 포함되어 있어야 함
#    topNRecs: ALS로 추천된 Top 10 movie per user

# 2. 실제 평가 데이터에서 평점 4.0 이상인 영화만 긍정적으로 간주
actual_relevant = test.filter(col("rating") >= 4.0) \
                      .select("userId", "movieId") \
                      .distinct() \
                      .groupBy("userId") \
                      .agg(collect_set("movieId").alias("actual_movies"))

# 3. 추천 결과를 userId별로 movieId 리스트로 집계
predicted_recs = topNRecs.groupBy("userId") \
                         .agg(collect_set("movieId").alias("predicted_movies"))

# 4. 실제/예측 join
joined = predicted_recs.join(actual_relevant, on="userId", how="inner")

# 5. Precision@10, Recall@10 계산
def precision_recall_udf(predicted, actual):
    predicted_set = set(predicted)
    actual_set = set(actual)
    intersection = predicted_set & actual_set
    precision = len(intersection) / len(predicted_set) if predicted_set else 0.0
    recall = len(intersection) / len(actual_set) if actual_set else 0.0
    return (precision, recall)

from pyspark.sql.types import StructType, StructField, DoubleType
from pyspark.sql.functions import udf

schema = StructType([
    StructField("precision", DoubleType(), True),
    StructField("recall", DoubleType(), True)
])

precision_recall_udf_spark = udf(precision_recall_udf, schema)

# 6. 컬럼 생성
scored = joined.withColumn("metrics", precision_recall_udf_spark(col("predicted_movies"), col("actual_movies"))) \
               .select("userId", "metrics.*")

# 7. 평균 계산
avg_scores = scored.select(F.avg("precision").alias("avg_precision_at_10"),
                           F.avg("recall").alias("avg_recall_at_10"))

# 8. 출력
display(avg_scores)
