## ALS 기반 전체 사용자 추천

In [0]:
#ALS을 활용한 전체 사용자
#ALS (Alternating Least Squares)는 사용자-아이템 평점 행렬(Rating Matrix)을 분해해서, 사용자와 아이템의 잠재 요인(latent factor)을 학습
#대용량에 강함 (Spark 기반으로 병렬 처리)
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, explode

# 데이터 불러오기
df1 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
df2 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.links")
df3 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.ratings")
df4 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.tags")

# ✅ 필요한 컬럼 캐스팅 (정수형/실수형)
ratings = df3.select(
    col("userId").cast("integer"),
    col("movieId").cast("integer"),
    col("rating").cast("float")
)

# 학습/테스트 분할
(training, test) = ratings.randomSplit([0.8, 0.2], seed=42)

# ALS 모델 정의
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",  # NaN 예측 제거
    nonnegative=True
)

# 모델 학습
model = als.fit(training)

# 전체 사용자 추천 (Top 5)
userRecs = model.recommendForAllUsers(5)
display(userRecs)

# 특정 사용자(예: 123)에 대한 추천
user_df = spark.createDataFrame([(123,)], ["userId"])
userRecs = model.recommendForUserSubset(user_df, 5)
display(userRecs)

# 추천 결과 분해 및 영화 정보 붙이기
userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                           .select("userId", col("rec.movieId"), col("rec.rating"))

# ✅ df1을 영화 정보 데이터로 사용
movies = df1.select(
    col("movieId").cast("integer"),
    "title",
    "genres"
)

# 추천 영화에 제목 붙이기
recommended_movies = userRecsExploded.join(movies, on="movieId", how="inner")
display(recommended_movies)

In [0]:
# 사용자의 설문 응답 예시 (선호 장르) 
user_selected_genres = ['Action', 'Sci-Fi', 'Thriller']

# 영화 장르 데이터 준비
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

# Spark → Pandas 변환
movies_pd = df1.select("movieId", "title", "genres").dropna().toPandas()
movies_pd['genres'] = movies_pd['genres'].apply(lambda x: x.split('|'))

# 장르 원-핫 인코딩
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_pd['genres'])

# 사용자 장르 벡터 생성
user_vec = [1 if genre in user_selected_genres else 0 for genre in mlb.classes_]

# 유사도 계산 (Cosine Similarity)
similarities = cosine_similarity([user_vec], genre_matrix)[0]
movies_pd['similarity'] = similarities

# 상위 추천 영화 Top 10
recommended = movies_pd.sort_values(by='similarity', ascending=False).head(10)

# 결과 출력
display(recommended[['title', 'genres', 'similarity']])

In [0]:
from pyspark.sql.functions import col, split, explode, avg, desc

# 1. 데이터 로딩 (이미 실행되었을 경우 생략 가능)
movies_df = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/movies.csv").withColumn("movieId", col("movieId").cast("int"))
ratings_df = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/ratings.csv") \
                       .withColumn("userId", col("userId").cast("int")) \
                       .withColumn("movieId", col("movieId").cast("int")) \
                       .withColumn("rating", col("rating").cast("float"))

# 🔧 사용자 ID 설정 (원하는 사용자 ID를 넣으세요)
target_user_id = 123  # 예시

# 2. 해당 사용자가 높게 평가한 영화 추출 (예: 평점 4 이상)
high_rated_movies = ratings_df.filter((col("userId") == target_user_id) & (col("rating") >= 4.0))

# 3. 그 영화들의 장르 추출
movies_with_genres = movies_df.withColumn("genre", explode(split("genres", "\\|")))

user_preferred_genres = high_rated_movies.join(movies_with_genres, on="movieId") \
                                         .select("genre") \
                                         .distinct()

# 4. 전체 영화 중 해당 장르 영화들 중 평점이 높은 영화 구하기
# 해당 장르 영화 전체 추출
genre_movies = movies_with_genres.join(user_preferred_genres, on="genre")

# 영화 평균 평점 계산
movie_avg_ratings = ratings_df.groupBy("movieId").agg(avg("rating").alias("avg_rating"))

# 장르별 영화 + 평점 결합
genre_movies_with_ratings = genre_movies.join(movie_avg_ratings, on="movieId", how="left") \
                                        .select("movieId", "title", "genres", "avg_rating") \
                                        .dropDuplicates(["movieId"])

# 5. 사용자가 이미 본 영화 제외
seen_movies = ratings_df.filter(col("userId") == target_user_id).select("movieId").distinct()

recommendations = genre_movies_with_ratings.join(seen_movies, on="movieId", how="left_anti")

# 6. 추천 리스트 정렬 후 상위 N개 추천
top_n_recommendations = recommendations.orderBy(desc("avg_rating")).limit(10)

# 결과 출력
top_n_recommendations.select("movieId", "title", "genres", "avg_rating").show(truncate=False)


## 군집화 기반 사용자 영화추천

In [0]:

┌──────────────────────┐
│ 1. 데이터 로딩        │
│ - movies.csv         │
│ - ratings.csv        │
└─────────┬────────────┘
          │
          ▼
┌──────────────────────────────────────┐
│ 2. 장르 전처리                        │
│ - 장르 explode 및 원-핫 인코딩         │
└─────────┬────────────────────────────┘
          │
          ▼
┌──────────────────────────────────────┐
│ 3. 사용자 장르 선호도 벡터화           │
│ - 장르 x 평점 → 사용자 프로파일 생성    │
└─────────┬────────────────────────────┘
          │
          ▼
┌────────────────────────────┐
│ 4. 사용자 벡터 → features   │
│ - VectorAssembler 사용     │
└─────────┬──────────────────┘
          │
          ▼
┌────────────────────────────┐
│ 5. KMeans 군집화            │
│ - 사용자 군집 분류           │
└─────────┬──────────────────┘
          │
          ▼
┌────────────────────────────────────┐
│ 6. 클러스터 내 영화 평균 평점 계산    │
│ - 각 군집 내 인기 영화 파악          │
└─────────┬──────────────────────────┘
          │
          ▼
┌─────────────────────────────────────┐
│ 7. 특정 사용자에게 추천 수행          │
│ - 같은 클러스터 + 안 본 영화          │
│ - 평균 평점 기반 상위 N개 추천        │
└─────────┬───────────────────────────┘
          │
          ▼
┌─────────────────────────────────────┐
│ 8. 추천 결과 출력                    │
│ - movieId, title, genres, avg_rating│
└─────────────────────────────────────┘


In [0]:
from pyspark.sql.functions import col, split, explode, avg, when, max, row_number
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.window import Window

def recommend_movies_by_user(userId: int):
    # 1. 데이터 불러오기
    df1 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
    df2 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.links")
    df3 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.ratings")
    df4 = spark.read.table("`1dt_team8_databricks`.`movielens-small`.tags") 

    # 2. 영화 장르 전처리
    movies_with_genres = df1.withColumn("genre", explode(split("genres", "\\|")))
    distinct_genres = movies_with_genres.select("genre").distinct().rdd.flatMap(lambda x: x).collect()

    for genre in distinct_genres:
        movies_with_genres = movies_with_genres.withColumn(
            f"genre_{genre}",
            when(col("genre") == genre, 1).otherwise(0)
        )

    genre_features = movies_with_genres.groupBy("movieId").agg(
        *[max(f"genre_{genre}").alias(f"genre_{genre}") for genre in distinct_genres]
    )

    # 3. 사용자-장르 선호도 계산
    ratings_with_genres = df3.join(genre_features, on="movieId", how="inner")

    user_profile = ratings_with_genres.groupBy("userId").agg(
        *[avg(f"genre_{genre}").alias(f"pref_{genre}") for genre in distinct_genres]
    )

    # 4. 벡터화 및 클러스터링
    feature_cols = [col for col in user_profile.columns if col.startswith("pref_")]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    user_features = assembler.transform(user_profile)

    kmeans = KMeans(k=5, seed=42)
    model = kmeans.fit(user_features)

    user_clusters = model.transform(user_features).select("userId", "prediction")
    ratings_with_cluster = df3.join(user_clusters, on="userId")
    movie_avg_by_cluster = ratings_with_cluster.groupBy("prediction", "movieId") \
        .agg(avg("rating").alias("avg_rating"))

    movie_avg_with_titles = movie_avg_by_cluster.join(df1.select("movieId", "title", "genres"), on="movieId")

    # 5. 추천 로직 실행
    user_seen_movies = df3.filter(col("userId") == userId).select("movieId").distinct()
    user_cluster = user_clusters.filter(col("userId") == userId).select("prediction").collect()[0][0]

    recommend_pool = movie_avg_with_titles.filter(col("prediction") == user_cluster)
    recommend_pool_unseen = recommend_pool.join(user_seen_movies, on="movieId", how="left_anti")

    top_recommendations = recommend_pool_unseen \
        .filter(col("genres") != "(no genres listed)") \
        .orderBy(col("avg_rating").desc()) \
        .limit(10) \
        .select("movieId")

    # 6. index 추가 및 결과 출력
    indexed = top_recommendations.withColumn(
        "index", row_number().over(Window.orderBy(col("movieId"))) - 1
    ).select("index", "movieId")

    display(indexed)


In [0]:
recommend_movies_by_user(2)

In [0]:
%python
#군집화를 활용한 그 사람이 좋아할 만한 영화 추천 시스템
from pyspark.sql.functions import col, split, explode, avg, when, max
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# 1. 데이터 불러오기
df1 = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/movies.csv").withColumn("movieId", col("movieId").cast("int"))
df3 = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/ratings.csv") \
               .withColumn("userId", col("userId").cast("int")) \
               .withColumn("movieId", col("movieId").cast("int")) \
               .withColumn("rating", col("rating").cast("float"))

# 2. 영화 장르 전처리
movies_with_genres = df1.withColumn("genre", explode(split("genres", "\\|")))
distinct_genres = movies_with_genres.select("genre").distinct().rdd.flatMap(lambda x: x).collect()

# 장르 원-핫 인코딩
for genre in distinct_genres:
    movies_with_genres = movies_with_genres.withColumn(
        f"genre_{genre}",
        when(col("genre") == genre, 1).otherwise(0)
    )

# 영화별 장르 벡터 완성
genre_features = movies_with_genres.groupBy("movieId").agg(
    *[max(f"genre_{genre}").alias(f"genre_{genre}") for genre in distinct_genres]
)

# 3. 사용자-장르 선호도 계산
ratings_with_genres = df3.join(genre_features, on="movieId", how="inner")

user_profile = ratings_with_genres.groupBy("userId").agg(
    *[avg(f"genre_{genre}").alias(f"pref_{genre}") for genre in distinct_genres]
)

# 4. KMeans를 위한 벡터화
feature_cols = [col for col in user_profile.columns if col.startswith("pref_")]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
user_features = assembler.transform(user_profile)

# 5. KMeans 클러스터링
kmeans = KMeans(k=5, seed=42)
model = kmeans.fit(user_features)

# 6. 사용자 클러스터 결과
user_clusters = model.transform(user_features).select("userId", "prediction")

# 7. 클러스터별 인기 영화 추천 (평점 평균 기준)
ratings_with_cluster = df3.join(user_clusters, on="userId")
movie_avg_by_cluster = ratings_with_cluster.groupBy("prediction", "movieId").agg(avg("rating").alias("avg_rating"))

# 영화 제목 붙이기
movie_avg_with_titles = movie_avg_by_cluster.join(df1.select("movieId", "title", "genres"), on="movieId")

# 8. 사용자가 보지 않은 영화만 추천
target_user_id = 2  # <- 추천 대상 사용자 ID 입력

# 추천 영화 중 장르가 없는 영화 제외
top_recommendations_filtered = top_recommendations.filter(col("genres") != "(no genres listed)")
top_recommendations_filtered.select("movieId", "title", "genres", "avg_rating").show(truncate=False)

user_seen_movies = df3.filter(col("userId") == target_user_id).select("movieId").distinct()
user_cluster = user_clusters.filter(col("userId") == target_user_id).select("prediction").collect()[0][0]

recommend_pool = movie_avg_with_titles.filter(col("prediction") == user_cluster)
recommend_pool_unseen = recommend_pool.join(user_seen_movies, on="movieId", how="left_anti")

# 9. 상위 N개 추천 (예: 5개)
top_recommendations = recommend_pool_unseen.orderBy(col("avg_rating").desc()).limit(5)

# 💡 결과 출력: movieId, title, genres, avg_rating
top_recommendations.select("movieId", "title", "genres", "avg_rating").show(truncate=False)

In [0]:

┌──────────────────────────────────────────────┐
│          1. 데이터 로딩 (movies, ratings)     │
└──────────────────────────────────────────────┘
                        │
                        ▼
┌──────────────────────────────────────────────┐
│      2. 장르 텍스트를 소문자/토큰으로 변환      │
│      (Tokenizer → CountVectorizer)           │
└──────────────────────────────────────────────┘
                        │
                        ▼
┌──────────────────────────────────────────────┐
│          3. TF-IDF 벡터 생성 (장르 → 벡터)     │
└──────────────────────────────────────────────┘
                        │
                        ▼
┌──────────────────────────────────────────────┐
│    4. 선택한 영화의 TF-IDF 벡터 추출           │
│    (target_vector)                           │
└──────────────────────────────────────────────┘
                        │
                        ▼
┌──────────────────────────────────────────────┐
│     5. 다른 영화들과의 코사인 유사도 계산       │
│     (UDF로 similarity 컬럼 생성)              │
└──────────────────────────────────────────────┘
                        │
                        ▼
┌──────────────────────────────────────────────┐
│       6. 평균 평점과 함께 영화 정보 결합        │
│       (title, genres, avg_rating, similarity)│
└──────────────────────────────────────────────┘
                        │
                        ▼
┌──────────────────────────────────────────────┐
│         7. 유사도가 높은 상위 5개 추천         │
└──────────────────────────────────────────────┘


In [0]:
%python #코싸인 유사도를 활용한 영화 정보 + 장르 기반 유사 영화 추천 
from pyspark.sql.functions import col, avg, lower, udf
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF
from pyspark.sql.types import DoubleType
from pyspark.ml.linalg import Vectors
from pyspark.sql import functions as F

# 1. 데이터 로딩
df_movies = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/movies.csv")   # movies
df_ratings = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/ratings.csv")  # ratings

# 2. 입력 영화 ID
target_movie_id = "14"  # 입력값은 string 형식

# 3. 데이터 alias 지정 및 전처리
movies = df_movies.withColumn("genres", lower(col("genres"))).alias("movies")
ratings = df_ratings.withColumn("rating", col("rating").cast("float")).alias("ratings")

# 4. TF-IDF: 장르 -> features
tokenizer = Tokenizer(inputCol="genres", outputCol="genre_words")
tokenized = tokenizer.transform(movies).alias("tokenized")

cv = CountVectorizer(inputCol="genre_words", outputCol="raw_features")
cv_model = cv.fit(tokenized)
vectorized = cv_model.transform(tokenized).alias("vectorized")

idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(vectorized)
tfidf = idf_model.transform(vectorized).alias("tfidf")

# 5. 코사인 유사도 계산
target_vector = tfidf.filter(col("movieId") == target_movie_id).select(col("features")).collect()[0][0]

def cosine_similarity(v1, v2):
    dot_product = float(v1.dot(v2))
    norm_v1 = float(v1.norm(2))
    norm_v2 = float(v2.norm(2))
    return dot_product / (norm_v1 * norm_v2) if norm_v1 != 0 and norm_v2 != 0 else 0.0

cosine_sim_udf = udf(lambda x: cosine_similarity(target_vector, x), DoubleType())
tfidf = tfidf.withColumn("similarity", cosine_sim_udf(col("features")))

# 6. 평균 평점 계산
avg_rating = ratings.groupBy(col("movieId").alias("r_movieId")) \
                    .agg(avg("rating").alias("avg_rating")) \
                    .alias("avg_rating")

# 7. 영화 정보 통합
movies_with_info = tfidf \
    .join(
        movies.select(
            col("movieId").alias("m_movieId"),
            col("title").alias("movie_title"),
            col("genres").alias("movie_genres")
        ),
        col("movieId") == col("m_movieId")
    ) \
    .join(
        avg_rating,
        col("movieId") == col("r_movieId"),
        how="left"
    ) \
    .select(
        col("movieId").alias("movie_id"),
        col("movie_title"),
        col("movie_genres"),
        col("avg_rating"),
        col("features").alias("genre_features"),
        col("similarity")
    )

# 8. 🎬 선택한 영화 정보 출력
print("🎬 [선택한 영화 정보]")
movies_with_info.filter(col("movie_id") == target_movie_id) \
    .select("movie_id", "movie_title", "movie_genres", "avg_rating") \
    .show(truncate=False)

# 9. 🎯 유사 영화 추천 출력
print("🎯 [장르 유사 영화 추천]")
movies_with_info.filter(col("movie_id") != target_movie_id) \
    .orderBy(col("similarity").desc()) \
    .select("movie_id", "movie_title", "movie_genres", "avg_rating", "similarity") \
    .limit(5) \
    .show(truncate=False)

In [0]:
from pyspark.sql.window import Window #평가지표 MSE

# 유사도가 0보다 큰 영화만 사용 (자기 자신 제외됨)
similar_movies = movies_with_info.filter((col("movie_id") != target_movie_id) & (col("similarity") > 0))

# 유사도 합 (정규화용)
sim_sum = similar_movies.agg(F.sum("similarity")).collect()[0][0]

# 예측 평점 계산: weighted avg
predicted_rating = similar_movies.withColumn(
    "weighted_rating", col("similarity") * col("avg_rating")
).agg(
    (F.sum("weighted_rating") / F.lit(sim_sum)).alias("predicted_rating")
).collect()[0][0]

# 실제 평점 (영화 전체 평균)
actual_rating = movies_with_info.filter(col("movie_id") == target_movie_id) \
    .select("avg_rating").collect()[0][0]

# MSE 계산
mse = (actual_rating - predicted_rating) ** 2
print(f"📊 Mean Squared Error (MSE): {mse}")


##ALS 기반 사용자 영화 추천

In [0]:
        [사용자]
           │
     ┌─────▼─────┐
     │  설문 응답 │  ← 사용자의 취향(장르 등)
     └─────┬─────┘
           │
     ┌─────▼────────────┐
     │사용자 성향 벡터 생성│
     └─────┬────────────┘
           │
           ▼
[ 사용자 기반 추천 모델 ]  ← Cosine Similarity 등

           ▲
           │
     ┌─────▼─────┐
     │ 평점 데이터 │  ← ratings.csv
     └─────┬─────┘
           ▼
   [ ALS 추천 모델 ]  ← Spark ML ALS

           ▼
  ┌────────┴────────┐
  │ Hybrid 추천 조합│ ← 설문 기반 + ALS 기반 결합 (가중 평균 등)
  └────────┬────────┘
           ▼
     [추천 영화 리스트]
           ▼
       [시각화/출력]


In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode, row_number
from pyspark.sql.window import Window

# 1. 데이터 불러오기
df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
df_ratings = spark.read.table("`1dt_team8_databricks`.`movielens-small`.ratings")

# 2. 타입 변환
df_ratings = df_ratings.withColumn("userId", col("userId").cast("integer")) \
                       .withColumn("movieId", col("movieId").cast("integer")) \
                       .withColumn("rating", col("rating").cast("float"))

df_movies = df_movies.withColumn("movieId", col("movieId").cast("integer"))

# 3. 학습/검증 데이터 나누기
(training, test) = df_ratings.randomSplit([0.8, 0.2], seed=42)

# 4. ALS 모델 정의
als = ALS(
    userCol="userId", itemCol="movieId", ratingCol="rating",
    coldStartStrategy="drop", nonnegative=True
)

# 5. 모델 학습
model = als.fit(training)

# 6. 특정 사용자 추천 (userId 예: 123)
user_df = spark.createDataFrame([(123,)], ["userId"])
userRecs = model.recommendForUserSubset(user_df, 10)

# 7. 추천 결과 정리 (explode + index 붙이기)
userRecsExploded = userRecs.select("userId", explode("recommendations").alias("rec")) \
                            .select("userId", col("rec.movieId"), col("rec.rating"))

# 순위(index) 부여
windowSpec = Window.orderBy(col("rating").desc())
indexedRecs = userRecsExploded.withColumn("index", row_number().over(windowSpec) - 1) \
                              .select("index", "movieId")

# 8. 결과 출력
display(indexedRecs)


## ML Flow 사용 파이프라인

In [0]:
# 10. 테스트 데이터셋을 이용해 예측 수행
predictions = model.transform(test)

# 11. MSE, RMSE 평가자 정의
evaluator = RegressionEvaluator(
    metricName="rmse",  # 또는 "mse"로 바꿀 수 있음
    labelCol="rating",
    predictionCol="prediction"
)

# 12. RMSE 계산
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")

# 13. MSE 계산 (옵션)
evaluator_mse = RegressionEvaluator(
    metricName="mse",
    labelCol="rating",
    predictionCol="prediction"
)
mse = evaluator_mse.evaluate(predictions)
print(f"Mean Squared Error (MSE) = {mse}")

In [0]:
%python
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

# 0. 실험 설정 (필수!)
mlflow.set_experiment('/Users/1dt011@msacademy.msai.kr/1dt011')

# 1. 데이터 로딩 및 타입 변환
df_movies = spark.read.table("`1dt_team8_databricks`.`movielens-small`.movies")
df_ratings = spark.read.table("`1dt_team8_databricks`.`movielens-small`.ratings")

df_ratings = df_ratings.withColumn("userId", col("userId").cast("integer")) \
                       .withColumn("movieId", col("movieId").cast("integer")) \
                       .withColumn("rating", col("rating").cast("float"))
df_movies = df_movies.withColumn("movieId", col("movieId").cast("integer"))

(training, test) = df_ratings.randomSplit([0.8, 0.2], seed=42)

# 2. MLflow 실험 시작
with mlflow.start_run(run_name="ALS-Recommender"):

    # 하이퍼파라미터
    rank = 10
    maxIter = 10
    regParam = 0.1

    als = ALS(
        userCol="userId", itemCol="movieId", ratingCol="rating",
        rank=rank, maxIter=maxIter, regParam=regParam,
        coldStartStrategy="drop", nonnegative=True
    )

    model = als.fit(training)
    predictions = model.transform(test)

    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)

    # 로깅
    mlflow.log_param("rank", rank)
    mlflow.log_param("maxIter", maxIter)
    mlflow.log_param("regParam", regParam)
    mlflow.log_metric("rmse", rmse)

    mlflow.spark.log_model(model, "als_model")

    print(f"MLflow Run Completed - RMSE: {rmse:.4f}")

In [0]:
import mlflow
import mlflow.spark
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# 0. 실험 등록
mlflow.set_experiment("/Users/1dt011@msacademy.msai.kr/1dt011")
# mlflow.set_experiment("/Users/1dt011@msacademy.msai.kr/als-test")


# 1. 데이터 불러오기 및 변환
def load_data():
    df_movies = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/movies.csv")
    df_ratings = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/ratings.csv")

    df_movies = df_movies.withColumn("movieId", col("movieId").cast("integer"))
    df_ratings = df_ratings.withColumn("userId", col("userId").cast("integer")) \
                           .withColumn("movieId", col("movieId").cast("integer")) \
                           .withColumn("rating", col("rating").cast("float"))
    
    return df_ratings, df_movies

# 2. 파이프라인 구성
def train_pipeline(rank=10, maxIter=10, regParam=0.1):
    df_ratings, df_movies = load_data()
    train_data, test_data = df_ratings.randomSplit([0.8, 0.2], seed=42)

    als = ALS(
        userCol="userId",
        itemCol="movieId",
        ratingCol="rating",
        rank=rank,
        maxIter=maxIter,
        regParam=regParam,
        coldStartStrategy="drop",
        nonnegative=True
    )

    # ALS는 Transformer처럼 작동 (Pipeline에도 넣을 수 있음)
    pipeline = Pipeline(stages=[als])

    # MLflow 시작
    with mlflow.start_run(run_name="ALS-Pipeline-Run"):

        model = pipeline.fit(train_data)

        predictions = model.transform(test_data)

        evaluator = RegressionEvaluator(
            metricName="rmse", labelCol="rating", predictionCol="prediction"
        )
        rmse = evaluator.evaluate(predictions)

        # 로깅
        mlflow.log_param("rank", rank)
        mlflow.log_param("maxIter", maxIter)
        mlflow.log_param("regParam", regParam)
        mlflow.log_metric("rmse", rmse)
        mlflow.spark.log_model(model, "als_pipeline_model")

        print(f"[MLflow] Run logged with RMSE: {rmse:.4f}")

    return model, df_movies

# 3. 사용자 추천 함수
def recommend_for_user(model, user_id, df_movies, n=5):
    user_df = spark.createDataFrame([(user_id,)], ["userId"])
    recs = model.stages[0].recommendForUserSubset(user_df, n)

    from pyspark.sql.functions import explode
    recs = recs.select("userId", explode("recommendations").alias("rec")) \
               .select("userId", col("rec.movieId"), col("rec.rating"))

    final_recs = recs.join(df_movies, on="movieId", how="left")
    return final_recs.select("userId", "title", "rating")

# 4. 실행
model, df_movies = train_pipeline()
display(recommend_for_user(model, 123, df_movies))


In [0]:
# 1. 위젯으로 파라미터 입력
dbutils.widgets.text("rank", "10")
dbutils.widgets.text("regParam", "0.1")
dbutils.widgets.text("maxIter", "10")

# 2. 입력값 추출
rank = int(dbutils.widgets.get("rank"))
regParam = float(dbutils.widgets.get("regParam"))
maxIter = int(dbutils.widgets.get("maxIter"))

# 3. ALS 모델 학습
als = ALS(
    userCol="userId", itemCol="movieId", ratingCol="rating",
    rank=rank, regParam=regParam, maxIter=maxIter,
    coldStartStrategy="drop", nonnegative=True
)

model = als.fit(training)

# 4. 평가 결과 저장
from pyspark.ml.evaluation import RegressionEvaluator
predictions = model.transform(test)
rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction").evaluate(predictions)

# 5. 결과 테이블 저장
from pyspark.sql import Row
result_row = Row(rank=rank, regParam=regParam, maxIter=maxIter, rmse=rmse)
result_df = spark.createDataFrame([result_row])
result_df.write.mode("append").saveAsTable("als_experiment_results")

# 6. 추천 결과 저장
userRecs = model.recommendForAllUsers(5)
userRecs.write.mode("overwrite").saveAsTable("als_user_recommendations")

In [0]:
%sql
SELECT * FROM als_experiment_results ORDER BY rmse ASC;

In [0]:
# 1. 입력 위젯 (노트북 상단에서 파라미터 조정 가능)
dbutils.widgets.text("rank", "10")
dbutils.widgets.text("regParam", "0.1")
dbutils.widgets.text("maxIter", "10")

# 2. 파라미터 추출
rank = int(dbutils.widgets.get("rank"))
regParam = float(dbutils.widgets.get("regParam"))
maxIter = int(dbutils.widgets.get("maxIter"))

# 3. 데이터 로딩
df_ratings = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/ratings.csv")
df_movies = spark.read.option("header", True).csv("/Volumes/1dt_team8/default/movies/movies.csv")

# 4. 타입 변환
from pyspark.sql.functions import col
df_ratings = df_ratings.withColumn("userId", col("userId").cast("integer")) \
                       .withColumn("movieId", col("movieId").cast("integer")) \
                       .withColumn("rating", col("rating").cast("float"))
df_movies = df_movies.withColumn("movieId", col("movieId").cast("integer"))

# 5. 학습/검증 분리
(training, test) = df_ratings.randomSplit([0.8, 0.2], seed=42)

# 6. ALS 모델 정의 및 학습
from pyspark.ml.recommendation import ALS
als = ALS(
    userCol="userId", itemCol="movieId", ratingCol="rating",
    rank=rank, regParam=regParam, maxIter=maxIter,
    coldStartStrategy="drop", nonnegative=True
)
model = als.fit(training)

# 7. 예측 및 평가
from pyspark.ml.evaluation import RegressionEvaluator
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

# 8. 실험 결과 저장
from pyspark.sql import Row
result = Row(rank=rank, regParam=regParam, maxIter=maxIter, rmse=rmse)
result_df = spark.createDataFrame([result])
result_df.write.mode("append").saveAsTable("als_experiment_results")

# 9. 추천 결과 생성 및 저장
user_recs = model.recommendForAllUsers(5)  # 또는 특정 유저만 추천하려면 recommendForUserSubset()
user_recs.write.mode("overwrite").saveAsTable("als_user_recommendations")

# 10. (선택) 추천 결과 보기 예시
from pyspark.sql.functions import explode
recs = user_recs.select("userId", explode("recommendations").alias("rec")) \
                .select("userId", col("rec.movieId"), col("rec.rating")) \
                .join(df_movies, on="movieId", how="left")

display(recs.orderBy("userId", "rating", ascending=[True, False]))
