전처리

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from surprise import Reader, Dataset, KNNBasic, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split, GridSearchCV
from surprise import accuracy
from sklearn.metrics import mean_squared_error, mean_absolute_error

# --- 1. Utility Functions ---
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_true = y_true != 0
    if np.sum(non_zero_true) == 0:
        return 0.0
    return np.mean(np.abs((y_true[non_zero_true] - y_pred[non_zero_true]) / y_true[non_zero_true])) * 100

def calculate_metrics(true_ratings, predicted_ratings):
    mse = mean_squared_error(true_ratings, predicted_ratings)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    mape = mean_absolute_percentage_error(true_ratings, predicted_ratings)
    return mse, rmse, mae, mape

# --- 2. Data Loading and Preprocessing ---
# 파일 로드
df = pd.read_parquet('review_data_optimized.parquet')

# 필요한 컬럼 추출
df_processed = df[['user_id', 'business_id', 'stars']].copy()

# 데이터 분할 (7:1:2 비율)
train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
val_size_ratio = 1 / 8
train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42)

print(f"전체 데이터 수: {len(df_processed)}")
print(f"학습 데이터 수: {len(train_df)} ({len(train_df)/len(df_processed)*100:.2f}%)")
print(f"검증 데이터 수: {len(val_df)} ({len(val_df)/len(df_processed)*100:.2f}%)")
print(f"테스트 데이터 수: {len(test_df)} ({len(test_df)/len(df_processed)*100:.2f}%)")

# --- 3. Surprise 라이브러리를 위한 데이터 준비 ---
reader = Reader(rating_scale=(1, 5))
train_set_for_surprise = train_df[['user_id', 'business_id', 'stars']]
data_for_train = Dataset.load_from_df(train_set_for_surprise, reader)
train_set = data_for_train.build_full_trainset()

val_set = [(row['user_id'], row['business_id'], row['stars']) for _, row in val_df.iterrows()]
test_set = [(row['user_id'], row['business_id'], row['stars']) for _, row in test_df.iterrows()]

전체 데이터 수: 447796
학습 데이터 수: 313456 (70.00%)
검증 데이터 수: 44780 (10.00%)
테스트 데이터 수: 89560 (20.00%)


UBCF

In [2]:
# --- UBCF (User-Based Collaborative Filtering) 모델 ---

def find_best_k_ubcf(train_set, val_set, k_values):
    """ UBCF 최적의 k 값을 찾는 함수 """
    best_k = None
    best_rmse = float('inf')
    
    print("\n--- UBCF 최적 k 값 탐색 시작 ---")
    for k in k_values:
        model = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}, k=k)
        model.fit(train_set)
        predictions = model.test(val_set)
        rmse = accuracy.rmse(predictions, verbose=False)
        print(f"k={k}: Validation RMSE = {rmse:.4f}")
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_k = k
    print(f"\nUBCF 최적 k 값: {best_k} (Validation RMSE: {best_rmse:.4f})")
    return best_k

# 최적의 k 값을 탐색할 범위 설정
k_values_to_test = [10, 20, 30, 40, 50, 70, 100]

# UBCF 최적 k 값 찾기
best_k_ubcf = find_best_k_ubcf(train_set, val_set, k_values_to_test)

# 최적 k 값으로 최종 UBCF 모델 학습 및 평가
print(f"\n--- 최적 k={best_k_ubcf}로 UBCF 최종 모델 학습 및 평가 ---")
ubcf_model = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}, k=best_k_ubcf)
ubcf_model.fit(train_set)
predictions_ubcf = ubcf_model.test(test_set)
print("UBCF 모델 예측 완료.")

# UBCF 성능 지표 계산
true_ratings_ubcf = [pred.r_ui for pred in predictions_ubcf]
predicted_ratings_ubcf = [pred.est for pred in predictions_ubcf]
mse_ubcf, rmse_ubcf, mae_ubcf, mape_ubcf = calculate_metrics(true_ratings_ubcf, predicted_ratings_ubcf)

print(f"\n--- UBCF (k={best_k_ubcf}) Performance Metrics on Test Set ---")
print(f"Mean Squared Error (MSE): {mse_ubcf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_ubcf:.4f}")
print(f"Mean Absolute Error (MAE): {mae_ubcf:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape_ubcf:.2f}%")


--- UBCF 최적 k 값 탐색 시작 ---
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=10: Validation RMSE = 1.1439
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=20: Validation RMSE = 1.1247
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=30: Validation RMSE = 1.1198
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=40: Validation RMSE = 1.1175
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=50: Validation RMSE = 1.1166
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=70: Validation RMSE = 1.1159
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=100: Validation RMSE = 1.1152

UBCF 최적 k 값: 100 (Validation RMSE: 1.1152)

--- 최적 k=100로 UBCF 최종 모델 학습 및 평가 ---
Computing the cosine similarity matrix...
Done computing similarity matrix.
UBCF 모델 예측 완료.

--- UBCF (k=100) Performance Metrics on Test Set 

IBCF

In [3]:
# --- IBCF (Item-Based Collaborative Filtering) 모델 ---

def find_best_k_ibcf(train_set, val_set, k_values):
    """ IBCF 최적의 k 값을 찾는 함수 """
    best_k = None
    best_rmse = float('inf')
    
    print("\n--- IBCF 최적 k 값 탐색 시작 ---")
    for k in k_values:
        model = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}, k=k)
        model.fit(train_set)
        predictions = model.test(val_set)
        rmse = accuracy.rmse(predictions, verbose=False)
        print(f"k={k}: Validation RMSE = {rmse:.4f}")
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_k = k
    print(f"\nIBCF 최적 k 값: {best_k} (Validation RMSE: {best_rmse:.4f})")
    return best_k

# 최적의 k 값을 탐색할 범위 설정 (UBCF와 동일한 값 사용)
k_values_to_test = [10, 20, 30, 40, 50, 70, 100]

# IBCF 최적 k 값 찾기
best_k_ibcf = find_best_k_ibcf(train_set, val_set, k_values_to_test)

# 최적 k 값으로 최종 IBCF 모델 학습 및 평가
print(f"\n--- 최적 k={best_k_ibcf}로 IBCF 최종 모델 학습 및 평가 ---")
ibcf_model = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}, k=best_k_ibcf)
ibcf_model.fit(train_set)
predictions_ibcf = ibcf_model.test(test_set)
print("IBCF 모델 예측 완료.")

# IBCF 성능 지표 계산
true_ratings_ibcf = [pred.r_ui for pred in predictions_ibcf]
predicted_ratings_ibcf = [pred.est for pred in predictions_ibcf]
mse_ibcf, rmse_ibcf, mae_ibcf, mape_ibcf = calculate_metrics(true_ratings_ibcf, predicted_ratings_ibcf)

print(f"\n--- IBCF (k={best_k_ibcf}) Performance Metrics on Test Set ---")
print(f"Mean Squared Error (MSE): {mse_ibcf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_ibcf:.4f}")
print(f"Mean Absolute Error (MAE): {mae_ibcf:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape_ibcf:.2f}%")


--- IBCF 최적 k 값 탐색 시작 ---
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=10: Validation RMSE = 1.1982
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=20: Validation RMSE = 1.1872
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=30: Validation RMSE = 1.1847
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=40: Validation RMSE = 1.1834
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=50: Validation RMSE = 1.1830
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=70: Validation RMSE = 1.1825
Computing the cosine similarity matrix...
Done computing similarity matrix.
k=100: Validation RMSE = 1.1825

IBCF 최적 k 값: 100 (Validation RMSE: 1.1825)

--- 최적 k=100로 IBCF 최종 모델 학습 및 평가 ---
Computing the cosine similarity matrix...
Done computing similarity matrix.
IBCF 모델 예측 완료.

--- IBCF (k=100) Performance Metrics on Test Set 

SVD

In [4]:
# --- SVD (Singular Value Decomposition) 모델 ---

print("\n--- SVD 최적 파라미터 탐색 시작 (GridSearchCV) ---")
# 탐색할 파라미터 그리드 정의
param_grid = {
    'n_factors': [50, 100, 150],
    'reg_all': [0.02, 0.05, 0.1]
}

# GridSearchCV는 전체 데이터셋(train_val_df)을 사용하여 교차 검증을 수행합니다.
data_for_svd_tuning = Dataset.load_from_df(train_val_df[['user_id', 'business_id', 'stars']], reader)

gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1)
gs_svd.fit(data_for_svd_tuning)

# 최적 파라미터와 최고 RMSE 출력
best_params_svd = gs_svd.best_params['rmse']
best_rmse_svd = gs_svd.best_score['rmse']

print(f"\nSVD 최적 파라미터: {best_params_svd}")
print(f"SVD 최적 Validation RMSE: {best_rmse_svd:.4f}")

# 최적 파라미터로 최종 SVD 모델 학습 및 평가
print(f"\n--- SVD 최종 모델 학습 및 평가 ---")
svd_model = SVD(**best_params_svd)
svd_model.fit(train_set)
predictions_svd = svd_model.test(test_set)
print("SVD 모델 예측 완료.")

# SVD 성능 지표 계산
true_ratings_svd = [pred.r_ui for pred in predictions_svd]
predicted_ratings_svd = [pred.est for pred in predictions_svd]
mse_svd, rmse_svd, mae_svd, mape_svd = calculate_metrics(true_ratings_svd, predicted_ratings_svd)

print(f"\n--- SVD Performance Metrics on Test Set ---")
print(f"Optimal Parameters: {best_params_svd}")
print(f"Mean Squared Error (MSE): {mse_svd:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_svd:.4f}")
print(f"Mean Absolute Error (MAE): {mae_svd:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape_svd:.2f}%")


--- SVD 최적 파라미터 탐색 시작 (GridSearchCV) ---

SVD 최적 파라미터: {'n_factors': 50, 'reg_all': 0.1}
SVD 최적 Validation RMSE: 1.0472

--- SVD 최종 모델 학습 및 평가 ---
SVD 모델 예측 완료.

--- SVD Performance Metrics on Test Set ---
Optimal Parameters: {'n_factors': 50, 'reg_all': 0.1}
Mean Squared Error (MSE): 1.0718
Root Mean Squared Error (RMSE): 1.0353
Mean Absolute Error (MAE): 0.8108
Mean Absolute Percentage Error (MAPE): 33.10%
