In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os

# -------------------- 데이터 로딩 및 전처리 --------------------
# 'review_data_optimized.parquet' 파일을 읽어옵니다.
df = pd.read_parquet('review_data_optimized.parquet', engine='pyarrow')
df_processed = df[['user_id', 'business_id', 'stars']].copy()

# Label Encoding을 사용하여 user_id와 business_id를 정수형으로 변환합니다.
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()
df_processed['user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed['business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

# -------------------- Dataset 정의 --------------------
class NeuMFDataset(Dataset):
    def __init__(self, df):
        self.user_ids = torch.tensor(df['user_encoded'].values, dtype=torch.long)
        self.item_ids = torch.tensor(df['business_encoded'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['stars'].values, dtype=torch.float)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]

# -------------------- 모델 정의 --------------------
class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, mf_dim=16, mlp_dims=[64, 32]):
        super(NeuMF, self).__init__()
        self.user_embedding_gmf = nn.Embedding(num_users, mf_dim)
        self.item_embedding_gmf = nn.Embedding(num_items, mf_dim)

        self.user_embedding_mlp = nn.Embedding(num_users, mlp_dims[0] // 2)
        self.item_embedding_mlp = nn.Embedding(num_items, mlp_dims[0] // 2)

        mlp_layers = []
        input_dim = mlp_dims[0]
        for dim in mlp_dims[1:]:
            mlp_layers.append(nn.Linear(input_dim, dim))
            mlp_layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*mlp_layers)

        self.final_layer = nn.Linear(mf_dim + mlp_dims[-1], 1)

    def forward(self, user_ids, item_ids):
        gmf_user = self.user_embedding_gmf(user_ids)
        gmf_item = self.item_embedding_gmf(item_ids)
        gmf_output = gmf_user * gmf_item

        mlp_user = self.user_embedding_mlp(user_ids)
        mlp_item = self.item_embedding_mlp(item_ids)
        mlp_input = torch.cat((mlp_user, mlp_item), dim=1)
        mlp_output = self.mlp(mlp_input)

        concat = torch.cat((gmf_output, mlp_output), dim=1)
        prediction = self.final_layer(concat)
        return prediction.view(-1)

# -------------------- 평가 지표 함수 정의 --------------------
def mean_absolute_percentage_error(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    epsilon = 1e-10
    return np.mean(np.abs((y_true - y_pred) / (y_true + epsilon))) * 100

# -------------------- 5회 반복 학습 및 평가 --------------------
num_runs = 5
mse_list, rmse_list, mae_list, mape_list = [], [], [], []

embedding_dim = 16
mlp_dims = [64, 32]
learning_rate = 0.001
epochs = 50
patience = 5
min_delta = 0.0001

for i in range(num_runs):
    print(f"\n==================== {i+1}번째 반복 시작 ====================")

    # 매 반복마다 데이터셋을 무작위로 분할하여 독립적인 테스트를 보장합니다.
    train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42 + i)
    val_size_ratio = 1 / 8
    train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42 + i)

    train_dataset = NeuMFDataset(train_df)
    val_dataset = NeuMFDataset(val_df)
    test_dataset = NeuMFDataset(test_df)

    batch_size = 128
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # 모델, 손실 함수, 옵티마이저는 매번 새로 초기화합니다.
    model = NeuMF(num_users, num_businesses, embedding_dim, mlp_dims)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model_path = f'best_neumf_model_run_{i+1}.pt'

    best_val_rmse = float('inf')
    epochs_no_improve = 0

    # 학습 루프
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        train_bar = tqdm(train_loader, desc=f"[Run {i+1}, Epoch {epoch+1}] Training", leave=False)
        for user_ids, item_ids, ratings in train_bar:
            optimizer.zero_grad()
            predictions = model(user_ids, item_ids)
            loss = criterion(predictions, ratings)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            train_bar.set_postfix(loss=loss.item())

        model.eval()
        val_predictions, val_true = [], []
        val_bar = tqdm(val_loader, desc=f"[Run {i+1}, Epoch {epoch+1}] Validating", leave=False)
        with torch.no_grad():
            for user_ids, item_ids, ratings in val_bar:
                preds = model(user_ids, item_ids)
                val_predictions.extend(preds.tolist())
                val_true.extend(ratings.tolist())

        val_rmse = np.sqrt(mean_squared_error(val_true, val_predictions))

        if val_rmse < best_val_rmse - min_delta:
            best_val_rmse = val_rmse
            epochs_no_improve = 0
            torch.save(model.state_dict(), model_path)
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print(f"조기 종료 발생. (Run {i+1})")
                break

    # 테스트 루프
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path))
        print(f"최적 모델 로드 완료: {model_path}")
    else:
        print(f"최적 모델을 찾지 못해 현재 모델을 사용합니다. (Run {i+1})")

    model.eval()
    test_preds, test_true = [], []
    test_bar = tqdm(test_loader, desc=f"Testing (Run {i+1})", leave=False)
    with torch.no_grad():
        for user_ids, item_ids, ratings in test_bar:
            preds = model(user_ids, item_ids)
            test_preds.extend(preds.tolist())
            test_true.extend(ratings.tolist())

    test_mse = mean_squared_error(test_true, test_preds)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(test_true, test_preds)
    test_mape = mean_absolute_percentage_error(test_true, test_preds)

    print(f"\n✅ [NeuMF] {i+1}번째 테스트 결과:")
    print(f" - MSE  : {test_mse:.4f}")
    print(f" - RMSE : {test_rmse:.4f}")
    print(f" - MAE  : {test_mae:.4f}")
    print(f" - MAPE : {test_mape:.2f}%")

    mse_list.append(test_mse)
    rmse_list.append(test_rmse)
    mae_list.append(test_mae)
    mape_list.append(test_mape)

# 최종 평균 계산 및 출력
avg_mse = np.mean(mse_list)
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mape = np.mean(mape_list)

print("\n\n==================== 5회 반복 최종 평균 결과 ====================")
print(f" - 평균 MSE  : {avg_mse:.4f}")
print(f" - 평균 RMSE : {avg_rmse:.4f}")
print(f" - 평균 MAE  : {avg_mae:.4f}")
print(f" - 평균 MAPE : {avg_mape:.2f}%")






                                                                                           

조기 종료 발생. (Run 1)
최적 모델 로드 완료: best_neumf_model_run_1.pt


                                                                   


✅ [NeuMF] 1번째 테스트 결과:
 - MSE  : 1.1683
 - RMSE : 1.0809
 - MAE  : 0.8567
 - MAPE : 34.13%



                                                                                           

조기 종료 발생. (Run 2)
최적 모델 로드 완료: best_neumf_model_run_2.pt


                                                                   


✅ [NeuMF] 2번째 테스트 결과:
 - MSE  : 1.1710
 - RMSE : 1.0822
 - MAE  : 0.8525
 - MAPE : 34.54%



                                                                                           

조기 종료 발생. (Run 3)
최적 모델 로드 완료: best_neumf_model_run_3.pt


                                                                   


✅ [NeuMF] 3번째 테스트 결과:
 - MSE  : 1.1788
 - RMSE : 1.0857
 - MAE  : 0.8466
 - MAPE : 34.79%



                                                                                           

조기 종료 발생. (Run 4)
최적 모델 로드 완료: best_neumf_model_run_4.pt


                                                                   


✅ [NeuMF] 4번째 테스트 결과:
 - MSE  : 1.1820
 - RMSE : 1.0872
 - MAE  : 0.8546
 - MAPE : 34.80%



                                                                                            

조기 종료 발생. (Run 5)
최적 모델 로드 완료: best_neumf_model_run_5.pt


                                                                   


✅ [NeuMF] 5번째 테스트 결과:
 - MSE  : 1.1852
 - RMSE : 1.0887
 - MAE  : 0.8501
 - MAPE : 34.58%


 - 평균 MSE  : 1.1771
 - 평균 RMSE : 1.0849
 - 평균 MAE  : 0.8521
 - 평균 MAPE : 34.57%


