In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os

# --- 1. Utility Functions ---

# MAPE를 위한 유틸리티 함수 (0으로 나누는 오류 방지)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_true = y_true != 0
    if np.sum(non_zero_true) == 0:
        return 0.0 # 모든 y_true가 0인 경우 MAPE는 0으로 처리
    return np.mean(np.abs((y_true[non_zero_true] - y_pred[non_zero_true]) / y_true[non_zero_true])) * 100

# --- 2. Data Loading and Preprocessing ---

# 파일 로드
# IMPORTANT: Adjust this path to where your JSON file is located on your local machine.
df = pd.read_json('review_business_5up_5aspect_3sentiment_vectorized_clean.json', lines=True)

# 필요한 컬럼 추출
df_processed = df[['user_id', 'business_id', 'stars', 'sentiment_vector']].copy()

# user_id와 business_id를 연속적인 정수 ID로 인코딩
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

df_processed.loc[:, 'user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed.loc[:, 'business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

# 데이터 분할
# 논문에서 제시된 70/10/20 비율로 데이터 분할
train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
val_size_ratio = 1 / 8 # 10% of total data (1/8 of 80%)
train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42)

print(f"전체 데이터 수: {len(df_processed)}")
print(f"학습 데이터 수: {len(train_df)} ({len(train_df)/len(df_processed)*100:.2f}%)")
print(f"검증 데이터 수: {len(val_df)} ({len(val_df)/len(df_processed)*100:.2f}%)")
print(f"테스트 데이터 수: {len(test_df)} ({len(test_df)/len(df_processed)*100:.2f}%)")

# Determine sentiment_vector_dim dynamically
sentiment_vector_dim = len(df_processed['sentiment_vector'].iloc[0]) if not df_processed.empty else 15

# --- 3. PyTorch Dataset and DataLoader Definition ---
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.user_ids = torch.tensor(df['user_encoded'].values, dtype=torch.long)
        self.business_ids = torch.tensor(df['business_encoded'].values, dtype=torch.long)
        self.sentiment_vectors = torch.tensor(np.array(df['sentiment_vector'].tolist()), dtype=torch.float)
        self.stars = torch.tensor(df['stars'].values, dtype=torch.float)

    def __len__(self):
        return len(self.stars)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.business_ids[idx], self.sentiment_vectors[idx], self.stars[idx]

# --- 4. Model Architecture Definition ---
class CustomerRestaurantInteractionModule(nn.Module):
    def __init__(self, num_users, num_businesses, embedding_dim, mlp_dims):
        super(CustomerRestaurantInteractionModule, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.business_embedding = nn.Embedding(num_businesses, embedding_dim)

        layers = []
        input_dim = embedding_dim * 2
        for dim in mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*layers)
        self.output_dim = mlp_dims[-1] if mlp_dims else embedding_dim * 2

    def forward(self, user_ids, business_ids):
        user_vec = self.user_embedding(user_ids)
        business_vec = self.business_embedding(business_ids)
        combined_vec = torch.cat((user_vec, business_vec), dim=1)
        interaction_features = self.mlp(combined_vec)
        return interaction_features

class ReviewAspectModule(nn.Module):
    def __init__(self, sentiment_vector_dim, aspect_mlp_dims):
        super(ReviewAspectModule, self).__init__()
        layers = []
        input_dim = sentiment_vector_dim
        for dim in aspect_mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*layers)
        self.output_dim = aspect_mlp_dims[-1] if aspect_mlp_dims else sentiment_vector_dim

    def forward(self, sentiment_vectors):
        aspect_features = self.mlp(sentiment_vectors)
        return aspect_features

class AATRec(nn.Module):
    def __init__(self, num_users, num_businesses, embedding_dim,
                     user_biz_mlp_dims, aspect_mlp_dims, final_mlp_dims,
                     sentiment_vector_dim):
        super(AATRec, self).__init__()
        self.customer_restaurant_interaction_module = CustomerRestaurantInteractionModule(
            num_users, num_businesses, embedding_dim, user_biz_mlp_dims
        )
        self.review_aspect_module = ReviewAspectModule(
            sentiment_vector_dim, aspect_mlp_dims
        )

        final_input_dim = self.customer_restaurant_interaction_module.output_dim + \
                              self.review_aspect_module.output_dim

        layers = []
        input_dim = final_input_dim
        for dim in final_mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        layers.append(nn.Linear(input_dim, 1)) # Final output is rating (1-dimensional)
        self.prediction_mlp = nn.Sequential(*layers)

    def forward(self, user_ids, business_ids, sentiment_vectors):
        user_biz_features = self.customer_restaurant_interaction_module(user_ids, business_ids)
        aspect_features = self.review_aspect_module(sentiment_vectors)
        combined_features = torch.cat((user_biz_features, aspect_features), dim=1)
        predicted_rating = self.prediction_mlp(combined_features)
        return predicted_rating.squeeze() # Return 1D rating

# --- 5. Device Configuration (GPU Setup) ---
# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 6. Dataset and DataLoader Creation ---
train_dataset = ReviewDataset(train_df)
val_dataset = ReviewDataset(val_df)
test_dataset = ReviewDataset(test_df)

# --- 7. Apply the Given Best Parameters ---
# Previously, these were found via grid search. Now, we explicitly set them.
best_params = {
    'aspect_mlp_hidden_dims': [64, 32],
    'batch_size': 128,
    'embedding_dim': 64,
    'final_mlp_hidden_dims': [32, 16],
    'learning_rate': 0.001,
    'user_biz_mlp_hidden_dims': [128, 64]
}

print("\n" + "="*50)
print(f"Applying pre-selected Best Parameters: {best_params}")
print("="*50)

# --- 8. Final Model Training and Testing (Using Best Parameters) ---
final_embedding_dim = best_params['embedding_dim']
final_learning_rate = best_params['learning_rate']
final_batch_size = best_params['batch_size']
final_user_biz_mlp_dims = best_params['user_biz_mlp_hidden_dims']
final_aspect_mlp_dims = best_params['aspect_mlp_hidden_dims']
final_final_mlp_dims = best_params['final_mlp_hidden_dims']

final_model = AATRec(num_users, num_businesses, final_embedding_dim,
                     final_user_biz_mlp_dims, final_aspect_mlp_dims, final_final_mlp_dims,
                     sentiment_vector_dim).to(device) # Move final model to device

final_criterion = nn.MSELoss()
final_optimizer = optim.Adam(final_model.parameters(), lr=final_learning_rate)

final_train_loader = DataLoader(train_dataset, batch_size=final_batch_size, shuffle=True)
final_val_loader = DataLoader(val_dataset, batch_size=final_batch_size, shuffle=False)
final_test_loader = DataLoader(test_dataset, batch_size=final_batch_size, shuffle=False)

final_epochs = 50 # Ample epochs for final training
final_patience = 10 # More patience for final training
final_min_delta = 0.0005 # Stricter improvement criterion

best_final_val_rmse = float('inf')
epochs_no_improve_final = 0
final_model_path = 'final_best_aat_rec_model.pt'

print("\n--- Training Final Model with Best Parameters ---")
for epoch in range(final_epochs):
    # Training phase
    final_model.train()
    total_train_loss = 0
    for user_ids, business_ids, sentiment_vectors, stars in final_train_loader:
        user_ids, business_ids, sentiment_vectors, stars = \
            user_ids.to(device), business_ids.to(device), sentiment_vectors.to(device), stars.to(device)

        final_optimizer.zero_grad()
        predictions = final_model(user_ids, business_ids, sentiment_vectors)
        loss = final_criterion(predictions, stars)
        loss.backward()
        final_optimizer.step()
        total_train_loss += loss.item()

    # Validation phase
    final_model.eval()
    val_predictions = []
    val_true_ratings = []
    with torch.no_grad():
        for user_ids, business_ids, sentiment_vectors, stars in final_val_loader:
            user_ids, business_ids, sentiment_vectors, stars = \
                user_ids.to(device), business_ids.to(device), sentiment_vectors.to(device), stars.to(device)

            predictions = final_model(user_ids, business_ids, sentiment_vectors)
            val_predictions.extend(predictions.cpu().tolist())
            val_true_ratings.extend(stars.cpu().tolist())

    current_val_rmse = np.sqrt(mean_squared_error(val_true_ratings, val_predictions))

    print(f"Final Train Epoch {epoch+1}/{final_epochs}, "
          f"Train Loss: {total_train_loss / len(final_train_loader):.4f}, "
          f"Val RMSE: {current_val_rmse:.4f}")

    # Early stopping logic for final model
    if current_val_rmse < best_final_val_rmse - final_min_delta:
        best_final_val_rmse = current_val_rmse
        epochs_no_improve_final = 0
        torch.save(final_model.state_dict(), final_model_path)
        print(f"  --> RMSE improved. Model saved: {best_final_val_rmse:.4f}")
    else:
        epochs_no_improve_final += 1
        print(f"  --> RMSE not improved. ({epochs_no_improve_final}/{final_patience})")
        if epochs_no_improve_final == final_patience:
            print(f"Early stopping - No validation RMSE improvement for {final_patience} epochs.")
            break

# --- 9. Final Model Testing ---
print("\n--- Evaluating Final Model on Test Set ---")
if os.path.exists(final_model_path):
    final_model.load_state_dict(torch.load(final_model_path))
    print(f"Loaded best model weights from {final_model_path}")
else:
    print(f"Could not find optimal final model weights at '{final_model_path}'. Testing with current model state.")

final_model.eval()
test_predictions = []
true_ratings = []

with torch.no_grad():
    for user_ids, business_ids, sentiment_vectors, stars in final_test_loader:
        user_ids, business_ids, sentiment_vectors, stars = \
            user_ids.to(device), business_ids.to(device), sentiment_vectors.to(device), stars.to(device)

        predictions = final_model(user_ids, business_ids, sentiment_vectors)
        test_predictions.extend(predictions.cpu().tolist())
        true_ratings.extend(stars.cpu().tolist())

mse = mean_squared_error(true_ratings, test_predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(true_ratings, test_predictions)
mape = mean_absolute_percentage_error(true_ratings, test_predictions)

print(f"\n--- Performance Evaluation (Final Model with Best Parameters) ---")
print(f"Selected Hyperparameters: {best_params}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Clean up temporary models directory if it was created by the original script
if os.path.exists('temp_models'):
    import shutil
    shutil.rmtree('temp_models')
    print("\nCleaned up 'temp_models' directory.")


전체 데이터 수: 451185
학습 데이터 수: 315829 (70.00%)
검증 데이터 수: 45119 (10.00%)
테스트 데이터 수: 90237 (20.00%)
Using device: cuda

Applying pre-selected Best Parameters: {'aspect_mlp_hidden_dims': [64, 32], 'batch_size': 128, 'embedding_dim': 64, 'final_mlp_hidden_dims': [32, 16], 'learning_rate': 0.001, 'user_biz_mlp_hidden_dims': [128, 64]}

--- Training Final Model with Best Parameters ---
Final Train Epoch 1/50, Train Loss: 0.7284, Val RMSE: 0.7046
  --> RMSE improved. Model saved: 0.7046
Final Train Epoch 2/50, Train Loss: 0.4770, Val RMSE: 0.6902
  --> RMSE improved. Model saved: 0.6902
Final Train Epoch 3/50, Train Loss: 0.4411, Val RMSE: 0.6864
  --> RMSE improved. Model saved: 0.6864
Final Train Epoch 4/50, Train Loss: 0.4118, Val RMSE: 0.7170
  --> RMSE not improved. (1/10)
Final Train Epoch 5/50, Train Loss: 0.3887, Val RMSE: 0.6884
  --> RMSE not improved. (2/10)
Final Train Epoch 6/50, Train Loss: 0.3658, Val RMSE: 0.7025
  --> RMSE not improved. (3/10)
Final Train Epoch 7/50, Train Loss: 

In [1]:
import pandas as pd # 여전히 LabelEncoder, DataFrame 초기화 등에 필요하지만, 대용량 파일 로드에는 사용하지 않음
import numpy as np
import torch
import torch.nn as nn
import json # JSONL 파일 처리를 위해 필요
import os

# --- 1. Dimensionality Reducer MLP Definition ---
class DimensionalityReducerMLP(nn.Module):
    """
    3072차원 입력 벡터를 512 -> 128 -> 32차원으로 줄이는 MLP 모듈.
    """
    def __init__(self, input_dim=3072, output_dim=32):
        super(DimensionalityReducerMLP, self).__init__()
        # 첫 번째 선형 레이어: input_dim -> 512
        self.layer1 = nn.Linear(input_dim, 512)
        # 두 번째 선형 레이어: 512 -> 128
        self.layer2 = nn.Linear(512, 128)
        # 세 번째 선형 레이어 (최종 출력): 128 -> output_dim
        self.layer3 = nn.Linear(128, output_dim)

        # 활성화 함수 (ReLU)
        self.relu = nn.ReLU()

    def forward(self, x):
        # x는 입력 벡터 (예: 3072차원)
        x = self.relu(self.layer1(x)) # 3072 -> 512, ReLU 적용
        x = self.relu(self.layer2(x)) # 512 -> 128, ReLU 적용
        x = self.layer3(x)           # 128 -> 32 (최종 출력에는 일반적으로 ReLU 미적용)
        return x

# --- 2. Configuration ---
# 원본 3072차원 임베딩이 포함된 큰 JSONL 파일
input_file_path = 'review_business_5up_with_embedded_vector.jsonl'
# 32차원으로 축소된 임베딩이 저장될 새로운 JSONL 파일
output_file_path = 'review_business_5up_with_reduced_embedding.jsonl'

original_embedding_dim = 3072
reduced_embedding_dim = 32
processing_batch_size = 1024 # 한 번에 처리할 레코드 수 (메모리 상황에 따라 조절)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 3. Initialize MLP Model ---
reducer_mlp = DimensionalityReducerMLP(input_dim=original_embedding_dim, output_dim=reduced_embedding_dim).to(device)
reducer_mlp.eval() # 추론 모드로 설정 (dropout, batchnorm 등 비활성화)

# --- 4. Process Embeddings in Chunks and Save ---
print(f"Processing embeddings from '{input_file_path}' and saving to '{output_file_path}'...")
print(f"Reducing from {original_embedding_dim} to {reduced_embedding_dim} dimensions in batches of {processing_batch_size}.")

# 파일의 총 라인 수를 미리 세는 과정은 제거 (tqdm 사용 안 하므로)
# 이 과정이 너무 오래 걸린다면, total=None으로 설정하여 대략적인 진행률만 표시할 수 있습니다.
# try:
#     print("Counting total lines in the input file... (This might take a while for large files)")
#     with open(input_file_path, 'r', encoding='utf-8') as f:
#         total_lines = sum(1 for line in f)
#     print(f"Total lines found: {total_lines}")
# except FileNotFoundError:
#     print(f"Error: Input file not found at {input_file_path}. Please check the path.")
#     exit()
# except Exception as e:
#     print(f"Error counting lines in input file: {e}")
#     total_lines = None # 총 라인 수를 알 수 없으면 None으로 설정하여 tqdm이 무한 진행률로 표시


# 출력 파일을 새로 생성하거나 덮어쓰기 위해 'w' 모드로 엽니다.
with open(input_file_path, 'r', encoding='utf-8') as infile, \
     open(output_file_path, 'w', encoding='utf-8') as outfile:

    batch_records = [] # 현재 배치의 원본 레코드를 저장
    line_count = 0 # 처리된 라인 수 카운터 추가

    for line in infile: # tqdm 제거
        line_count += 1
        try:
            record = json.loads(line.strip())
            
            # 'embedding' 컬럼이 존재하는지 확인
            if 'embedding' not in record:
                print(f"Warning: 'embedding' key not found in record (line {line_count}), skipping. Line: {line.strip()}")
                continue
            if not isinstance(record['embedding'], list) or len(record['embedding']) == 0:
                print(f"Warning: 'embedding' is not a valid list or is empty (line {line_count}), skipping. Line: {line.strip()}")
                continue
            
            # 실제 임베딩 차원 확인 (첫 번째 레코드에서만 경고)
            if line_count == 1 and len(record['embedding']) != original_embedding_dim:
                 print(f"Warning: Expected embedding dimension is {original_embedding_dim}, but found {len(record['embedding'])}.")
                 print("Please ensure 'original_embedding_dim' matches your actual embedding dimension.")

            batch_records.append(record)

            if len(batch_records) >= processing_batch_size:
                # 배치 처리
                embeddings_to_process = torch.tensor(
                    np.array([d['embedding'] for d in batch_records]), dtype=torch.float
                ).to(device)

                with torch.no_grad():
                    reduced_embeddings = reducer_mlp(embeddings_to_process).cpu().tolist()

                # 원본 데이터에 축소된 임베딩 추가 및 저장
                for i, d in enumerate(batch_records):
                    d['reduced_embedding'] = reduced_embeddings[i]
                    if 'embedding' in d:
                        del d['embedding']
                    outfile.write(json.dumps(d, ensure_ascii=False) + '\n')
                batch_records = [] # 배치 초기화
                print(f"Processed {line_count} lines...") # 진행 상황 출력

        except json.JSONDecodeError as e:
            print(f"Skipping malformed JSON line (line {line_count}): {line.strip()} - Error: {e}")
            continue
        except Exception as e:
            print(f"An unexpected error occurred (line {line_count}): {e} in line: {line.strip()}")
            continue

    # 남은 데이터 처리 (마지막 배치)
    if batch_records:
        embeddings_to_process = torch.tensor(
            np.array([d['embedding'] for d in batch_records]), dtype=torch.float
        ).to(device)

        with torch.no_grad():
            reduced_embeddings = reducer_mlp(embeddings_to_process).cpu().tolist()

        for i, d in enumerate(batch_records):
            d['reduced_embedding'] = reduced_embeddings[i]
            if 'embedding' in d:
                del d['embedding']
            outfile.write(json.dumps(d, ensure_ascii=False) + '\n')
    print(f"Finished processing all {line_count} lines.")

print(f"Embedding reduction and saving complete. Processed data saved to '{output_file_path}'")

Using device: cuda
Processing embeddings from 'review_business_5up_with_embedded_vector.jsonl' and saving to 'review_business_5up_with_reduced_embedding.jsonl'...
Reducing from 3072 to 32 dimensions in batches of 1024.
Processed 1024 lines...
Processed 2048 lines...
Processed 3072 lines...
Processed 4096 lines...
Processed 5120 lines...
Processed 6144 lines...
Processed 7168 lines...
Processed 8192 lines...
Processed 9216 lines...
Processed 10240 lines...
Processed 11264 lines...
Processed 12288 lines...
Processed 13312 lines...
Processed 14336 lines...
Processed 15360 lines...
Processed 16384 lines...
Processed 17408 lines...
Processed 18432 lines...
Processed 19456 lines...
Processed 20480 lines...
Processed 21504 lines...
Processed 22528 lines...
Processed 23552 lines...
Processed 24576 lines...
Processed 25600 lines...
Processed 26624 lines...
Processed 27648 lines...
Processed 28672 lines...
Processed 29696 lines...
Processed 30720 lines...
Processed 31744 lines...
Processed 32768

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os

# --- 1. Utility Functions ---

# MAPE를 위한 유틸리티 함수 (0으로 나누는 오류 방지)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_true = y_true != 0
    if np.sum(non_zero_true) == 0:
        return 0.0 # 모든 y_true가 0인 경우 MAPE는 0으로 처리
    return np.mean(np.abs((y_true[non_zero_true] - y_pred[non_zero_true]) / y_true[non_zero_true])) * 100

# --- 2. Data Loading and Preprocessing ---

# 첫 번째 파일 로드: sentiment_vector가 포함된 원본 파일
# IMPORTANT: Adjust this path to where your JSON file is located on your local machine.
print("Loading sentiment data from 'review_business_5up_5aspect_3sentiment_vectorized_clean.json'...")
df_sentiment = pd.read_json('review_business_5up_5aspect_3sentiment_vectorized_clean.json', lines=True)
print(f"Sentiment data loaded. Total records: {len(df_sentiment)}")

# 두 번째 파일 로드: reduced_embedding이 포함된 파일
# IMPORTANT: Adjust this path to where your JSONL file is located on your local machine.
print("Loading reduced embedding data from 'review_business_5up_with_reduced_embedding.jsonl'...")
df_reduced_emb = pd.read_json('review_business_5up_with_reduced_embedding.jsonl', lines=True)
print(f"Reduced embedding data loaded. Total records: {len(df_reduced_emb)}")

# 두 데이터프레임 병합
# review_id를 기준으로 병합합니다. review_id가 각 리뷰의 고유 ID라고 가정합니다.
# 필요한 컬럼만 선택하여 병합합니다.
df_processed = pd.merge(
    df_sentiment[['review_id', 'user_id', 'business_id', 'stars', 'sentiment_vector']],
    df_reduced_emb[['review_id', 'reduced_embedding']],
    on='review_id',
    how='inner' # 양쪽에 모두 존재하는 리뷰만 사용
)
print(f"Data merged successfully. Total processed records: {len(df_processed)}")


# user_id와 business_id를 연속적인 정수 ID로 인코딩
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

df_processed.loc[:, 'user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed.loc[:, 'business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

# 데이터 분할
# 논문에서 제시된 70/10/20 비율로 데이터 분할
train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
val_size_ratio = 1 / 8 # 10% of total data (1/8 of 80%)
train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42)

print(f"전체 데이터 수: {len(df_processed)}")
print(f"학습 데이터 수: {len(train_df)} ({len(train_df)/len(df_processed)*100:.2f}%)")
print(f"검증 데이터 수: {len(val_df)} ({len(val_df)/len(df_processed)*100:.2f}%)")
print(f"테스트 데이터 수: {len(test_df)} ({len(test_df)/len(df_processed)*100:.2f}%)")

# 임베딩 차원 설정
original_sentiment_dim = len(df_processed['sentiment_vector'].iloc[0]) if not df_processed.empty else 15 # 기존 15차원 감성 벡터
loaded_embedding_dim = len(df_processed['reduced_embedding'].iloc[0]) if not df_processed.empty else 32 # 이미 32차원으로 축소된 임베딩

# --- 3. PyTorch Dataset and DataLoader Definition ---
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.user_ids = torch.tensor(df['user_encoded'].values, dtype=torch.long)
        self.business_ids = torch.tensor(df['business_encoded'].values, dtype=torch.long)
        self.sentiment_vectors = torch.tensor(np.array(df['sentiment_vector'].tolist()), dtype=torch.float)
        # 'reduced_embedding' 컬럼을 사용합니다.
        self.embeddings = torch.tensor(np.array(df['reduced_embedding'].tolist()), dtype=torch.float)
        self.stars = torch.tensor(df['stars'].values, dtype=torch.float)

    def __len__(self):
        return len(self.stars)

    def __getitem__(self, idx):
        # 'embeddings'를 반환 값에 추가합니다. (이미 32차원)
        return self.user_ids[idx], self.business_ids[idx], self.sentiment_vectors[idx], \
               self.embeddings[idx], self.stars[idx]

# --- 4. Model Architecture Definition ---
class CustomerRestaurantInteractionModule(nn.Module):
    def __init__(self, num_users, num_businesses, embedding_dim, mlp_dims):
        super(CustomerRestaurantInteractionModule, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.business_embedding = nn.Embedding(num_businesses, embedding_dim)

        layers = []
        input_dim = embedding_dim * 2
        for dim in mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*layers)
        self.output_dim = mlp_dims[-1] if mlp_dims else embedding_dim * 2

    def forward(self, user_ids, business_ids):
        user_vec = self.user_embedding(user_ids)
        business_vec = self.business_embedding(business_ids)
        combined_vec = torch.cat((user_vec, business_vec), dim=1)
        interaction_features = self.mlp(combined_vec)
        return interaction_features

class ReviewAspectModule(nn.Module):
    # 이제 embedding_original_dim, embedding_reduced_dim 대신 loaded_embedding_dim을 받습니다.
    def __init__(self, original_sentiment_dim, loaded_embedding_dim, aspect_mlp_dims):
        super(ReviewAspectModule, self).__init__()
        # 3072 -> 32 차원 축소 MLP는 더 이상 필요 없습니다.
        # self.embedding_reducer = nn.Sequential(...) 삭제

        layers = []
        # MLP의 입력 차원은 원래 감성 벡터 차원 + (이미 줄어든) 임베딩 차원 (15 + 32 = 47)
        input_dim = original_sentiment_dim + loaded_embedding_dim
        for dim in aspect_mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*layers)
        self.output_dim = aspect_mlp_dims[-1] if aspect_mlp_dims else (original_sentiment_dim + loaded_embedding_dim)

    def forward(self, sentiment_vectors, embeddings):
        # embeddings는 이미 32차원이므로 추가 축소 필요 없음
        # reduced_embeddings = self.embedding_reducer(embeddings) 삭제
        # 원래 감성 벡터와 (이미 줄어든) 임베딩을 직접 연결
        combined_aspect_features = torch.cat((sentiment_vectors, embeddings), dim=1)
        aspect_features = self.mlp(combined_aspect_features)
        return aspect_features

class AATRec(nn.Module):
    def __init__(self, num_users, num_businesses, embedding_dim,
                 user_biz_mlp_dims, aspect_mlp_dims, final_mlp_dims,
                 original_sentiment_dim, loaded_embedding_dim): # 인자 변경
        super(AATRec, self).__init__()
        self.customer_restaurant_interaction_module = CustomerRestaurantInteractionModule(
            num_users, num_businesses, embedding_dim, user_biz_mlp_dims
        )
        self.review_aspect_module = ReviewAspectModule(
            original_sentiment_dim, loaded_embedding_dim, aspect_mlp_dims # 변경된 인자 전달
        )

        final_input_dim = self.customer_restaurant_interaction_module.output_dim + \
                               self.review_aspect_module.output_dim

        layers = []
        input_dim = final_input_dim
        for dim in final_mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        layers.append(nn.Linear(input_dim, 1)) # Final output is rating (1-dimensional)
        self.prediction_mlp = nn.Sequential(*layers)

    # forward 함수에 embeddings 인자 추가 (이미 32차원)
    def forward(self, user_ids, business_ids, sentiment_vectors, embeddings):
        user_biz_features = self.customer_restaurant_interaction_module(user_ids, business_ids)
        # review_aspect_module에 두 가지 임베딩 전달 (embeddings는 이미 32차원)
        aspect_features = self.review_aspect_module(sentiment_vectors, embeddings)
        combined_features = torch.cat((user_biz_features, aspect_features), dim=1)
        predicted_rating = self.prediction_mlp(combined_features)
        return predicted_rating.squeeze() # Return 1D rating

# --- 5. Device Configuration (GPU Setup) ---
# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 6. Dataset and DataLoader Creation ---
train_dataset = ReviewDataset(train_df)
val_dataset = ReviewDataset(val_df)
test_dataset = ReviewDataset(test_df)

# --- 7. Apply the Given Best Parameters ---
# Previously, these were found via grid search. Now, we explicitly set them.
best_params = {
    'aspect_mlp_hidden_dims': [64, 32],
    'batch_size': 128,
    'embedding_dim': 64,
    'final_mlp_hidden_dims': [32, 16],
    'learning_rate': 0.001,
    'user_biz_mlp_hidden_dims': [128, 64]
}

print("\n" + "="*50)
print(f"Applying pre-selected Best Parameters: {best_params}")
print("="*50)

# --- 8. Final Model Training and Testing (Using Best Parameters) ---
final_embedding_dim = best_params['embedding_dim']
final_learning_rate = best_params['learning_rate']
final_batch_size = best_params['batch_size']
final_user_biz_mlp_dims = best_params['user_biz_mlp_hidden_dims']
final_aspect_mlp_dims = best_params['aspect_mlp_hidden_dims']
final_final_mlp_dims = best_params['final_mlp_hidden_dims']

# AATRec 모델 초기화 시 변경된 임베딩 차원 인자 전달
final_model = AATRec(num_users, num_businesses, final_embedding_dim,
                     final_user_biz_mlp_dims, final_aspect_mlp_dims, final_final_mlp_dims,
                     original_sentiment_dim, loaded_embedding_dim).to(device) # loaded_embedding_dim 전달

final_criterion = nn.MSELoss()
final_optimizer = optim.Adam(final_model.parameters(), lr=final_learning_rate)

final_train_loader = DataLoader(train_dataset, batch_size=final_batch_size, shuffle=True)
final_val_loader = DataLoader(val_dataset, batch_size=final_batch_size, shuffle=False)
final_test_loader = DataLoader(test_dataset, batch_size=final_batch_size, shuffle=False)

final_epochs = 50 # Ample epochs for final training
final_patience = 10 # More patience for final training
final_min_delta = 0.0005 # Stricter improvement criterion

best_final_val_rmse = float('inf')
epochs_no_improve_final = 0
# 모델 저장 경로 변경 (이전 3072차원 처리 모델과 구분)
final_model_path = 'final_best_aat_rec_model_with_pre_reduced_embedding.pt'

print("\n--- Training Final Model with Best Parameters ---")
for epoch in range(final_epochs):
    # Training phase
    final_model.train()
    total_train_loss = 0
    for user_ids, business_ids, sentiment_vectors, embeddings, stars in final_train_loader:
        user_ids, business_ids, sentiment_vectors, embeddings, stars = \
            user_ids.to(device), business_ids.to(device), sentiment_vectors.to(device), \
            embeddings.to(device), stars.to(device) # embeddings는 이미 32차원

        final_optimizer.zero_grad()
        predictions = final_model(user_ids, business_ids, sentiment_vectors, embeddings)
        loss = final_criterion(predictions, stars)
        loss.backward()
        final_optimizer.step()
        total_train_loss += loss.item()

    # Validation phase
    final_model.eval()
    val_predictions = []
    val_true_ratings = []
    with torch.no_grad():
        for user_ids, business_ids, sentiment_vectors, embeddings, stars in final_val_loader:
            user_ids, business_ids, sentiment_vectors, embeddings, stars = \
                user_ids.to(device), business_ids.to(device), sentiment_vectors.to(device), \
                embeddings.to(device), stars.to(device) # embeddings는 이미 32차원

            predictions = final_model(user_ids, business_ids, sentiment_vectors, embeddings)
            val_predictions.extend(predictions.cpu().tolist())
            val_true_ratings.extend(stars.cpu().tolist())

    current_val_rmse = np.sqrt(mean_squared_error(val_true_ratings, val_predictions))

    print(f"Epoch {epoch+1}/{final_epochs}, "
          f"Train Loss (Avg): {total_train_loss / len(final_train_loader):.4f}, "
          f"Val RMSE: {current_val_rmse:.4f}")

    # Early stopping logic for final model
    if current_val_rmse < best_final_val_rmse - final_min_delta:
        best_final_val_rmse = current_val_rmse
        epochs_no_improve_final = 0
        torch.save(final_model.state_dict(), final_model_path)
        print(f"  --> RMSE improved. Model saved: {best_final_val_rmse:.4f}")
    else:
        epochs_no_improve_final += 1
        print(f"  --> RMSE not improved. ({epochs_no_improve_final}/{final_patience})")
        if epochs_no_improve_final == final_patience:
            print(f"Early stopping - No validation RMSE improvement for {final_patience} epochs.")
            break

# --- 9. Final Model Testing ---
print("\n--- Evaluating Final Model on Test Set ---")
if os.path.exists(final_model_path):
    final_model.load_state_dict(torch.load(final_model_path))
    print(f"Loaded best model weights from {final_model_path}")
else:
    print(f"Could not find optimal final model weights at '{final_model_path}'. Testing with current model state.")

final_model.eval()
test_predictions = []
true_ratings = []

# 테스트 진행도 출력
print("  Starting test set evaluation...")
with torch.no_grad():
    for user_ids, business_ids, sentiment_vectors, embeddings, stars in final_test_loader:
        user_ids, business_ids, sentiment_vectors, embeddings, stars = \
            user_ids.to(device), business_ids.to(device), sentiment_vectors.to(device), \
            embeddings.to(device), stars.to(device) # embeddings는 이미 32차원

        predictions = final_model(user_ids, business_ids, sentiment_vectors, embeddings)
        test_predictions.extend(predictions.cpu().tolist())
        true_ratings.extend(stars.cpu().tolist())

print("  Test set evaluation completed.")

mse = mean_squared_error(true_ratings, test_predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(true_ratings, test_predictions)
mape = mean_absolute_percentage_error(true_ratings, test_predictions)

print(f"\n--- Performance Evaluation (Final Model with Best Parameters) ---")
print(f"Selected Hyperparameters: {best_params}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Clean up temporary models directory if it was created by the original script
if os.path.exists('temp_models'):
    import shutil
    shutil.rmtree('temp_models')
    print("\nCleaned up 'temp_models' directory.")

Loading sentiment data from 'review_business_5up_5aspect_3sentiment_vectorized_clean.json'...
Sentiment data loaded. Total records: 447796
Loading reduced embedding data from 'review_business_5up_with_reduced_embedding.jsonl'...
Reduced embedding data loaded. Total records: 447796
Data merged successfully. Total processed records: 447796
전체 데이터 수: 447796
학습 데이터 수: 313456 (70.00%)
검증 데이터 수: 44780 (10.00%)
테스트 데이터 수: 89560 (20.00%)
Using device: cuda

Applying pre-selected Best Parameters: {'aspect_mlp_hidden_dims': [64, 32], 'batch_size': 128, 'embedding_dim': 64, 'final_mlp_hidden_dims': [32, 16], 'learning_rate': 0.001, 'user_biz_mlp_hidden_dims': [128, 64]}

--- Training Final Model with Best Parameters ---
Epoch 1/50, Train Loss (Avg): 0.6682, Val RMSE: 0.7050
  --> RMSE improved. Model saved: 0.7050
Epoch 2/50, Train Loss (Avg): 0.4715, Val RMSE: 0.7007
  --> RMSE improved. Model saved: 0.7007
Epoch 3/50, Train Loss (Avg): 0.4326, Val RMSE: 0.6832
  --> RMSE improved. Model saved: 

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os

# --- 1. Utility Functions ---

# MAPE를 위한 유틸리티 함수 (0으로 나누는 오류 방지)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_true = y_true != 0
    if np.sum(non_zero_true) == 0:
        return 0.0 # 모든 y_true가 0인 경우 MAPE는 0으로 처리
    return np.mean(np.abs((y_true[non_zero_true] - y_pred[non_zero_true]) / y_true[non_zero_true])) * 100

# --- 2. Data Loading and Preprocessing ---

# 첫 번째 파일 로드: sentiment_vector가 포함된 원본 파일 (이번에는 사용하지 않음)
# IMPORTANT: Adjust this path to where your JSON file is located on your local machine.
print("Loading sentiment data from 'review_business_5up_5aspect_3sentiment_vectorized_clean.json' (for review_id only)...")
df_sentiment = pd.read_json('review_business_5up_5aspect_3sentiment_vectorized_clean.json', lines=True)
print(f"Sentiment data loaded. Total records: {len(df_sentiment)}")

# 두 번째 파일 로드: reduced_embedding이 포함된 파일
# IMPORTANT: Adjust this path to where your JSONL file is located on your local machine.
print("Loading reduced embedding data from 'review_business_5up_with_reduced_embedding.jsonl'...")
df_reduced_emb = pd.read_json('review_business_5up_with_reduced_embedding.jsonl', lines=True)
print(f"Reduced embedding data loaded. Total records: {len(df_reduced_emb)}")

# 두 데이터프레임 병합
# review_id를 기준으로 병합합니다. review_id가 각 리뷰의 고유 ID라고 가정합니다.
# sentiment_vector 컬럼을 제외하고 병합합니다.
df_processed = pd.merge(
    df_sentiment[['review_id', 'user_id', 'business_id', 'stars']], # sentiment_vector 제외
    df_reduced_emb[['review_id', 'reduced_embedding']],
    on='review_id',
    how='inner' # 양쪽에 모두 존재하는 리뷰만 사용
)
print(f"Data merged successfully. Total processed records: {len(df_processed)}")

# user_id와 business_id를 연속적인 정수 ID로 인코딩
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

df_processed.loc[:, 'user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed.loc[:, 'business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

# 데이터 분할
# 논문에서 제시된 70/10/20 비율로 데이터 분할
train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
val_size_ratio = 1 / 8 # 10% of total data (1/8 of 80%)
train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42)

print(f"전체 데이터 수: {len(df_processed)}")
print(f"학습 데이터 수: {len(train_df)} ({len(train_df)/len(df_processed)*100:.2f}%)")
print(f"검증 데이터 수: {len(val_df)} ({len(val_df)/len(df_processed)*100:.2f}%)")
print(f"테스트 데이터 수: {len(test_df)} ({len(test_df)/len(df_processed)*100:.2f}%)")

# 임베딩 차원 설정
# original_sentiment_dim은 이제 사용되지 않습니다.
loaded_embedding_dim = len(df_processed['reduced_embedding'].iloc[0]) if not df_processed.empty else 32 # 이미 32차원으로 축소된 임베딩

# --- 3. PyTorch Dataset and DataLoader Definition ---
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.user_ids = torch.tensor(df['user_encoded'].values, dtype=torch.long)
        self.business_ids = torch.tensor(df['business_encoded'].values, dtype=torch.long)
        # sentiment_vectors는 이제 포함되지 않습니다.
        self.embeddings = torch.tensor(np.array(df['reduced_embedding'].tolist()), dtype=torch.float)
        self.stars = torch.tensor(df['stars'].values, dtype=torch.float)

    def __len__(self):
        return len(self.stars)

    def __getitem__(self, idx):
        # sentiment_vectors를 제거하고 embeddings만 반환합니다.
        return self.user_ids[idx], self.business_ids[idx], self.embeddings[idx], self.stars[idx]

# --- 4. Model Architecture Definition ---
class CustomerRestaurantInteractionModule(nn.Module):
    def __init__(self, num_users, num_businesses, embedding_dim, mlp_dims):
        super(CustomerRestaurantInteractionModule, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.business_embedding = nn.Embedding(num_businesses, embedding_dim)

        layers = []
        input_dim = embedding_dim * 2
        for dim in mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*layers)
        self.output_dim = mlp_dims[-1] if mlp_dims else embedding_dim * 2

    def forward(self, user_ids, business_ids):
        user_vec = self.user_embedding(user_ids)
        business_vec = self.business_embedding(business_ids)
        combined_vec = torch.cat((user_vec, business_vec), dim=1)
        interaction_features = self.mlp(combined_vec)
        return interaction_features

class ReviewAspectModule(nn.Module):
    # original_sentiment_dim을 더 이상 받지 않습니다.
    def __init__(self, loaded_embedding_dim, aspect_mlp_dims):
        super(ReviewAspectModule, self).__init__()

        layers = []
        # MLP의 입력 차원은 이제 오직 loaded_embedding_dim입니다.
        input_dim = loaded_embedding_dim
        for dim in aspect_mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        self.mlp = nn.Sequential(*layers)
        self.output_dim = aspect_mlp_dims[-1] if aspect_mlp_dims else loaded_embedding_dim

    # forward 함수에서 sentiment_vectors 인자 제거
    def forward(self, embeddings):
        # embeddings는 이미 32차원이므로 추가 축소 필요 없음
        # combined_aspect_features는 이제 embeddings 자체입니다.
        aspect_features = self.mlp(embeddings)
        return aspect_features

class AATRec(nn.Module):
    def __init__(self, num_users, num_businesses, embedding_dim,
                 user_biz_mlp_dims, aspect_mlp_dims, final_mlp_dims,
                 loaded_embedding_dim): # original_sentiment_dim 제거
        super(AATRec, self).__init__()
        self.customer_restaurant_interaction_module = CustomerRestaurantInteractionModule(
            num_users, num_businesses, embedding_dim, user_biz_mlp_dims
        )
        self.review_aspect_module = ReviewAspectModule(
            loaded_embedding_dim, aspect_mlp_dims # 변경된 인자 전달
        )

        final_input_dim = self.customer_restaurant_interaction_module.output_dim + \
                               self.review_aspect_module.output_dim

        layers = []
        input_dim = final_input_dim
        for dim in final_mlp_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim
        layers.append(nn.Linear(input_dim, 1)) # Final output is rating (1-dimensional)
        self.prediction_mlp = nn.Sequential(*layers)

    # forward 함수에서 sentiment_vectors 인자 제거
    def forward(self, user_ids, business_ids, embeddings):
        user_biz_features = self.customer_restaurant_interaction_module(user_ids, business_ids)
        # review_aspect_module에 embeddings만 전달
        aspect_features = self.review_aspect_module(embeddings)
        combined_features = torch.cat((user_biz_features, aspect_features), dim=1)
        predicted_rating = self.prediction_mlp(combined_features)
        return predicted_rating.squeeze() # Return 1D rating

# --- 5. Device Configuration (GPU Setup) ---
# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 6. Dataset and DataLoader Creation ---
train_dataset = ReviewDataset(train_df)
val_dataset = ReviewDataset(val_df)
test_dataset = ReviewDataset(test_df)

# --- 7. Apply the Given Best Parameters ---
# Previously, these were found via grid search. Now, we explicitly set them.
best_params = {
    'aspect_mlp_hidden_dims': [64, 32],
    'batch_size': 128,
    'embedding_dim': 64,
    'final_mlp_hidden_dims': [32, 16],
    'learning_rate': 0.001,
    'user_biz_mlp_hidden_dims': [128, 64]
}

print("\n" + "="*50)
print(f"Applying pre-selected Best Parameters: {best_params}")
print("="*50)

# --- 8. Final Model Training and Testing (Using Best Parameters) ---
final_embedding_dim = best_params['embedding_dim']
final_learning_rate = best_params['learning_rate']
final_batch_size = best_params['batch_size']
final_user_biz_mlp_dims = best_params['user_biz_mlp_hidden_dims']
final_aspect_mlp_dims = best_params['aspect_mlp_hidden_dims']
final_final_mlp_dims = best_params['final_mlp_hidden_dims']

# AATRec 모델 초기화 시 변경된 임베딩 차원 인자 전달
final_model = AATRec(num_users, num_businesses, final_embedding_dim,
                     final_user_biz_mlp_dims, final_aspect_mlp_dims, final_final_mlp_dims,
                     loaded_embedding_dim).to(device) # original_sentiment_dim 제거

final_criterion = nn.MSELoss()
final_optimizer = optim.Adam(final_model.parameters(), lr=final_learning_rate)

final_train_loader = DataLoader(train_dataset, batch_size=final_batch_size, shuffle=True)
final_val_loader = DataLoader(val_dataset, batch_size=final_batch_size, shuffle=False)
final_test_loader = DataLoader(test_dataset, batch_size=final_batch_size, shuffle=False)

final_epochs = 50 # Ample epochs for final training
final_patience = 10 # More patience for final training
final_min_delta = 0.0005 # Stricter improvement criterion

best_final_val_rmse = float('inf')
epochs_no_improve_final = 0
# 모델 저장 경로 변경 (sentiment vector가 제외된 모델임을 명시)
final_model_path = 'final_best_aat_rec_model_no_sentiment.pt'

print("\n--- Training Final Model with Best Parameters (Sentiment Vector Excluded) ---")
for epoch in range(final_epochs):
    # Training phase
    final_model.train()
    total_train_loss = 0
    # DataLoader에서 sentiment_vectors 인자 제거
    for user_ids, business_ids, embeddings, stars in final_train_loader:
        user_ids, business_ids, embeddings, stars = \
            user_ids.to(device), business_ids.to(device), \
            embeddings.to(device), stars.to(device) # embeddings는 이미 32차원

        final_optimizer.zero_grad()
        # forward 함수에 sentiment_vectors 인자 제거
        predictions = final_model(user_ids, business_ids, embeddings)
        loss = final_criterion(predictions, stars)
        loss.backward()
        final_optimizer.step()
        total_train_loss += loss.item()

    # Validation phase
    final_model.eval()
    val_predictions = []
    val_true_ratings = []
    with torch.no_grad():
        # DataLoader에서 sentiment_vectors 인자 제거
        for user_ids, business_ids, embeddings, stars in final_val_loader:
            user_ids, business_ids, embeddings, stars = \
                user_ids.to(device), business_ids.to(device), \
                embeddings.to(device), stars.to(device) # embeddings는 이미 32차원

            # forward 함수에 sentiment_vectors 인자 제거
            predictions = final_model(user_ids, business_ids, embeddings)
            val_predictions.extend(predictions.cpu().tolist())
            val_true_ratings.extend(stars.cpu().tolist())

    current_val_rmse = np.sqrt(mean_squared_error(val_true_ratings, val_predictions))

    print(f"Epoch {epoch+1}/{final_epochs}, "
          f"Train Loss (Avg): {total_train_loss / len(final_train_loader):.4f}, "
          f"Val RMSE: {current_val_rmse:.4f}")

    # Early stopping logic for final model
    if current_val_rmse < best_final_val_rmse - final_min_delta:
        best_final_val_rmse = current_val_rmse
        epochs_no_improve_final = 0
        torch.save(final_model.state_dict(), final_model_path)
        print(f"  --> RMSE improved. Model saved: {best_final_val_rmse:.4f}")
    else:
        epochs_no_improve_final += 1
        print(f"  --> RMSE not improved. ({epochs_no_improve_final}/{final_patience})")
        if epochs_no_improve_final == final_patience:
            print(f"Early stopping - No validation RMSE improvement for {final_patience} epochs.")
            break

# --- 9. Final Model Testing ---
print("\n--- Evaluating Final Model on Test Set (Sentiment Vector Excluded) ---")
if os.path.exists(final_model_path):
    final_model.load_state_dict(torch.load(final_model_path))
    print(f"Loaded best model weights from {final_model_path}")
else:
    print(f"Could not find optimal final model weights at '{final_model_path}'. Testing with current model state.")

final_model.eval()
test_predictions = []
true_ratings = []

# 테스트 진행도 출력
print("  Starting test set evaluation...")
with torch.no_grad():
    # DataLoader에서 sentiment_vectors 인자 제거
    for user_ids, business_ids, embeddings, stars in final_test_loader:
        user_ids, business_ids, embeddings, stars = \
            user_ids.to(device), business_ids.to(device), \
            embeddings.to(device), stars.to(device) # embeddings는 이미 32차원

        # forward 함수에 sentiment_vectors 인자 제거
        predictions = final_model(user_ids, business_ids, embeddings)
        test_predictions.extend(predictions.cpu().tolist())
        true_ratings.extend(stars.cpu().tolist())

print("  Test set evaluation completed.")

mse = mean_squared_error(true_ratings, test_predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(true_ratings, test_predictions)
mape = mean_absolute_percentage_error(true_ratings, test_predictions)

print(f"\n--- Performance Evaluation (Final Model with Best Parameters, Sentiment Vector Excluded) ---")
print(f"Selected Hyperparameters: {best_params}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

# Clean up temporary models directory if it was created by the original script
if os.path.exists('temp_models'):
    import shutil
    shutil.rmtree('temp_models')
    print("\nCleaned up 'temp_models' directory.")

Loading sentiment data from 'review_business_5up_5aspect_3sentiment_vectorized_clean.json' (for review_id only)...
Sentiment data loaded. Total records: 447796
Loading reduced embedding data from 'review_business_5up_with_reduced_embedding.jsonl'...
Reduced embedding data loaded. Total records: 447796
Data merged successfully. Total processed records: 447796
전체 데이터 수: 447796
학습 데이터 수: 313456 (70.00%)
검증 데이터 수: 44780 (10.00%)
테스트 데이터 수: 89560 (20.00%)
Using device: cuda

Applying pre-selected Best Parameters: {'aspect_mlp_hidden_dims': [64, 32], 'batch_size': 128, 'embedding_dim': 64, 'final_mlp_hidden_dims': [32, 16], 'learning_rate': 0.001, 'user_biz_mlp_hidden_dims': [128, 64]}

--- Training Final Model with Best Parameters (Sentiment Vector Excluded) ---
Epoch 1/50, Train Loss (Avg): 1.4664, Val RMSE: 1.0917
  --> RMSE improved. Model saved: 1.0917
Epoch 2/50, Train Loss (Avg): 0.9607, Val RMSE: 0.9272
  --> RMSE improved. Model saved: 0.9272
Epoch 3/50, Train Loss (Avg): 0.7682, Va