In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
import re
from collections import Counter
import random
import json # Import json for saving results

# --- 유틸리티 함수 ---
def mean_absolute_percentage_error(y_true, y_pred):
    """
    Calculates MAPE, preventing division by zero.
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_true = y_true != 0
    if np.sum(non_zero_true) == 0:
        return 0.0
    return np.mean(np.abs((y_true[non_zero_true] - y_pred[non_zero_true]) / y_true[non_zero_true])) * 100

# --- 장치 설정 (GPU 사용 가능 시) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 데이터 전처리 및 어휘 구축 ---
def preprocess_text(text):
    """
    Converts text to lowercase, keeps only alphabets, numbers, and spaces, then tokenizes.
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()

class Vocabulary:
    """
    Builds a vocabulary from text data and converts words to integer IDs.
    """
    def __init__(self, min_freq):
        self.stoi = {"<PAD>": 0, "<UNK>": 1} # string_to_int: Defines padding and unknown tokens
        self.itos = {0: "<PAD>", 1: "<UNK>"} # int_to_string
        self.freq = Counter()
        self.min_freq = min_freq
    
    def build_vocabulary(self, text_list):
        """
        Builds the vocabulary based on the given text list.
        Words appearing less than min_freq times are treated as <UNK> tokens.
        """
        for text in text_list:
            self.freq.update(text)
        
        idx = 2 # Start index after <PAD> and <UNK>
        for word, count in self.freq.items():
            if count >= self.min_freq:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
    
    def numericalize(self, text):
        """
        Converts text (list of words) to a sequence of integer IDs.
        """
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in text]

# --- 파일 로드 ---
try:
    df = pd.read_json('review.json', lines=True) # Use lines=True for JSONL format
except ValueError:
    print("Trying to read JSON without lines=True (assuming a single JSON object or array of objects).")
    df = pd.read_json('review.json') # Fallback for standard JSON

# --- 필요한 컬럼 추출 및 인코딩 ---
df_processed = df[['user_id', 'business_id', 'stars', 'text']].copy()

user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

df_processed.loc[:, 'user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed.loc[:, 'business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

# --- 텍스트 전처리 및 어휘 구축 실행 ---
all_texts = df_processed['text'].apply(preprocess_text).tolist()
min_word_freq = 5 # Set minimum word frequency (adjustable)
vocab = Vocabulary(min_word_freq)
vocab.build_vocabulary(all_texts)
vocab_size = len(vocab.stoi)
print(f"Vocabulary size: {vocab_size}")

# Convert review text to integer ID sequences and apply padding/truncation
MAX_REVIEW_LEN = 100 # Maximum length of review text (adjustable)
df_processed.loc[:, 'numericalized_text'] = df_processed['text'].apply(vocab.numericalize)
df_processed['numericalized_text'] = df_processed['numericalized_text'].apply(
    lambda x: x[:MAX_REVIEW_LEN] if len(x) > MAX_REVIEW_LEN else x + [vocab.stoi["<PAD>"]] * (MAX_REVIEW_LEN - len(x))
)

# --- PyTorch Dataset 및 DataLoader 정의 ---
class AFRAMDataset(Dataset):
    """
    PyTorch Dataset class for AFRAM model training.
    Returns user ID, business ID, numericalized review text, and star rating.
    """
    def __init__(self, df):
        self.user_ids = torch.tensor(df['user_encoded'].values, dtype=torch.long)
        self.business_ids = torch.tensor(df['business_encoded'].values, dtype=torch.long)
        self.reviews = torch.tensor(np.array(df['numericalized_text'].tolist()), dtype=torch.long)
        self.stars = torch.tensor(df['stars'].values, dtype=torch.float)

    def __len__(self):
        return len(self.stars)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.business_ids[idx], self.reviews[idx], self.stars[idx]

# --- AFRAM 모델 아키텍처 정의 ---
class TextEncoderWithAttention(nn.Module):
    """
    Extracts features from review text using CNN, LSTM, and attention mechanism.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate):
        super(TextEncoderWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=hidden_dim, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        
        self.attn_proj = nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.v = nn.Parameter(torch.rand(hidden_dim * 2, 1))
        
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, text_seq):
        embedded = self.embedding(text_seq)
        embedded = embedded.permute(0, 2, 1)
        
        conv_out = torch.relu(self.conv(embedded))
        conv_out = conv_out.permute(0, 2, 1)

        lstm_out, _ = self.lstm(self.dropout(conv_out))
        
        attn_weights = torch.tanh(self.attn_proj(lstm_out))
        v_expanded = self.v.unsqueeze(0).expand(attn_weights.shape[0], -1, -1)
        
        scores = torch.bmm(attn_weights, v_expanded)
        attention_weights = torch.softmax(scores, dim=1)
        
        context_vector = torch.sum(lstm_out * attention_weights, dim=1)
        
        return context_vector

class AFRAMModel(nn.Module):
    """
    Implements the full model structure from the AFRAM paper.
    Combines user-business interaction and review text features to predict ratings.
    """
    def __init__(self, num_users, num_businesses, vocab_size, embedding_dim,
                 text_encoder_hidden_dim, user_item_mlp_dims, final_mlp_dims, dropout_rate):
        super(AFRAMModel, self).__init__()
        
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.business_embedding = nn.Embedding(num_businesses, embedding_dim)
        
        self.review_encoder = TextEncoderWithAttention(vocab_size, embedding_dim, text_encoder_hidden_dim, dropout_rate)

        user_item_mlp_input_dim = embedding_dim * 2
        user_item_layers = []
        for dim in user_item_mlp_dims:
            user_item_layers.append(nn.Linear(user_item_mlp_input_dim, dim))
            user_item_layers.append(nn.ReLU())
            user_item_mlp_input_dim = dim
        self.user_item_mlp = nn.Sequential(*user_item_layers)
        self.user_item_mlp_output_dim = user_item_mlp_dims[-1] if user_item_mlp_dims else embedding_dim * 2

        final_mlp_input_dim = self.user_item_mlp_output_dim + \
                              text_encoder_hidden_dim * 2
        
        final_layers = []
        for dim in final_mlp_dims:
            final_layers.append(nn.Linear(final_mlp_input_dim, dim))
            final_layers.append(nn.ReLU())
            final_mlp_input_dim = dim
        final_layers.append(nn.Linear(final_mlp_input_dim, 1))
        self.prediction_mlp = nn.Sequential(*final_layers)

    def forward(self, user_ids, business_ids, reviews):
        user_vec = self.user_embedding(user_ids)
        business_vec = self.business_embedding(business_ids)
        
        user_item_combined = torch.cat((user_vec, business_vec), dim=1)
        user_item_features = self.user_item_mlp(user_item_combined)

        review_features = self.review_encoder(reviews)
        
        combined_features = torch.cat((user_item_features, review_features), dim=1)
        
        predicted_rating = self.prediction_mlp(combined_features)
        return predicted_rating.squeeze()

# --- 하이퍼파라미터 탐색 공간 정의 ---
param_grid = {
    'embedding_dim': [32, 64, 128],
    'text_encoder_hidden_dim': [64, 128, 256],
    'learning_rate': [0.0005, 0.001, 0.002],
    'batch_size': [128, 256, 512],
    'user_item_mlp_dims': [[64, 32], [128, 64], [256, 128]],
    'final_mlp_dims': [[32, 16], [64, 32], [128, 64]],
    'dropout_rate': [0.1, 0.2, 0.3]
}

num_trials = 10 # Number of random combinations to try
best_params = None
best_rmse = float('inf')
results_log = []

# Fixed random state for data splitting for consistency across hyperparameter trials
# You typically want to use the same data split for all trials when tuning.
DATA_SPLIT_RANDOM_STATE = 42

print(f"\n--- Starting Hyperparameter Search with {num_trials} trials ---")
print(f"Data Split Random State: {DATA_SPLIT_RANDOM_STATE}")

# Data splitting (7:1:2 ratio) - fixed for hyperparameter tuning
train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=DATA_SPLIT_RANDOM_STATE)
train_df, val_df = train_test_split(train_val_df, test_size=1/8, random_state=DATA_SPLIT_RANDOM_STATE)

print(f"Fixed Data Split: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

# Create Dataset objects (once for all trials)
train_dataset = AFRAMDataset(train_df)
val_dataset = AFRAMDataset(val_df)
test_dataset = AFRAMDataset(test_df)

for trial_num in range(num_trials):
    print(f"\n--- Trial {trial_num + 1}/{num_trials} ---")

    # Randomly sample a set of hyperparameters for this trial
    current_params = {k: random.choice(v) for k, v in param_grid.items()}
    print(f"Current Parameters: {current_params}")

    # Create DataLoaders for the current trial (batch_size might change)
    train_loader = DataLoader(train_dataset, batch_size=current_params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=current_params['batch_size'], shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=current_params['batch_size'], shuffle=False)

    # Unpack parameters
    embedding_dim = current_params['embedding_dim']
    text_encoder_hidden_dim = current_params['text_encoder_hidden_dim']
    learning_rate = current_params['learning_rate']
    batch_size = current_params['batch_size']
    user_item_mlp_dims = current_params['user_item_mlp_dims']
    final_mlp_dims = current_params['final_mlp_dims']
    dropout_rate = current_params['dropout_rate']

    epochs = 50 # Maximum number of epochs for each trial
    patience = 7 # Slightly reduced patience for faster trials
    min_delta = 0.0005

    # Create new model instance and move to GPU for each trial
    model = AFRAMModel(num_users, num_businesses, vocab_size, embedding_dim,
                       text_encoder_hidden_dim, user_item_mlp_dims, final_mlp_dims, dropout_rate).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    trial_best_val_rmse = float('inf')
    epochs_no_improve = 0
    trial_model_save_path = f'temp_best_model_trial_{trial_num+1}.pt' # Temporary model path for each trial

    # --- Training Loop (with Early Stopping) ---
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for user_ids, business_ids, reviews, stars in train_loader:
            user_ids, business_ids, reviews, stars = user_ids.to(device), business_ids.to(device), reviews.to(device), stars.to(device)
            optimizer.zero_grad()
            predictions = model(user_ids, business_ids, reviews)
            loss = criterion(predictions, stars)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        model.eval()
        total_val_loss = 0
        val_predictions = []
        val_true_ratings = []
        with torch.no_grad():
            for user_ids, business_ids, reviews, stars in val_loader:
                user_ids, business_ids, reviews, stars = user_ids.to(device), business_ids.to(device), reviews.to(device), stars.to(device)
                predictions = model(user_ids, business_ids, reviews)
                loss = criterion(predictions, stars)
                total_val_loss += loss.item()
                val_predictions.extend(predictions.tolist())
                val_true_ratings.extend(stars.tolist())

        current_val_rmse = np.sqrt(mean_squared_error(val_true_ratings, val_predictions))

        if current_val_rmse < trial_best_val_rmse - min_delta:
            trial_best_val_rmse = current_val_rmse
            epochs_no_improve = 0
            torch.save(model.state_dict(), trial_model_save_path)
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                # print(f"    Early stopping at epoch {epoch+1}.") # Can uncomment for more verbose output
                break
    
    # --- Evaluate the best model from this trial on the Test Set ---
    if os.path.exists(trial_model_save_path):
        model.load_state_dict(torch.load(trial_model_save_path))
    else:
        print(f"Warning: Model for trial {trial_num+1} not saved. Testing with last state.")

    model.eval()
    test_predictions = []
    true_ratings = []
    with torch.no_grad():
        for user_ids, business_ids, reviews, stars in test_loader:
            user_ids, business_ids, reviews, stars = user_ids.to(device), business_ids.to(device), reviews.to(device), stars.to(device)
            predictions = model(user_ids, business_ids, reviews)
            test_predictions.extend(predictions.tolist())
            true_ratings.extend(stars.tolist())

    mse = mean_squared_error(true_ratings, test_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_ratings, test_predictions)
    mape = mean_absolute_percentage_error(true_ratings, test_predictions)

    print(f"  Trial {trial_num + 1} Test RMSE: {rmse:.4f}")

    # Log results for this trial
    trial_results = {
        'trial_num': trial_num + 1,
        'parameters': current_params,
        'test_mse': mse,
        'test_rmse': rmse,
        'test_mae': mae,
        'test_mape': mape
    }
    results_log.append(trial_results)

    # Check if this trial yielded the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = current_params
        # Optionally save the best model found during hyperparameter search
        torch.save(model.state_dict(), 'best_overall_afram_model.pt')
        print(f"  --> New best RMSE found: {best_rmse:.4f} with params: {best_params}")

    # Clean up temporary model file
    if os.path.exists(trial_model_save_path):
        os.remove(trial_model_save_path)

# --- Final Output ---
print(f"\n--- Hyperparameter Search Completed ---")
print(f"Best RMSE found: {best_rmse:.4f}")
print(f"Best Parameters: {best_params}")

# Optionally save the full results log to a JSON file
with open('hyperparameter_search_results.json', 'w') as f:
    json.dump(results_log, f, indent=4)
print(f"Full results logged to 'hyperparameter_search_results.json'")
print(f"Best model weights saved to 'best_overall_afram_model.pt'")

Using device: cuda
Vocabulary size: 51962

--- Starting Hyperparameter Search with 10 trials ---
Data Split Random State: 42
Fixed Data Split: Train=313456, Val=44780, Test=89560

--- Trial 1/10 ---
Current Parameters: {'embedding_dim': 64, 'text_encoder_hidden_dim': 256, 'learning_rate': 0.002, 'batch_size': 512, 'user_item_mlp_dims': [128, 64], 'final_mlp_dims': [32, 16], 'dropout_rate': 0.3}
  Trial 1 Test RMSE: 0.9531
  --> New best RMSE found: 0.9531 with params: {'embedding_dim': 64, 'text_encoder_hidden_dim': 256, 'learning_rate': 0.002, 'batch_size': 512, 'user_item_mlp_dims': [128, 64], 'final_mlp_dims': [32, 16], 'dropout_rate': 0.3}

--- Trial 2/10 ---
Current Parameters: {'embedding_dim': 128, 'text_encoder_hidden_dim': 256, 'learning_rate': 0.001, 'batch_size': 256, 'user_item_mlp_dims': [128, 64], 'final_mlp_dims': [64, 32], 'dropout_rate': 0.1}
  Trial 2 Test RMSE: 0.9001
  --> New best RMSE found: 0.9001 with params: {'embedding_dim': 128, 'text_encoder_hidden_dim': 256

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
import re
from collections import Counter
import random

# --- 유틸리티 함수 ---
def mean_absolute_percentage_error(y_true, y_pred):
    """
    Calculates MAPE, preventing division by zero.
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_true = y_true != 0
    if np.sum(non_zero_true) == 0:
        return 0.0
    return np.mean(np.abs((y_true[non_zero_true] - y_pred[non_zero_true]) / y_true[non_zero_true])) * 100

# --- 장치 설정 (GPU 사용 가능 시) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 데이터 전처리 및 어휘 구축 ---
def preprocess_text(text):
    """
    Converts text to lowercase, keeps only alphabets, numbers, and spaces, then tokenizes.
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()

class Vocabulary:
    """
    Builds a vocabulary from text data and converts words to integer IDs.
    """
    def __init__(self, min_freq):
        self.stoi = {"<PAD>": 0, "<UNK>": 1} # string_to_int: Defines padding and unknown tokens
        self.itos = {0: "<PAD>", 1: "<UNK>"} # int_to_string
        self.freq = Counter()
        self.min_freq = min_freq
    
    def build_vocabulary(self, text_list):
        """
        Builds the vocabulary based on the given text list.
        Words appearing less than min_freq times are treated as <UNK> tokens.
        """
        for text in text_list:
            self.freq.update(text)
        
        idx = 2 # Start index after <PAD> and <UNK>
        for word, count in self.freq.items():
            if count >= self.min_freq:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
    
    def numericalize(self, text):
        """
        Converts text (list of words) to a sequence of integer IDs.
        """
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in text]

# --- 파일 로드 ---
# Changed to read review.json
try:
    df = pd.read_json('review.json', lines=True) # Use lines=True for JSONL format
except ValueError:
    print("Trying to read JSON without lines=True (assuming a single JSON object or array of objects).")
    df = pd.read_json('review.json') # Fallback for standard JSON

# --- 필요한 컬럼 추출 및 인코딩 ---
df_processed = df[['user_id', 'business_id', 'stars', 'text']].copy()

user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

df_processed.loc[:, 'user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed.loc[:, 'business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

# --- 텍스트 전처리 및 어휘 구축 실행 ---
all_texts = df_processed['text'].apply(preprocess_text).tolist()
min_word_freq = 5 # Set minimum word frequency (adjustable)
vocab = Vocabulary(min_word_freq)
vocab.build_vocabulary(all_texts)
vocab_size = len(vocab.stoi)
print(f"Vocabulary size: {vocab_size}")

# Convert review text to integer ID sequences and apply padding/truncation
MAX_REVIEW_LEN = 100 # Maximum length of review text (adjustable)
df_processed.loc[:, 'numericalized_text'] = df_processed['text'].apply(vocab.numericalize)
df_processed['numericalized_text'] = df_processed['numericalized_text'].apply(
    lambda x: x[:MAX_REVIEW_LEN] if len(x) > MAX_REVIEW_LEN else x + [vocab.stoi["<PAD>"]] * (MAX_REVIEW_LEN - len(x))
)

# --- PyTorch Dataset 및 DataLoader 정의 ---
class AFRAMDataset(Dataset):
    """
    PyTorch Dataset class for AFRAM model training.
    Returns user ID, business ID, numericalized review text, and star rating.
    """
    def __init__(self, df):
        self.user_ids = torch.tensor(df['user_encoded'].values, dtype=torch.long)
        self.business_ids = torch.tensor(df['business_encoded'].values, dtype=torch.long)
        self.reviews = torch.tensor(np.array(df['numericalized_text'].tolist()), dtype=torch.long)
        self.stars = torch.tensor(df['stars'].values, dtype=torch.float)

    def __len__(self):
        return len(self.stars)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.business_ids[idx], self.reviews[idx], self.stars[idx]

# --- AFRAM 모델 아키텍처 정의 ---
class TextEncoderWithAttention(nn.Module):
    """
    Extracts features from review text using CNN, LSTM, and attention mechanism.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate):
        super(TextEncoderWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # Convolutional Layer (CNN)
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=hidden_dim, kernel_size=3, padding=1)
        # Bidirectional LSTM Layer
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        
        # Attention Layer (Bahdanau-style Additive Attention)
        self.attn_proj = nn.Linear(hidden_dim * 2, hidden_dim * 2) # Matches Bi-LSTM output dimension
        self.v = nn.Parameter(torch.rand(hidden_dim * 2, 1)) # Learnable attention weight vector
        
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, text_seq):
        # text_seq: (batch_size, seq_len)
        embedded = self.embedding(text_seq) # (batch_size, seq_len, embedding_dim)
        embedded = embedded.permute(0, 2, 1) # Change dimension for Conv1d (batch_size, embedding_dim, seq_len)
        
        conv_out = torch.relu(self.conv(embedded)) # (batch_size, hidden_dim, seq_len)
        conv_out = conv_out.permute(0, 2, 1) # Change dimension for LSTM (batch_size, seq_len, hidden_dim)

        lstm_out, _ = self.lstm(self.dropout(conv_out)) # (batch_size, seq_len, hidden_dim * 2) (Bi-LSTM)
        
        # Apply attention mechanism
        attn_weights = torch.tanh(self.attn_proj(lstm_out)) # (batch_size, seq_len, hidden_dim * 2)
        v_expanded = self.v.unsqueeze(0).expand(attn_weights.shape[0], -1, -1) # Expand v to match batch size
        
        scores = torch.bmm(attn_weights, v_expanded) # Calculate attention scores (batch_size, seq_len, 1)
        attention_weights = torch.softmax(scores, dim=1) # Normalize weights with softmax (sum=1)
        
        # Calculate context vector (weighted sum): weighted average of LSTM output applying attention weights
        context_vector = torch.sum(lstm_out * attention_weights, dim=1) # (batch_size, hidden_dim * 2)
        
        return context_vector # This vector represents the "aspect features" of the review.

class AFRAMModel(nn.Module):
    """
    Implements the full model structure from the AFRAM paper.
    Combines user-business interaction and review text features to predict ratings.
    """
    def __init__(self, num_users, num_businesses, vocab_size, embedding_dim,
                 text_encoder_hidden_dim, user_item_mlp_dims, final_mlp_dims, dropout_rate):
        super(AFRAMModel, self).__init__()
        
        # User and business embedding layers
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.business_embedding = nn.Embedding(num_businesses, embedding_dim)
        
        # Module to encode review text (including attention)
        self.review_encoder = TextEncoderWithAttention(vocab_size, embedding_dim, text_encoder_hidden_dim, dropout_rate)

        # User-business interaction MLP (Customer-Restaurant Interaction Module in the paper)
        user_item_mlp_input_dim = embedding_dim * 2
        user_item_layers = []
        for dim in user_item_mlp_dims:
            user_item_layers.append(nn.Linear(user_item_mlp_input_dim, dim))
            user_item_layers.append(nn.ReLU())
            user_item_mlp_input_dim = dim
        self.user_item_mlp = nn.Sequential(*user_item_layers)
        self.user_item_mlp_output_dim = user_item_mlp_dims[-1] if user_item_mlp_dims else embedding_dim * 2

        # Final rating prediction MLP (Rating Prediction Module in the paper)
        final_mlp_input_dim = self.user_item_mlp_output_dim + \
                              text_encoder_hidden_dim * 2 # Output dimension of review_encoder (Bi-LSTM, so hidden_dim * 2)
        
        final_layers = []
        for dim in final_mlp_dims:
            final_layers.append(nn.Linear(final_mlp_input_dim, dim))
            final_layers.append(nn.ReLU())
            final_mlp_input_dim = dim
        final_layers.append(nn.Linear(final_mlp_input_dim, 1)) # Final output is rating (1 dimension)
        self.prediction_mlp = nn.Sequential(*final_layers)

    def forward(self, user_ids, business_ids, reviews):
        # Get user and business embedding vectors
        user_vec = self.user_embedding(user_ids)
        business_vec = self.business_embedding(business_ids)
        
        # Combine user-business embeddings and pass through MLP to generate interaction features
        user_item_combined = torch.cat((user_vec, business_vec), dim=1)
        user_item_features = self.user_item_mlp(user_item_combined)

        # Pass review text through review encoder to extract text features (aspect features)
        review_features = self.review_encoder(reviews)
        
        # Combine interaction features and review text features
        combined_features = torch.cat((user_item_features, review_features), dim=1)
        
        # Pass through final rating prediction MLP to return result
        predicted_rating = self.prediction_mlp(combined_features)
        return predicted_rating.squeeze() # Reduce dimension to return 1D rating

# --- 모델 학습 및 평가 (다중 반복 및 평균) ---
# Hyperparameters to use (can be tuned)
params = {
    'embedding_dim': 64,
    'text_encoder_hidden_dim': 256,
    'learning_rate': 0.0005,
    'batch_size': 256,
    'user_item_mlp_dims': [64, 32],
    'final_mlp_dims': [32, 16],
    'dropout_rate': 0.3
}

# Lists to store results from each run
all_mse = []
all_rmse = []
all_mae = []
all_mape = []

num_runs = 5 # Number of repetitions

print(f"\n--- Starting {num_runs} runs of training and evaluation ---")
print(f"Base Random State: 42")
print(f"Parameters: {params}")

for i in range(num_runs):
    current_random_state = 42 + i
    print(f"\n--- Run {i+1}/{num_runs} (Random State: {current_random_state}) ---")

    # Data splitting (7:1:2 ratio) - split with a different random_state each iteration
    train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=current_random_state)
    train_df, val_df = train_test_split(train_val_df, test_size=1/8, random_state=current_random_state)

    print(f"Run {i+1} Data Split: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

    # Create Dataset objects
    train_dataset = AFRAMDataset(train_df)
    val_dataset = AFRAMDataset(val_df)
    test_dataset = AFRAMDataset(test_df)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

    # Unpack parameters
    embedding_dim = params['embedding_dim']
    text_encoder_hidden_dim = params['text_encoder_hidden_dim']
    learning_rate = params['learning_rate']
    batch_size = params['batch_size']
    user_item_mlp_dims = params['user_item_mlp_dims']
    final_mlp_dims = params['final_mlp_dims']
    dropout_rate = params['dropout_rate']

    epochs = 50 # Maximum number of epochs
    patience = 10 # Number of epochs to wait for validation performance improvement for early stopping
    min_delta = 0.0005 # Minimum change to be considered an improvement

    best_val_rmse = float('inf')
    epochs_no_improve = 0
    model_save_path = f'best_afram_model_run_{i+1}.pt' # Unique model save path for each run

    # Create new model instance and move to GPU (new instance for each run)
    model = AFRAMModel(num_users, num_businesses, vocab_size, embedding_dim,
                       text_encoder_hidden_dim, user_item_mlp_dims, final_mlp_dims, dropout_rate).to(device)

    criterion = nn.MSELoss() # Loss function: MSE
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Optimizer: Adam

    # --- Training Loop (with Early Stopping) ---
    for epoch in range(epochs):
        model.train() # Set model to training mode
        total_train_loss = 0
        for user_ids, business_ids, reviews, stars in train_loader:
            # Move data to GPU
            user_ids, business_ids, reviews, stars = user_ids.to(device), business_ids.to(device), reviews.to(device), stars.to(device)
            
            optimizer.zero_grad() # Initialize optimizer gradients
            predictions = model(user_ids, business_ids, reviews) # Perform prediction
            loss = criterion(predictions, stars) # Calculate loss
            loss.backward() # Backpropagation
            optimizer.step() # Update parameters
            total_train_loss += loss.item()

        model.eval() # Set model to evaluation mode
        total_val_loss = 0
        val_predictions = []
        val_true_ratings = []
        with torch.no_grad(): # Disable gradient calculation (save memory, speed up)
            for user_ids, business_ids, reviews, stars in val_loader:
                # Move data to GPU
                user_ids, business_ids, reviews, stars = user_ids.to(device), business_ids.to(device), reviews.to(device), stars.to(device)
                
                predictions = model(user_ids, business_ids, reviews)
                loss = criterion(predictions, stars)
                total_val_loss += loss.item()
                val_predictions.extend(predictions.tolist())
                val_true_ratings.extend(stars.tolist())

        current_val_rmse = np.sqrt(mean_squared_error(val_true_ratings, val_predictions))

        # Early stopping logic
        if current_val_rmse < best_val_rmse - min_delta:
            best_val_rmse = current_val_rmse
            epochs_no_improve = 0
            torch.save(model.state_dict(), model_save_path)
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print(f"  Early stopping! No improvement in RMSE for {patience} epochs.")
                break

    # --- Final Model Testing ---
    if os.path.exists(model_save_path):
        model.load_state_dict(torch.load(model_save_path))
        print(f"Loaded best model weights from {model_save_path}")
    else:
        print(f"Could not find best model weights at '{model_save_path}'. Testing with current model state.")

    model.eval()
    test_predictions = []
    true_ratings = []

    with torch.no_grad():
        for user_ids, business_ids, reviews, stars in test_loader:
            user_ids, business_ids, reviews, stars = user_ids.to(device), business_ids.to(device), reviews.to(device), stars.to(device)
            predictions = model(user_ids, business_ids, reviews)
            test_predictions.extend(predictions.tolist())
            true_ratings.extend(stars.tolist())

    mse = mean_squared_error(true_ratings, test_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_ratings, test_predictions)
    mape = mean_absolute_percentage_error(true_ratings, test_predictions)

    print(f"--- Run {i+1} Performance on Test Set ---")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

    all_mse.append(mse)
    all_rmse.append(rmse)
    all_mae.append(mae)
    all_mape.append(mape)

# --- Average Performance Output ---
print(f"\n--- Average Performance over {num_runs} Runs ---")
print(f"Average MSE: {np.mean(all_mse):.4f} +/- {np.std(all_mse):.4f}")
print(f"Average RMSE: {np.mean(all_rmse):.4f} +/- {np.std(all_rmse):.4f}")
print(f"Average MAE: {np.mean(all_mae):.4f} +/- {np.std(all_mae):.4f}")
print(f"Average MAPE: {np.mean(all_mape):.2f}% +/- {np.std(all_mape):.2f}%")

Using device: cuda
Vocabulary size: 51962

--- Starting 5 runs of training and evaluation ---
Base Random State: 42
Parameters: {'embedding_dim': 64, 'text_encoder_hidden_dim': 256, 'learning_rate': 0.0005, 'batch_size': 256, 'user_item_mlp_dims': [64, 32], 'final_mlp_dims': [32, 16], 'dropout_rate': 0.3}

--- Run 1/5 (Random State: 42) ---
Run 1 Data Split: Train=313456, Val=44780, Test=89560
  Early stopping! No improvement in RMSE for 10 epochs.
Loaded best model weights from best_afram_model_run_1.pt
--- Run 1 Performance on Test Set ---
Mean Squared Error (MSE): 0.8070
Root Mean Squared Error (RMSE): 0.8984
Mean Absolute Error (MAE): 0.6806
Mean Absolute Percentage Error (MAPE): 25.99%

--- Run 2/5 (Random State: 43) ---
Run 2 Data Split: Train=313456, Val=44780, Test=89560
  Early stopping! No improvement in RMSE for 10 epochs.
Loaded best model weights from best_afram_model_run_2.pt
--- Run 2 Performance on Test Set ---
Mean Squared Error (MSE): 0.8661
Root Mean Squared Error (R