In [11]:
#기존 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks


# MAPE 오류 방지
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_true = y_true != 0
    if np.sum(non_zero_true) == 0:
        return 0.0 # 모든 y_true가 0인 경우 MAPE는 0으로 처리
    return np.mean(np.abs((y_true[non_zero_true] - y_pred[non_zero_true]) / y_true[non_zero_true])) * 100


# ABSA 15차원
print("Loading sentiment data from 'review_business_5up_5aspect_3sentiment_vectorized_clean.json'...")
df = pd.read_json('review_business_5up_5aspect_3sentiment_vectorized_clean.json', lines=True)
print(f"Sentiment data loaded. Total records: {len(df)}")

df_processed = df[['user_id', 'business_id', 'stars', 'sentiment_vector']].copy()
print(f"Data processed. Total records: {len(df_processed)}")

user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

df_processed.loc[:, 'user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed.loc[:, 'business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

# 데이터 7:1:2로 분할 
train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
val_size_ratio = 1 / 8 # 10% of total data (1/8 of 80%)
train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42)

print(f"전체 데이터 수: {len(df_processed)}")
print(f"학습 데이터 수: {len(train_df)} ({len(train_df)/len(df_processed)*100:.2f}%)")
print(f"검증 데이터 수: {len(val_df)} ({len(val_df)/len(df_processed)*100:.2f}%)")
print(f"테스트 데이터 수: {len(test_df)} ({len(test_df)/len(df_processed)*100:.2f}%)")


# 임베딩 차원 설정
sentiment_vector_dim = len(df_processed['sentiment_vector'].iloc[0]) if not df_processed.empty else 15


train_sentiment_vectors = np.array(train_df['sentiment_vector'].tolist())
val_sentiment_vectors = np.array(val_df['sentiment_vector'].tolist())
test_sentiment_vectors = np.array(test_df['sentiment_vector'].tolist())


def build_asrec_model(num_users, num_businesses, embedding_dim,
                       user_biz_mlp_dims, aspect_mlp_dims, final_mlp_dims,
                       sentiment_vector_dim):

    # 유저-비즈니스 관계 모듈
    user_input = keras.Input(shape=(1,), name='user_id')
    business_input = keras.Input(shape=(1,), name='business_id')

    user_embedding = layers.Embedding(num_users, embedding_dim, name='user_embedding')(user_input)
    user_vec = layers.Flatten()(user_embedding)

    business_embedding = layers.Embedding(num_businesses, embedding_dim, name='business_embedding')(business_input)
    business_vec = layers.Flatten()(business_embedding)

    combined_vec = layers.concatenate([user_vec, business_vec], axis=1)

    interaction_features = combined_vec
    for dim in user_biz_mlp_dims:
        interaction_features = layers.Dense(dim, activation='relu')(interaction_features)

    # ABSA 모듈
    sentiment_input = keras.Input(shape=(sentiment_vector_dim,), name='sentiment_vector')

    aspect_features = sentiment_input
    for dim in aspect_mlp_dims:
        aspect_features = layers.Dense(dim, activation='relu')(aspect_features)

    # 최종 예측 모듈
    final_combined_features = layers.concatenate([interaction_features, aspect_features], axis=1)

    predicted_rating = final_combined_features
    for dim in final_mlp_dims:
        predicted_rating = layers.Dense(dim, activation='relu')(predicted_rating)
    predicted_rating = layers.Dense(1, activation='linear', name='output_rating')(predicted_rating)

    
    model = models.Model(inputs=[user_input, business_input, sentiment_input],
                         outputs=predicted_rating)
    return model

# 파라미터
best_params = {
    'aspect_mlp_hidden_dims': [64, 32],
    'batch_size': 128,
    'embedding_dim': 64,
    'final_mlp_hidden_dims': [32, 16],
    'learning_rate': 0.001,
    'user_biz_mlp_hidden_dims': [128, 64]
}

print("\n" + "="*50)
print(f"Best Parameters: {best_params}")
print("="*50)


final_embedding_dim = best_params['embedding_dim']
final_learning_rate = best_params['learning_rate']
final_batch_size = best_params['batch_size']
final_user_biz_mlp_dims = best_params['user_biz_mlp_hidden_dims']
final_aspect_mlp_dims = best_params['aspect_mlp_hidden_dims']
final_final_mlp_dims = best_params['final_mlp_hidden_dims']

# 최종 모델
final_model = build_asrec_model(num_users, num_businesses, final_embedding_dim,
                                 final_user_biz_mlp_dims, final_aspect_mlp_dims, final_final_mlp_dims,
                                 sentiment_vector_dim)

final_model.compile(optimizer=keras.optimizers.Adam(learning_rate=final_learning_rate),
                    loss='mse',
                    metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), 'mae'])

final_model_path = 'final_best_as_rec_model_sentiment_only.keras'

early_stopping_callback = callbacks.EarlyStopping(
    monitor='val_rmse',
    patience=10,
    min_delta=0.0005,
    mode='min',
    restore_best_weights=True
)

model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=final_model_path,
    monitor='val_rmse',
    save_best_only=True,
    mode='min',
    verbose=1
)

print("\n--- Training Final Model with Best Parameters ---")
history = final_model.fit(
    {'user_id': train_df['user_encoded'],
     'business_id': train_df['business_encoded'],
     'sentiment_vector': train_sentiment_vectors},
    train_df['stars'],
    batch_size=final_batch_size,
    epochs=50, 
    validation_data=(
        {'user_id': val_df['user_encoded'],
         'business_id': val_df['business_encoded'],
         'sentiment_vector': val_sentiment_vectors},
        val_df['stars']
    ),
    callbacks=[early_stopping_callback, model_checkpoint_callback],
    verbose=1
)

print("\n--- Evaluating Final Model on Test Set ---")
if os.path.exists(final_model_path):
    final_model = keras.models.load_model(final_model_path)
    print(f"Loaded best model weights from {final_model_path}")
else:
    print(f"Could not find optimal final model weights at '{final_model_path}'. Testing with current model state.")

test_predictions = final_model.predict(
    {'user_id': test_df['user_encoded'],
     'business_id': test_df['business_encoded'],
     'sentiment_vector': test_sentiment_vectors}
).flatten()

true_ratings = test_df['stars'].values

mse = mean_squared_error(true_ratings, test_predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(true_ratings, test_predictions)
mape = mean_absolute_percentage_error(true_ratings, test_predictions)

print(f"\n--- Performance Evaluation (Final Model with Best Parameters) ---")
print(f"Selected Hyperparameters: {best_params}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")


if os.path.exists('temp_models'):
    import shutil
    shutil.rmtree('temp_models')
    print("\nCleaned up 'temp_models' directory.")


Loading sentiment data from 'review_business_5up_5aspect_3sentiment_vectorized_clean.json'...
Sentiment data loaded. Total records: 447796
Data processed. Total records: 447796
전체 데이터 수: 447796
학습 데이터 수: 313456 (70.00%)
검증 데이터 수: 44780 (10.00%)
테스트 데이터 수: 89560 (20.00%)

Best Parameters: {'aspect_mlp_hidden_dims': [64, 32], 'batch_size': 128, 'embedding_dim': 64, 'final_mlp_hidden_dims': [32, 16], 'learning_rate': 0.001, 'user_biz_mlp_hidden_dims': [128, 64]}

--- Training Final Model with Best Parameters ---
Epoch 1/50
[1m2437/2449[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 1.1951 - mae: 0.7401 - rmse: 1.0054
Epoch 1: val_rmse improved from inf to 0.67721, saving model to final_best_as_rec_model_sentiment_only.keras
[1m2449/2449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - loss: 1.1920 - mae: 0.7392 - rmse: 1.0041 - val_loss: 0.4586 - val_mae: 0.5305 - val_rmse: 0.6772
Epoch 2/50
[1m2446/2449[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0

In [14]:
# 제미니

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks


# MAPE 오류 방지
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_true = y_true != 0
    if np.sum(non_zero_true) == 0:
        return 0.0 # 모든 y_true가 0인 경우 MAPE는 0으로 처리
    return np.mean(np.abs((y_true[non_zero_true] - y_pred[non_zero_true]) / y_true[non_zero_true])) * 100



# ABSA 15차원
print("Loading sentiment data from 'review_business_5up_5aspect_3sentiment_vectorized_clean.json'...")
df_sentiment = pd.read_json('review_business_5up_5aspect_3sentiment_vectorized_clean.json', lines=True)
print(f"Sentiment data loaded. Total records: {len(df_sentiment)}")

# 제미니 임베딩 32차원
print("Loading reduced embedding data from 'review_business_5up_with_reduced_embedding.jsonl'...")
df_reduced_emb = pd.read_json('review_business_5up_with_reduced_embedding.jsonl', lines=True)
print(f"Reduced embedding data loaded. Total records: {len(df_reduced_emb)}")

# 데이터 병합
df_processed = pd.merge(
    df_sentiment[['review_id', 'user_id', 'business_id', 'stars', 'sentiment_vector']],
    df_reduced_emb[['review_id', 'reduced_embedding']],
    on='review_id',
    how='inner'
)
print(f"Data merged successfully. Total processed records: {len(df_processed)}")


user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

df_processed.loc[:, 'user_encoded'] = user_encoder.fit_transform(df_processed['user_id'])
df_processed.loc[:, 'business_encoded'] = business_encoder.fit_transform(df_processed['business_id'])

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

# 데이터 7:1:2로 분할 
train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
val_size_ratio = 1 / 8 # 데이터의 10%
train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42)

print(f"전체 데이터 수: {len(df_processed)}")
print(f"학습 데이터 수: {len(train_df)} ({len(train_df)/len(df_processed)*100:.2f}%)")
print(f"검증 데이터 수: {len(val_df)} ({len(val_df)/len(df_processed)*100:.2f}%)")
print(f"테스트 데이터 수: {len(test_df)} ({len(test_df)/len(df_processed)*100:.2f}%)")

# 임베딩 차원 설정
original_sentiment_dim = len(df_processed['sentiment_vector'].iloc[0]) if not df_processed.empty else 15 # 기존 15차원 감성 벡터
loaded_embedding_dim = len(df_processed['reduced_embedding'].iloc[0]) if not df_processed.empty else 32 # 이미 32차원으로 축소된 임베딩

train_sentiment_vectors = np.array(train_df['sentiment_vector'].tolist())
train_embeddings = np.array(train_df['reduced_embedding'].tolist())
val_sentiment_vectors = np.array(val_df['sentiment_vector'].tolist())
val_embeddings = np.array(val_df['reduced_embedding'].tolist())
test_sentiment_vectors = np.array(test_df['sentiment_vector'].tolist())
test_embeddings = np.array(test_df['reduced_embedding'].tolist())


def build_asrec_model(num_users, num_businesses, embedding_dim,
                       user_biz_mlp_dims, aspect_mlp_dims, final_mlp_dims,
                       original_sentiment_dim, loaded_embedding_dim): # loaded_embedding_dim 인자 유지

    # 유저-비즈니스 관계 모듈
    user_input = keras.Input(shape=(1,), name='user_id')
    business_input = keras.Input(shape=(1,), name='business_id')

    user_embedding = layers.Embedding(num_users, embedding_dim, name='user_embedding')(user_input)
    user_vec = layers.Flatten()(user_embedding)

    business_embedding = layers.Embedding(num_businesses, embedding_dim, name='business_embedding')(business_input)
    business_vec = layers.Flatten()(business_embedding)

    combined_vec = layers.concatenate([user_vec, business_vec], axis=1)

    interaction_features = combined_vec
    for dim in user_biz_mlp_dims:
        interaction_features = layers.Dense(dim, activation='relu')(interaction_features)

    # ABSA 모듈
    sentiment_input = keras.Input(shape=(original_sentiment_dim,), name='sentiment_vector')
    embedding_input = keras.Input(shape=(loaded_embedding_dim,), name='reduced_embedding') # reduced_embedding 입력 다시 추가

    combined_aspect_features = layers.concatenate([sentiment_input, embedding_input], axis=1) # 두 임베딩 결합

    aspect_features = combined_aspect_features
    for dim in aspect_mlp_dims:
        aspect_features = layers.Dense(dim, activation='relu')(aspect_features)

    # 최종 예측 모듈
    final_combined_features = layers.concatenate([interaction_features, aspect_features], axis=1)

    predicted_rating = final_combined_features
    for dim in final_mlp_dims:
        predicted_rating = layers.Dense(dim, activation='relu')(predicted_rating)
    predicted_rating = layers.Dense(1, activation='linear', name='output_rating')(predicted_rating)

    model = models.Model(inputs=[user_input, business_input, sentiment_input, embedding_input],
                         outputs=predicted_rating)
    return model

# 파라미터
best_params = {
    'aspect_mlp_hidden_dims': [64, 32],
    'batch_size': 128,
    'embedding_dim': 64,
    'final_mlp_hidden_dims': [32, 16],
    'learning_rate': 0.001,
    'user_biz_mlp_hidden_dims': [128, 64]
}

print("\n" + "="*50)
print(f"Best Parameters: {best_params}")
print("="*50)

final_embedding_dim = best_params['embedding_dim']
final_learning_rate = best_params['learning_rate']
final_batch_size = best_params['batch_size']
final_user_biz_mlp_dims = best_params['user_biz_mlp_hidden_dims']
final_aspect_mlp_dims = best_params['aspect_mlp_hidden_dims']
final_final_mlp_dims = best_params['final_mlp_hidden_dims']


final_model = build_asrec_model(num_users, num_businesses, final_embedding_dim,
                                 final_user_biz_mlp_dims, final_aspect_mlp_dims, final_final_mlp_dims,
                                 original_sentiment_dim, loaded_embedding_dim)

final_model.compile(optimizer=keras.optimizers.Adam(learning_rate=final_learning_rate),
                    loss='mse',
                    metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), 'mae'])

final_model_path = 'final_best_as_rec_model_with_pre_reduced_embedding.keras'

early_stopping_callback = callbacks.EarlyStopping(
    monitor='val_rmse',
    patience=10,
    min_delta=0.0005,
    mode='min',
    restore_best_weights=True
)

model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=final_model_path,
    monitor='val_rmse',
    save_best_only=True,
    mode='min',
    verbose=1
)

print("\n--- Training Final Model with Best Parameters ---")
history = final_model.fit(
    {'user_id': train_df['user_encoded'],
     'business_id': train_df['business_encoded'],
     'sentiment_vector': train_sentiment_vectors,
     'reduced_embedding': train_embeddings},
    train_df['stars'],
    batch_size=final_batch_size,
    epochs=50,
    validation_data=(
        {'user_id': val_df['user_encoded'],
         'business_id': val_df['business_encoded'],
         'sentiment_vector': val_sentiment_vectors,
         'reduced_embedding': val_embeddings},
        val_df['stars']
    ),
    callbacks=[early_stopping_callback, model_checkpoint_callback],
    verbose=1
)

print("\n--- Evaluating Final Model on Test Set ---")
if os.path.exists(final_model_path):
    final_model = keras.models.load_model(final_model_path)
    print(f"Loaded best model weights from {final_model_path}")
else:
    print(f"Could not find optimal final model weights at '{final_model_path}'. Testing with current model state.")

test_predictions = final_model.predict(
    {'user_id': test_df['user_encoded'],
     'business_id': test_df['business_encoded'],
     'sentiment_vector': test_sentiment_vectors,
     'reduced_embedding': test_embeddings}
).flatten()

true_ratings = test_df['stars'].values

mse = mean_squared_error(true_ratings, test_predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(true_ratings, test_predictions)
mape = mean_absolute_percentage_error(true_ratings, test_predictions)

print(f"\n--- Performance Evaluation (Final Model with Best Parameters) ---")
print(f"Selected Hyperparameters: {best_params}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

if os.path.exists('temp_models'):
    import shutil
    shutil.rmtree('temp_models')
    print("\nCleaned up 'temp_models' directory.")


Loading sentiment data from 'review_business_5up_5aspect_3sentiment_vectorized_clean.json'...
Sentiment data loaded. Total records: 447796
Loading reduced embedding data from 'review_business_5up_with_reduced_embedding.jsonl'...
Reduced embedding data loaded. Total records: 447796
Data merged successfully. Total processed records: 447796
전체 데이터 수: 447796
학습 데이터 수: 313456 (70.00%)
검증 데이터 수: 44780 (10.00%)
테스트 데이터 수: 89560 (20.00%)

Best Parameters: {'aspect_mlp_hidden_dims': [64, 32], 'batch_size': 128, 'embedding_dim': 64, 'final_mlp_hidden_dims': [32, 16], 'learning_rate': 0.001, 'user_biz_mlp_hidden_dims': [128, 64]}

--- Training Final Model with Best Parameters ---
Epoch 1/50
[1m2445/2449[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 1.5642 - mae: 0.8160 - rmse: 1.1362
Epoch 1: val_rmse improved from inf to 0.67386, saving model to final_best_as_rec_model_with_pre_reduced_embedding.keras
[1m2449/2449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 