라이브러리

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

파라미터

In [14]:
PARQUET_PATH = 'review_data_optimized.parquet'
MODEL_SAVE_PATH = 'final_best_hybrid_gemini_model.keras'

best_params = {
    'user_embedding_dim': 64,
    'business_embedding_dim': 64,
    'gemini_mlp_dims': [1536, 768, 384],
    'user_biz_mlp_dims': [128, 64],
    'final_mlp_dims': [256, 128, 64],
    'learning_rate': 0.001,
    'batch_size': 128
}

df = pd.read_parquet(PARQUET_PATH)
print(" 데이터 로드")

 데이터 로드


데이터 분할 및 전처리

In [15]:
df_processed = df[['user_id', 'business_id', 'stars', 'embedding']].copy()

# 데이터 7:1:2로 분할
train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
val_size_ratio = 1 / 8 
train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=42)


user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

train_df.loc[:, 'user_encoded'] = user_encoder.fit_transform(train_df['user_id'])
train_df.loc[:, 'business_encoded'] = business_encoder.fit_transform(train_df['business_id'])


user_mapping = {label: i for i, label in enumerate(user_encoder.classes_)}
business_mapping = {label: i for i, label in enumerate(business_encoder.classes_)}


val_df.loc[:, 'user_encoded'] = val_df['user_id'].map(user_mapping).fillna(-1).astype(int)
val_df.loc[:, 'business_encoded'] = val_df['business_id'].map(business_mapping).fillna(-1).astype(int)

test_df.loc[:, 'user_encoded'] = test_df['user_id'].map(user_mapping).fillna(-1).astype(int)
test_df.loc[:, 'business_encoded'] = test_df['business_id'].map(business_mapping).fillna(-1).astype(int)

num_users = len(user_encoder.classes_)
num_businesses = len(business_encoder.classes_)

print(f"전체 데이터 수: {len(df_processed)}")
print(f"학습 데이터 수: {len(train_df)} ({len(train_df)/len(df_processed)*100:.2f}%)")
print(f"검증 데이터 수: {len(val_df)} ({len(val_df)/len(df_processed)*100:.2f}%)")
print(f"테스트 데이터 수: {len(test_df)} ({len(test_df)/len(df_processed)*100:.2f}%)")

train_embeddings = np.array(train_df['embedding'].tolist())
val_embeddings = np.array(val_df['embedding'].tolist())
test_embeddings = np.array(test_df['embedding'].tolist())

gemini_embedding_dim = len(train_df['embedding'].iloc[0]) if not train_df.empty else 3072
print(f"제미니 임베딩 차원: {gemini_embedding_dim}")

전체 데이터 수: 447796
학습 데이터 수: 313456 (70.00%)
검증 데이터 수: 44780 (10.00%)
테스트 데이터 수: 89560 (20.00%)
제미니 임베딩 차원: 3072


모델 구축

In [16]:
def build_hybrid_gemini_model(num_users, num_businesses, user_embedding_dim, business_embedding_dim,
                             gemini_embedding_dim, user_biz_mlp_dims, gemini_mlp_dims, final_mlp_dims):
    
    # 사용자-비즈니스 상호작용 모듈
    user_input = keras.Input(shape=(1,), name='user_id')
    business_input = keras.Input(shape=(1,), name='business_id')
    user_embedding = layers.Embedding(num_users, user_embedding_dim, name='user_embedding')(user_input)
    user_vec = layers.Flatten()(user_embedding)
    business_embedding = layers.Embedding(num_businesses, business_embedding_dim, name='business_embedding')(business_input)
    business_vec = layers.Flatten()(business_embedding)
    combined_vec = layers.concatenate([user_vec, business_vec], axis=1)
    interaction_features = combined_vec
    for dim in user_biz_mlp_dims:
        interaction_features = layers.Dense(dim, activation='relu')(interaction_features)

    # 제미니 임베딩 모듈
    gemini_input = keras.Input(shape=(gemini_embedding_dim,), name='gemini_embedding')
    gemini_features = gemini_input
    for dim in gemini_mlp_dims:
        gemini_features = layers.Dense(dim, activation='relu')(gemini_features)
    
    # 최종 예측 모듈
    final_combined_features = layers.concatenate([interaction_features, gemini_features], axis=1)
    predicted_rating = final_combined_features
    for dim in final_mlp_dims:
        predicted_rating = layers.Dense(dim, activation='relu')(predicted_rating)
    predicted_rating = layers.Dense(1, activation='linear', name='output_rating')(predicted_rating)
    
    model = models.Model(inputs=[user_input, business_input, gemini_input], outputs=predicted_rating)
    return model

학습

In [17]:
print("\n" + "="*50)
print(f"파라미터 : {best_params}")
print("="*50)

# 최종 모델
final_model = build_hybrid_gemini_model(
    num_users, num_businesses,
    best_params['user_embedding_dim'], best_params['business_embedding_dim'],
    gemini_embedding_dim,
    best_params['user_biz_mlp_dims'], best_params['gemini_mlp_dims'],
    best_params['final_mlp_dims']
)

final_model.compile(optimizer=keras.optimizers.Adam(learning_rate=best_params['learning_rate']),
                    loss='mse',
                    metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), 'mae'])

early_stopping_callback = callbacks.EarlyStopping(
    monitor='val_rmse',
    patience=5,
    min_delta=0.0005,
    mode='min',
    restore_best_weights=True
)

model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=MODEL_SAVE_PATH,
    monitor='val_rmse',
    save_best_only=True,
    mode='min',
    verbose=1
)


history = final_model.fit(
    {'user_id': train_df['user_encoded'],
     'business_id': train_df['business_encoded'],
     'gemini_embedding': train_embeddings},
    train_df['stars'],
    batch_size=best_params['batch_size'],
    epochs=50,
    validation_data=(
        {'user_id': val_df['user_encoded'],
         'business_id': val_df['business_encoded'],
         'gemini_embedding': val_embeddings},
        val_df['stars']
    ),
    callbacks=[early_stopping_callback, model_checkpoint_callback],
    verbose=1
)


파라미터 : {'user_embedding_dim': 64, 'business_embedding_dim': 64, 'gemini_mlp_dims': [1536, 768, 384], 'user_biz_mlp_dims': [128, 64], 'final_mlp_dims': [256, 128, 64], 'learning_rate': 0.001, 'batch_size': 128}
Epoch 1/50
[1m2447/2449[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - loss: 0.5193 - mae: 0.4833 - rmse: 0.6662
Epoch 1: val_rmse improved from inf to 0.48354, saving model to final_best_hybrid_gemini_model.keras
[1m2449/2449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 19ms/step - loss: 0.5191 - mae: 0.4832 - rmse: 0.6661 - val_loss: 0.2338 - val_mae: 0.3726 - val_rmse: 0.4835
Epoch 2/50
[1m2447/2449[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - loss: 0.2135 - mae: 0.3565 - rmse: 0.4620
Epoch 2: val_rmse improved from 0.48354 to 0.47346, saving model to final_best_hybrid_gemini_model.keras
[1m2449/2449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 19ms/step - loss: 0.2135 - mae: 0.3565 - rmse: 0.4620 - val_loss: 0.2

모델 평가

In [19]:
if os.path.exists(MODEL_SAVE_PATH):
    final_model = keras.models.load_model(MODEL_SAVE_PATH)
    print(f" 최적 파라미터 : {MODEL_SAVE_PATH}")
else:
    print(f"최적 파라미터 찾을 수 없음")

test_predictions = final_model.predict(
    {'user_id': test_df['user_encoded'],
     'business_id': test_df['business_encoded'],
     'gemini_embedding': test_embeddings}
).flatten()

true_ratings = test_df['stars'].values

mse = mean_squared_error(true_ratings, test_predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(true_ratings, test_predictions)
mape = mean_absolute_percentage_error(true_ratings, test_predictions)


print(f"파라미터 : {best_params}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}")

 최적 파라미터 : final_best_hybrid_gemini_model.keras
[1m2799/2799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step
파라미터 : {'user_embedding_dim': 64, 'business_embedding_dim': 64, 'gemini_mlp_dims': [1536, 768, 384], 'user_biz_mlp_dims': [128, 64], 'final_mlp_dims': [256, 128, 64], 'learning_rate': 0.001, 'batch_size': 128}
Mean Squared Error (MSE): 0.2210
Root Mean Squared Error (RMSE): 0.4701
Mean Absolute Error (MAE): 0.3638
Mean Absolute Percentage Error (MAPE): 0.1226
