In [3]:
import pandas as pd

df = pd.read_csv('interpark_reviews.csv')
print(df.columns)

Index(['url', 'title', 'review', 'view', 'likes', 'stars', 'blank', 'userid',
       'date', 'star_rating'],
      dtype='object')


In [7]:
r_cols = ['title','userid','star_rating']
DF = df[r_cols].copy(deep=True)

# "NONAME"이 포함된 행 제거
DF = DF[DF["title"] != "NO NAME"]

                                 title      userid  star_rating
0         2024 Someday Christmas in 여수  tmddu***..          5.0
1                2024 성시경 연말 콘서트 〈성시경〉  blonc***..          5.0
2      2024 영탁 단독 콘서트 “TAK SHOW3” - 부산  sunny***..          5.0
3                       벤슨 분 첫 단독 내한공연      zx5***          5.0
4  2024-25 이무진 전국투어 콘서트 ［별책부록］ - 서울 앵콜     orol***          5.0
(43919, 3)


In [11]:
# 중복 제거
DF = DF.drop_duplicates()
print(DF.shape)

(36181, 3)


In [12]:
# user encoding
user_dict = {}
for user in set(DF["userid"]):
    user_dict[user] = len(user_dict)
n_users = len(user_dict)
print(n_users)

22257


In [15]:
# Item encoding
from sklearn.utils import shuffle
item_dict = {}
start_point = n_users
for item in set(DF["title"]):
    item_dict[item] = start_point + len(item_dict)
n_items = len(item_dict)
start_point += n_items
num_x = start_point
DF = shuffle(DF,random_state=1)

In [16]:
# generate X data
import numpy as np
data = [] # 변수 x의 값을 [인덱스,값]의 형태로 저장
y = [] # 평점 데이터
w0 = np.mean(DF['star_rating']) # 전체 편향값

for i in range(len(DF)):
    case = DF.iloc[i]
    x_index = []
    x_value = []
    x_index.append(user_dict[case["userid"]]) # user encoding
    x_value.append(1)
    x_index.append(item_dict[case["title"]])
    x_value.append(1)
    data.append([x_index, x_value])
    y.append(case["star_rating"]-w0)
    if (i%5000)==0:
        print('Encoding',i,'cases...')

Encoding 0 cases...
Encoding 5000 cases...
Encoding 10000 cases...
Encoding 15000 cases...
Encoding 20000 cases...
Encoding 25000 cases...
Encoding 30000 cases...
Encoding 35000 cases...


In [17]:
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# FM 구현
class FM() :
    def __init__(self,N,K,data,y,alpha,beta,train_ratio = 0.75,
                 iterations=100,tolerance=0.005,l2_reg=True,verbose=True):
        self.K=K # latent feature의 수
        self.N=N # 변수 x의 수
        self.n_case = len(data)
        self.alpha = alpha # 학습률
        self.beta = beta # 정규화 계수
        self.iterations = iterations # 반복횟수
        self.tolerance = tolerance # 반복을 중단하는 RMSE의 기준인 tolerance
        self.l2_reg = l2_reg # 정규화를 할지 여부를 나타내는 값
        self.verbose = verbose # 학습 상황을 표시할지 나타내는 값
        
        # 변수의 편향을 나타내는 w벡터 초기화
        self.w = np.random.normal(scale=1./self.N,size = (self.N))
        # 잠재요인 행렬 v 초기화
        self.v = np.random.normal(scale=1./self.K,size = (self.N,self.K))
        
        # Train/Test 분리
        cutoff = int(train_ratio*len(data))
        self.train_x = data[:cutoff]
        self.train_y = y[:cutoff]
        self.test_x = data[cutoff:]
        self.test_y = y[cutoff:]
        
    # Training 하면서 RMSE 계산
    def test(self):
        # SGD를 iterations 숫자만큼 진행
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):
            # SGD & Train RMSE 계산
            rmse1 = self.sgd(self.train_x,self.train_y)
            # Test RMSE 계산
            rmse2 = self.test_rmse(self.test_x,self.test_y)
            training_process.append([i,rmse1,rmse2])
            
            if self.verbose:
                if(i+1)%10==0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1,rmse1,rmse2))
            
            if best_RMSE > rmse2:
                best_RMSE = rmse2
                best_iteration = i
            # RMSE가 정해진 tolerance보다 더 악화되었으면 학습을 중단
            elif(rmse2-best_RMSE) > self.tolerance: break
        
        print(best_iteration,best_RMSE)
        return training_process
    
            
    # w,v 업데이트를 위한 Stochastic gradient descent
    def sgd(self,x_data,y_data):
        y_pred = []
        for data,y in zip(x_data,y_data):
            x_idx = data[0] # x의 인덱스
            x_0 = np.array(data[1]) # 해당 x의 값
            x_1 = x_0.reshape(-1,1) # x의 값을 2차원으로 변형 (2차원인 v행렬과 연산을 위해서)
            
            # 편향값 계산
            bias_score = np.sum(self.w[x_idx]*x_0)
            
            # score 계산
            vx = self.v[x_idx] * (x_1) # v matrix * x
            sum_vx = np.sum(vx,axis = 0) # sigma(vx)
            sum_vx_2 = np.sum(vx*vx,axis = 0) # (v matrix * x)의 제곱
            latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2) # FM 변형식
            
            # 예측값 계산
            y_hat = bias_score + latent_score
            y_pred.append(y_hat)
            error = y - y_hat
            
            # w,v 업데이트
            if self.l2_reg: # 정규화하는 경우의 업데이트
                self.w[x_idx] += error * self.alpha * (x_0 - self.beta * self.w[x_idx])
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx*x_1) - self.beta * self.v[x_idx])
            else: # 정규화하지 않는 경우 (update rule)
                self.w[x_idx] += error * self.alpha * x_0
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx*x_1))
        return RMSE(y_data,y_pred)
    
    def test_rmse(self,x_data,y_data):
        y_pred = []
        for data,y in zip(x_data,y_data):
            y_hat = self.predict(data[0],data[1])
            y_pred.append(y_hat)
        return RMSE(y_data,y_pred)
        
    # 데이터 중 하나의 행에 대한 예측값을 계산하는 함수
    # 위의 sgd() 함수에서 계산하는것과 동일
    def predict(self,idx,x):
        x_0 = np.array(x)
        x_1 = x_0.reshape(-1,1)
        
        # 편향값 계산
        bias_score = np.sum(self.w[idx]*x_0)
        
        # score 계산
        vx = self.v[idx] * (x_1)
        sum_vx = np.sum(vx,axis = 0)
        sum_vx_2 = np.sum(vx*vx,axis = 0)
        latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)
        
        # 예측값 계산
        y_hat = bias_score + latent_score
        return y_hat

K = 350
fm1 = FM(num_x,K,data,y,alpha=0.0014,beta=0.075,train_ratio=0.75,iterations=200,tolerance=0.005,l2_reg=True,verbose=True)
result = fm1.test()
        

Iteration: 10 ; Train RMSE = 0.563515 ; Test RMSE = 0.594920
Iteration: 20 ; Train RMSE = 0.540284 ; Test RMSE = 0.581873
Iteration: 30 ; Train RMSE = 0.526311 ; Test RMSE = 0.576901
Iteration: 40 ; Train RMSE = 0.515714 ; Test RMSE = 0.574688
Iteration: 50 ; Train RMSE = 0.506733 ; Test RMSE = 0.573660
Iteration: 60 ; Train RMSE = 0.498690 ; Test RMSE = 0.573228
Iteration: 70 ; Train RMSE = 0.491267 ; Test RMSE = 0.573133
Iteration: 80 ; Train RMSE = 0.484287 ; Test RMSE = 0.573243
Iteration: 90 ; Train RMSE = 0.477634 ; Test RMSE = 0.573483
Iteration: 100 ; Train RMSE = 0.471218 ; Test RMSE = 0.573811
Iteration: 110 ; Train RMSE = 0.464960 ; Test RMSE = 0.574198
Iteration: 120 ; Train RMSE = 0.458781 ; Test RMSE = 0.574624
Iteration: 130 ; Train RMSE = 0.452594 ; Test RMSE = 0.575076
Iteration: 140 ; Train RMSE = 0.446300 ; Test RMSE = 0.575545
Iteration: 150 ; Train RMSE = 0.439794 ; Test RMSE = 0.576022
Iteration: 160 ; Train RMSE = 0.432972 ; Test RMSE = 0.576500
Iteration: 170 ; 

## implicit 데이터를 활용. 
## wide & deep 모델 + 행렬요인화

In [20]:
!pip install tensorflow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [26]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten, Dense, Concatenate, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam, Adamax

# 유저 및 아이템 임베딩 크기 설정
embedding_dim = 5 # 작은 데이터셋 (수천~수만 개의 샘플)

# 유저 입력 (성별, 나이, 선호 장르, 선호 분위기, 선호 스타일, 선호 타입)
gender_input = Input(shape=(1,))
age_input = Input(shape=(1,))  # 연속형 변수
pref_genre_input = Input(shape=(1,))
pref_mood_input = Input(shape=(1,))
pref_style_input = Input(shape=(1,))
pref_type_input = Input(shape=(1,))

gender_embedding = Embedding(input_dim=num_genders, output_dim=2)(gender_input)
pref_genre_embedding = Embedding(input_dim=num_genres, output_dim=8)(pref_genre_input)
pref_mood_embedding = Embedding(input_dim=num_moods, output_dim=4)(pref_mood_input)
pref_style_embedding = Embedding(input_dim=num_styles, output_dim=4)(pref_style_input)
pref_type_embedding = Embedding(input_dim=num_types, output_dim=4)(pref_type_input)

# 아이템 입력 (장르, 분위기, 스타일, 타입, 시간, 장소)
item_genre_input = Input(shape=(1,))
item_mood_input = Input(shape=(1,))
item_style_input = Input(shape=(1,))
item_type_input = Input(shape=(1,))
time_input = Input(shape=(1,))  # 연속형 변수
place_input = Input(shape=(1,))

item_genre_embedding = Embedding(input_dim=num_genres, output_dim=8)(item_genre_input)
item_mood_embedding = Embedding(input_dim=num_moods, output_dim=4)(item_mood_input)
item_style_embedding = Embedding(input_dim=num_styles, output_dim=4)(item_style_input)
item_type_embedding = Embedding(input_dim=num_types, output_dim=4)(item_type_input)
place_embedding = Embedding(input_dim=num_places, output_dim=4)(place_input)

# 사용자 행동 입력 (조회수, 즐겨찾기 유무)
view_count_input = Input(shape=(1,))  # 연속형 변수
favorite_input = Input(shape=(1,))    # 이진 변수

# Flatten all embeddings
flattened_embeddings = [
    Flatten()(gender_embedding),
    Flatten()(pref_genre_embedding),
    Flatten()(pref_mood_embedding),
    Flatten()(pref_style_embedding),
    Flatten()(pref_type_embedding),
    Flatten()(item_genre_embedding),
    Flatten()(item_mood_embedding),
    Flatten()(item_style_embedding),
    Flatten()(item_type_embedding),
    Flatten()(place_embedding)
]

# 연속형 변수들은 그대로 사용
continuous_features = [
    age_input,
    time_input,
    view_count_input,
    favorite_input
]

In [27]:
# Wide 부분 (선형 회귀)
wide_features = Concatenate()([user_input, item_input, behavior_input])
wide_output = Dense(1, activation='sigmoid')(wide_features)

In [28]:
# Deep 부분
deep_features = Concatenate()([user_embedding, item_embedding, behavior_dense])
deep_output = Dense(128, activation='relu')(deep_features)
deep_output = Dense(64, activation='relu')(deep_output)
deep_output = Dense(1)(deep_output)

In [29]:
# Wide & Deep 결합
final_output = Concatenate()([wide_output, deep_output])
final_output = Dense(1, activation='sigmoid')(final_output)

model = Model(inputs=[user_input, item_input, behavior_input], outputs=final_output)
model.compile(
    optimizer='adam', 
    loss='binary_crossentropy', # 이진 크로스엔트로피 -> 공연을 좋아할 확률을 나타내기 때문
    metrics=['AUC-ROC', 'Precision', 'Recall']
)
model.summary()