## 4. Matrix Factorization

In [5]:
import os
import numpy as np
import pandas as pd

In [6]:
# 사용자 u.user 파일을 DataFrame으로 읽기
base_src = '../data'
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                    sep = "\t",
                    names = r_cols,
                    encoding = 'latin-1')
# timestamp 제거
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

In [7]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [30]:
class MF():
    def __init__(self, ratings, hyper_params):
        self.R = np.array(ratings)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = hyper_params['K'] # num of latent factor(잠재 요인)
        self.alpha = hyper_params['alpha'] # learning rate
        self.beta = hyper_params['beta'] # regularization rate
        self.iterations = hyper_params['iterations'] # num of iter (sgd)
        self.verbose = hyper_params['verbose']

    def rmse(self):
        xs, ys = self.R.nonzero()
        self.prediction = []
        self.errors = []

        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x,y)
            self.prediction.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.prediction = np.array(self.prediction)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    def train(self):
        self.P = np.random.normal(scale = 1. / self.K, size = (self.num_users, self.K))
        self.Q = np.random.normal(scale = 1. / self.K, size = (self.num_items, self.K))

        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]

        training_precess = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            training_precess.append((i+1, rmse))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration : % d ; train RMSE = %.4f"%(i+1, rmse))
        return training_precess

    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i,:].dot(self.Q[j,:].T)
        return prediction

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - (self.beta * self.b_u[i]))
            self.b_d[j] += self.alpha * (e - (self.beta * self.b_d[j]))

            self.P[i, :] += self.alpha * ((e * self.Q[j, :] - self.beta * self.P[i, :]))
            self.Q[j, :] += self.alpha * ((e * self.P[i, :] - self.beta * self.Q[j, :]))


R_temp = ratings.pivot(index = 'user_id',
                       columns = 'movie_id',
                       values = 'rating').fillna(0)


hyper_params = {
    'K' : 30,
    'alpha' : 0.001,
    'beta' : 0.02,
    'iterations' : 100,
    'verbose' : True
}

mf = MF(R_temp, hyper_params)

train_process = mf.train()

Iteration :  10 ; train RMSE = 0.9585
Iteration :  20 ; train RMSE = 0.9373
Iteration :  30 ; train RMSE = 0.9280
Iteration :  40 ; train RMSE = 0.9224
Iteration :  50 ; train RMSE = 0.9181
Iteration :  60 ; train RMSE = 0.9141
Iteration :  70 ; train RMSE = 0.9092
Iteration :  80 ; train RMSE = 0.9024
Iteration :  90 ; train RMSE = 0.8929
Iteration :  100 ; train RMSE = 0.8806


### train, test set 분리

In [13]:
import os
import numpy as np
import pandas as pd

In [14]:
# 사용자 u.user 파일을 DataFrame으로 읽기
base_src = '../data'
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                    sep = "\t",
                    names = r_cols,
                    encoding = 'latin-1')
# timestamp 제거
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

In [22]:
# train, test set 분리
from sklearn.utils import shuffle
TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state=2023)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

In [23]:
print(ratings_train.shape)
print(ratings_test.shape)

(75000, 3)
(25000, 3)


In [28]:
class NEW_MF():
    def __init__(self, ratings, hyper_params):
        self.R = np.array(ratings)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = hyper_params['K']
        self.alpha = hyper_params['alpha']
        self.beta = hyper_params['beta']
        self.iterations = hyper_params['iterations']
        self.verbose = hyper_params['verbose']

        # movie lens data의 경우 uset, item의 index값이 연속된 정수 값으로 잘 정의되어 있다.
        # 하지만 실제 데이터는 그렇지 않을 수 있다.
        # 따라서, 실제 존재하는 index에 대해 mapping 하는 작업이 필요하다.

        # item
        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)

        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)

    def rmse(self):
        # self.R 에서 평점이 있는 (0이 아닌) 요소의 인덱스 가져오기
        xs, ys = self.R.nonzero()
        # prediction과 error를 담을 리스트 변수 초기화
        self.predictions = []
        self.errors = []
        # 평점이 있는 요소(사용자 x, 아이템 y) 각각에 대해서 아래의 코드를 실행
        for x, y in zip(xs, ys):
            # 사용자 x, 아이템 y에 대해서 평점 예측치를 get_prediction() 함수를 사용해서 계산
            prediction = self.get_prediction(x,y)
            # 예측값을 예측값 리스트에 추가
            self.predictions.append(prediction)
            # 실제값(R)과 예측값의 차이(errors) 계산해서 오차값 리스트에 추가
            self.errors.append(self.R[x,y] - prediction)
        # 예측값 리스트와 오차값 리스트를 numpy array 형태로 변환
        self.errors = np.array(self.errors)
        # error를 활용해서 RMSE 도출
        return np.sqrt(np.mean(self.errors**2))

    def sgd(self):
        for i, j, r in self.samples:
            # 사용자 i, 아이템 j에 대한 평점 예측치 계산
            prediction = self.get_prediction(i, j)
            # 실제 평점과 비교한 오차 계산
            e = (r - prediction)

            # 사용자 평가 경향 계산 및 업데이트
            self.b_u[i] += self.alpha * (e - (self.beta * self.b_u[i]))
            # 아이템 평가 경향 계산 및 업데이트
            self.b_d[j] += self.alpha * (e - (self.beta * self.b_d[j]))

            # P 행렬 계산 및 업데이트
            self.P[i, :] += self.alpha * ((e * self.Q[j, :]) - (self.beta * self.P[i, :]))
            # Q 행렬 계산 및 업데이트
            self.Q[j, :] += self.alpha * ((e * self.P[i, :]) - (self.beta * self.Q[j, :]))

    def get_prediction(self, i, j):
        # 사용자 i, 아이템 j에 대한 평점 예측치를 앞에서 배웠던 식을 이용해서 구함.
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i,:].dot(self.Q[j,:].T)
        return prediction

    # Test set 선정
    def set_test(self, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):
            x = self.user_id_index[ratings_test.iloc[i,0]]
            y = self.item_id_index[ratings_test.iloc[i,1]]
            z = ratings_test.iloc[i,2]
            test_set.append([x,y,z])
            self.R[x,y] = 0
        self.test_set = test_set
        return test_set

    # Test set RMSW 계산
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            # pow : e -> e**2
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    def test(self):
        self.P = np.random.normal(scale = 1./self.K,
                                  size = (self.num_users, self.K))
        self.Q = np.random.normal(scale = 1./self.K,
                                  size = (self.num_items, self.K))
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i, j in zip(rows, columns)]

        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse() # train
            rmse2 = self.test_rmse() # test
            training_process.append((i+1, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration : % d ; Train RMSE = %.4f ; Test RMSE = %.4f"% (i+1, rmse1, rmse2))
        return training_process

    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(self.user_id_index[user_id],
                                   self.item_id_index[item_id])
    def full_prediction(self):
        return self.b + self.b_u[:, np.newaxis] + self.b_d[np.newaxis, :] + self.P.dot(self.Q.T)

R_temp = ratings.pivot(index = 'user_id',
                       columns = 'movie_id',
                       values = 'rating').fillna(0)

hyper_params = {
    'K' : 30,
    'alpha' : 0.001,
    'beta' : 0.02,
    'iterations' : 100,
    'verbose' : True
}

mf = NEW_MF(R_temp, hyper_params)
test_set = mf.set_test(ratings_test)
result = mf.test()

Iteration :  10 ; Train RMSE = 0.9673 ; Test RMSE = 0.9814
Iteration :  20 ; Train RMSE = 0.9427 ; Test RMSE = 0.9608
Iteration :  30 ; Train RMSE = 0.9317 ; Test RMSE = 0.9521
Iteration :  40 ; Train RMSE = 0.9250 ; Test RMSE = 0.9473
Iteration :  50 ; Train RMSE = 0.9203 ; Test RMSE = 0.9444
Iteration :  60 ; Train RMSE = 0.9164 ; Test RMSE = 0.9424
Iteration :  70 ; Train RMSE = 0.9128 ; Test RMSE = 0.9409
Iteration :  80 ; Train RMSE = 0.9088 ; Test RMSE = 0.9395
Iteration :  90 ; Train RMSE = 0.9041 ; Test RMSE = 0.9381
Iteration :  100 ; Train RMSE = 0.8980 ; Test RMSE = 0.9364


In [29]:
ratings_test.iloc[0]

user_id     532
movie_id    373
rating        3
Name: 29900, dtype: int64

In [None]:
print(mf.full_prediction())

In [None]:
print(mf.get_one_prediction(1,2)) # 1번 사용자의 2번 아이템에 대한 예측 평점

### 4.5 MF의 최적 파라미터 찾기

In [None]:
# 최적의 K 값 찾기
results = []
index = []

R_temp = ratings.pivot(index = 'user_id',
                       columns = 'movie_id',
                       values = 'rating').fillna(0)

for K in range(50, 261, 10):
    print(f'K : {K}')
    hyper_params = {
        'K' : K,
    'alpha' : 0.001,
    'beta' : 0.02,
    'iterations' : 300,
    'verbose' : True
    }
    mf = NEW_MF(R_temp, hyper_params)
    test_set = mf.set_test(ratings_test)
    result = mf.test()
    index.append(K)
    results.append(result)

In [None]:
summary = []
for i in range(len(result)):
    RMSE = []
    for result in results[i]:
        RMSE.append(result[2])
    min = np.min(RMSE)
    j = RMSE.index(min)
    summary.append([index[i], j + 1, RMSE[j]])