## Matrix Factorization 실습

KNN과 동일한 ratings 데이터의 모델 기반 협업필터링 방법 중 하나.

In [1]:
import pandas as pd
import numpy as np

np.random.seed(2021)

## 1. Data

### 1.1 Data Load

- 유저-영화 평점 데이터를 이용해 유저가 아직 평가하지 않은 영화를 추천.
- 유저 고유 아이디를 나타내는 userId, 영화 고유 아이디를 나타내는 movieId, 유저가 영화를 평가한 점수 rating 컬럼을 이용.

In [2]:
ratings = pd.read_csv('ratings_small.csv')
ratings = ratings[['userId', 'movieId', 'rating']]

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


다른 두 데이터를 이용해 ratings 데이터의 movieId에 맞는 영화 제목을 얻기.

- 실제 영화 제목과 mapping

In [4]:
movies = pd.read_csv('movies_metadata.csv')
links = pd.read_csv('links_small.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


### 1.2 Data Preprocessing

movies에서 'tt숫자'로 이루어진 imdb_id에서 숫자 부분과 links 데이터의 '숫자'로 이루어진 imdbId와 연결.

In [5]:
movies = movies.fillna('')
movies = movies[movies['imdb_id'].str.startswith('tt')]
movies['imdbId'] = movies['imdb_id'].apply(lambda x: int(x[2:]))
movies = movies.merge(links, on='imdbId')

In [6]:
movies = movies[['title', 'movieId']]
movies = movies.set_index('movieId')

In [7]:
movies.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
1,Toy Story
2,Jumanji
3,Grumpier Old Men
4,Waiting to Exhale
5,Father of the Bride Part II


pivot 함수를 이용해 유저 아이디가 인덱스이고, 영화 아이디가 컬럼, 값이 평가 점수인 user_movie_matrix를 생성.

결측값은 0으로 대체.

In [8]:
user_movie_matrix = ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating',
)
user_movie_matrix = user_movie_matrix.fillna(0)

In [9]:
user_movie_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Matrix Factorization

### 2.1 초기 세팅

### 2.1.1 정답 R

In [10]:
R = user_movie_matrix.values

In [11]:
n_user = R.shape[0] # 전체 유저 수
n_item = R.shape[1] # 전체 영화 수

### 2.1.2 잠재 요인 행렬

유저와 영화별로 잠재 요인 크기가 10인 행렬을 선언.

In [12]:
K = 10

### 2.1.3 P와 Q 랜덤 값으로 초기화

유저 행렬 P와 영화 행렬 Q를 랜덤 값으로 초기화.

In [13]:
P = np.random.normal(size=(n_user, K))
Q = np.random.normal(size=(n_item, K))

In [14]:
P

array([[ 1.48860905,  0.67601087, -0.41845137, ...,  0.64500184,
         0.10641374,  0.42215483],
       [ 0.12420684, -0.83795346,  0.4090157 , ..., -0.22508127,
        -1.33620597,  0.30372151],
       [-0.72015884,  2.5449146 ,  1.31729112, ...,  1.37626076,
        -0.47218397,  0.5240849 ],
       ...,
       [-0.34036392,  1.10504404,  0.25446956, ..., -0.20915116,
         0.65492966, -0.3958868 ],
       [-0.31165161,  1.78026007,  1.08668056, ...,  0.03222073,
        -0.52333827, -0.11044398],
       [-1.2146398 , -0.10685361,  0.845032  , ..., -1.02719008,
         0.00569836,  0.22101445]])

In [15]:
Q

array([[ 0.30194165,  0.36629183, -0.52061911, ..., -0.43741366,
         1.19149681,  0.03748171],
       [-0.02156433, -1.76596912, -0.05909484, ...,  0.45219164,
        -0.99925363,  1.92936678],
       [-0.26655993, -0.48104382, -0.16922735, ...,  0.48428921,
        -0.04504006, -0.35068684],
       ...,
       [-0.33373493, -0.76955212, -1.0908092 , ...,  0.88754135,
        -2.14405834,  1.25667084],
       [-0.32719638, -0.73017883,  0.04958502, ...,  0.20299266,
         0.02776886,  0.30185611],
       [ 0.0813312 ,  0.29697644,  1.11559121, ..., -1.66948007,
        -0.15183078,  0.60258872]])

### 2.2 Gradient Descent를 이용한 잠재 요인 행렬 학습

유저 '670'이 영화 '0'에 평가한 점수를 학습하는 과정.

In [16]:
user_id = 670
item_id = 0

### 2.2.1 R hat을 계산

In [17]:
pred = P[user_id, :].dot(Q[item_id, :].T)
pred

1.0134653914338267

### 2.2.2 R과 R hat의 오차를 계산

In [18]:
real = R[user_id, item_id]
real

5.0

In [19]:
error = real - pred
error

3.9865346085661733

### 2.2.3 Gradient Descent를 이용한 P와 Q를 업데이트

In [20]:
learning_rate = 0.01

In [21]:
dp = 2 * error * Q[item_id, :]
dq = 2 * error * P[user_id, :]

In [22]:
P[user_id, :] += learning_rate * dp
Q[item_id, :] += learning_rate * dq

In [24]:
P[user_id] # 업데이트된 P

array([-1.19056579, -0.07764891,  0.80352268,  0.58764539,  1.12716065,
       -0.30665716,  1.03069912, -1.06206537,  0.10069722,  0.2240029 ])

업데이트된 P와 Q를 이용해 오차가 감소했음을 알 수 있음.

In [25]:
pred = P[user_id, :].dot(Q[item_id, :].T)
error = real - pred
error

2.7414918543707754

### 2.2.4 업데이트 과정을 반복

In [26]:
epochs = 10
real = R[user_id, item_id]

for epoch in range(epochs):
    pred = P[user_id, :].dot(Q[item_id, :].T)
    error = real - pred
    
    dp = 2 * error * Q[item_id, :]
    dq = 2 * error * P[user_id, :]
    
    P[user_id, :] += learning_rate * dp
    Q[item_id, :] += learning_rate * dq
    
    print(f"Epoch{epoch}: {round(error, 3)}")

Epoch0: 2.741
Epoch1: 1.86
Epoch2: 1.242
Epoch3: 0.817
Epoch4: 0.532
Epoch5: 0.344
Epoch6: 0.221
Epoch7: 0.141
Epoch8: 0.09
Epoch9: 0.058


### 2.3 전체 데이터를 이용해 P와 Q 업데이트

In [27]:
K = 10

P = np.random.normal(size=(n_user, K))
Q = np.random.normal(size=(n_item, K))

epochs = 5
learning_rate = 0.01

for epoch in range(1, epochs + 1):
    total_error = 0
    iteration = 0
    
    # 모든 유저에 대해 반복
    for user_id in range(n_user):
        # 모든 아이템에 대해 반복
        for item_id in range(n_item):
            
            real = R[user_id, item_id]
            
            # 평가하지 않은 경우 제외
            if real == 0:
                continue
                
            # P와 Q 업데이트
            pred = P[user_id, :].dot(Q[item_id, :].T)
            error = real - pred
            
            dp = 2 * error * Q[item_id, :]
            dq = 2 * error * P[user_id, :]
            
            P[user_id, :] += learning_rate * dp
            Q[item_id, :] += learning_rate * dq
            
            total_error += (error ** 2)
            iteration += 1
            
    print(f"Epoch{epoch}: {round(np.sqrt(total_error / iteration), 5)}")

Epoch1: 2.98596
Epoch2: 1.67461
Epoch3: 1.3254
Epoch4: 1.1665
Epoch5: 1.06908


### 2.4 영화 추천하기

모든 영화에 대해서 점수를 예측하고 예측 평가 점수가 높은 영화를 유저에게 추천.

In [35]:
user_id = 124

In [36]:
# 유저 '124' 잠재요인에 모든 영화의 잠재요인을 곱해 평점 예측
prediction = P[[user_id], :].dot(Q.T)[0]

In [37]:
prediction

array([ 4.07742196,  3.43959439,  3.33107706, ..., -0.12097473,
        2.20315537,  1.47067927])

In [38]:
# 영화 아이디별 예측 평가 점수를 내림차순으로 정렬
prediction = pd.Series(
    data=prediction,
    index=user_movie_matrix.columns,
).sort_values(ascending=False)

In [39]:
prediction

movieId
2305      7.600884
2481      7.418646
3192      7.356615
118326    7.112933
52528     7.098605
            ...   
26180    -5.148957
59834    -5.174445
5864     -5.540053
4984     -5.771524
137595   -6.315502
Length: 9066, dtype: float64

In [40]:
# 아직 평가하지 않은 영화만 추출
prediction = prediction[user_movie_matrix.loc[user_id] == 0]

In [41]:
# 예측 평가 점수 상위 10개의 영화 아이디 추출
recommend = prediction[:10].index

In [42]:
recommend

Int64Index([2305, 2481, 3192, 118326, 52528, 74370, 25744, 5288, 1652, 844], dtype='int64', name='movieId')

In [43]:
movies.loc[recommend]

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
2305,Slam
2481,My Name Is Joe
3192,The Terrorist
118326,By the Gun
52528,Tristana
74370,The House of the Devil
25744,Häxan
5288,The Atomic Cafe
1652,Year of the Horse
844,The Story of Xinghua
