# collaborative filtering based on Matrix Factorization

- referece : 파이썬 머신러닝 완벽가이드 (권철민, 위키북스), ch. 9.7
- https://github.com/wikibook/pymldg-rev
- dataset : movielens modified w/o rating info.

In [42]:
import numpy as np
from sklearn.metrics import mean_squared_error

# 실제 R 행렬과 예측 행렬의 RMSE 계산
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R행렬 생성
    full_pred_matrix = np.dot(P, Q.T)

    # 실제 R 행렬에서 NaN이 아닌 값의 위치 인덱스를 추출하여 실제 R 행렬과 에측 행렬의 RMSE 추출
    x_non_zero_idx = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_idx = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_idx, y_non_zero_idx]

    ffull_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_idx, y_non_zero_idx]

    mse = mean_squared_error(R_non_zeros, ffull_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)

    return rmse

In [43]:
from tqdm import tqdm

def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda=0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다.
    np.random.seed()
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0

    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 젖아.
    non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0 ]

    # SGD기법으로 P와 Q 맽릭스를 계속 업데이트
    for step in range(steps):
        print("step : ", {steps})
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate * (eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate * (eij * P[i, :] - r_lambda*Q[j,:])

        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10 ) == 0:
            print("### iteration step :", step," rmse : ", rmse)

        return P, Q

In [44]:
import pandas as pd
import numpy as np

ratings = pd.read_csv('./data/train/train_ratings.csv')
ratings['rating'] = 1.0

ratings_df = ratings[['user', 'item', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index='user', columns='item')
ratings_matrix

# column = 'title'로 title 컬럼으로 pivot 수행
ratings_matrix = rating_movies.pivot_table('rating', index ='user', columns='item' )

# # NaN 값을 모두 0으로 변환
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138473,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138486,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda=0.01)
pred_matrix = np.dot(P, Q.T)

step :  {200}
### iteration step : 0  rmse :  0.5823203149938739


In [110]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index=ratings_matrix.index, columns=ratings_matrix.columns )
ratings_pred_matrix.head(3)

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,0.002598,-0.005692,-0.001228,0.009116,-0.003179,0.003702,-0.000288,0.002416,0.003751,0.002511,...,-0.004565,0.00114,-0.000247,0.009733,-0.000216,0.00399,-0.001047,0.002761,-0.001517,-0.001003
14,-0.000814,0.001535,0.004758,0.004233,0.000817,0.006391,0.003105,0.006867,0.001004,0.004979,...,9.9e-05,-0.001352,0.001345,0.003179,-0.003674,-0.001134,0.001611,0.007131,-0.000571,-0.00626
18,-0.003813,-0.003089,-0.00476,-0.000866,0.001158,-0.002854,0.001033,0.003577,0.003881,-0.004878,...,0.003764,0.003226,0.000919,0.001049,0.002116,0.006161,0.003884,-0.003958,0.002755,0.00126


In [162]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함. 
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [163]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [164]:
list(ratings_df['item'].unique())

[4643,
 170,
 531,
 616,
 2140,
 2722,
 2313,
 2688,
 2428,
 3113,
 1591,
 2600,
 8169,
 2572,
 58293,
 7541,
 1367,
 32,
 4792,
 7444,
 53953,
 56949,
 6502,
 53000,
 51662,
 5151,
 35836,
 7293,
 33585,
 8810,
 56801,
 5377,
 344,
 19,
 410,
 2124,
 828,
 1274,
 8977,
 1032,
 1214,
 1200,
 1320,
 3897,
 7173,
 1225,
 2858,
 59418,
 45361,
 2706,
 1321,
 2793,
 33085,
 4235,
 3892,
 4340,
 27660,
 43556,
 47124,
 2294,
 48304,
 150,
 31184,
 34338,
 1917,
 50162,
 2827,
 27368,
 4366,
 2153,
 30812,
 3525,
 1270,
 2011,
 2012,
 8973,
 1255,
 2018,
 541,
 4878,
 7361,
 31658,
 2571,
 7099,
 260,
 1196,
 60069,
 160,
 1882,
 60037,
 880,
 36509,
 405,
 3826,
 4133,
 673,
 6541,
 611,
 172,
 4638,
 5171,
 208,
 4887,
 5459,
 60760,
 8361,
 60514,
 1544,
 1876,
 442,
 32213,
 5219,
 1690,
 2717,
 27608,
 52722,
 780,
 6934,
 52287,
 3745,
 45499,
 37830,
 60040,
 34319,
 8644,
 6365,
 34048,
 316,
 3300,
 7022,
 7254,
 57368,
 1584,
 2232,
 1748,
 1253,
 49278,
 42718,
 1097,
 7481,
 5903

In [166]:
mp_items = []
users = []

for u in tqdm(list(ratings_df['user'].unique())):
    # 사용자가 관람하지 않는 영화명 추출   
    unseen_list = get_unseen_movies(ratings_matrix, u)

    # 아이템 기반의 인접 이웃 협업 필터링으로 영화 추천 
    recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, u, unseen_list, top_n=10)

    mp_items= mp_items + list(recomm_movies.index)
    users = users + [u, u, u, u, u, u, u, u, u, u]

# 평점 데이타를 DataFrame으로 생성
test_df = pd.DataFrame(zip(users,mp_items), columns=['user','item'])
test_df.to_csv("most_popular_submission.csv", index=False)

100%|██████████| 31360/31360 [21:40<00:00, 24.12it/s] 
