📖 참고 : 패스트캠퍼스 - 딥러닝을 활용한 추천시스템 구현 올인원 패키지 Online.

# Neighborhood-based Collaborative Filtering

-----

# 1. 라이브러리 호출 및 데이터 읽기

In [1]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
path = 'data/movielens'
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')

print(ratings_df.shape)
print(ratings_df.head())

(100836, 4)
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [3]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


# 2. Sparse Matrix 만들기

- sparse matrix = 
(user, movie)

In [4]:
sparse_matrix = train_df.groupby('movieId').apply(lambda x: pd.Series(x['rating'].values, index=x['userId'])).unstack()
sparse_matrix.index.name = 'movieId'

sparse_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,,2.5,,5.0
2,,,,,,,,4.0,,,...,,4.0,,,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193573,,,,,,,,,,,...,,,,,,,,,,
193579,,,,,,,,,,,...,,,,,,,,,,
193581,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


# 3. 코사인 유사도

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

def cossim_matrix(a, b):
    cossim_values = cosine_similarity(a.values, b.values)
    cossim_df = pd.DataFrame(data=cossim_values, columns = a.index.values, index=a.index)

    return cossim_df

# 4. Neighborhood-based 협업필터링 추천점수 계산하기

### Item-based

In [6]:
item_sparse_matrix = sparse_matrix.fillna(0)
item_sparse_matrix.shape

(8938, 610)

In [7]:
item_cossim_df = cossim_matrix(item_sparse_matrix, item_sparse_matrix)
item_cossim_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.304336,0.267816,0.040259,0.221228,0.266544,0.149392,0.132943,0.182044,0.296838,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.304336,1.000000,0.226138,0.052482,0.154783,0.209716,0.189420,0.068012,0.027945,0.303157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.267816,0.226138,1.000000,0.000000,0.306435,0.245555,0.368724,0.168267,0.253679,0.178219,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.040259,0.052482,0.000000,1.000000,0.095673,0.068708,0.205962,0.000000,0.000000,0.044835,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.221228,0.154783,0.306435,0.095673,1.000000,0.238683,0.343507,0.204088,0.222925,0.150729,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193573,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193579,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193581,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193587,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [8]:
userId_grouped = train_df.groupby('userId')

item_prediction_result_df = pd.DataFrame(index=list(userId_grouped.indices.keys()), columns=item_sparse_matrix.index)
item_prediction_result_df

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [9]:
for userId, group in tqdm(userId_grouped):
    user_sim = item_cossim_df.loc[group['movieId']]
    user_rating = group['rating']

    sim_sum = user_sim.sum(axis=0)

    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum+1)
    item_prediction_result_df.loc[userId] = pred_ratings

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/610 [00:00<?, ?it/s]

In [10]:
item_prediction_result_df.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
1,4.202787,4.187832,4.176092,3.032719,4.045611,4.240191,3.993299,3.905474,3.702791,4.188552,...,0.979003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.382727
2,3.192613,3.152069,2.461376,0.280903,2.619162,2.975403,1.858767,1.687683,1.092676,3.022348,...,0.387743,1.708851,1.708851,1.708851,1.708851,1.708851,1.708851,1.708851,1.708851,2.326659
3,1.323152,1.204457,1.396351,0.217016,0.819716,1.59889,0.802514,0.755008,0.86708,1.553646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.48994,3.488045,3.388644,3.150166,3.304884,3.502666,3.317941,2.906289,3.034133,3.416967,...,1.033496,0.505535,0.505535,0.505535,0.505535,0.505535,0.505535,0.505535,0.505535,1.962996
5,3.279163,3.1278,2.999465,2.565822,2.941788,3.168687,3.007926,2.678086,2.276683,3.015567,...,0.391638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.562705
6,3.585973,3.572326,3.536148,3.33866,3.514443,3.5477,3.538256,3.420565,3.340599,3.552752,...,0.446497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.672442
7,3.360095,3.253378,3.245163,2.225812,3.211405,3.338081,3.183199,2.891798,2.891935,3.351496,...,0.872628,0.3554,0.3554,0.3554,0.3554,0.3554,0.3554,0.3554,0.3554,2.50752
8,3.261226,3.22385,3.03702,2.581285,2.945482,3.215053,3.061258,2.6713,2.370005,3.174262,...,0.597653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.730709
9,2.807968,2.621562,2.282086,0.767416,2.250433,2.641792,2.04582,1.599452,1.070717,2.63652,...,0.458811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,3.070416,3.043548,2.859834,0.908598,2.979648,2.904999,2.752276,2.240635,1.588792,3.056238,...,0.63253,1.211363,1.211363,1.211363,1.211363,1.211363,1.211363,1.211363,1.211363,2.333355


### User-based

In [11]:
user_sparse_matrix = sparse_matrix.fillna(0).transpose()

In [12]:
user_cossim_df = cossim_matrix(user_sparse_matrix, user_sparse_matrix)
user_cossim_df # 1번 유저와 4번 유저 연관성 높은편

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.016665,0.070570,0.160438,0.075410,0.087404,0.123664,0.069225,0.037416,0.009900,...,0.056163,0.124896,0.162324,0.050852,0.101583,0.128198,0.240652,0.225897,0.063984,0.095228
2,0.016665,1.000000,0.000000,0.004295,0.020560,0.030996,0.027726,0.000000,0.000000,0.057112,...,0.151666,0.019379,0.006645,0.000000,0.000000,0.028180,0.000000,0.046286,0.033522,0.090288
3,0.070570,0.000000,1.000000,0.002677,0.000000,0.003477,0.000000,0.000000,0.000000,0.000000,...,0.002878,0.002174,0.027609,0.000000,0.000000,0.012090,0.000000,0.023927,0.000000,0.018332
4,0.160438,0.004295,0.002677,1.000000,0.121648,0.093634,0.106495,0.047930,0.000000,0.034281,...,0.065328,0.116656,0.251105,0.056396,0.075940,0.187236,0.102819,0.109515,0.038805,0.086941
5,0.075410,0.020560,0.000000,0.121648,1.000000,0.164390,0.075932,0.302418,0.000000,0.000000,...,0.090020,0.306634,0.103646,0.145354,0.119800,0.071333,0.064705,0.111755,0.159946,0.049236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.128198,0.028180,0.012090,0.187236,0.071333,0.076207,0.159508,0.051044,0.034675,0.066999,...,0.123464,0.083967,0.238107,0.068227,0.110706,1.000000,0.101661,0.218106,0.073027,0.164355
607,0.240652,0.000000,0.000000,0.102819,0.064705,0.121999,0.167129,0.167671,0.000000,0.010033,...,0.064124,0.196274,0.176603,0.111658,0.116814,0.101661,1.000000,0.233010,0.142659,0.114225
608,0.225897,0.046286,0.023927,0.109515,0.111755,0.144931,0.274492,0.166382,0.053714,0.054227,...,0.134966,0.183543,0.198187,0.130758,0.159466,0.218106,0.233010,1.000000,0.107489,0.250085
609,0.063984,0.033522,0.000000,0.038805,0.159946,0.168015,0.080615,0.372351,0.000000,0.027156,...,0.028968,0.315137,0.051141,0.183182,0.079507,0.073027,0.142659,0.107489,1.000000,0.047981


In [13]:
movieId_grouped = train_df.groupby('movieId')
user_prediction_result_df = pd.DataFrame(index=list(movieId_grouped.indices.keys()), columns=user_sparse_matrix.index)
user_prediction_result_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193573,,,,,,,,,,,...,,,,,,,,,,
193579,,,,,,,,,,,...,,,,,,,,,,
193581,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [14]:
for movieId, group in tqdm(movieId_grouped):
    user_sim = user_cossim_df.loc[group['userId']]
    user_rating = group['rating']
    sim_sum = user_sim.sum(axis=0)
    
    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum+1)
    user_prediction_result_df.loc[movieId] = pred_ratings

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/8938 [00:00<?, ?it/s]

In [15]:
print(item_prediction_result_df.shape)
print(user_prediction_result_df.transpose().shape)

(610, 8938)
(610, 8938)


In [16]:
print(item_prediction_result_df.head())
print(user_prediction_result_df.transpose().head())

user_prediction_result_df = user_prediction_result_df.transpose()

movieId    1         2         3         4         5         6         7       \
1        4.202787  4.187832  4.176092  3.032719  4.045611  4.240191  3.993299   
2        3.192613  3.152069  2.461376  0.280903  2.619162  2.975403  1.858767   
3        1.323152  1.204457  1.396351  0.217016  0.819716   1.59889  0.802514   
4         3.48994  3.488045  3.388644  3.150166  3.304884  3.502666  3.317941   
5        3.279163    3.1278  2.999465  2.565822  2.941788  3.168687  3.007926   

movieId    8         9         10      ...    190219    191005    193565  \
1        3.905474  3.702791  4.188552  ...  0.979003       0.0       0.0   
2        1.687683  1.092676  3.022348  ...  0.387743  1.708851  1.708851   
3        0.755008   0.86708  1.553646  ...       0.0       0.0       0.0   
4        2.906289  3.034133  3.416967  ...  1.033496  0.505535  0.505535   
5        2.678086  2.276683  3.015567  ...  0.391638       0.0       0.0   

movieId    193567    193571    193573    193579    19358

# 5. RMSE로 추천시스템 성능 평가하기

In [17]:
def evaluate(test_df, prediction_result_df):
  groups_with_movie_ids = test_df.groupby(by='movieId')
  groups_with_user_ids = test_df.groupby(by='userId')
  intersection_movie_ids = sorted(list(set(list(prediction_result_df.columns)).intersection(set(list(groups_with_movie_ids.indices.keys())))))
  intersection_user_ids = sorted(list(set(list(prediction_result_df.index)).intersection(set(groups_with_user_ids.indices.keys()))))

  print(len(intersection_movie_ids))
  print(len(intersection_user_ids))

  compressed_prediction_df = prediction_result_df.loc[intersection_user_ids][intersection_movie_ids]
  # compressed_prediction_df

  # test_df에 대해서 RMSE 계산
  grouped = test_df.groupby(by='userId')
  result_df = pd.DataFrame(columns=['rmse'])
  for userId, group in tqdm(grouped):
      if userId in intersection_user_ids:
          pred_ratings = compressed_prediction_df.loc[userId][compressed_prediction_df.loc[userId].index.intersection(list(group['movieId'].values))]
          pred_ratings = pred_ratings.to_frame(name='rating').reset_index().rename(columns={'index':'movieId','rating':'pred_rating'})
          actual_ratings = group[['rating', 'movieId']].rename(columns={'rating':'actual_rating'})

          final_df = pd.merge(actual_ratings, pred_ratings, how='inner', on=['movieId'])
          final_df = final_df.round(4) # 반올림
    
  return final_df

In [18]:
evaluate(test_df, user_prediction_result_df)

4385
610


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


  0%|          | 0/610 [00:00<?, ?it/s]

Unnamed: 0,actual_rating,movieId,pred_rating
0,5.0,3527,3.285668
1,3.5,84772,2.211593
2,3.5,103141,2.835437
3,4.0,81132,0.450808
4,4.5,130634,1.022444
...,...,...,...
218,4.0,106100,2.868226
219,4.0,111759,3.513955
220,1.0,4852,0.202402
221,3.0,2628,2.937769


In [19]:
evaluate(test_df, item_prediction_result_df)

4385
610


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


  0%|          | 0/610 [00:00<?, ?it/s]

Unnamed: 0,actual_rating,movieId,pred_rating
0,5.0,3527,3.797414
1,3.5,84772,3.730651
2,3.5,103141,3.689697
3,4.0,81132,3.334023
4,4.5,130634,3.451202
...,...,...,...
218,4.0,106100,3.729685
219,4.0,111759,3.758423
220,1.0,4852,3.648878
221,3.0,2628,3.812378


In [20]:
result_df = evaluate(test_df, user_prediction_result_df)
print(result_df)
print(f"RMSE: {sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))}")

4385
610


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


  0%|          | 0/610 [00:00<?, ?it/s]

     actual_rating  movieId pred_rating
0              5.0     3527    3.285668
1              3.5    84772    2.211593
2              3.5   103141    2.835437
3              4.0    81132    0.450808
4              4.5   130634    1.022444
..             ...      ...         ...
218            4.0   106100    2.868226
219            4.0   111759    3.513955
220            1.0     4852    0.202402
221            3.0     2628    2.937769
222            5.0     1953    3.172221

[223 rows x 3 columns]
RMSE: 1.6949489761800296


In [21]:
result_df = evaluate(test_df, item_prediction_result_df)
print(result_df)
print(f"RMSE: {sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))}")

4385
610


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


  0%|          | 0/610 [00:00<?, ?it/s]

     actual_rating  movieId pred_rating
0              5.0     3527    3.797414
1              3.5    84772    3.730651
2              3.5   103141    3.689697
3              4.0    81132    3.334023
4              4.5   130634    3.451202
..             ...      ...         ...
218            4.0   106100    3.729685
219            4.0   111759    3.758423
220            1.0     4852    3.648878
221            3.0     2628    3.812378
222            5.0     1953    3.866463

[223 rows x 3 columns]
RMSE: 0.8145193961484049


# 6. 결론

- Item-based 협업필터링의 RMSE가 Use-based 협업필터링보다 낮다.