# 추천 시스템
* 컨텐츠 기반 필터링
* 협업 필터링

# Surprise
* 추천 시스템 개발을 위한 라이브러리

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 1.2 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1623224 sha256=1cd8c04d10776c7b6db44eafa1d6aa5eee6130409e16a144639a0f32e9f84a71
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [3]:
data = Dataset.load_builtin('ml-100k', prompt = False)
data.raw_ratings[:10]

Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [4]:
model = SVD()


In [5]:
cross_validate(model, data, measures = ['rmse', 'mae'], cv = 5, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9425  0.9358  0.9403  0.9330  0.9278  0.9359  0.0052  
MAE (testset)     0.7419  0.7407  0.7408  0.7347  0.7323  0.7381  0.0038  
Fit time          4.80    7.03    5.77    5.12    4.27    5.40    0.95    
Test time         0.39    0.15    0.24    0.18    0.19    0.23    0.08    


{'fit_time': (4.796828031539917,
  7.0259318351745605,
  5.765401840209961,
  5.116170406341553,
  4.270819664001465),
 'test_mae': array([0.7418925 , 0.74073523, 0.74083019, 0.7346977 , 0.73229092]),
 'test_rmse': array([0.94245207, 0.93580898, 0.94026854, 0.93303782, 0.92778929]),
 'test_time': (0.387096643447876,
  0.14985132217407227,
  0.2426133155822754,
  0.17836332321166992,
  0.18712472915649414)}

# 컨텐츠 기반 필터링
* 컨텐츠 기반 필터링은 이전의 행동과 명시적 피드백을 통해 좋아하는 것과 유사한 항목을 추천
* 유사도를 기반

In [6]:
import numpy as np
from surprise import Dataset

In [7]:
data = Dataset.load_builtin('ml-100k', prompt= True)
raw_data = np.array(data.raw_ratings, dtype = int)

In [8]:
raw_data[:,0] -= 1
raw_data[:,1] -= 1

In [9]:
n_users = np.max(raw_data[:,0])
n_movies = np.max(raw_data[:,1])
shape = (n_users + 1, n_movies +1)
shape

(943, 1682)

In [10]:
adj_matrix = np.ndarray(shape, dtype = int)
for user_id, movie_id, rating, time in raw_data:
  adj_matrix[user_id][movie_id] = 1.
adj_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [11]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1,-1,[]

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    similarity = np.dot(my_vector, user_vector)
    if similarity > best_match:
      best_mach = similarity
      best_match_id = user_id
      best_match_vector = user_vector
print('Best Match: {}, Best Match ID : {}'.format(best_match, best_match_id))

Best Match: -1, Best Match ID : 942


In [12]:
recommand_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1and log2 > 0.:
    recommand_list.append(i)
print(recommand_list)

[273, 280, 281, 283, 317, 355, 366, 372, 384, 385, 390, 392, 398, 400, 401, 402, 404, 405, 411, 414, 418, 420, 422, 425, 426, 430, 442, 448, 449, 467, 469, 470, 474, 484, 507, 525, 540, 545, 548, 558, 565, 567, 568, 569, 575, 580, 584, 594, 608, 613, 624, 654, 671, 684, 716, 719, 720, 721, 723, 731, 738, 755, 762, 764, 784, 793, 795, 807, 815, 823, 824, 830, 839, 927, 940, 942, 1010, 1027, 1043, 1046, 1066, 1073, 1187, 1227, 1329]


* 코사인 유사도를 사용해 추천
* 두 벡터가 이루고 있는 각을 계산

In [13]:
def cross_cos_similarity(v1,v2):
  norm1 = np.sqrt(np.sum(np.square(v1)))
  norm2 = np.sqrt(np.sum(np.square(v2)))

  dot = np.dot(v1,v2)
  return dot / (norm1 * norm2)

In [14]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector = -1,-1,[]

for user_id, user_vector in enumerate(adj_matrix):
  if my_id != user_id:
    cos_similarity = cross_cos_similarity(my_vector, user_vector)
    if cos_similarity > best_match:
      best_mach = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector
print('Best Match: {}, Best Match ID : {}'.format(best_match, best_match_id))

Best Match: -1, Best Match ID : 942


# 협업 필터링
* 사용자와 항목의 유사성을 동시에 고려
* 기존에 내 관심사가 아닌 항목이라도 추천
* 자동으로 임베딩 학습

In [15]:
from surprise import KNNBasic, SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate

In [16]:
data = Dataset.load_builtin('ml-100k', prompt = False)

## KNNbasic을 사용한 협업 필터링

In [17]:
model = KNNBasic()
cross_validate(model, data, measures = ['rmse', 'mae'], cv =5, n_jobs = 4, verbose = True)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9811  0.9637  0.9748  0.9869  0.9834  0.9780  0.0082  
MAE (testset)     0.7742  0.7615  0.7707  0.7796  0.7760  0.7724  0.0062  
Fit time          0.35    0.41    0.62    0.60    0.51    0.50    0.11    
Test time         6.58    8.10    8.43    6.39    4.03    6.71    1.56    


{'fit_time': (0.34525132179260254,
  0.4105257987976074,
  0.6234827041625977,
  0.5984477996826172,
  0.5137319564819336),
 'test_mae': array([0.77419307, 0.76145315, 0.77068052, 0.77958988, 0.77604258]),
 'test_rmse': array([0.98108614, 0.96368066, 0.97480789, 0.98694947, 0.98335157]),
 'test_time': (6.575359106063843,
  8.103976249694824,
  8.427873373031616,
  6.393515348434448,
  4.030378580093384)}

## SVD를 사용한 협업 필터링

In [18]:
model = SVD()
cross_validate(model, data, measures = ['rmse', 'mae'], cv =5, n_jobs = 4, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9291  0.9444  0.9328  0.9333  0.9419  0.9363  0.0058  
MAE (testset)     0.7327  0.7435  0.7351  0.7359  0.7393  0.7373  0.0037  
Fit time          10.02   12.40   12.54   10.38   6.97    10.46   2.02    
Test time         0.46    0.46    0.40    0.23    0.14    0.34    0.13    


{'fit_time': (10.021586656570435,
  12.404861450195312,
  12.539223194122314,
  10.37698221206665,
  6.96793794631958),
 'test_mae': array([0.73274948, 0.74349623, 0.73508158, 0.73586089, 0.73932489]),
 'test_rmse': array([0.92913807, 0.94436634, 0.93278916, 0.93331709, 0.9419154 ]),
 'test_time': (0.4574716091156006,
  0.46450304985046387,
  0.3977944850921631,
  0.2252507209777832,
  0.1365509033203125)}

## NMF를 사용한 협업 필터링

In [19]:
model = NMF()
cross_validate(model, data, measures = ['rmse', 'mae'], cv =5, n_jobs = 4, verbose = True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9585  0.9674  0.9617  0.9589  0.9646  0.9622  0.0034  
MAE (testset)     0.7550  0.7607  0.7551  0.7527  0.7580  0.7563  0.0028  
Fit time          11.15   13.26   13.56   11.31   7.46    11.35   2.18    
Test time         0.47    0.34    0.31    0.19    0.12    0.29    0.12    


{'fit_time': (11.149754524230957,
  13.259210348129272,
  13.564266681671143,
  11.307984113693237,
  7.460906267166138),
 'test_mae': array([0.75495165, 0.76066651, 0.75510639, 0.75268224, 0.75797074]),
 'test_rmse': array([0.95851408, 0.96744615, 0.96168184, 0.95887852, 0.96460414]),
 'test_time': (0.4666166305541992,
  0.3439757823944092,
  0.3088560104370117,
  0.19393539428710938,
  0.11891746520996094)}

## SVD++를 사용한 협업 필터링

In [20]:
model = SVDpp()
cross_validate(model, data, measures = ['rmse', 'mae'], cv =5, n_jobs = 4, verbose = True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9195  0.9186  0.9228  0.9195  0.9176  0.9196  0.0017  
MAE (testset)     0.7215  0.7196  0.7240  0.7204  0.7198  0.7211  0.0016  
Fit time          561.05  559.69  564.03  564.95  141.27  478.20  168.48  
Test time         10.08   10.18   7.95    5.60    2.59    7.28    2.88    


{'fit_time': (561.049090385437,
  559.6857116222382,
  564.0267977714539,
  564.9547748565674,
  141.26636576652527),
 'test_mae': array([0.72150413, 0.71956229, 0.72401237, 0.72044898, 0.71984764]),
 'test_rmse': array([0.91954038, 0.91856159, 0.92275322, 0.9195138 , 0.91755947]),
 'test_time': (10.078526258468628,
  10.182770729064941,
  7.946866273880005,
  5.599149227142334,
  2.586862087249756)}

# 하이브리드
* 컨텐츠 기반 필터링과 협업 기반의 필터링 조합

In [60]:
import numpy as np
from sklearn.decomposition import randomized_svd, non_negative_factorization
from surprise import Dataset

In [61]:
data = Dataset.load_builtin('ml-100k', prompt= False)
raw_data = np.array(data.raw_ratings, dtype = int)
raw_data[:, 0] -= 1
raw_data[:, 1] -= 1

In [62]:
n_users = np.max(raw_data[:, 0])
n_movies =np.max(raw_data[:, 1])
shape = (n_users +1, n_movies +1)
shape

(943, 1682)

In [63]:
adj_matrix = np.ndarray(shape, dtype = int)
for user_id, movie_id, rating, time in raw_data:
  adj_matrix[user_id][movie_id] = rating

In [64]:
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [65]:
U, S, V = randomized_svd(adj_matrix, n_components= 2)
S = np.diag(S)



In [66]:
print(U.shape)
print(S.shape)
print(V.shape)

(943, 2)
(2, 2)
(2, 1682)


In [67]:
np.matmul(np.matmul(U,S), V)

array([[ 3.91732663e+00,  1.47276644e+00,  7.98261988e-01, ...,
         6.24907189e-04,  1.41100852e-02,  1.36545878e-02],
       [ 1.85777226e+00,  3.96191175e-01,  5.05705740e-01, ...,
         5.38862978e-03,  1.77237914e-03,  5.26968095e-04],
       [ 8.94989517e-01,  1.71578497e-01,  2.51738682e-01, ...,
         2.92094923e-03,  5.39937171e-04, -1.25733753e-04],
       ...,
       [ 9.92051955e-01,  2.10814957e-01,  2.70363365e-01, ...,
         2.89019297e-03,  9.34221962e-04,  2.66612193e-04],
       [ 1.30425401e+00,  5.27669941e-01,  2.50080165e-01, ...,
        -4.20677765e-04,  5.30525683e-03,  5.28069948e-03],
       [ 2.82999397e+00,  9.70812247e-01,  6.15871694e-01, ...,
         2.02091492e-03,  8.67740813e-03,  8.03107892e-03]])

* 사용자 기반 추천
* 나와 비슷한 취향을 가진 다른 사용자의 행동을 추천
* 사용자 특징 벡터의 유사도 사용

In [68]:
my_id, my_vector = 0,U[0]
best_match, best_match_id, best_match_vector = -1,-1,[]

for user_id, user_vector in enumerate(U):
  if my_id != user_id:
    cos_similarity = cross_cos_similarity(my_vector, user_vector)
    if cos_similarity > best_match:
      best_mach = cos_similarity
      best_match_id = user_id
      best_match_vector = user_vector
print('Best Match: {}, Best Match ID : {}'.format(best_match, best_match_id))

Best Match: -1, Best Match ID : 942


In [69]:
recommand_list = []

for i , log in enumerate(zip(my_vector, best_match_vector)):
  log1, log2 = log
  if log1 < 1. and log2 >0:
    recommand_list.append(i)
print(recommand_list)

[0]
