# 콘텐츠기반 필터링

In [1]:
import numpy as np
import pandas as pd

In [2]:
!pip install scikit-surprise > /dev/null

In [3]:
from surprise import Dataset

In [4]:
data = Dataset.load_builtin('ml-100k', prompt = False)
df= pd.DataFrame(data.raw_ratings, columns = ['user_id', 'movie_id','rating','timestamp'])
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [5]:
df.shape

(100000, 4)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    100000 non-null  object 
 1   movie_id   100000 non-null  object 
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 3.1+ MB


In [7]:
df.user_id.min(), df.user_id.max()

('1', '99')

## 인접행렬 생성(Adjacent Matrix)
- row : user
- col : movie
- contents : rating

In [8]:
# 데이터가 array가 아님
raw_data = np.array(data.raw_ratings, dtype = int)

In [9]:
np.min(raw_data, axis = 0)

array([        1,         1,         1, 874724710])

In [10]:
np.max(raw_data, axis = 0)

array([      943,      1682,         5, 893286638])

In [11]:
# user_id , movie_id 를 0 부터 시작하도록 변경
raw_data[:, :2] -= 1

In [12]:
np.min(raw_data, axis = 0)

array([        0,         0,         1, 874724710])

### 행 num, 열 num 필요

In [13]:
nrows = df['user_id'].nunique()
ncols = df['movie_id'].nunique()
nrows, ncols

(943, 1682)

### 1.본 영화/ 안 본 영화 (= 1 / 0) 구분

In [14]:
adj_mat = np.zeros([nrows, ncols], int)
for user_id, movie_id, _, _ in raw_data:
    adj_mat[user_id, movie_id] = 1
adj_mat[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [15]:
# 0번이 '나'일 때
my_id, my_vector = 0, adj_mat[0]

#### 유사도
- 이진벡터의 내적

In [16]:
# 나와 10번 사용자 사이의 유사도
np.dot(my_vector, adj_mat
[10])

71

In [17]:
# 누가 나랑 닮았는지
best_score = 0 
best_match_id = 0

for i in range(1, len(adj_mat)):
    dot = np.dot(my_vector, adj_mat[i])
    if dot > best_score:
         best_score, best_match_id = dot, i
best_score ; best_match_id

275

In [18]:
best_vector = adj_mat[best_match_id]
my_vector[100:110]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [19]:
best_vector[100:110]

array([1, 0, 0, 1, 0, 0, 0, 1, 1, 0])

In [20]:
# 내가 보지 않은 영화 중에서 best_match가 본 영화 추천
recommend = []
for u , (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view == 1:
        recommend.append(i)

In [21]:
len(recommend), recommend[:10]

(335, [942, 942, 942, 942, 942, 942, 942, 942, 942, 942])

### 2.평점점수를 주는 경우

In [22]:
adj_mat = np.zeros([nrows, ncols], int)
for user_id, movie_id, rating, _ in raw_data:
    adj_mat[user_id, movie_id] = rating
adj_mat[:5]

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [4, 3, 0, ..., 0, 0, 0]])

#### 2-1. 유사도 - 유클리드거리
- 유클리드 거리가 작을수록 닮음.

In [23]:
# 0번이 '나'일 때
my_id, my_vector = 0, adj_mat[0]

In [24]:
# 누가 나랑 닮았는지
best_score = 100000 
best_match_id = 0

for i in range(1, len(adj_mat)):
    euc = np.sqrt(np.sum(np.square(my_vector - adj_mat[i])))
    if euc < best_score:
         best_score, best_match_id = euc, i
best_score, best_match_id

(55.06359959174482, 737)

In [25]:
print(adj_mat[0])
print(adj_mat[1])

[5 3 4 ... 0 0 0]
[4 0 0 ... 0 0 0]


In [26]:
np.square(my_vector - adj_mat[1])

array([ 1,  9, 16, ...,  0,  0,  0])

In [27]:
#np.square(my_vector,adj_mat[1])

In [28]:
# 내가 보지 않은 영화 중에서 best_match가 본 영화 추천
best_vector = adj_mat[best_match_id]
recommend = []
for i, (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view >= 1:
        recommend.append(i)
len(recommend), recommend[:10]

(48, [297, 312, 317, 342, 356, 366, 379, 384, 392, 402])

In [29]:
# 내가 보지 않은 영화 중에서 best_match가 본 평점이 좋은 영화 추천
best_vector = adj_mat[best_match_id]
recommend = []
for i, (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view == 4:
        recommend.append(i)
len(recommend), recommend[:10]

(15, [356, 422, 433, 454, 469, 473, 495, 510, 527, 650])

#### 2-2. 코사인 유사도

In [30]:
def cos_similarity(v1, v2):
    v1_norm = np.sqrt(np.sum(np.square(v1)))
    v2_norm = np.sqrt(np.sum(np.square(v2)))
    return np.dot(v1, v2) / (v1_norm * v2_norm)

In [31]:
# 누가 나랑 닮았는지
best_score = 0
best_match_id = 0

for i in range(1, len(adj_mat)):
    cos_sim = cos_similarity(my_vector, adj_mat[i])
    if cos_sim > best_score:
         best_score, best_match_id = cos_sim, i
best_score, best_match_id

(0.569065731527988, 915)

In [32]:
# 내가 보지 않은 영화 중에서 best_match가 본 영화 추천
best_vector = adj_mat[best_match_id]
recommend = []
for i, (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view >= 1:
        recommend.append(i)
len(recommend), recommend[:10]

(162, [272, 275, 279, 280, 283, 285, 289, 294, 297, 316])

In [33]:
# 내가 보지 않은 영화 중에서 best_match가 본 평점이 좋은 영화 추천
best_vector = adj_mat[best_match_id]
recommend = []
for i, (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view == 4:
        recommend.append(i)
len(recommend), recommend[:10]

(51, [275, 285, 316, 317, 381, 386, 426, 427, 460, 461])