# Collaborative filtering

# 데이터 읽어오기

In [1]:
import pandas as pd
import numpy as np
movies = pd.read_csv("./dataset/tmdb_5000_movies.csv")         # tmdb_5000_movies.csv dataframe으로 읽어오기
movies = movies[["id","genres","vote_average","vote_count","popularity","original_title", "keywords","overview"]]
movies.drop(["genres","vote_average","vote_count","popularity", "keywords","overview"],axis=1,inplace=True)

print(movies.shape)
movies.head()

(4803, 2)


Unnamed: 0,id,original_title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter


In [2]:
# 컬럼명 변경
movies.columns = ['tmdbId', 'original_title']
movies.head()

Unnamed: 0,tmdbId,original_title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter


In [3]:
# links.csv dataframe으로 읽어오기
links = pd.read_csv("./dataset/links_small.csv")
links = links[["movieId","imdbId","tmdbId"]]
links.drop(['imdbId'],axis=1,inplace=True)

print(links.shape)
links.head()

(9125, 2)


Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [4]:
# ratings.csv dataframe으로 읽어오기
ratings = pd.read_csv("./dataset/ratings_small.csv")   
ratings = ratings[["userId","movieId","rating","timestamp"]]
ratings.drop(["timestamp"],axis=1,inplace=True)

print(ratings.shape)
ratings.head()

(100004, 3)


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


## 결측값 확인


In [5]:
# null 값 개수 확인
movies.isnull().sum()

tmdbId            0
original_title    0
dtype: int64

In [6]:
# null 값 개수 확인
links.isnull().sum()

movieId     0
tmdbId     13
dtype: int64

In [7]:
# 결측값 있는 로우 제거
links = links.dropna(axis=0)

# 타입변환
links["tmdbId"] = links["tmdbId"].astype("int")

links.head()

Unnamed: 0,movieId,tmdbId
0,1,862
1,2,8844
2,3,15602
3,4,31357
4,5,11862


In [8]:
# null 값 개수 확인
ratings.isnull().sum()

userId     0
movieId    0
rating     0
dtype: int64

## 데이터 프레임끼리 inner join

In [9]:
join_movie = pd.merge(ratings, links, left_on='movieId', right_on='movieId', how='inner')
print(join_movie)

       userId  movieId  rating  tmdbId
0           1       31     2.5    9909
1           7       31     3.0    9909
2          31       31     4.0    9909
3          32       31     4.0    9909
4          36       31     3.0    9909
...       ...      ...     ...     ...
99928     664    64997     2.5   34812
99929     664    72380     3.5   22825
99930     665      129     3.0  110972
99931     665     4736     1.0   26602
99932     668     6425     1.0   36807

[99933 rows x 4 columns]


In [10]:
join_movie = pd.merge(join_movie, movies, left_on='tmdbId', right_on='tmdbId', how='inner')
print(join_movie)

       userId  movieId  rating  tmdbId  original_title
0           1     1061     3.0     819        Sleepers
1          19     1061     3.0     819        Sleepers
2          23     1061     3.5     819        Sleepers
3          30     1061     3.0     819        Sleepers
4          70     1061     5.0     819        Sleepers
...       ...      ...     ...     ...             ...
66942     663   134528     3.5  222936           Aloha
66943     663   137595     3.0  264999  Magic Mike XXL
66944     664    60832     3.0   12192       Pathology
66945     664    72380     3.5   22825         The Box
66946     665     4736     1.0   26602    Summer Catch

[66947 rows x 5 columns]


In [11]:
join_movie["original_title"].value_counts()

Forrest Gump                 341
Pulp Fiction                 324
The Shawshank Redemption     311
The Silence of the Lambs     304
Star Wars                    291
                            ... 
The Boy                        1
디워                             1
The Alamo                      1
A Thousand Words               1
Life or Something Like It      1
Name: original_title, Length: 3393, dtype: int64

In [12]:
# # movies에 있던 모든 영화가 join이 잘 됐는지 확인 -> 여긴 발표할 때는 빼도 될 듯..
# after = join_movie["tmdbId"].unique()
# print(len(after))
# print(movies.shape)

# latent factor 알고리즘

### 피벗테이블 생성

In [13]:
user_movie_rating = join_movie.pivot_table('rating', index = 'userId', columns='tmdbId').fillna(0)
print(user_movie_rating.shape)
user_movie_rating.head()

(671, 3394)


tmdbId,5,11,12,13,14,16,18,19,20,22,...,325173,328111,328425,329833,332411,332567,333371,334074,342521,347969
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,3.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 원본 행렬에서 값이 0이 아닌 부분 인덱스 찾기

In [14]:
data = user_movie_rating.values

nonZeroX=[]
nonZeroY=[]

for i in range (0, data.shape[0]):
  for j in range(0, data.shape[1]):
    # print(mat[i][j])
    if (data[i][j] != 0):
      nonZeroX.append(i)
      nonZeroY.append(j)

### 원본 행렬에서 값이 0이 아닌 부분 값들 찾기
* 이후에 오차 계산할 때 사용

In [15]:
nonZeroData = []
for i,j in zip (nonZeroX, nonZeroY):
  nonZeroData.append(data[i][j])

In [16]:
# 개수 확인
print(len(nonZeroData))

66947


## 학습

* SGD (Stochastic Gradient Descent)
 - http://ntucsu.csie.ntu.edu.tw/~cjlin/papers/libmf/libmf.pdf
 - https://velog.io/@vvakki_/Matrix-Factorization-2
 - https://soobarkbar.tistory.com/105
 - 구현이 쉽고, 비교적 빠른 실행 시간

* RMSE (Root Mean Square Error)
  - 오차: 실제 값과 예측 값의 차이
  - 오차들의 제곱을 모두 더한 뒤 평균내고 루트 씌움

In [17]:
from sklearn.metrics import mean_squared_error

# 분해할 행렬의 shape
num_users, num_items = user_movie_rating.shape
K = 150

# 랜덤 값으로 채움
np.random.seed(1)
P = np.random.normal(scale=1.0/K, size=(num_users, K))
Q = np.random.normal(scale=1.0/K, size=(num_items, K))

# 과적합 되지 않게 보정
r_lambda = 0.03

# 학습률
lr = 0.02

# 반복 횟수
step = 100



for epoch in range(1, step+1):
  for i,j in zip (nonZeroX, nonZeroY):
    # 결과 행렬의 i, j 항은 P행렬의 i행과 Q행렬의 j행렬의 행렬곱 값이다.
    # 에러 구함
    error = data[i][j] - np.dot(P[i, :], Q[j, :].T)
    
    # 구한 에러를 바탕으로 SGD를 이용해 P, Q 행렬 보정
    P[i, :] = P[i, :] + lr*(error * Q[j, :] - r_lambda*P[i, :])
    Q[j, :] = Q[j, :] + lr*(error * P[i, :] - r_lambda*Q[j, :])


  # 보정한 행렬을 바탕으로 예측
  prediction = np.dot(P, Q.T)
  errorList = []
  # 원본 행렬에서 0이 아닌 값이 있는 위치에 있는 값들을 가져옴
  for i,j in zip (nonZeroX, nonZeroY):
    errorList.append(prediction[i][j])

  if (epoch) % 10 == 0:
      # 오차 계산
      rmse = mean_squared_error(nonZeroData, errorList, squared=False)
      print("epoch: {}, rmse: {:.6f}" .format(epoch, rmse))


epoch: 10, rmse: 0.741029
epoch: 20, rmse: 0.440402
epoch: 30, rmse: 0.302979
epoch: 40, rmse: 0.256121
epoch: 50, rmse: 0.238608
epoch: 60, rmse: 0.230862
epoch: 70, rmse: 0.226846
epoch: 80, rmse: 0.224479
epoch: 90, rmse: 0.222942
epoch: 100, rmse: 0.221869


## 예측한 행렬 확인

In [18]:
# 예측한 행렬 보이기
pred_matrix = np.dot(P, Q.T)
pred_matrix_df = pd.DataFrame(pred_matrix, columns = user_movie_rating.columns, index = user_movie_rating.index)
print(pred_matrix_df.shape)
pred_matrix_df.head()

(671, 3394)


tmdbId,5,11,12,13,14,16,18,19,20,22,...,325173,328111,328425,329833,332411,332567,333371,334074,342521,347969
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.018751,2.027506,2.435144,2.871769,2.817492,1.930649,2.823132,2.634303,2.821766,2.666339,...,1.678784,2.044131,1.886671,1.571274,0.835638,1.617979,2.811076,1.742602,0.44204,1.273466
2,2.427161,3.817115,3.434385,3.138006,4.241183,3.600734,3.015552,3.931935,3.837951,3.102039,...,2.171388,3.197801,2.751044,2.007929,1.082741,1.914578,3.754777,2.405215,0.5311,1.936493
3,4.023255,3.817229,3.136991,4.789111,3.940762,3.634627,3.656024,2.934828,3.645171,3.174454,...,2.281968,3.247178,2.450493,2.149517,1.13941,1.768384,4.150747,2.459349,0.476006,1.980283
4,3.864301,5.227648,4.769728,4.83727,4.691342,4.66492,4.738351,4.802329,4.629582,4.70306,...,3.0806,3.54918,3.211044,2.582616,1.532466,2.590606,4.941381,2.994904,0.737824,2.199557
5,3.409294,3.917637,4.113348,4.193443,3.681283,3.373367,3.571902,3.090779,3.836153,3.808318,...,2.407582,3.757537,2.770761,2.363732,1.200014,1.88403,4.277915,2.610122,0.493224,2.216691


#### 0.5점 단위로 끊어서 확인

In [19]:
# 예측 행렬을 원본 점수처럼 0.5점 단위로 끊어서 표시하기

thres = [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]


for i in range(0, pred_matrix.shape[0]):
  for j in range(0, pred_matrix.shape[1]):
    for k in thres:
      if pred_matrix[i][j] < 0:
        pred_matrix[i][j] = 0.0
        break
      elif pred_matrix[i][j] > 5.0:
        pred_matrix[i][j] = 5.0
        break      
      elif (pred_matrix[i][j]>= k-0.25) and (pred_matrix[i][j]< k+0.25):
        pred_matrix[i][j] = k
        break


pred_matrix_df_05 = pd.DataFrame(pred_matrix, columns = user_movie_rating.columns, index = user_movie_rating.index)
print(pred_matrix_df_05.shape)
pred_matrix_df_05.head()       


(671, 3394)


tmdbId,5,11,12,13,14,16,18,19,20,22,...,325173,328111,328425,329833,332411,332567,333371,334074,342521,347969
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,2.0,2.5,3.0,3.0,2.0,3.0,2.5,3.0,2.5,...,1.5,2.0,2.0,1.5,1.0,1.5,3.0,1.5,0.5,1.5
2,2.5,4.0,3.5,3.0,4.0,3.5,3.0,4.0,4.0,3.0,...,2.0,3.0,3.0,2.0,1.0,2.0,4.0,2.5,0.5,2.0
3,4.0,4.0,3.0,5.0,4.0,3.5,3.5,3.0,3.5,3.0,...,2.5,3.0,2.5,2.0,1.0,2.0,4.0,2.5,0.5,2.0
4,4.0,5.0,5.0,5.0,4.5,4.5,4.5,5.0,4.5,4.5,...,3.0,3.5,3.0,2.5,1.5,2.5,5.0,3.0,0.5,2.0
5,3.5,4.0,4.0,4.0,3.5,3.5,3.5,3.0,4.0,4.0,...,2.5,4.0,3.0,2.5,1.0,2.0,4.5,2.5,0.5,2.0


# Content-based filtering

## 1. CSV 파일 불러와 DataFrame으로 저장

In [20]:
import pandas as pd
content_movies = pd.read_csv("./dataset/tmdb_5000_movies.csv")         # tmdb_5000_movies.csv dataframe으로 읽어오기
content_movies = content_movies[['id',"original_title", "overview"]]
content_movies.columns=['tmdbId','original_title','overview']
content_movies["overview"] = content_movies["overview"].astype("str")
content_movies

Unnamed: 0,tmdbId,original_title,overview
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."
...,...,...,...
4798,9367,El Mariachi,El Mariachi just wants to play his guitar and ...
4799,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...
4800,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic..."
4801,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...


"overview" column 모두 소문자로, 문자+숫자(\w)만 남기고 나머지는 띄어쓰기로 대체\
https://wikidocs.net/21703 참고

In [21]:
import re
content_movies['overview'].apply(lambda x:re.sub('\W',' ',x.lower()))

0       in the 22nd century  a paraplegic marine is di...
1       captain barbossa  long believed to be dead  ha...
2       a cryptic message from bond s past sends him o...
3       following the death of district attorney harve...
4       john carter is a war weary  former military ca...
                              ...                        
4798    el mariachi just wants to play his guitar and ...
4799    a newlywed couple s honeymoon is upended by th...
4800     signed  sealed  delivered  introduces a dedic...
4801    when ambitious new york attorney sam is sent t...
4802    ever since the second grade when he first saw ...
Name: overview, Length: 4803, dtype: object

TF-IDF 사용   
=>텍스트를 수치화한다 by 특징 추출   
가장 기본은 countVectorizer. 하지만 조사, 관사 등 **의미 없는 단어에 높은 수치를 부여** 할 위험이 있음   
https://chan-lab.tistory.com/24?category=810217 참고

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 단어 묶음을 1-2개로 설정
# ex) go home, very nice 등과 같은 단어도 인덱스로 받는다
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = tfidf_vec.fit_transform(content_movies['overview'])# 단어 학습

## cosine similarity

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
print("tfidf_matrix shape: ",tfidf_matrix.shape) # (데이터 개수, unique한 단어 개수)
plot_similarity = cosine_similarity(tfidf_matrix,tfidf_matrix) # 줄거리 간 cosine 유사도 구하기 - tfidf_matrix 사용
print("### COSINE Similarity ###")
print(plot_similarity)
similar_index = np.argsort(-plot_similarity)  # 유사도 높은 순서대로 index 정렬
print("### 유사도 기준 index 정렬 ###") 
print(similar_index)

tfidf_matrix shape:  (4803, 154844)
### COSINE Similarity ###
[[1.         0.01514413 0.00614504 ... 0.01195829 0.00572386 0.006304  ]
 [0.01514413 1.         0.01308527 ... 0.0176922  0.00997908 0.00666831]
 [0.00614504 0.01308527 1.         ... 0.01289    0.00565554 0.00612954]
 ...
 [0.01195829 0.0176922  0.01289    ... 1.         0.01532978 0.00900306]
 [0.00572386 0.00997908 0.00565554 ... 0.01532978 1.         0.01649947]
 [0.006304   0.00666831 0.00612954 ... 0.00900306 0.01649947 1.        ]]
### 유사도 기준 index 정렬 ###
[[   0 3604  634 ... 4140 2596 2669]
 [   1 2379 2542 ...  161 2656 4458]
 [   2 1343 3162 ... 4144 4148 4180]
 ...
 [4800 4034  569 ... 2853 4140 1038]
 [4801 2017 1480 ... 4140 4458 2108]
 [4802 2586  868 ... 3988 4513 3152]]


In [26]:
def recommend_movies(user_id,user_movie):
    
    ### contents-based filtering
    
    # input_movie에 해당하는 index 값 가져오기
    movie_index = content_movies[content_movies['original_title']==user_movie].index.values    
    similar_movies = similar_index[movie_index, :]  # 유사도 상위 10개 index 가져오기
    # 인덱스로 사용하기 위해서는 1차원으로 변형
    similar_movies_index = similar_movies.reshape(-1,)             # similar_movies 1차원 변형
    contents_based_movies=content_movies.iloc[similar_movies_index]
    
    
    ### collaborative filtering
    
    user_id=user_id-1
    # collaborative filtering 수행 결과를 평점 높은 순으로 정렬
    collaborative_filtering_movies=pred_matrix_df_05.iloc[user_id].sort_values(ascending=False)

    # 사용자 데이터 추출
    user_data=join_movie[join_movie.userId==user_id]
    # 사용자가 이미 본 영화는 삭제
    recommendations=movies[-movies['tmdbId'].isin(user_data['movieId'])]
    recommendations.columns=['tmdbId','original_title']
    # 데이터 합치기
    recommendations=recommendations.merge(pd.DataFrame(collaborative_filtering_movies).reset_index(),on='tmdbId')
    
    recommendations.columns=['tmdbId','original_title','Predictions']
    recommendations.drop(['original_title'],axis='columns',inplace=True)
    
    # contents-based + collaborative filtering 결과 합치기
    final_recommendations = pd.merge(recommendations, contents_based_movies, left_on='tmdbId', right_on='tmdbId', how='inner')
    
    final_recommendations["Predictions"] = final_recommendations["Predictions"].astype("int")
    final_recommendations=final_recommendations.sort_values(by=['Predictions'],axis=0,ascending=False)
    final_recommendations=final_recommendations.head(10)

    return final_recommendations

In [27]:
recommend_movies(330,'The Sound of Music')

Unnamed: 0,tmdbId,Predictions,original_title,overview
2910,11386,5,The Crying Game,Irish Republican Army member Fergus (Stephen R...
1679,4518,5,Elizabeth,The story of the ascension to the throne and t...
3277,14275,5,Hoop Dreams,This documentary follows two inner-city Chicag...
1629,424,5,Schindler's List,The true story of how businessman Oskar Schind...
2611,15121,5,The Sound of Music,Film adaptation of a classic Rodgers and Hamme...
2562,31174,5,Richard III,Shakespeare's Play transplanted into a 1930s s...
673,11128,4,Ladder 49,"Under the watchful eye of his mentor, Captain ..."
1364,11978,4,Men of Honor,Against formidable odds -- and an old-school d...
1320,18320,4,The Young Victoria,From Academy Award® winners Graham King and Ma...
2682,8390,4,"Definitely, Maybe",When Will decides to tell his daughter the sto...
