# 🎞 간단한 영화 평점 예측(추천) 시스템

- 영화 평점 예측(추천) 시스템
- 데이터 : MovieLens
- 알고리즘
    1. 랜덤으로 평점 예측하기
    2. 영화 평균 평점기반 예측하기
    3. 사용자 평균 평점기반 예측하기
    4. Rule 기반 영화 랭킹 예측하기
- 평가 : RMSE

----

# 1. 라이브러리 호출 및 데이터 읽기

In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
path = './data/movielens'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [3]:
ratings_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# 2. EDA

### rating 데이터 정보 확인하기
- 몇 명의 유저가 몇 개의 영화에 평점을 줬는지

In [4]:
num_users = ratings_df['userId'].unique()
num_movies = ratings_df['movieId'].unique()

print("총 유저 수: ", len(num_users))
print("총 영화 수: ", len(num_movies))

총 유저 수:  610
총 영화 수:  9724


- 각 유저가 어떤 평점을 줬는지 sparse matrix

In [5]:
user_movie_matrix = ratings_df.pivot(
    index = 'movieId',
    columns = 'userId',
    values = 'rating'
).fillna(0)

In [6]:
user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# convert dataframe of movie features to scipy sparse matrix => csr_matrix
sparse_mat = csr_matrix(user_movie_matrix.values)

In [8]:
print(sparse_mat)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


### 해석
- 0번 영화는 많은 유저에게 평가 받음
- 9723번 영화는 한 명의 유저에게만 평가 받음

### 데이터 프레임으로 확인

In [9]:
user_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
                           index = user_movie_matrix.columns, columns=['movies_rated'])

In [10]:
user_info_df

Unnamed: 0_level_0,movies_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [11]:
movie_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index],
                           index = user_movie_matrix.index, columns=['users_rated'])

In [12]:
movie_info_df

Unnamed: 0_level_0,users_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


### 영화당 평가 개수가 너무 다름 -> Bias 하다는 것을 볼 수 있음

# 3. 데이터 셋 나누기

In [13]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [14]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


# 4. 추천 알고리즘 만들기

### 1) 랜덤으로 평점 예측하기

In [15]:
ratings_range = np.arange(0.5, 5.5, step = 0.5)

In [16]:
import random
pred_random = [random.choice(ratings_range) for x in range(len(test_df))]
pred_random[:10]

[4.0, 1.5, 3.5, 4.5, 3.0, 2.5, 4.5, 3.0, 1.5, 4.5]

In [17]:
test_df['pred_ratings_random'] = pred_random

In [18]:
mse = mean_squared_error(y_true = test_df['rating'].values, y_pred = test_df['pred_ratings_random'].values)
rmse = np.sqrt(mse)

print(mse, rmse) # 3점이면 1.1 ~ 4.9

3.6643692978976596 1.9142542406633607


### 2) 영화 평균 평점기반 예측하기

1. train set의 모든 영화에 대해서 평균 평점 구하기
2. test set 예측할 때, train set의 영화 평균 평점 활용하기. 만약 없다면, random으로 선택하기

In [19]:
train_movie_df = train_df.groupby('movieId').mean()

train_movie_df.head()

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,317.741379,3.893678,1142680000.0
2,329.538462,3.373626,1145753000.0
3,284.9,3.1625,1005732000.0
4,242.5,2.25,905213300.0
5,329.205882,2.955882,1006175000.0


In [20]:
def avg_rating_prediction(training_set, x):
    if x in training_set.index:
        pred_rating = training_set.loc[x]['rating']
    else:
        pred_rating = random.choice(ratings_range)
    return pred_rating

In [21]:
test_df['pred_rating_movie'] = test_df['movieId'].apply(lambda x: avg_rating_prediction(train_movie_df, x))

test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie
67037,432,77866,4.5,1335139641,4.0,2.9
42175,288,474,3.0,978465565,1.5,3.754386
93850,599,4351,3.0,1498524542,3.5,3.25
6187,42,2987,4.0,996262677,4.5,3.578947
12229,75,1610,4.0,1158989841,3.0,3.80137


In [22]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_movie'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.0545155311733276 1.0268960663929567


### 3) 사용자 평균 평점기반 예측하기
1. train set의 모든 유저가 준 평균 평점
2. test set 예측할 때, 유저가 train set에서 준 평균 평점을 활용. 유저가 없을 경우 random 평점 적용

In [23]:
train_user_df = train_df.groupby('userId').mean()

train_user_df.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1810.336788,4.331606,964982400.0
2,67608.08,3.92,1445715000.0
3,8325.516129,2.580645,1306464000.0
4,1899.694118,3.464706,965721200.0
5,366.842105,3.657895,847435100.0


In [24]:
test_df['pred_rating_user'] = test_df['userId'].apply(lambda x: avg_rating_prediction(train_user_df, x))

In [25]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_user'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

0.9020188826241128 0.9497467465720074


### 4) Rule 기반 영화 평점 예측하기
1. train set에 포함된 유저의 영화 평균 평점과 영화의 장르를 활용하여, 장르별 평균 평점 계산 -> test set의 영화 장르의 평균 평점으로 예측

In [26]:
train_user_movie_matrix = train_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

In [27]:
genres_df = movies_df['genres'].str.get_dummies(sep='|')
print(genres_df.shape)
genres_df = genres_df.loc[train_df.movieId.unique()]
print(genres_df.shape)
genres_df.head()

(9742, 20)
(8983, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7347,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
71462,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2115,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1127,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
2409,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [28]:
train_movie_avg_ratings_df = train_user_movie_matrix.copy()
train_movie_avg_ratings_df = train_movie_avg_ratings_df.replace(0, np.NaN)
train_movie_avg_ratings_df = train_movie_avg_ratings_df.mean(axis = 1)

train_movie_avg_ratings_df.head()

movieId
1    3.893678
2    3.373626
3    3.162500
4    2.250000
5    2.955882
dtype: float64

In [29]:
genres_avg_ratings_df = pd.DataFrame(index=genres_df.columns, columns=['avg_ratings'])

for genre in genres_avg_ratings_df.index:
    genre_avg_rating = train_movie_avg_ratings_df.loc[genres_df[genres_df[genre].isin([1])].index].mean()
    genres_avg_ratings_df.loc[genre]['avg_ratings'] = genre_avg_rating

genres_avg_ratings_df

Unnamed: 0,avg_ratings
(no genres listed),3.35679
Action,3.121847
Adventure,3.205439
Animation,3.477723
Children,3.115097
Comedy,3.170161
Crime,3.313487
Documentary,3.777727
Drama,3.426015
Fantasy,3.220115


In [30]:
def get_genre_avg_ratings(x):
    genres_list = movies_df.loc[x]['genres'].split('|')
    rating = 0
    for genre in genres_list:
        rating += genres_avg_ratings_df.loc[genre]['avg_ratings']
    
    return rating / len(genres_list)

In [31]:
tqdm.pandas()
test_df['pred_rating_genre'] = test_df['movieId'].progress_apply(lambda x: get_genre_avg_ratings(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 20168/20168 [00:11<00:00, 1775.50it/s]


In [32]:
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user,pred_rating_genre
67037,432,77866,4.5,1335139641,4.0,2.900000,3.628571,3.335160
42175,288,474,3.0,978465565,1.5,3.754386,3.131737,3.142915
93850,599,4351,3.0,1498524542,3.5,3.250000,2.651029,3.199772
6187,42,2987,4.0,996262677,4.5,3.578947,3.541547,3.262930
12229,75,1610,4.0,1158989841,3.0,3.801370,3.322034,3.163756
...,...,...,...,...,...,...,...,...
57416,380,5048,2.0,1494268065,2.0,2.875000,3.675358,3.163565
67290,434,54272,3.5,1270606860,3.5,3.592105,3.738220,3.323942
33423,226,5989,4.5,1162428551,3.5,3.948454,3.474874,3.369751
98552,607,1320,3.0,963080497,1.0,3.183333,3.744828,3.085319


In [33]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_genre'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.1333817433607896 1.0646040312533058


2. user의 평균 영화 평점을 normalize해서 확인하기, 평점 측정 수, 표준편차 등 활용가능 

In [34]:
train_user_info_df = pd.DataFrame({
    'avg_ratings': train_df.groupby('userId')['rating'].mean(),
    'std_ratings': train_df.groupby('userId')['rating'].std(),
    'count_ratings': train_df.groupby('userId')['rating'].count()
})

train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.331606,0.812587,193
2,3.920000,0.837655,25
3,2.580645,2.125625,31
4,3.464706,1.372356,170
5,3.657895,1.046908,38
...,...,...,...
606,3.657002,0.706503,914
607,3.744828,0.977378,145
608,3.117820,1.081198,679
609,3.290323,0.461414,31


In [35]:
min_count = train_user_info_df['count_ratings'].min()
max_count = train_user_info_df['count_ratings'].max()
avg_count = train_user_info_df['count_ratings'].mean()

train_user_info_df['weights'] = train_user_info_df['count_ratings'].apply(lambda x: (x-avg_count)/(max_count-min_count))

In [36]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train_user_info_df)
df_normalized = pd.DataFrame(np_scaled, columns = train_user_info_df.columns, index=train_user_info_df.index)
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.817711,0.382282,0.085349,0.085349
2,0.705455,0.394075,0.005690,0.005690
3,0.340176,1.000000,0.008535,0.008535
4,0.581283,0.645625,0.074443,0.074443
5,0.633971,0.492518,0.011854,0.011854
...,...,...,...,...
606,0.633728,0.332374,0.427217,0.427217
607,0.657680,0.459808,0.062589,0.062589
608,0.486678,0.508649,0.315789,0.315789
609,0.533724,0.217072,0.008535,0.008535


In [37]:
df_normalized['normalized_avg_ratings'] = df_normalized['avg_ratings'] * 5
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights,normalized_avg_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.817711,0.382282,0.085349,0.085349,4.088554
2,0.705455,0.394075,0.005690,0.005690,3.527273
3,0.340176,1.000000,0.008535,0.008535,1.700880
4,0.581283,0.645625,0.074443,0.074443,2.906417
5,0.633971,0.492518,0.011854,0.011854,3.169856
...,...,...,...,...,...
606,0.633728,0.332374,0.427217,0.427217,3.168639
607,0.657680,0.459808,0.062589,0.062589,3.288401
608,0.486678,0.508649,0.315789,0.315789,2.433391
609,0.533724,0.217072,0.008535,0.008535,2.668622


In [38]:
test_df['pred_rating_normalized'] = test_df['userId'].apply(lambda x: df_normalized.loc[x]['normalized_avg_ratings'])
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user,pred_rating_genre,pred_rating_normalized
67037,432,77866,4.5,1335139641,4.0,2.900000,3.628571,3.335160,3.129870
42175,288,474,3.0,978465565,1.5,3.754386,3.131737,3.142915,2.452368
93850,599,4351,3.0,1498524542,3.5,3.250000,2.651029,3.199772,1.796857
6187,42,2987,4.0,996262677,4.5,3.578947,3.541547,3.262930,3.011201
12229,75,1610,4.0,1158989841,3.0,3.801370,3.322034,3.163756,2.711864
...,...,...,...,...,...,...,...,...,...
57416,380,5048,2.0,1494268065,2.0,2.875000,3.675358,3.163565,3.193670
67290,434,54272,3.5,1270606860,3.5,3.592105,3.738220,3.323942,3.279391
33423,226,5989,4.5,1162428551,3.5,3.948454,3.474874,3.369751,2.920283
98552,607,1320,3.0,963080497,1.0,3.183333,3.744828,3.085319,3.288401


In [39]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_normalized'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.2273721504015616 1.1078682910894966


------

# 후기

- 추천 시스템의 기본에 대해 알 수 있었다.
- 판다스의 여러 가지 기능에 대해 더 공부해봐야겠다.
- 얼른 다른 알고리즘들을 적용해보고 싶다는 생각이 들었다.