In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [12]:
# 데이터 불러오기
base_src = '../data'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src,
                    sep = "|",
                    names = u_cols,
                    encoding = 'latin-1')
users = users.set_index('user_id')

i_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id', 'title', 'release date', 'video release date',
          'IMDB URL', 'unknown', 'Action', 'Adventure', 'Animation',
          'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv(i_item_src,
                    sep = "|",
                    names = i_cols,
                    encoding = 'latin-1')
movies = movies.set_index('movie_id')

u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src,
                    sep = "\t",
                    names = r_cols,
                    encoding = 'latin-1')

# ratings DataFrame에서 timestamp 제거
ratings = ratings.drop('timestamp', axis = 1)

# 정확도 (RMSE)를 계산하는 함수
def RMSE(y_ture, y_pred):
    return np.sqrt(np.mean((np.array(y_ture) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# 데이터 셋 만들기
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.25,
                                                    stratify = y) # 계층화 추출
ratings_matrix = x_train.pivot(index = "user_id", columns = "movie_id", values = "rating")



In [13]:
# 코사인 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                               index = ratings_matrix.index,
                               columns = ratings_matrix.index)

In [14]:
user_similarity.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.140663,0.039134,0.064815,0.28162,0.29078,0.307716,0.2821,0.040946,0.25499,...,0.273897,0.064947,0.215023,0.115342,0.146232,0.085477,0.254637,0.035533,0.138319,0.317781
2,0.140663,1.0,0.117788,0.196684,0.054211,0.196596,0.060608,0.110063,0.101472,0.067412,...,0.152855,0.204764,0.276731,0.476402,0.318135,0.174263,0.183083,0.071884,0.121517,0.114893
3,0.039134,0.117788,1.0,0.319002,0.0,0.082856,0.045072,0.054732,0.083413,0.080111,...,0.023901,0.0,0.073228,0.069932,0.089505,0.0,0.104696,0.051704,0.158771,0.0
4,0.064815,0.196684,0.319002,1.0,0.0,0.049645,0.063456,0.197152,0.133871,0.036842,...,0.053703,0.049026,0.079774,0.265372,0.166956,0.0,0.210735,0.206531,0.16816,0.045358
5,0.28162,0.054211,0.0,0.0,1.0,0.179655,0.245795,0.150137,0.031561,0.104972,...,0.234222,0.054612,0.081105,0.028153,0.135078,0.053117,0.159443,0.070183,0.101943,0.249954


In [15]:
# 주어진 영화(movie_id)의 가중 평균 rating을 계산하는 함수
def CF_simple(user_id, movie_id): # user_id와 movie_id를 이용해서
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy() # 유사도를 얻어내고
        movie_ratings = ratings_matrix[movie_id].copy() # 다른 사람들의 평점 정보를 얻어
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum() # 가중평균을 한다.
    else:
        mean_rating = 3.0
    return mean_rating

In [16]:
# 정확도 계산
score(CF_simple)

1.0152192405149216

### 3.4 이웃을 고려한 CF

In [17]:
# score 함수 수정 : 유사집단의 크기를 미리 정하기 위해 기존 score 함수에 neighbor_size 인자값 추가
def score(model, neighbor_size = 0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [22]:
def CF_knn(user_id, movie_id, neighbor_size = 0):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)

        if neighbor_size == 0: # simple CF
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0

    return mean_rating

# 정확도 계산
score(CF_knn, neighbor_size=30)

1.0066534267432241

In [23]:
# 실제 주어진 사용자에 대해 추천을 받는 기능 구현 (전체 데이터 사용)
ratings_matrix = ratings.pivot(index = "user_id", columns = "movie_id", values = "rating")
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                               index = ratings_matrix.index,
                               columns = ratings_matrix.index)

def recom_movie(user_id, n_items, neighbor_size = 30):
    user_movie = ratings_matrix.loc[user_id].copy()

    for movie in ratings_matrix.columns:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0

        else:
            user_movie.loc[movie] = CF_knn(user_id, movie, neighbor_size)
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id = 729, n_items = 5, neighbor_size = 30)

movie_id
1189                      Prefontaine (1997)
1293                         Star Kid (1997)
1467    Saint of Fort Washington, The (1993)
1500               Santa with Muscles (1996)
22                         Braveheart (1995)
Name: title, dtype: object

### 3.5 최적의 이웃 크기 결정

In [24]:
# neighbor size를 10,20,..., 60인 경우에 대해 RMSE를 계산하고 이를 출력
for neighbor_size in [10,20,30,40,50,60]:
    print('Neighbor size = %d : RMSE = %.4f'%(neighbor_size, score(CF_knn, neighbor_size)))

Neighbor size = 10 : RMSE = 0.8064
Neighbor size = 20 : RMSE = 0.8738
Neighbor size = 30 : RMSE = 0.8999
Neighbor size = 40 : RMSE = 0.9138
Neighbor size = 50 : RMSE = 0.9226
Neighbor size = 60 : RMSE = 0.9288


### 3.6 사용자의 평가경향을 고려한 CF

In [25]:
# train, test set 분할
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.25,
                                                    stratify = y) # 계층화 추출
ratings_matrix = x_train.pivot(index = "user_id", columns = "movie_id", values = "rating")

matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,
                               index = ratings_matrix.index,
                               columns = ratings_matrix.index)

In [27]:
# 사용자의 평가경향을 고려한 함수
rating_mean = ratings_matrix.mean(axis =1)
rating_bias = (ratings_matrix.T - rating_mean).T

def CF_knn_bias(user_id, movie_id, neighbor_size = 0):
    if movie_id in rating_bias.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)

        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]

        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]

    return prediction

score(CF_knn_bias, 30)

0.9425196971784293

### 3.7 그 외의 CF 정확도 개선 방법

In [29]:
# 신뢰도를 반영한 추천
rating_binary_1 = np.array(ratings_matrix > 0).astype(float)
rating_binary_2 = rating_binary_1.T

counts = np.dot(rating_binary_1, rating_binary_2)
counts = pd.DataFrame(counts, index = ratings_matrix.index, columns = ratings_matrix.index).fillna(0)
print(counts.shape)

(943, 943)


In [30]:
counts.head() # user들 간의 공통으로 평가한 아이템의 개수

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,204.0,12.0,4.0,5.0,44.0,47.0,75.0,18.0,4.0,41.0,...,40.0,7.0,36.0,14.0,20.0,7.0,25.0,5.0,16.0,37.0
2,12.0,46.0,6.0,4.0,3.0,22.0,10.0,4.0,3.0,6.0,...,7.0,6.0,19.0,12.0,8.0,8.0,9.0,4.0,7.0,4.0
3,4.0,6.0,40.0,8.0,0.0,7.0,6.0,3.0,0.0,4.0,...,2.0,1.0,9.0,4.0,5.0,2.0,7.0,3.0,7.0,1.0
4,5.0,4.0,8.0,18.0,1.0,5.0,6.0,4.0,1.0,3.0,...,2.0,1.0,4.0,4.0,3.0,1.0,7.0,4.0,7.0,2.0
5,44.0,3.0,0.0,1.0,131.0,23.0,63.0,12.0,4.0,23.0,...,34.0,4.0,9.0,4.0,10.0,5.0,17.0,3.0,7.0,32.0


In [32]:
def CF_knn_bias_sig(user_id, movie_id, neighbor_size = 0):
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()

        no_rating = movie_ratings.isnull()
        common_counts = counts[user_id]
        low_significance = common_counts < SIG_LEVEL
        none_rating_idx = movie_ratings[no_rating | low_significance].index

        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)

        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > MIN_RATINGS:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]

    # 추가 - rating의 범위를 제한해서 RMSE를 낮추자
    if prediction <=1:
        prediction = 1
    elif prediction >=5:
        prediction = 5

    return prediction

SIG_LEVEL = 3
MIN_RATINGS = 3
score(CF_knn_bias_sig, 30)

0.9424551104388059

### 3.8 사용자 기반 CF과 아이템 기반 CF

In [33]:
# score 함수 재설정
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [36]:
rating_matrix_t = np.transpose(ratings_matrix)

matrix_dummy = ratings_matrix.T.copy().fillna(0)

item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity,
                               index = rating_matrix_t.index, columns = rating_matrix_t.index)

In [38]:
def CF_IBCF(user_id, movie_id):
    if movie_id in item_similarity.columns:
        sim_scores = item_similarity[movie_id]
        user_rating = rating_matrix_t[user_id]
        none_rating_idx = user_rating[user_rating.isnull()].index
        user_rating = user_rating.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
    else:
        mean_rating = 3.0

    return mean_rating

score(CF_IBCF)

1.0133776297201365