In [51]:
%matplotlib inline
import pandas as pd

In [81]:
metadata = pd.read_csv('metadata.csv')
ratings = pd.read_csv('ratings-train.csv')
ratings_valid = pd.read_csv('ratings-valid.csv').sample(frac=0.2, random_state=17)

In [56]:
# RMSE 함수 (page. 23)
import math
def rmse(expected, answer):
    joined = pd.merge(answer, expected, on=['userid', 'itemid'], how='left')
    joined['rating_y'] = joined['rating_y'].fillna(0)
    joined['square_error'] = (joined['rating_x'] - joined['rating_y']) ** 2
    
    mse = joined['square_error'].mean()
    return math.sqrt(mse)

# 주어진 두 사용자에 대해 jaccard similarity 계산 (p. 28)
users = ratings['userid'].unique()
i_dict = {u: set(ratings[ratings['userid'] == u]['itemid']) for u in users}
def sim(u, v):
    i_u = i_dict[u]
    i_v = i_dict[v]
    
    if len(i_u) == 0 or len(i_v) == 0:
        return 0.0
    return len(i_u & i_v) / float(len(i_u | i_v))

# 사용자 u와 비슷한 k명의 사용자를 찾는 함수 (p. 29)
def similar_users(u, k):
    sims = [(sim(u, v), v) for v in users if u != v]
    topk_sims = sorted(sims, reverse=True)[:k]
    topk_users = [v for s, v in topk_sims]
    
    return pd.DataFrame(topk_users, columns=['userid'])

# User-User Collaborative Filtering을 사용하는 predict 함수 (개선된 버전, p. 31, 숙제)
r_mean = ratings.groupby('userid')['rating'].mean().reset_index()
r_mean = r_mean.rename(columns={'rating': 'mean_rating'})
def predict(u, i):
    topk_users = similar_users(u, 5)
    topk_users['sim'] = topk_users.apply(lambda v: sim(u, v['userid']), axis=1)
    merged = pd.merge(topk_users, r_mean, on='userid')
    merged = pd.merge(merged, ratings[ratings['itemid'] == i], on='userid')
    merged['weighted'] = (merged['rating'] - merged['mean_rating']) * merged['sim']
   
    weighted_sum = merged['weighted'].sum()
    sim_sum = topk_users['sim'].sum()
    weighted_avg = weighted_sum / sim_sum
    return weighted_avg + r_mean[r_mean['userid'] == u]['mean_rating'].iloc[0]

expected = ratings_valid.copy()
expected['rating'] = expected.apply(lambda x: predict(x['userid'], x['itemid']), axis=1)

rmse(expected, ratings_valid)

2.1183615304666623

In [50]:
# surprise 라이브러리에서 쓸 수 있도록 데이터 타입 변환하기 (p. 44)
from surprise import Dataset, Reader
reader = Reader(rating_scale=(0, 10))
ds_train = Dataset.load_from_df(ratings, reader).build_full_trainset()

In [59]:
# surprise 라이브러리의 User-User CF 모델 수행해보기 (평균 보정 없음, p. 44)
from surprise import KNNBasic

model = KNNBasic(k=5)
model.fit(ds_train)

def predict(u, i):
    return model.predict(u, i).est

expected = ratings_valid.copy()
expected['rating'] = expected.apply(lambda x: predict(x['userid'], x['itemid']), axis=1)

rmse(expected, ratings_valid)

Computing the msd similarity matrix...
Done computing similarity matrix.


2.275156622021938

In [60]:
# surprise 라이브러리의 User-User CF 모델 수행해보기 (평균 보정 있음, p. 44)
from surprise import KNNWithMeans

model = KNNWithMeans(k=5)
model.fit(ds_train)

def predict(u, i):
    return model.predict(u, i).est

expected = ratings_valid.copy()
expected['rating'] = expected.apply(lambda x: predict(x['userid'], x['itemid']), axis=1)

rmse(expected, ratings_valid)

Computing the msd similarity matrix...
Done computing similarity matrix.


2.1668738952586426

In [245]:
# surprise 라이브러리의 matrix factorization 모델 수행해보기 (p. 44)
from surprise import SVD

model = SVD(biased=False, random_state=17)
model.fit(ds_train)

def predict(u, i):
    return model.predict(u, i).est

expected = ratings_valid.copy()
expected['rating'] = expected.apply(lambda x: predict(x['userid'], x['itemid']), axis=1)

rmse(expected, ratings_valid)

2.114879457135263

In [252]:
# surprise 라이브러리의 matrix factorization 모델 옵션 바꿔보기 (p. 44)
from surprise import SVD

model = SVD(n_factors=200, biased=True, n_epochs=100, random_state=17)
model.fit(ds_train)

def predict(u, i):
    return model.predict(u, i).est

expected = ratings_valid.copy()
expected['rating'] = expected.apply(lambda x: predict(x['userid'], x['itemid']), axis=1)

rmse(expected, ratings_valid)

1.9558362434755228

In [260]:
# 'QjEzc3k=' 유저가 10점을 준 영화 목록을 뽑아보고, 해당 유저와 cosine similarity top-2 유저의 10점 영화 목록 출력해보기 (p. 48)

def show_hist(u):
    rated = ratings[ratings['userid'] == u]
    rated = pd.merge(rated, metadata, on='itemid')
    return rated[rated['rating'] >= 0][['title', 'rating']]

import numpy as np
u = 'QjEzc3k='
ufeat = model.pu / np.linalg.norm(model.pu, axis=1, keepdims=True)
inner_u = ds_train.to_inner_uid(u)
p_u = ufeat[inner_u]
show_hist(u)

Unnamed: 0,title,rating
0,써니 (2011),10
1,레옹 (1994),0
2,조커 (2019),0
3,이레셔널 맨 (2015),0
4,"스탠바이, 웬디 (2017)",10
5,타짜: 원 아이드 잭 (2019),0
6,82년생 김지영 (2019),10
7,애드 아스트라 (2019),0
8,맨하탄 녹턴 (2016),10
9,써드 퍼슨 (2013),0


In [254]:
sims = np.dot(ufeat, p_u)
sims[inner_u] = -np.inf
cands = [ds_train.to_raw_uid(u) for u in np.argsort(-sims)[:2]]

In [255]:
show_hist(cands[0])

Unnamed: 0,title,rating
1,토이 스토리 4 (2019),10
3,어메이징 스파이더맨 (2012),10
4,고스트버스터즈 (2016),10
6,예스터데이 (2019),10
8,82년생 김지영 (2019),10
9,맨하탄 녹턴 (2016),10
11,크리스틴 (2016),10
12,메기 (2018),10
14,스노우 화이트 (2019),10
15,더 글래스 캐슬 (2017),10


In [256]:
show_hist(cands[1])

Unnamed: 0,title,rating
1,태극기 휘날리며 (2004),10
2,아쿠아맨 (2018),10
6,제로 다크 서티 (2012),10
8,더 포스트 (2017),10
9,존 윅 (2014),10
10,스파이더맨 2 (2004),10
11,유전 (2018),10
14,타이타닉 (1997),10
16,매드맥스: 분노의 도로 (2015),10
20,괴물 (2006),10


In [258]:
# 유사 엔티티 시각화를 위한 데이터 출력 (p. 49)
df_qi = pd.DataFrame(data=model.qi)
df_qi.to_csv('qi.tsv', sep='\t', header=False, index=False)

In [242]:
df_iids = pd.DataFrame(data=[ds_train.to_raw_iid(i) for i in ds_train.all_items()], columns=['itemid'])
df_titles = pd.merge(df_iids, metadata[['itemid', 'title']], on='itemid', how='inner')
df_titles['title'].to_csv('titles.tsv', sep='\t', header=False, index=False)