In [1]:
%matplotlib inline
import pandas as pd

In [2]:
# 데이터 불러오기 (valid는 20% uniform random sampling)
metadata = pd.read_csv('metadata.csv')
ratings_train = pd.read_csv('ratings-train.csv')
ratings_valid = pd.read_csv('ratings-valid.csv').sample(frac=0.2, random_state=17)

In [5]:
# RMSE
def rmse(expected, answer):
    merged = pd.merge(answer, expected, on=['userid', 'itemid'], how='left')
    merged['rating_y'] = merged['rating_y'].fillna(0)
    merged['square_error'] = (merged['rating_x'] - merged['rating_y']) ** 2
    return merged['square_error'].mean() ** 0.5

In [6]:
# jaccard similarity
# u, v => len(i_dict[u] & i_dict[v]) / len(i_dict[u] | i_dict[v])
all_users = ratings_train['userid'].unique()
i_dict = {u: set(ratings_train[ratings_train['userid'] == u]['itemid'])
          for u in all_users}
def sim(u, v):
    i_u = i_dict[u]
    i_v = i_dict[v]
    
    cup = i_dict[u] | i_dict[v]
    cap = i_dict[u] & i_dict[v]
    
    if len(cup) == 0:
        return 0.0
    return len(cap) / len(cup)

In [8]:
sim('TERhUA==', 'Q1ladXM=')

0.08333333333333333

In [9]:
def similar_users(u, k):
    sims = [(sim(u, v), v) for v in all_users if u != v]
    sorted_sims = sorted(sims, reverse=True)
    topk_sims = sorted_sims[:k]
    topk_users = [v for s, v in topk_sims]
    
    return pd.DataFrame(topk_users, columns=['userid'])

In [11]:
similar_users('TERhUA==', 5)

Unnamed: 0,userid
0,YzkyQQ==
1,NGdmcVQ=
2,M2hETGQ=
3,V0NyaQ==
4,QTB5d0E=


In [40]:
r_mean = ratings_train.groupby('userid')['rating'].mean().reset_index()

In [50]:
r_mean = ratings_train.groupby('userid')['rating'].mean().reset_index()
def predict(u, i):
    topk = similar_users(u, 10)
    topk['sim'] = topk.apply(lambda x: sim(u, x['userid']), axis=1)
    joined = pd.merge(topk, ratings_train[ratings_train['itemid'] == i], on='userid')
    joined = pd.merge(joined, r_mean, on='userid')
    joined['score'] = joined['sim'] * (joined['rating_x'] - joined['rating_y'])
    mean_u = r_mean[r_mean['userid'] == u]['rating'].iloc[0]
    sim_sum = joined['sim'].sum()
    r_ui = mean_u
    if len(joined) > 0:
        r_ui += joined['score'].sum() / sim_sum
    
    return r_ui
    
expected = ratings_valid.copy()
expected['rating'] = expected.apply(
    lambda x: predict(x['userid'], x['itemid']), axis=1)

rmse(expected, ratings_valid)

2.191210568409944

In [51]:
# Surprise 라이브러리의 User-User CF 실습해보기 (p. 44)
from surprise import Reader, Dataset
reader = Reader(rating_scale=(0, 10))
train_ds = Dataset.load_from_df(ratings_train, reader).build_full_trainset()

In [72]:
from surprise import KNNBasic, KNNWithMeans, SVD

model = SVD(random_state=17, n_factors=200, n_epochs=200, lr_all=0.0052, reg_all=0.04,
            verbose=True)
model.fit(train_ds)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f531df0eac8>

In [73]:
def predict(u, i):
    return model.predict(u, i).est

expected = ratings_valid.copy()
expected['rating'] = expected.apply(
    lambda x: predict(x['userid'], x['itemid']), axis=1)

rmse(expected, ratings_valid)

1.9359638225728149

In [75]:
model.qi.shape

(5834, 200)

In [76]:
df_qi = pd.DataFrame(model.qi)

In [86]:
df_qi.to_csv('qi.tsv', sep='\t', header=False, index=False)

In [87]:
df_iids = pd.DataFrame([train_ds.to_raw_iid(i) for i in train_ds.all_items()],
                       columns=['itemid'])
df_titles = pd.merge(df_iids, metadata[['itemid', 'title']], on='itemid', how='inner')
df_titles['title'].to_csv('titles.tsv', sep='\t', header=False, index=False)

In [96]:
ratings_custom = pd.read_csv('영화 평가 - 시트1.csv')

ratings_custom['dataitgirls'] = 1
ratings_train['dataitgirls'] = 0

In [97]:
ratings_concat = pd.concat((ratings_custom, ratings_train))

In [100]:
train_ds = Dataset.load_from_df(
    ratings_concat[['userid', 'itemid', 'rating']], reader).build_full_trainset()

In [101]:
model = SVD(n_factors=150, n_epochs=100, random_state=17, biased=False)
model.fit(train_ds)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f531dfe7f98>

In [104]:
people = ratings_custom['userid'].unique()
people, len(people)

(array(['adela', 'bomin', 'chiwan', 'dahye', 'danbi', 'gilim', 'hansol',
        'Heeyawl', 'kwang', 'mihyeon', 'minju', 'Song', 'sunmi', 'wooju',
        'yeeun', 'yeseul'], dtype=object), 16)

In [106]:
train_ds.to_inner_uid('chiwan')

2

In [107]:
pu = []
names = []
for name in people:
    names.append(name)
    inner_uid = train_ds.to_inner_uid(name)
    pu.append(model.pu[inner_uid])

In [110]:
df_pu = pd.DataFrame(pu)

In [113]:
df_names = pd.DataFrame(names)

In [115]:
df_pu.to_csv('pu.tsv', sep='\t', index=False, header=False)
df_names.to_csv('names.tsv', sep='\t', index=False, header=False)