# Memory Based 협업필터링

## (1) 사용자 기반

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
가 = [5, 2, 4, 3, np.nan]
나 = [1, 4, 1, np.nan, 4]
다 = [4, 4, np.nan, 5, 3]
라 = [np.nan, 2, 2, 4, 3]
마 = [4, np.nan, 4, 4, 2]
바 = [4, 2, 1, np.nan, 5]

col = ['건축학개론', '늑대소년', '너의결혼식', '해리포터', '반지의제왕']

In [3]:
dic = {'가':가, '나':나, '다':다, '라':라, '마':마, '바':바}

data = pd.DataFrame(dic).T
data.columns = col
data

Unnamed: 0,건축학개론,늑대소년,너의결혼식,해리포터,반지의제왕
가,5.0,2.0,4.0,3.0,
나,1.0,4.0,1.0,,4.0
다,4.0,4.0,,5.0,3.0
라,,2.0,2.0,4.0,3.0
마,4.0,,4.0,4.0,2.0
바,4.0,2.0,1.0,,5.0


In [4]:
def vector_size(lst):
    return np.sqrt((lst * lst).sum())

def cosine_similarity(data, a, b):
    # 데이터 불러오기
    one = data.loc[a]
    two = data.loc[b]
    
    # 평점을 매긴 영화 이름 불러오기
    one_idx = one[one>=0].index
    two_idx = two[two>=0].index
    
    # 두명의 공통 영화 불러오기
    lst = list(set(one_idx) & set(two_idx))
    
    one = one[lst]
    two = two[lst]
    
    result = (one*two).sum() / (vector_size(one) * vector_size(two))
    
    return np.round(result, 2)

def user_matrix(df):
    dic = {}
    for i in tqdm(df.index):
        lst = []
        for j in tqdm(df.index):
            lst.append(cosine_similarity(df, i, j))

        dic[i] = lst

    return pd.DataFrame(dic, index=df.index)

In [5]:
u_matrix = user_matrix(data)
u_matrix

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 73.82it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 125.00it/s][A
 33%|████████████████████████████                                                        | 2/6 [00:00<00:00, 13.77it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 150.89it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 125.08it/s][A
 67%|████████████████████████████████████████████████████████                            | 4/6 [00:00<00:00, 14.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 91.87it/s][A

100%|█████████████████

Unnamed: 0,가,나,다,라,마,바
가,1.0,0.6,0.92,0.91,0.98,0.91
나,0.6,1.0,0.87,0.93,0.63,0.83
다,0.92,0.87,1.0,0.97,0.99,0.91
라,0.91,0.93,0.97,1.0,0.93,0.93
마,0.98,0.63,0.99,0.93,1.0,0.77
바,0.91,0.83,0.91,0.93,0.77,1.0


## '마'의 늑대소년 점수 예측
## - 전체를 대상으로 유사도 기반의 weighted sum 값

In [6]:
cosine_similarity_other = u_matrix.loc['마'].drop(['마'], axis=0)
movie_rating_other = data['늑대소년'].drop(['마'], axis=0)
org_user_matrix_score = (cosine_similarity_other * movie_rating_other).sum() / cosine_similarity_other.sum()
org_user_matrix_score

2.7534883720930234

## (2) 아이템 기반

In [7]:
def item_matrix(df):
    dic = {}
    for i in df.T.index:
        lst = []
        for j in df.T.index:
            lst.append(cosine_similarity(df.T, i, j))

        dic[i] = lst

    return pd.DataFrame(dic, index=df.T.index)

In [8]:
i_matrix = item_matrix(data)
i_matrix

Unnamed: 0,건축학개론,늑대소년,너의결혼식,해리포터,반지의제왕
건축학개론,1.0,0.79,0.92,0.96,0.86
늑대소년,0.79,1.0,0.73,0.98,0.91
너의결혼식,0.92,0.73,1.0,0.94,0.67
해리포터,0.96,0.98,0.94,1.0,0.99
반지의제왕,0.86,0.91,0.67,0.99,1.0


## '마'의 늑대소년 점수 예측
## - 전체를 대상으로 유사도 기반의 weighted sum 값

In [9]:
cosine_similarity_other = i_matrix.loc['늑대소년'].drop(['늑대소년'], axis=0)
movie_rating_other = data.loc['마'].drop(['늑대소년'], axis=0)
org_item_matrix_score = (cosine_similarity_other * movie_rating_other).sum() / cosine_similarity_other.sum()
org_item_matrix_score

3.466275659824047

# [1,1,1,1,1] [5,5,5,5,5] 인 경우 코사인 유사도가 높게 나오는 문제점 발생

## 사용자의 평균값을 빼줌으로써 보정한 코사인 유사도

In [10]:
def vector_size_transform(lst, mean):
    return np.sqrt(((lst-mean) * (lst-mean)).sum())

def cosine_similarity_transform(data, a, b):
    e = 1e-5
    # 데이터 불러오기
    one = data.loc[a]
    two = data.loc[b]
    
    one_idx = one[one>=0].index
    two_idx = two[two>=0].index
    
    # 두명의 공통 영화 불러오기
    lst = list(set(one_idx) & set(two_idx))
    
    one = one[lst]
    two = two[lst]
    
    result = ((one-one.mean()) * (two-two.mean())).sum() / (vector_size_transform(one, one.mean()) * vector_size_transform(two, two.mean()) + e)
    
    return np.round(result, 2)


def user_matrix_transform(df):
    dic = {}
    for i in tqdm(df.index):
        lst = []
        for j in tqdm(df.index):
            lst.append(cosine_similarity_transform(df, i, j))

        dic[i] = lst

    return pd.DataFrame(dic, index=df.index)


def item_matrix_transform(df):
    dic = {}
    for i in df.T.index:
        lst = []
        for j in df.T.index:
            lst.append(cosine_similarity_transform(df.T, i, j))

        dic[i] = lst

    return pd.DataFrame(dic, index=df.T.index)

In [11]:
u_matrix2 = user_matrix_transform(data)
u_matrix2

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]
  0%|                                                                                            | 0/6 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 53.02it/s][A
 17%|██████████████                                                                      | 1/6 [00:00<00:00,  7.92it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 61.73it/s][A
 33%|████████████████████████████                                                        | 2/6 [00:00<00:00,  8.02it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 83.43it/s][A

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 75.05it/s][A
 67%|███████████████████

Unnamed: 0,가,나,다,라,마,바
가,1.0,-0.94,-0.19,0.0,0.0,0.5
나,-0.94,1.0,-0.5,0.5,-1.0,0.32
다,-0.19,-0.5,1.0,0.5,0.87,-0.76
라,0.0,0.5,0.5,1.0,0.0,0.97
마,0.0,-1.0,0.87,0.0,1.0,-0.69
바,0.5,0.32,-0.76,0.97,-0.69,1.0


In [12]:
cosine_similarity_other = u_matrix2.loc['마'].drop(['마'], axis=0)
movie_rating_other = data['늑대소년'].drop(['마'], axis=0)
transform_user_matrix_score = (cosine_similarity_other * movie_rating_other).sum() / cosine_similarity_other.sum()
transform_user_matrix_score

2.317073170731707

In [14]:
i_matrix2 = item_matrix_transform(data)
i_matrix2

Unnamed: 0,건축학개론,늑대소년,너의결혼식,해리포터,반지의제왕
건축학개론,1.0,-0.67,0.67,-0.87,-0.26
늑대소년,-0.67,1.0,-0.47,0.87,-0.3
너의결혼식,0.67,-0.47,1.0,-0.5,-0.91
해리포터,-0.87,0.87,-0.5,1.0,0.5
반지의제왕,-0.26,-0.3,-0.91,0.5,1.0


In [15]:
cosine_similarity_other = i_matrix2.loc['늑대소년'].drop(['늑대소년'], axis=0)
movie_rating_other = data.loc['마'].drop(['늑대소년'], axis=0)
transform_item_matrix_score = (cosine_similarity_other * movie_rating_other).sum() / cosine_similarity_other.sum()
transform_item_matrix_score

2.9473684210526323

## 영화 평점 데이터로 실습

In [16]:
rating_data = pd.read_csv('../data/movie_data/ratings.csv')
movie_data = pd.read_csv('../data/movie_data/movies.csv')

In [17]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [18]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## 불필요한 column제거

In [19]:
rating_data.drop('timestamp', axis=1, inplace=True)
rating_data.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [20]:
movie_data.drop('genres', axis=1, inplace=True)
movie_data.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [21]:
user_movie_data = pd.merge(rating_data, movie_data, on='movieId')
user_movie_data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,31,2.5,Dangerous Minds (1995)
1,7,31,3.0,Dangerous Minds (1995)
2,31,31,4.0,Dangerous Minds (1995)
3,32,31,4.0,Dangerous Minds (1995)
4,36,31,3.0,Dangerous Minds (1995)


## user-item matrix

In [22]:
user_movie_rating = user_movie_data.pivot_table(index = 'userId', columns='title', values='rating').fillna(0)
user_movie_rating.head()

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## item-user matrix

In [23]:
moive_user_rating = user_movie_rating.T
moive_user_rating.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Neath the Arizona Skies (1934),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 10명의 유저 user matrix 구하기

In [24]:
u_matrix = user_matrix(user_movie_rating[:10])
u_matrix

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s][A
 10%|████████▎                                                                          | 1/10 [00:00<00:01,  4.73it/s][A
 20%|████████████████▌                                                                  | 2/10 [00:00<00:01,  4.94it/s][A
 30%|████████████████████████▉                                                          | 3/10 [00:00<00:01,  5.65it/s][A
 40%|█████████████████████████████████▏                                                 | 4/10 [00:00<00:01,  6.00it/s][A
 50%|█████████████████████████████████████████▌                                         | 5/10 [00:00<00:00,  5.88it/s][A
 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:01<00:00,  5.58it/s][A
 70%|██████████████

 50%|█████████████████████████████████████████▌                                         | 5/10 [00:00<00:00,  5.00it/s][A
 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:01<00:00,  4.94it/s][A
 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:01<00:00,  5.19it/s][A
 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:01<00:00,  5.12it/s][A
 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [00:01<00:00,  4.91it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.98it/s][A
 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:12<00:08,  2.01s/it]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s][A
 10%|████████▎     

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.0,0.0,0.0,0.07,0.02,0.0,0.08,0.0,0.01,0.0
2,0.0,1.0,0.12,0.12,0.1,0.0,0.21,0.11,0.11,0.04
3,0.0,0.12,1.0,0.08,0.15,0.06,0.15,0.25,0.13,0.11
4,0.07,0.12,0.08,1.0,0.13,0.08,0.32,0.19,0.03,0.14
5,0.02,0.1,0.15,0.13,1.0,0.06,0.1,0.17,0.09,0.03
6,0.0,0.0,0.06,0.08,0.06,1.0,0.0,0.13,0.02,0.05
7,0.08,0.21,0.15,0.32,0.1,0.0,1.0,0.15,0.06,0.19
8,0.0,0.11,0.25,0.19,0.17,0.13,0.15,1.0,0.16,0.16
9,0.01,0.11,0.13,0.03,0.09,0.02,0.06,0.16,1.0,0.13
10,0.0,0.04,0.11,0.14,0.03,0.05,0.19,0.16,0.13,1.0


In [82]:
def predict_movie_score(user_id):
    sim = u_matrix.loc[user_id].drop([user_id], axis=0).values
    
    score = []
    for i in user_movie_rating.iloc[:10].columns[:10]:
        temp = user_movie_rating.iloc[:10][i].drop([user_id], axis=0)
        score.append(((temp * sim).sum() / sim.sum(), i))
    
    # Top - 5 선택
    movie = []
    for i in sorted(score, key=lambda x: x[0], reverse=True)[:5]:
        movie.append(i[1])
        
    # user_id가 본 영화 제외
    seen_movie = user_movie_rating.loc[user_id][user_movie_rating.loc[user_id] != 0].index
    
    return movie[movie not in seen_movie.values]

In [83]:
predict_movie_score(8)

  return movie[movie not in seen_movie.values]


'"Great Performances" Cats (1998)'