<a href="https://colab.research.google.com/github/bwowby/DS/blob/master/collaborate_filtering_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

### Load Data

In [None]:
ratings_url = "https://bit.ly/dsml-01-ratings2"

In [None]:
data = pd.read_csv(ratings_url)
data.head()

Unnamed: 0,사람,책,평점
0,민지,백설공주,5.0
1,민지,신데렐라,4.0
2,민지,어린왕자,1.0
3,민지,흥부전,3.0
4,현우,노인과바다,3.0


In [None]:
ratings = pd.pivot_table(data, index="사람", columns="책", values="평점")
ratings

책,노인과바다,백설공주,신데렐라,어린왕자,콩쥐팥쥐,흥부전
사람,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
민수,3.0,4.0,4.0,3.0,4.0,
민지,,5.0,4.0,1.0,,3.0
지민,4.0,1.0,,5.0,2.0,3.0
지연,5.0,,3.0,4.0,3.0,3.0
현우,3.0,,2.0,,1.0,2.0


### 코사인 유사도 계산하기

In [None]:
''' u v 사용자
u = [3,4,3]
v = [3,2,4]
이 두명의 사용자의 유사도를 계산하자
'''
#벡터 연산 쉽게하기 위해 np 쓰기
u = np.array([3,4,3])
v = np.array([3,2,4])

In [None]:
#코사인 유사도 분자 : 각 항목 값 곱해서 더하기
uvdot = (u * v).sum()
uvdot

#분모 : 각자 제곱해서 더하기 후 루트
norm1 = (u**2).sum()
norm2 = (v**2).sum()

score = uvdot / np.sqrt(norm1 * norm2)
score

0.9235481451827989

In [None]:
# 평점을 남겼다는 게 전제인데 하나라도 평점이 없으면 skip 해줘야함 -> 공통된 상품만
u = np.array([np.nan,4,3])
v = np.array([3,2,np.nan])


In [None]:
# nan이 아닌 애만 가져오기
~np.isnan(u)
np.isfinite(u) , np.isfinite(v)

(array([False,  True,  True]), array([ True,  True, False]))

In [None]:
mask = np.isfinite(u) & np.isfinite(v)
u = u[mask]
v = v[mask]
u,v

(array([4.]), array([2.]))

In [None]:
# 함수로 모듈화 하기
def get_cosine_similarity(u,v) : 
    mask = np.isfinite(u) & np.isfinite(v)
    u = u[mask]
    v = v[mask]
    
    uvdot = (u*v).sum()
    norm1 = (u**2).sum()
    norm2 = (v**2).sum()
    score = uvdot / np.sqrt(norm1*norm2)
    
    return score

u = np.array([np.nan,4,3])
v = np.array([3,2,4])

get_cosine_similarity(u,v)

0.8944271909999159

In [None]:
u = ratings.loc["민수"]
v = ratings.loc["지민"]

get_cosine_similarity(u,v)

0.8132062148225916

In [None]:
#전체 사용자에 대한 유사도 구하기
from itertools import product

def get_cosine_similarity_table(ratings) : 
    index_combinations = list(product(ratings.index, repeat=2))
    similarity_list = []
    for uname,vname in index_combinations : 
        u,v  = ratings.loc[uname] ,ratings.loc[vname]        
        score = get_cosine_similarity(u,v)
#         print(uname,vname,score)

        similarity = {
            'u' : uname,
            'v' : vname,
            'score' : score,
        }
        
        similarity_list.append(similarity)
    
    similarity_list = pd.DataFrame(similarity_list)
    similarity_table = pd.pivot_table(similarity_list, index="u", columns="v", values="score")
    return similarity_table

similarty_table = get_cosine_similarity_table(ratings)
similarty_table

v,민수,민지,지민,지연,현우
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
민수,1.0,0.939827,0.813206,0.938986,0.876523
민지,0.939827,1.0,0.542857,0.840841,0.989949
지민,0.813206,0.542857,1.0,0.974406,0.992583
지연,0.938986,0.840841,0.974406,1.0,0.980581
현우,0.876523,0.989949,0.992583,0.980581,1.0
