- 아주 간단한 샘플 데이터 협업 필터링 사용자 기반 알고리즘을 구현
    - 유클리디안 디스턴스, 코사인
- 캐글에서 영화 평점 데이터로 알고리즘을 구현
    - 유사도, 그루핑 갯수 설정

- 네이버의 추천시스템
- https://m.blog.naver.com/naver_diary/220936643956


In [None]:
# 샘플데이터 만들기 AiRS


In [1]:
import numpy as np
import pandas as pd
from scipy import spatial

In [2]:

# 샘플 데이터에 대한 인덱스와 컬럼을 정의
columns = ["article_1","article_2","article_3","article_4","article_5","article_6"]
index = ["user_1", "user_2", "user_3", "user_4", "user_5"]

# 샘플 데이터에 대한 행렬을 만듦
data = np.array([
    [5,3,0,0,2,3],
    [2,0,0,1,4,1],
    [0,0,5,3,1,2],
    [4,0,4,5,0,5],
    [0,0,1,2,0,0],
])

# 샘플 데이터 프레임을 만듦
sample_df = pd.DataFrame(data, columns=columns, index=index)
sample_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,5,3,0,0,2,3
user_2,2,0,0,1,4,1
user_3,0,0,5,3,1,2
user_4,4,0,4,5,0,5
user_5,0,0,1,2,0,0


In [3]:
def intersect_nonzero(vector_1, vector_2):
    
    idx = np.intersect1d(vector_1.nonzero(), vector_2.nonzero())
    
    vector_1 = vector_1[idx]
    vector_2 = vector_2[idx]
    
    return vector_1, vector_2

In [4]:
intersect_nonzero(
    sample_df.loc["user_1"].values,
    sample_df.loc["user_2"].values
)

(array([5, 2, 3]), array([2, 4, 1]))

In [6]:
intersect_nonzero(
    sample_df.loc["user_1"].values,
    sample_df.loc["user_5"].values
)

(array([], dtype=int32), array([], dtype=int32))

In [8]:
def euclidean_similarity(vec1,vec2):
    
    vec1,vec2 = intersect_nonzero(vec1,vec2)
    
    if not(len(vec1) or len(vec2)):
        return None
    

    return np.linalg.norm(vec1 - vec2)

In [9]:
result = euclidean_similarity(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_2"].values,
)
print(result)

4.12310562562


In [10]:
# test code 2
result = euclidean_similarity(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_5"].values,
)
print(result)

None


In [14]:
def cosine_similarity(v1,v2):
    v1, v2 = intersect_nonzero(v1,v2)
    
    if not (len(v1) or len(v2)):
        return -1
    return 1- spatial.distance.cosine(v1,v2)

In [15]:
# test code 1
result = cosine_similarity(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_2"].values
)
print(result)

0.743391941675


In [16]:
# test code 2
result = cosine_similarity(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_5"].values,
)
print(result)

-1


In [20]:
def similarity_matrix(df, similarity_func=cosine_similarity):
    matrix=[]
    
    for idx1, row1 in df.iterrows():
        row=[]
        for idx2, row2 in df.iterrows():
            row.append(similarity_func(row1.values, row2.values))
        matrix.append(row)
        
    sm_df = pd.DataFrame(matrix, columns=df.index, index=df.index)
    
    sm_df.fillna(df.max().max(), inplace=True)
    
    return sm_df

In [21]:
# 유클리디안 거리 유사도로 행렬 구함
sm_df = similarity_matrix(sample_df, euclidean_similarity)   
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4,user_5
user_1,0.0,4.123106,1.414214,2.236068,5.0
user_2,4.123106,0.0,3.741657,6.0,1.0
user_3,1.414214,3.741657,0.0,3.741657,4.123106
user_4,2.236068,6.0,3.741657,0.0,4.242641
user_5,5.0,1.0,4.123106,4.242641,0.0


In [22]:
# 코사인 유사도로 행렬 구함
sm_df = similarity_matrix(sample_df)   
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4,user_5
user_1,1.0,0.743392,0.992278,0.937425,-1.0
user_2,0.743392,1.0,0.566947,0.904534,1.0
user_3,0.992278,0.566947,1.0,0.898563,0.843661
user_4,0.937425,0.904534,0.898563,1.0,0.977802
user_5,-1.0,1.0,0.843661,0.977802,1.0


In [24]:
def pred_score(df, sm_df, user,closer_count=2):
    user_vec = df.loc[user]
    ms_df=sm_df.drop(user)
    ms_df=ms_df.sort_values(user,ascending=False)
    ms_df=ms_df[:closer_count]
    ms_df=df.loc[ms_df.index]
    
    mean_vec=[]
    for idx, column in ms_df.items():
        non_zero_count = len(np.nonzero(column.values)[0])
        
        mean = 0 if non_zero_count == 0 else sum(column.values)/non_zero_count
        
        mean_vec.append(mean)
        
    pred_df = pd.DataFrame(columns=df.columns)
    pred_df.loc["user"] = df.loc[user]
    pred_df.loc["pred"] = mean_vec
    
    return ms_df, pred_df

In [25]:
ms_df, pred_df = pred_score(sample_df, sm_df, "user_1")

In [26]:
ms_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_3,0,0,5,3,1,2
user_4,4,0,4,5,0,5


In [28]:
pred_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user,5,3,0.0,0,2,3.0
pred,4,0,4.5,4,1,3.5


In [32]:
# 예측행렬

def pred_matrix(df, similarity_func=cosine_similarity):
    sm_df = similarity_matrix(df,similarity_func)
    
    users = df.index
    
    pred_vecs_1 = []
    pred_vecs_2 = []
    
    for user in users:
        ms_df, pred_df = pred_score(df,sm_df, user)
        
        pred_vecs_1.append(pred_df.loc["pred"].copy())
        
        idx = pred_df.loc["user"].nonzero()[0]
        pred_df.loc["pred"][idx] = 0
        
        pred_vecs_2.append(pred_df.loc["pred"])
        
    non_zero_df = pd.DataFrame(pred_vecs_1, columns = df.columns, index = df.index)
    is_zero_df = pd.DataFrame(pred_vecs_2, columns = df.columns, index = df.index)
    
    return non_zero_df, is_zero_df

In [33]:
# test code - euclidean_similarity
non_zero_df, is_zero_df = pred_matrix(sample_df, euclidean_similarity) 

In [34]:
non_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,2.0,0.0,1.0,1.5,4.0,1.0
user_2,4.5,3.0,4.0,5.0,2.0,4.0
user_3,2.0,0.0,1.0,1.5,4.0,1.0
user_4,2.0,0.0,1.0,1.5,4.0,1.0
user_5,4.5,3.0,4.0,5.0,2.0,4.0


In [35]:
is_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,0.0,0.0,1.0,1.5,0.0,0.0
user_2,0.0,3.0,4.0,0.0,0.0,0.0
user_3,2.0,0.0,0.0,0.0,0.0,0.0
user_4,0.0,0.0,0.0,0.0,4.0,0.0
user_5,4.5,3.0,0.0,0.0,2.0,4.0


In [36]:
# test code - cosine_similarity
non_zero_df, is_zero_df = pred_matrix(sample_df) 

In [37]:
non_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,4.0,0.0,4.5,4.0,1.0,3.5
user_2,4.0,0.0,2.5,3.5,0.0,5.0
user_3,4.5,3.0,4.0,5.0,2.0,4.0
user_4,5.0,3.0,1.0,2.0,2.0,3.0
user_5,3.0,0.0,4.0,3.0,4.0,3.0


In [38]:
is_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,0.0,0.0,4.5,4.0,0.0,0.0
user_2,0.0,0.0,2.5,0.0,0.0,0.0
user_3,4.5,3.0,0.0,0.0,0.0,0.0
user_4,0.0,3.0,0.0,0.0,2.0,0.0
user_5,3.0,0.0,0.0,0.0,4.0,3.0


In [39]:
def recommand_result(target):
    idx = is_zero_df.loc[target].sort_values(ascending=False)>0
    
    return list(idx[idx==True].index)

In [40]:
recommand_result(sample_df.index[4])

['article_5', 'article_6', 'article_1']

In [41]:
for user in sample_df.index:
    print(user, recommand_result(user))
    

user_1 ['article_3', 'article_4']
user_2 ['article_3']
user_3 ['article_1', 'article_2']
user_4 ['article_2', 'article_5']
user_5 ['article_5', 'article_6', 'article_1']


In [42]:
def remove_zero_datas(value, pred):
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    return value, pred, idx

In [43]:
def mse(value, pred):
    value, pred, idx = remove_zero_datas(value, pred)
    
    return sum((value - pred)**2)/len(idx)

mse(pred_df.loc["user"], pred_df.loc["pred"])

2.8125

In [44]:
def rmse(value, pred):
    value, pred, idx = remove_zero_datas(value, pred)
    
    return np.sqrt(sum((value-pred)**2)/len(idx))

rmse(pred_df.loc["user"], pred_df.loc["pred"])


1.6770509831248424

In [51]:
def mae(value, pred):
    value, pred, idx = remove_zero_datas(value, pred)
    
    return sum(np.absolute(value - pred)/len(idx))

mae(pred_df.loc["user"], pred_df.loc["pred"])

1.375

In [52]:
def evaluate(df, closer_count=2, similarity_func=cosine_similarity):
    users=df.index
    evaluate_results={}
    
    sm_df=similarity_matrix(df, similarity_func)
    
    algorithms = [mse,rmse, mae]
    
    for algorithm in algorithms:
        evaluate_list = []
        for user in users:
            ms_df, pred_df = pred_score(df, sm_df, user, closer_count)
            evaluate_value = algorithm(pred_df.loc["user"], pred_df.loc["pred"])
            evaluate_list.append(evaluate_value)
            
        evaluate_results[algorithm.__name__]=sum(evaluate_list)/len(users)
        
    return evaluate_results

In [53]:
# test code 1
# closer_count : 1, similarity : cosine
evaluate(sample_df, 1)

{'mse': 7.85, 'rmse': 2.6371561507692038, 'mae': 2.35}

In [54]:
# test code 2
# closer_count : 2, similarity : cosine
evaluate(sample_df, 2)

{'mse': 5.325, 'rmse': 2.2284347104730364, 'mae': 2.05}

In [55]:
# test code 3
# closer_count : 1, similarity : euclidean
evaluate(sample_df, 1, euclidean_similarity)

{'mse': 9.15, 'rmse': 2.913055312224881, 'mae': 2.75}

In [56]:
# test code 4
# closer_count : 2, similarity : euclidean
evaluate(sample_df, 2, euclidean_similarity)

{'mse': 8.3375, 'rmse': 2.8773879752811631, 'mae': 2.775}