#### Dataframe
- sample_df : 샘플 데이터
- sm_df : 유사도
- closer_count : 유사한 사용자 평균
- ms_df : 유사도 평균값
- pred_df : 예측값
- recommend_df : 추천결과

##### Process
1. 샘플데이터
2. 유사도 행렬
3. 유사도 평균값 행렬
4. 예측 행렬
5. 기사 추천 리스트
6. 성능측정

In [1]:
import numpy as np
import pandas as pd
from scipy import spatial

### 1. 샘플데이터

In [2]:
# 샘플 데이터에 대한 인덱스와 컬럼을 정의
columns = ["article_1","article_2","article_3","article_4","article_5","article_6"]
index = ["user_1", "user_2", "user_3", "user_4", "user_5"]

# 샘플 데이터에 대한 행렬을 만듦
data = np.array([
    [5,3,0,0,2,3],
    [2,0,0,1,4,1],
    [0,0,5,3,1,2],
    [4,0,4,5,0,5],
    [0,0,1,2,0,0],
])

# 샘플 데이터 프레임을 만듦
sample_df = pd.DataFrame(data, columns=columns, index=index)
sample_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,5,3,0,0,2,3
user_2,2,0,0,1,4,1
user_3,0,0,5,3,1,2
user_4,4,0,4,5,0,5
user_5,0,0,1,2,0,0


### 2. 유사도 행렬

- 0이 아닌 벡터 값에 대한 인덱스 교집합

In [3]:
def intersect_nonzero(vector_1, vector_2):
    
    # vector_1과 vector_2에서 0이 아닌 위치 값을 구함
    idx = np.intersect1d(vector_1.nonzero(), vector_2.nonzero())
    
    # vector_1과 vector_2에서 0 아닌 위치값에 있는 데이터를 대입
    vector_1 = vector_1[idx]
    vector_2 = vector_2[idx]
    
    return vector_1, vector_2

In [4]:
# test code 1
intersect_nonzero(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_2"].values,
)

(array([5, 2, 3]), array([2, 4, 1]))

In [5]:
# test code 2
intersect_nonzero(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_5"].values,
)

(array([], dtype=int64), array([], dtype=int64))

- 유클리디안 거리 유사도

In [6]:
def euclidean_similarity(vector_1, vector_2):  
    
    # 0이 아닌 벡터데이터 추출
    vector_1, vector_2 = intersect_nonzero(vector_1, vector_2)
    
    # vector_1이나 vector_2의 데이터가 없을때 None을 return
    if not(len(vector_1) or len(vector_2)):
        return None
    
    # 유클리디안 거리 유사도 리턴
    return np.linalg.norm(vector_1 - vector_2)

In [7]:
# test code 1
result = euclidean_similarity(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_2"].values,
)
print(result)

4.123105625617661


In [8]:
# test code 2
result = euclidean_similarity(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_5"].values,
)
print(result)

None


- 코사인 유사도

In [9]:
def cosine_similarity(vector_1, vector_2):
    
    # 0이 아닌 벡터데이터 추출
    vector_1, vector_2 = intersect_nonzero(vector_1, vector_2)
    
    # vector_1이나 vector_2의 데이터가 없을때 가장 먼 거리 값이 -1을 대입
    if not(len(vector_1) or len(vector_2)):
        return -1
    
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [10]:
# test code 1
result = cosine_similarity(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_2"].values,
)
print(result)

0.7433919416750282


In [11]:
# test code 2
result = cosine_similarity(
    sample_df.loc["user_1"].values, 
    sample_df.loc["user_5"].values,
)
print(result)

-1


- 유사도 행렬 함수 - similarity matrix

In [12]:
def similarity_matrix(sample_df, similarity_func=cosine_similarity):

    matrix = []
    
    for idx1, row1 in sample_df.iterrows():
        row = []
        for idx2, row2 in sample_df.iterrows():
            row.append(similarity_func(row1.values, row2.values))
        matrix.append(row)
    
    sm_df = pd.DataFrame(matrix, columns=sample_df.index, index=sample_df.index) 
    
    # NaN 값은 최대 값으로 채움
    sm_df.fillna(sm_df.max().max(), inplace=True)
    
    return sm_df

In [13]:
# 유클리디안 거리 유사도로 행렬 구함
sm_df = similarity_matrix(sample_df, euclidean_similarity)   
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4,user_5
user_1,0.0,4.123106,1.414214,2.236068,6.0
user_2,4.123106,0.0,3.741657,6.0,1.0
user_3,1.414214,3.741657,0.0,3.741657,4.123106
user_4,2.236068,6.0,3.741657,0.0,4.242641
user_5,6.0,1.0,4.123106,4.242641,0.0


In [14]:
# 코사인 유사도로 행렬 구함
sm_df = similarity_matrix(sample_df)   
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4,user_5
user_1,1.0,0.743392,0.992278,0.937425,-1.0
user_2,0.743392,1.0,0.566947,0.904534,1.0
user_3,0.992278,0.566947,1.0,0.898563,0.843661
user_4,0.937425,0.904534,0.898563,1.0,0.977802
user_5,-1.0,1.0,0.843661,0.977802,1.0


### 3. 유사도 평균값 행렬 
- ms_df : mean score  
- pred_df : prediction

In [15]:
def pred_score(sample_df, sm_df, user, closer_count=2):
    """
    returns : ms_df : mean score dateframe
              pred_df : prediction dataframe
    """
    
    # user 데이터 백업
    user_vec = sample_df.loc[user]
    
    # user 데이터 제거
    ms_df = sm_df.drop(user)
    
    # 유사도 순으로 정렬
    ms_df = ms_df.sort_values(user, ascending=False)
    
    # 유사도가 가까운 샘플만 남기기
    ms_df = ms_df[:closer_count]
    
    # 유사도가 가까운 샘플
    ms_df = sample_df.loc[ms_df.index]

    # 유사도가 가까운 샘플의 평균
    # 컬럼 데이터를 합쳐서 나눠줄때 0인 데이터는 제외
    mean_vec = []
    for idx, column in ms_df.items():

        # 0이 아닌 컬럼 데이터의 갯수 구하기
        non_zero_count = len(np.nonzero(column.values)[0])
        
        # 컬럼 값을 다 더해서 0이 아닌 컬럼의 갯수로 나눠 평균을 구함
        # 0이 아닌 컬럼의 데이터 갯수가 0개이면 0을 리턴
        mean = 0 if non_zero_count == 0 else sum(column.values) / non_zero_count
        
        # mean_vec에 평균값 추가하기
        mean_vec.append(mean)
        
    pred_df = pd.DataFrame(columns=sample_df.columns)
    pred_df.loc["user"] = sample_df.loc[user]
    pred_df.loc["pred"] = mean_vec
    
    return ms_df, pred_df

In [16]:
ms_df, pred_df = pred_score(sample_df, sm_df, "user_1")

In [17]:
ms_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_3,0,0,5,3,1,2
user_4,4,0,4,5,0,5


In [18]:
pred_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user,5,3,0.0,0,2,3.0
pred,4,0,4.5,4,1,3.5


### 4. 예측 행렬
- 모든 User에 대한 예측 행렬 구하기

In [19]:
def pred_matrix(sample_df, similarity_func=cosine_similarity):
    """
    non_zero_df : not removed zero datas
    is_zero_df : removed zero datas
    """
    
    # 유사도 데이터 프레임
    sm_df = similarity_matrix(sample_df, similarity_func)
    
    # 사용자 리스트
    users = sample_df.index
    
    # 예측 매트릭스의 벡터 데이터
    pred_vecs_1 = []
    pred_vecs_2 = []
    
    # 사용자 별로 예측 벡터 를 만들어 pred_vecs에 저장
    for user in users:
        
        # 한 사용자에 대한 예측 벡터
        ms_df, pred_df = pred_score(sample_df, sm_df, user)
        
        # 예측 벡터를 깊은 복사하여 pred_vecs_1 변수에 저장
        pred_vecs_1.append(pred_df.loc["pred"].copy())
        
        # 사용자 벡터에서 0이 아닌 위치 값의 데이터를 예측 벡터에서 제거
        # 사용자 벡터에서 0이 아닌 데이터는 이미 컨텐츠를 소비한 데이터 이므로 추천에서 제거하기 위함
        idx = pred_df.loc["user"].nonzero()[0]
        pred_df.loc["pred"][idx] = 0
        
        # 예측 벡터를 pred_vecs_2 변수에 저장
        pred_vecs_2.append(pred_df.loc["pred"])
    
    # pred_vecs를 이용하여 데이터 프레임으로 만듦
    non_zero_df = pd.DataFrame(pred_vecs_1, columns=sample_df.columns, index=sample_df.index) 
    is_zero_df = pd.DataFrame(pred_vecs_2, columns=sample_df.columns, index=sample_df.index)
    
    return non_zero_df, is_zero_df

In [20]:
# test code - euclidean_similarity
non_zero_df, is_zero_df = pred_matrix(sample_df, euclidean_similarity) 

In [21]:
non_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,2.0,0.0,1.0,1.5,4.0,1.0
user_2,4.5,3.0,4.0,5.0,2.0,4.0
user_3,2.0,0.0,1.0,1.5,4.0,1.0
user_4,2.0,0.0,1.0,1.5,4.0,1.0
user_5,4.5,3.0,4.0,5.0,2.0,4.0


In [22]:
is_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,0.0,0.0,1.0,1.5,0.0,0.0
user_2,0.0,3.0,4.0,0.0,0.0,0.0
user_3,2.0,0.0,0.0,0.0,0.0,0.0
user_4,0.0,0.0,0.0,0.0,4.0,0.0
user_5,4.5,3.0,0.0,0.0,2.0,4.0


In [23]:
# test code - cosine_similarity
non_zero_df, is_zero_df = pred_matrix(sample_df) 

In [24]:
non_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,4.0,0.0,4.5,4.0,1.0,3.5
user_2,4.0,0.0,2.5,3.5,0.0,5.0
user_3,4.5,3.0,4.0,5.0,2.0,4.0
user_4,5.0,3.0,1.0,2.0,2.0,3.0
user_5,3.0,0.0,4.0,3.0,4.0,3.0


In [25]:
is_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,0.0,0.0,4.5,4.0,0.0,0.0
user_2,0.0,0.0,2.5,0.0,0.0,0.0
user_3,4.5,3.0,0.0,0.0,0.0,0.0
user_4,0.0,3.0,0.0,0.0,2.0,0.0
user_5,3.0,0.0,0.0,0.0,4.0,3.0


### 5. 기사 추천 리스트 구하기

In [26]:
def recommand_result(target):
    
    # 데이터 프레임의 row 데이터를 소팅
    # 0을 초과 하는 데이터를 비교연산
    idx = is_zero_df.loc[target].sort_values(ascending=False) > 0
    
    # 0을 초과하는 데이터에 대한 index 값을 리스트로 리턴
    return list(idx[idx == True].index)

In [27]:
# test code - one user
recommand_result(sample_df.index[4])

['article_5', 'article_6', 'article_1']

In [28]:
# test code - all user
for user in sample_df.index:
    print(user, recommand_result(user))

user_1 ['article_3', 'article_4']
user_2 ['article_3']
user_3 ['article_1', 'article_2']
user_4 ['article_2', 'article_5']
user_5 ['article_5', 'article_6', 'article_1']


### 6. 성능 측정

In [29]:
def remove_zero_datas(value, pred):
    
    # user 데이터에서 0인 데이터 제거
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    return value, pred, idx

- MSE : Mean Squared Error

- $ {\displaystyle MSE = \frac{1}{n}\sum_{i=1}^n{(Y_i-\hat{Y}_i)^2} } $

- mean : $ {\displaystyle {\frac {1}{n}}\sum _{i=1}^{n} } $

squares of the errors : $ {\displaystyle (Y_i-\hat{Y}_i)^{2}}$

In [30]:
# 한명의 user에 대한 MSE 값
def mse(value, pred):
    
    # user 데이터에서 0인 데이터 제거
    value, pred, idx = remove_zero_datas(value, pred)
    
    # 수식 계산후 결과 리턴
    return sum((value - pred)**2) / len(idx)

# test code
mse(pred_df.loc["user"], pred_df.loc["pred"])

2.8125

- RMSE : Root Mean Square Error

- $ {\displaystyle RMSE = \sqrt {\frac{1}{n}\sum_{i=1}^n{(Y_i-\hat{Y}_i)^2}} } $

In [31]:
# 한명의 user에 대한 RMSE 값
def rmse(value, pred):

    # user 데이터에서 0인 데이터 제거
    value, pred, idx  = remove_zero_datas(value, pred)
    
    # 수식 계산후 결과 리턴
    return np.sqrt(sum((value - pred)**2) / len(idx))

# test code
rmse(pred_df.loc["user"], pred_df.loc["pred"])

1.6770509831248424

- MAE : Mean Absolute Error

- ${\displaystyle MAE ={\frac {\sum _{i=1}^{n} |y_{i}-x_{i}|}{n}} }$

In [32]:
# 한명의 user에 대한 MAE 값
def mae(value, pred):
    
    # user 데이터에서 0인 데이터 제거
    value, pred, idx  = remove_zero_datas(value, pred)

    # 수식 계산후 결과 리턴
    return sum(np.absolute(value - pred)) / len(idx)

# test code
mae(pred_df.loc["user"], pred_df.loc["pred"])

1.375

- Evaluate

In [33]:
# 전체 user에 대한 평가
def evaluate(sample_df, closer_count=2, similarity_func=cosine_similarity):
    
    # user 리스트
    users = sample_df.index
    evaluate_results = {}
    
    # 유사도 행렬
    sm_df = similarity_matrix(sample_df, similarity_func)
    
    # 모든 user에 대해서 mae 값을 구함
    # 유사도 행렬 함수
    algorithms = [mse,rmse, mae]
    for algorithm in algorithms:
        evaluate_list = []
        for user in users:
            ms_df, pred_df = pred_score(sample_df, sm_df, user, closer_count)
            evaluate_value = algorithm(pred_df.loc["user"], pred_df.loc["pred"])
            evaluate_list.append(evaluate_value)
        
        # evaluate_results에 결과 데이터 저장     
        evaluate_results[algorithm.__name__] = sum(evaluate_list) / len(users)
        
    return evaluate_results

In [34]:
# test code 1
# closer_count : 1, similarity : cosine
evaluate(sample_df, 1)

{'mae': 2.35, 'mse': 7.85, 'rmse': 2.637156150769204}

In [35]:
# test code 2
# closer_count : 2, similarity : cosine
evaluate(sample_df, 2)

{'mae': 2.05, 'mse': 5.325, 'rmse': 2.2284347104730364}

In [36]:
# test code 3
# closer_count : 1, similarity : euclidean
evaluate(sample_df, 1, euclidean_similarity)

{'mae': 2.75, 'mse': 9.15, 'rmse': 2.913055312224881}

In [37]:
# test code 4
# closer_count : 2, similarity : euclidean
evaluate(sample_df, 2, euclidean_similarity)

{'mae': 2.775, 'mse': 8.3375, 'rmse': 2.877387975281163}

#### 6. Process

- 샘플 데이터

In [38]:
sample_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,5,3,0,0,2,3
user_2,2,0,0,1,4,1
user_3,0,0,5,3,1,2
user_4,4,0,4,5,0,5
user_5,0,0,1,2,0,0


- 유사도 행렬

In [39]:
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4,user_5
user_1,1.0,0.743392,0.992278,0.937425,-1.0
user_2,0.743392,1.0,0.566947,0.904534,1.0
user_3,0.992278,0.566947,1.0,0.898563,0.843661
user_4,0.937425,0.904534,0.898563,1.0,0.977802
user_5,-1.0,1.0,0.843661,0.977802,1.0


- 예측 행렬

In [40]:
non_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,4.0,0.0,4.5,4.0,1.0,3.5
user_2,4.0,0.0,2.5,3.5,0.0,5.0
user_3,4.5,3.0,4.0,5.0,2.0,4.0
user_4,5.0,3.0,1.0,2.0,2.0,3.0
user_5,3.0,0.0,4.0,3.0,4.0,3.0


- 추천 행렬

In [41]:
is_zero_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,0.0,0.0,4.5,4.0,0.0,0.0
user_2,0.0,0.0,2.5,0.0,0.0,0.0
user_3,4.5,3.0,0.0,0.0,0.0,0.0
user_4,0.0,3.0,0.0,0.0,2.0,0.0
user_5,3.0,0.0,0.0,0.0,4.0,3.0


- 추천

In [42]:
recommand_result(sample_df.index[4])

['article_5', 'article_6', 'article_1']

- 평가 지표

In [43]:
evaluate(sample_df)

{'mae': 2.05, 'mse': 5.325, 'rmse': 2.2284347104730364}