### Make Class
- DssSm : 유사도 클래스
- DssEvaluate : 성능평가 클래스
- DssRecommend : 추천 클래스

### structure


- DssSm

    - `intersect_nonzeroc` : 두개의 벡터에서 0이 아닌 index를 찾고 index에 해당하는 데이터만 남김
    - `euclidean_similarity` : 두벡터의 유클리디안 거리 유사도를 구함 (두 벡터중에서 0인 index 데이터는 제거)
    - `cosine_similarity` : 두벡터의 코사인 유사도를 구함 (두 벡터중에서 0인 index 데이터는 제거)
    
    
- DssEvaluate

    - `__preprocessing` : 샘플 데이터 프레임과 예측 데이터 프레임에서 0인 데이터를 제외하고 계산하도록 전처리
    - `mse` : mse 값을 구함
    - `rmse` : rmse 값을 구함
    - `mae` : mae 값을 구함
    
    
- DssRecommend(DssSm, DssEvaluate)

    - `similarity_matrix` : 유사도 매트릭스를 구함
    - `__pred_score` : 한 user에 대한 예측 벡터를 구함
    - `pred_matrix` : 전체 user에 대한 예측 매트릭스를 만듦
    - `recommand_matrix` : 예측 데이터가 순서대로 들어있는 데이터 프레임을 만듦
    - `recommand_user` : 사용자를 넣으면 사용자에 맞는 컨텐츠를 추천순으로 출력
    - `auto` : 위의 과정을 자동으로 수행
    - `__repr__` : 객체 정보를 보여줌

In [1]:
import numpy as np
import pandas as pd
from scipy import spatial

In [2]:
columns = ["article_1","article_2","article_3","article_4","article_5","article_6"]
index = ["user_1", "user_2", "user_3", "user_4", "user_5"]

data = np.array([
    [5,3,0,0,2,3],
    [2,0,0,1,4,1],
    [0,0,5,3,1,2],
    [4,0,4,5,0,5],
    [0,0,1,2,0,0],
])

sample_df = pd.DataFrame(data, columns=columns, index=index)
sample_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,5,3,0,0,2,3
user_2,2,0,0,1,4,1
user_3,0,0,5,3,1,2
user_4,4,0,4,5,0,5
user_5,0,0,1,2,0,0


In [3]:
class DssSm:
    
    def intersect_nonzero(self, vector_1, vector_2):
        idx = np.intersect1d(vector_1.nonzero(), vector_2.nonzero())
        vector_1 = vector_1[idx]
        vector_2 = vector_2[idx]
        return vector_1, vector_2
    
    def euclidean_similarity(self, vector_1, vector_2):  
        vector_1, vector_2 = self.intersect_nonzero(vector_1, vector_2)
        if not(len(vector_1) or len(vector_2)):
            return None
        return np.linalg.norm(vector_1 - vector_2)
    
    def cosine_similarity(self, vector_1, vector_2):
        vector_1, vector_2 = self.intersect_nonzero(vector_1, vector_2)
        if not(len(vector_1) or len(vector_2)):
            return -1
        return 1 - spatial.distance.cosine(vector_1, vector_2)

In [4]:
# test code
DS = DssSm()
v1, v2 = sample_df.loc["user_1"].values, sample_df.loc["user_2"].values
print(v1, v2)
DS.cosine_similarity(v1, v2)

[5 3 0 0 2 3] [2 0 0 1 4 1]


0.7433919416750282

In [5]:
class DssEvaluate:

    # preprocessing - filtering prediction able datas 
    def __preprocessing(self, sample, pred):
        zero_matrix = np.logical_and(sample != 0, pred != 0)
        counts = np.sum(zero_matrix, axis=1)
        c_sample = sample.copy()
        c_pred = pred.copy()
        c_sample[zero_matrix == False] = 0
        c_pred[zero_matrix == False] = 0
        return c_sample, c_pred, counts
        
    def mse(self, sample, pred):
        sample, pred, counts = self.__preprocessing(sample, pred)
        return np.average(((sample - pred) ** 2).sum(axis=1) / counts)
    
    def rmse(self, sample, pred):
        sample, pred, counts = self.__preprocessing(sample, pred)
        return np.average(np.sqrt(((sample - pred) ** 2).sum(axis=1)) / counts)
    
    def mae(self, sample, pred):
        sample, pred, counts = self.__preprocessing(sample, pred)
        return np.average(np.absolute(sample - pred).sum(axis=1) / counts) 

In [6]:
class DssRecommend(DssSm, DssEvaluate):
    
    def __init__(self, sample_df):
        self.sample_df = sample_df    
        self.pred_df = None
        self.evaluate_df = None
        self.rm_df = None
        self.is_pred = False
        self.is_evaluate = False
        self.is_rm = False
        
    def similarity_matrix(self, similarity="cosin"):
        
        if similarity == "cosin":
            similarity_func = self.cosine_similarity
        elif similarity == "euclidean":
            similarity_func = self.euclidean_similarity
            
        matrix = []

        for idx1, row1 in self.sample_df.iterrows():
            row = []
            for idx2, row2 in self.sample_df.iterrows():
                row.append(similarity_func(row1.values, row2.values))
            matrix.append(row)

        sm_df = pd.DataFrame(matrix, columns=self.sample_df.index, index=self.sample_df.index) 
        sm_df.fillna(sm_df.max().max(), inplace=True)
        
        return sm_df
    
    def __pred_score(self, sm_df, user, closer_count):

        user_vec = self.sample_df.loc[user]

        ms_df = sm_df.drop(user)
        ms_df = ms_df.sort_values(user, ascending=False)
        ms_df = ms_df[:closer_count]
        ms_df = self.sample_df.loc[ms_df.index]

        mean_vec = []
        for idx, column in ms_df.items():
            non_zero_count = len(np.nonzero(column.values)[0])
            mean = 0 if non_zero_count == 0 else sum(column.values) / non_zero_count
            mean_vec.append(mean)

        pred_df = pd.DataFrame(columns=self.sample_df.columns)
        pred_df.loc["user"] = self.sample_df.loc[user]
        pred_df.loc["pred"] = mean_vec

        return pred_df

      
    def pred_matrix(self, similarity="cosin", closer_count=2):
        
        sm_df = self.similarity_matrix(similarity)
        users = self.sample_df.index
        
        pred_vecs_1 = []
        pred_vecs_2 = []

        for user in users:
            pred_df = self.__pred_score(sm_df, user, closer_count)
            pred_vecs_1.append(pred_df.loc["pred"].copy())
            idx = pred_df.loc["user"].nonzero()[0]
            pred_df.loc["pred"][idx] = 0
            pred_vecs_2.append(pred_df.loc["pred"])
            
        non_zero_df = pd.DataFrame(pred_vecs_1, columns=self.sample_df.columns, index=self.sample_df.index) 
        is_zero_df = pd.DataFrame(pred_vecs_2, columns=self.sample_df.columns, index=self.sample_df.index)
        
        self.evaluate_df = non_zero_df
        self.pred_df = is_zero_df
        self.is_pred = True
        self.is_evaluate = True
        
        return non_zero_df, is_zero_df
    
    def recommand_matrix(self):
        
        def recommand_result(user):
            idx = self.pred_df.loc[user].sort_values(ascending=False) > 0
            return list(idx[idx == True].index)
        
        recommand_dict = {}
        for user in self.sample_df.index:
            recommand_dict[user] = str(recommand_result(user))[1:-1].replace("'","")
        
        self.rm_df = pd.DataFrame(recommand_dict, index=["recommend"]).T
        self.is_rm = True
            
        return self.rm_df
    
    def recommand_user(self, user):
        return self.rm_df.loc[user].values[0].split(",")        

    def auto(self, similarity="cosin", closer_count=2):
        self.pred_matrix(similarity, closer_count)
        self.recommand_matrix()
        
    def evaluate(self):
        return {
            "mse": self.mse(self.sample_df, self.evaluate_df),
            "rmse": self.rmse(self.sample_df, self.evaluate_df),
            "mae": self.mae(self.sample_df, self.evaluate_df),
        }
        
    def __repr__(self):
        return "<DssRecommend sample_df:{}, pred_df:{}, evaluate_df:{}, rm_df:{}>".format(
            len(self.sample_df), self.is_pred, self.is_evaluate, self.is_rm
        )

In [7]:
DR = DssRecommend(sample_df)
DR

<DssRecommend sample_df:5, pred_df:False, evaluate_df:False, rm_df:False>

In [8]:
euclidean_sm = DR.similarity_matrix("euclidean")
euclidean_sm

Unnamed: 0,user_1,user_2,user_3,user_4,user_5
user_1,0.0,4.123106,1.414214,2.236068,6.0
user_2,4.123106,0.0,3.741657,6.0,1.0
user_3,1.414214,3.741657,0.0,3.741657,4.123106
user_4,2.236068,6.0,3.741657,0.0,4.242641
user_5,6.0,1.0,4.123106,4.242641,0.0


In [9]:
cosin_sm = DR.similarity_matrix("cosin")
cosin_sm

Unnamed: 0,user_1,user_2,user_3,user_4,user_5
user_1,1.0,0.743392,0.992278,0.937425,-1.0
user_2,0.743392,1.0,0.566947,0.904534,1.0
user_3,0.992278,0.566947,1.0,0.898563,0.843661
user_4,0.937425,0.904534,0.898563,1.0,0.977802
user_5,-1.0,1.0,0.843661,0.977802,1.0


In [10]:
pred_df_1, pred_df_2 = DR.pred_matrix()

In [11]:
pred_df_1

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,4.0,0.0,4.5,4.0,1.0,3.5
user_2,4.0,0.0,2.5,3.5,0.0,5.0
user_3,4.5,3.0,4.0,5.0,2.0,4.0
user_4,5.0,3.0,1.0,2.0,2.0,3.0
user_5,3.0,0.0,4.0,3.0,4.0,3.0


In [12]:
pred_df_2

Unnamed: 0,article_1,article_2,article_3,article_4,article_5,article_6
user_1,0.0,0.0,4.5,4.0,0.0,0.0
user_2,0.0,0.0,2.5,0.0,0.0,0.0
user_3,4.5,3.0,0.0,0.0,0.0,0.0
user_4,0.0,3.0,0.0,0.0,2.0,0.0
user_5,3.0,0.0,0.0,0.0,4.0,3.0


In [13]:
rm_df = DR.recommand_matrix()
rm_df

Unnamed: 0,recommend
user_1,"article_3, article_4"
user_2,article_3
user_3,"article_1, article_2"
user_4,"article_2, article_5"
user_5,"article_5, article_6, article_1"


In [14]:
DR.recommand_user("user_5")

['article_5', ' article_6', ' article_1']

In [15]:
DR = DssRecommend(sample_df)
DR.auto()

In [16]:
DR

<DssRecommend sample_df:5, pred_df:True, evaluate_df:True, rm_df:True>

In [17]:
DR.evaluate()

{'mae': 1.8833333333333335, 'mse': 4.55, 'rmse': 1.1556982507228795}