# Simple User-based Nearest Neighbor
Berdasarkan artikel: https://blog.ariflaksito.net/2021/07/memahami-collaborative-filtering-di.html

In [1]:
import pandas as pd

In [2]:
# Memuat dump data rating
data = [['Arif','Item_1', 5],['Arif','Item_2',4],['Arif','Item_3',1],['Arif','Item_4',4],
                       ['Bob','Item_1',3],['Bob','Item_2',1],['Bob','Item_3',2],['Bob','Item_4',3],['Bob','Item_5',3],
                       ['Clark','Item_1',4],['Clark','Item_2',3],['Clark','Item_3',4],['Clark','Item_4',3],['Clark','Item_5',5],
                       ['Don','Item_1',3],['Don','Item_2',3],['Don','Item_3',1],['Don','Item_4',5],['Don','Item_5',4]]
ratings = pd.DataFrame(data, columns=['User', 'Item', 'Rating'])

In [3]:
ratings

Unnamed: 0,User,Item,Rating
0,Arif,Item_1,5
1,Arif,Item_2,4
2,Arif,Item_3,1
3,Arif,Item_4,4
4,Bob,Item_1,3
5,Bob,Item_2,1
6,Bob,Item_3,2
7,Bob,Item_4,3
8,Bob,Item_5,3
9,Clark,Item_1,4


In [4]:
# diubah menjadi matrix user - item
rating_matrix = ratings.pivot_table(index='User', columns='Item', values='Rating')
rating_matrix = rating_matrix.fillna(0)
rating_matrix

Item,Item_1,Item_2,Item_3,Item_4,Item_5
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Arif,5.0,4.0,1.0,4.0,0.0
Bob,3.0,1.0,2.0,3.0,3.0
Clark,4.0,3.0,4.0,3.0,5.0
Don,3.0,3.0,1.0,5.0,4.0


In [5]:
# mengambil nilai rerata dari tiap user
user_mean = rating_matrix.iloc[:,:4].mean(axis=1)
user_mean

User
Arif     3.50
Bob      2.25
Clark    3.50
Don      3.00
dtype: float64

In [6]:
# membuat matrix baru dari selisih rerata
new_rating_matrix = rating_matrix.iloc[:,:4].subtract(user_mean, axis=0)
new_rating_matrix

Item,Item_1,Item_2,Item_3,Item_4
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Arif,1.5,0.5,-2.5,0.5
Bob,0.75,-1.25,-0.25,0.75
Clark,0.5,-0.5,0.5,-0.5
Don,0.0,0.0,-2.0,2.0


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import operator

# fungsi untuk mencari kemiripan user
def similar_users(user_id, matrix, k=2):
    user = matrix[matrix.index == user_id]
    
    other_users = matrix[matrix.index != user_id]

    similarities = cosine_similarity(user,other_users)[0].tolist()    
    indices = other_users.index.tolist()
    
    index_similarity = dict(zip(indices, similarities))
    
    # sort by similarity
    index_similarity_sorted = sorted(index_similarity.items(), key=operator.itemgetter(1))
    index_similarity_sorted.reverse()
    
    top_users_similarities = index_similarity_sorted[:k]
    users = [u[0] for u in top_users_similarities]
    scores = [u[1] for u in top_users_similarities]
    
    return users, scores

In [10]:
similarity = similar_users('Arif', new_rating_matrix, 2)
similarity

(['Don', 'Bob'], [0.7071067811865476, 0.30151134457776363])

In [13]:
import numpy as np

def predict(ratings, similarity, k_item=4):
    mean_user_rating = ratings.iloc[:,:4].mean(axis=1)
    
    rating_diff = (ratings.iloc[:,-1] - mean_user_rating)
    fsim = similarity[:,:1].T
    dot_rating_diff = fsim.dot(rating_diff) +(1*mean_user_rating[:, np.newaxis])
    
    pred = mean_user_rating[:, np.newaxis] + dot_rating_diff /(np.abs(fsim).sum(axis=1)-1)
                                              
    return pred

In [14]:
# prediksi tanpa rating dari Clark, karena tidak masuk di threshold k
predict(rating_matrix.drop('Clark'), cosine_similarity(new_rating_matrix.drop('Clark')))

  
  # Remove the CWD from sys.path while we load stuff.


array([[4.42526623],
       [1.93594684],
       [3.42953847]])