In [2]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances

# Import Data

In [3]:
ratings = pd.read_csv('rtg.csv')
movies = pd.read_csv('movies.csv')

In [4]:
movie_data = movies[['movieId', 'title']]
movie_data.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [5]:
llor_matrix = ratings[['userId', 'movieId', 'S']]

In [6]:
llor_matrix.head()

Unnamed: 0,userId,movieId,S
0,1,1,3.3
1,1,3,4.1
2,1,6,6.1
3,1,47,4.5
4,1,50,3.5


In [7]:
merge_book_ratings = pd.merge(llor_matrix, movie_data, on=['movieId'], how='inner')
merge_book_ratings

Unnamed: 0,userId,movieId,S,title
0,1,1,3.300000,Toy Story (1995)
1,5,1,2.806738,Toy Story (1995)
2,7,1,5.000000,Toy Story (1995)
3,15,1,2.000000,Toy Story (1995)
4,17,1,5.850000,Toy Story (1995)
5,18,1,4.550000,Toy Story (1995)
6,19,1,6.000000,Toy Story (1995)
7,21,1,4.900000,Toy Story (1995)
8,27,1,2.100000,Toy Story (1995)
9,31,1,3.500000,Toy Story (1995)


# Observasi Data

Menghitung banyaknya konten id yang paling sering di rating oleh pengguna

In [8]:
rating_count = pd.DataFrame(llor_matrix.groupby('movieId')['S'].count())
rating_count.sort_values('S', ascending=False).head()

Unnamed: 0_level_0,S
movieId,Unnamed: 1_level_1
356,329
318,317
296,307
593,279
2571,278


Menghitung rata2 dari nilai S dari data

In [9]:
average_rating = pd.DataFrame(llor_matrix.groupby('movieId')['S'].mean())
average_rating['ratingCount'] = pd.DataFrame(llor_matrix.groupby('movieId')['S'].count())
average_rating.sort_values('ratingCount', ascending=False).head()

Unnamed: 0_level_0,S,ratingCount
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
356,4.523208,329
318,4.717935,317
296,4.563681,307
593,4.466129,279
2571,4.377698,278


Ubah data menjadi matrix 2D untuk diproses menggunakan kNN

In [10]:
ratings_pivot = llor_matrix.pivot(index='movieId', columns='userId', values='S').fillna(0)
ratings_pivot_matrix = csr_matrix(ratings_pivot.values)

In [11]:
ratings_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.3,0.0,0.0,0.0,2.806738,0.0,5.0,0.0,0.0,0.0,...,5.3,0.0,2.8,3.6,5.2,2.0,6.0,1.75,3.5,7.5
2,0.0,0.0,0.0,0.0,0.0,3.3,0.0,5.2,0.0,0.0,...,0.0,6.0,0.0,6.0,1.75,0.0,0.0,1.8,0.0,0.0
3,4.1,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,6.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


# Menghitung Similarity User-User

In [140]:
# konversi data menjadi numpy array
ratings_data = ratings_pivot.T
ratings_data_array = ratings_data.values

In [68]:
# mendapatkan similarity dari tiap user-user
user_similarity = pairwise_distances(ratings_data_array, metric='cosine')
pd.DataFrame(user_similarity).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,0.0,0.964411,0.956095,0.810081,0.883501,0.879927,0.852225,0.903215,0.940318,0.978408,...,0.950336,0.858962,0.785803,0.934751,0.856711,0.839131,0.764002,0.733569,0.911836,0.864302
1,0.964411,0.0,1.0,0.996529,0.979274,0.976359,0.965777,0.966424,1.0,0.931499,...,0.834188,0.974649,0.984538,1.0,1.0,0.970164,0.990256,0.955526,0.970249,0.890105
2,0.956095,1.0,0.0,0.997158,0.994729,0.993758,1.0,0.993169,1.0,1.0,...,0.996614,0.995137,0.969776,1.0,0.99544,0.987582,0.968203,0.978519,1.0,0.966506
3,0.810081,0.996529,0.997158,0.0,0.852985,0.91367,0.890809,0.946888,0.993457,0.975955,...,0.924201,0.876086,0.706898,0.947117,0.916295,0.816079,0.882989,0.866015,0.960371,0.895524
4,0.883501,0.979274,0.994729,0.852985,0.0,0.702546,0.889687,0.556787,1.0,0.963089,...,0.938875,0.578644,0.87751,0.765854,0.88205,0.903695,0.864426,0.856693,0.705179,0.939093


In [194]:
# menghitung user 0
K = 2
pred_ratings = np.zeros(K)
# mengambil similarity paling kecil dari user 0
selected_similar = user_similarity[0][:]
sorted_similar, index_sorted = np.sort(selected_similar), np.argsort(selected_similar)
most_similar, index_similar = sorted_similar[:K], index_sorted[:K]

# mengambil rating user
ratings_similar_user = ratings_data_array[index_similar,:]
# hitung rata-rata dari tiap user
mean_user_rating = ratings_similar_user.mean(axis=1)
diff_ratings = ratings_similar_user - mean_user_rating[:, np.newaxis]

# menghitung prediksi untuk user 0 terhadap object 1
pred = mean_user_rating[0] + np.sum(most_similar * diff_ratings[:,2]) / np.sum(most_similar)
print(pred)
pd.DataFrame(diff_ratings)

-0.02439349603609625


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
0,3.189922,-0.110078,3.989922,-0.110078,-0.110078,5.989922,-0.110078,-0.110078,-0.110078,-0.110078,...,-0.110078,-0.110078,-0.110078,-0.110078,-0.110078,-0.110078,-0.110078,-0.110078,-0.110078,-0.110078
1,-0.134472,-0.134472,-0.134472,-0.134472,-0.134472,4.865528,-0.134472,-0.134472,-0.134472,5.865528,...,-0.134472,-0.134472,-0.134472,-0.134472,-0.134472,-0.134472,-0.134472,-0.134472,-0.134472,-0.134472


# Definisi Model kNN

In [10]:
model = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1)
model.fit(ratings_pivot_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

# Testing Recommendation Model

In [28]:
# idx = np.random.choice(ratings_pivot.shape[0])
idx = 1
distances, index = model.kneighbors(ratings_pivot.iloc[idx, :].values.reshape(1, -1), n_neighbors=5)

In [29]:
print('Create recommendation for :',idx)
print('List Index :',index)

Create recommendation for : 1
List Index : [[  1 322 504 436 325]]


In [30]:
movie_data[movie_data['movieId'] == index.flatten()[4]]

Unnamed: 0,movieId,title
321,325,National Lampoon's Senior Trip (1995)


In [31]:
for i in range(len(distances.flatten())):
    if (i == 0):
        print('Recommendations for',movie_data[movie_data['movieId'] == idx].title.values[0])
    else:
        print('{0}: {1}, with distance : '.format(i, movie_data[movie_data['movieId'] == index.flatten()[i]].title.values[0]), distances.flatten()[i])

Recommendations for Toy Story (1995)
1: Swimming with Sharks (1995), with distance :  0.46860377711106704
2: No Escape (1994), with distance :  0.4751091229505844
3: Color of Night (1994), with distance :  0.48303745534699605
4: National Lampoon's Senior Trip (1995), with distance :  0.5033889183892293
