# Sistem Rekomendasi Item-Based Collaborative Filtering

### Dataset MovieLens

### Import Library & Load Dataset

In [1]:
# import library 

import pandas as pd
import numpy as np

In [2]:
# load dataset csv (ratings.csv) ke dataframe pandas

ratings = pd.read_csv('ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
ratings.tail()

Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [5]:
ratings.shape

(100836, 4)

In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


### Train-test split

In [7]:
# Membagi data untuk prediksi dan evaluasi

from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(ratings, test_size = 0.30, random_state = 42)

print(X_train.shape)
print(X_test.shape)

(70585, 4)
(30251, 4)


In [8]:
# Mengubah bentuk data dengan baris berupa userId, kolom berupa movieId dan nilai berupa rating
user_data = X_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
user_data[:10]

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,4.0,0.0,3.0,0.0,0.0,4.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Membuat salinan dari dataset train dan test
Dataset akan digunakan untuk prediksi and evaluasi.

Dummy train akan digunakan untuk prediksi dari film yang belum diberi rating oleh pengguna. Untuk mengabaikan film yang telah diberi rating oleh pengguna, akan ditandai sebagai 0. Film yang belum diberi rating oleh pengguna ditandai sebagai 1.

Dummy test akan digunakan untuk evaluasi. Untuk evaluasi, hanya akan menggunakan data film yang telah diberi rating oleh pengguna. Jadi, kebalikan dari dummy train yaitu 0 untuk film yang belum diberi rating oleh pengguna dan 1 untuk film yang telah diberi rating oleh pengguna.

In [9]:
# Membuat salinan untuk data prediksi dan evaluasi
dummy_train = X_train.copy()
dummy_test = X_test.copy()

# Film yang belum diberi rating oleh pengguna ditandai sebagai 1 untuk prediksi 
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x > 0 else 1)
dummy_train = dummy_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(1)

# Film yang belum diberi rating oleh pengguna ditandai sebagai 0 untuk evaluasi
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x > 0 else 0)
dummy_test = dummy_test.pivot(index ='userId', columns = 'movieId', values = 'rating').fillna(0)

In [10]:
dummy_train.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
dummy_test.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,189043,189111,189333,189547,189713,190213,190219,193567,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Item-Based Collaborative Filtering

In [12]:
movie_features = X_train.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,4.0,2.5,0.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


### Item-Item Similarity matrix

#### Using Cosine Similarity & Euclidean Distance

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

# Menggunakan Cosine Similarity sebagai similarity measure antar item
item_similarity_cs = cosine_similarity(movie_features)
item_similarity_cs[np.isnan(item_similarity_cs)] = 0
print(item_similarity_cs)
print("- "*10)
item_similarity_cs.shape

[[1.         0.3250275  0.20868811 ... 0.         0.         0.        ]
 [0.3250275  1.         0.16373912 ... 0.         0.         0.        ]
 [0.20868811 0.16373912 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
- - - - - - - - - - 


(8566, 8566)

In [14]:
from sklearn.metrics.pairwise import euclidean_distances

# Menggunakan Euclidean Distance sebagai similarity measure antar item
item_similarity_ed = euclidean_distances(movie_features)
item_similarity_ed[np.isnan(item_similarity_ed)] = 0
print(item_similarity_ed)
print("- "*10)
item_similarity_ed.shape

[[ 0.         49.06118629 49.03315205 ... 49.32544982 49.32544982
  49.36344802]
 [49.06118629  0.         34.34748899 ... 31.7411405  31.7411405
  31.80015723]
 [49.03315205 34.34748899  0.         ... 20.00624902 20.00624902
  20.09975124]
 ...
 [49.32544982 31.7411405  20.00624902 ...  0.          0.
   5.31507291]
 [49.32544982 31.7411405  20.00624902 ...  0.          0.
   5.31507291]
 [49.36344802 31.80015723 20.09975124 ...  5.31507291  5.31507291
   0.        ]]
- - - - - - - - - - 


(8566, 8566)

### Memprediksi rating yang diberikan oleh pengguna pada film

In [15]:
# Cosine Similarity
item_predicted_ratings_cs = np.dot(movie_features.T, item_similarity_cs)
print(item_predicted_ratings_cs)

[[1.70457674e+02 1.45736065e+02 1.43879168e+02 ... 4.90965204e-01
  4.90965204e-01 2.29232951e+00]
 [1.47046553e+01 1.27977597e+01 7.41114463e+00 ... 4.11059622e-01
  4.11059622e-01 2.92895445e+00]
 [4.64495934e+00 3.92745275e+00 3.15259091e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [3.43701501e+02 3.24195971e+02 2.65563257e+02 ... 1.03906576e+00
  1.03906576e+00 1.84582165e+01]
 [2.59031967e+01 2.12250531e+01 1.32545528e+01 ... 0.00000000e+00
  0.00000000e+00 3.54245954e-01]
 [5.33648012e+02 4.18264714e+02 2.11408804e+02 ... 1.91118832e+01
  1.91118832e+01 3.78219746e+01]]


In [16]:
# Euclidean Distance
item_predicted_ratings_ed = np.dot(movie_features.T, item_similarity_ed)
print(item_predicted_ratings_ed)

[[ 37774.80359374  28140.85336442  22615.45815686 ...  19238.3246162
   19238.3246162   19303.59553545]
 [  4714.14580887   3510.70261966   2898.40088646 ...   2334.83688981
    2334.83688981   2333.77773078]
 [  3147.01675215   2093.48573422   1428.05136766 ...    714.34294013
     714.34294013    727.80570087]
 ...
 [ 90621.57284695  64063.67194207  49621.14064453 ...  37921.24114981
   37921.24114981  38065.3258975 ]
 [  4928.96556057   3989.48612679   3559.04326674 ...   3118.93765621
    3118.93765621   3127.77116035]
 [165076.38517822 113527.11195207  83917.44893897 ...  51986.34714757
   51986.34714757  52500.25932455]]


In [17]:
print(item_predicted_ratings_cs.shape)
print(item_predicted_ratings_ed.shape)

(610, 8566)
(610, 8566)


In [18]:
dummy_train.shape

(610, 8566)

#### Memfilter data rating hanya untuk film yang belum diberi rating oleh pengguna untuk rekomendasi

In [19]:
# np.multiply untuk perkalian antar cell

# Cosine Similarity
item_final_ratings_cs = np.multiply(item_predicted_ratings_cs, dummy_train)
item_final_ratings_cs[:10]

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,145.736065,0.0,17.752252,60.018339,0.0,64.55558,56.220407,28.999703,143.817617,...,1.537469,0.490965,0.490965,0.490965,0.490965,0.490965,0.490965,0.490965,0.490965,2.29233
2,14.704655,12.79776,7.411145,0.26943,5.80407,10.243947,3.230076,4.735767,1.599833,10.96073,...,0.461311,0.41106,0.41106,0.41106,0.41106,0.41106,0.41106,0.41106,0.41106,2.928954
3,4.644959,3.927453,3.152591,0.227362,0.903036,4.159893,1.724414,0.864785,1.312176,4.381157,...,0.047166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,82.099482,70.871325,49.487513,10.082054,34.732576,65.274992,41.705394,29.677329,12.913699,70.165084,...,0.069251,0.629317,0.629317,0.629317,0.629317,0.629317,0.629317,0.629317,0.629317,1.033695
5,29.84575,30.143357,18.279888,9.367018,17.505036,23.594082,17.286535,13.69294,5.029116,31.156561,...,0.694836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.537539
6,110.149471,0.0,103.294299,0.0,86.985728,90.390343,0.0,0.0,47.563851,0.0,...,0.508641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.873823
7,0.0,66.250378,34.334041,4.493829,26.419984,55.013177,24.121834,18.931179,8.631865,57.934481,...,0.605949,0.343676,0.343676,0.343676,0.343676,0.343676,0.343676,0.343676,0.343676,5.604466
8,32.028368,0.0,18.92105,8.387268,17.824189,23.208145,17.049141,11.270996,5.624336,0.0,...,0.712729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.792048
9,21.477236,20.120541,11.916898,2.688664,8.34527,15.089861,8.533492,8.173293,1.813547,15.980694,...,0.231302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.837662
10,38.613229,36.842734,19.309977,1.32959,18.371483,22.967482,12.270699,10.142122,1.616975,23.813159,...,0.111485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.522305


In [20]:
# np.multiply untuk perkalian antar cell

# Euclidean Distance
item_final_ratings_ed = np.multiply(item_predicted_ratings_ed, dummy_train)
item_final_ratings_ed[:10]

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,28140.853364,0.0,19486.472752,22996.119255,0.0,24043.982756,19791.436524,20869.936551,29413.833297,...,19014.676975,19391.98945,19238.324616,19310.804014,19310.804014,19238.324616,19310.804014,19238.324616,19238.324616,19303.595535
2,4714.145809,3510.70262,2898.400886,2377.849601,2814.926004,3658.394408,2991.368658,2421.508907,2556.817739,3701.748743,...,2306.893834,2353.947096,2334.83689,2343.870043,2343.870043,2334.83689,2343.870043,2334.83689,2334.83689,2333.777731
3,3147.016752,2093.485734,1428.051368,774.204487,1347.359671,2173.409156,1477.48488,871.753733,1004.861963,2225.509343,...,670.772739,742.553499,714.34294,727.805701,727.805701,714.34294,727.805701,714.34294,714.34294,727.805701
4,24736.605954,17830.484535,14059.046128,10962.080461,13671.496018,18537.583981,14360.283148,11278.17333,12072.271725,18740.792281,...,10571.3199,10867.781943,10747.668878,10804.492579,10804.492579,10747.668878,10804.492579,10747.668878,10747.668878,10802.872256
5,6163.838618,4775.317941,4355.03261,3990.862771,4275.413277,5068.149227,4405.039242,4003.754831,4169.076385,4900.958911,...,3981.716633,4021.745612,4005.495102,4013.129104,4013.129104,4005.495102,4013.129104,4005.495102,4005.495102,4010.980655
6,38138.043544,0.0,19846.165514,0.0,18997.100096,27882.694353,0.0,0.0,16296.32235,0.0,...,13646.516933,14248.82544,14010.043806,14123.581172,14123.581172,14010.043806,14123.581172,14010.043806,14010.043806,14120.086594
7,0.0,11805.499585,10214.399867,8758.454878,10015.715069,12388.718632,10471.09967,8879.13165,9288.583015,12486.051988,...,8567.338813,8710.27727,8652.156234,8679.588264,8679.588264,8652.156234,8679.588264,8652.156234,8652.156234,8658.625016
8,5668.346231,0.0,4419.914672,4247.172174,4375.613676,4913.712304,4475.583701,4252.354511,4360.870169,0.0,...,4250.990007,4278.985383,4267.677707,4272.98351,4272.98351,4267.677707,4272.98351,4267.677707,4267.677707,4269.819302
9,6427.488397,4626.649498,3705.450088,2822.2384,3589.492379,4886.239305,3800.778795,2922.145888,3159.628614,4954.16385,...,2693.250566,2791.778269,2753.607715,2771.832454,2771.832454,2753.607715,2771.832454,2753.607715,2753.607715,2768.473881
10,14225.16754,9893.976398,7514.693969,5186.128075,7116.125036,10490.599834,7727.365645,5473.549103,6022.538247,10650.362937,...,4843.809255,5082.656079,4988.704412,5033.454947,5033.454947,4988.704412,5033.454947,4988.704412,4988.704412,5011.392455


In [21]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

Berdasarkan nilai rating diatas, dapat dilihat bahwa nilai rating memiliki nilai minimum 0.5 dan maksimum 5 sehingga kita perlu me-normalisasi nilai rating antara (0.5, 5)

In [22]:
from sklearn.preprocessing import MinMaxScaler

# Cosine Similarity
X_train_cs = item_final_ratings_cs.copy() 
X_train_cs = X_train_cs[X_train_cs > 0] # abaikan nilai 0 yang merupakan data film yang telah diberi rating oleh pengguna

scaler_train_cs = MinMaxScaler(feature_range = (0.5, 5))
scaler_train_cs.fit(X_train_cs)
pred_train_cs = scaler_train_cs.transform(X_train_cs)

print(pred_train_cs)

print("- "*10)

# Euclidean Distance
X_train_ed = item_final_ratings_ed.copy() 
X_train_ed = X_train_ed[X_train_ed > 0] # abaikan nilai 0 yang merupakan data film yang telah diberi rating oleh pengguna

scaler_train_ed = MinMaxScaler(feature_range = (0.5, 5))
scaler_train_ed.fit(X_train_ed)
pred_train_ed = scaler_train_ed.transform(X_train_ed)

print(pred_train_ed)

[[       nan 1.86285746        nan ... 0.55575891 0.55575891 0.73202816]
 [0.59634342 0.5917273  0.56201214 ... 0.54566987 0.54566987 0.79781647]
 [0.51346817 0.50691115 0.51183678 ...        nan        nan        nan]
 ...
 [       nan 3.56925628        nan ... 0.62496325 0.62496325 2.40259787]
 [       nan 0.67230743 0.63086064 ...        nan        nan 0.53174817]
 [       nan 4.46872327 2.96556385 ... 2.90687526 2.90687526 4.403633  ]]
- - - - - - - - - - 
[[       nan 1.48668478        nan ... 1.39870441 1.39870441 1.39209265]
 [0.56673014 0.60199948 0.58884013 ... 0.59416601 0.59416601 0.59302631]
 [0.53862819 0.55109476 0.5344151  ... 0.51703698 0.51703698 0.51740509]
 ...
 [       nan 2.77698907        nan ... 2.28793651 2.28793651 2.27553567]
 [       nan 0.61919681 0.61329383 ... 0.63148607 0.63148607 0.63041347]
 [       nan 4.55365615 3.58776254 ... 2.9573793  2.9573793  2.95524064]]


In [23]:
pred_train_df_cs = np.multiply(pred_train_cs, dummy_train).fillna(0)
pred_train_df_cs[:10]

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.862857,0.0,1.4747,1.644108,0.0,1.353577,1.349279,1.449874,1.883248,...,1.485256,0.555759,0.555759,0.555759,0.555759,0.555759,0.555759,0.555759,0.555759,0.732028
2,0.596343,0.591727,0.562012,0.505272,0.594854,0.559812,0.522697,0.55939,0.546684,0.575536,...,0.773796,0.54567,0.54567,0.54567,0.54567,0.54567,0.54567,0.54567,0.54567,0.797816
3,0.513468,0.506911,0.511837,0.50294,0.5,0.516156,0.502297,0.5,0.537202,0.510773,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.151565,1.147015,1.057766,1.049385,1.154731,0.954685,1.043987,0.942049,0.919626,1.158285,...,0.514601,0.573228,0.573228,0.573228,0.573228,0.573228,0.573228,0.573228,0.573228,0.601962
5,0.721081,0.757583,0.69007,1.009736,0.821312,0.655605,0.713143,0.696813,0.659724,0.774324,...,0.928182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.55069
6,1.382651,0.0,1.691732,0.0,2.16603,1.134899,0.0,0.0,2.06181,0.0,...,0.805087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.585441
7,0.0,1.102831,0.879224,0.739517,0.993851,0.881052,0.805753,0.777179,0.778483,1.037899,...,0.869418,0.537162,0.537162,0.537162,0.537162,0.537162,0.537162,0.537162,0.537162,1.074302
8,0.739062,0.0,0.697625,0.955409,0.827489,0.652836,0.709927,0.659655,0.679345,0.0,...,0.940012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57699
9,0.652138,0.661746,0.6151,0.63942,0.644036,0.594583,0.594551,0.612129,0.553728,0.624948,...,0.621734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.581704
10,0.793311,0.821641,0.702207,0.564059,0.838081,0.651109,0.645185,0.642335,0.547249,0.702043,...,0.542522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.065811


In [24]:
pred_train_df_ed = np.multiply(pred_train_ed, dummy_train).fillna(0)
pred_train_df_ed[:10]

movieId,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.486685,0.0,1.367933,1.450328,0.0,1.1701,1.576664,1.277757,1.213218,...,1.421561,1.384924,1.398704,1.392022,1.392022,1.398704,1.392022,1.398704,1.398704,1.392093
2,0.56673,0.601999,0.58884,0.591503,0.59893,0.559783,0.570936,0.614016,0.581509,0.574334,...,0.596624,0.592674,0.594166,0.593444,0.593444,0.594166,0.593444,0.594166,0.594166,0.593026
3,0.538628,0.551095,0.534415,0.518725,0.537017,0.529586,0.527851,0.528128,0.522505,0.537653,...,0.515841,0.517746,0.517037,0.517381,0.517381,0.517037,0.517381,0.517037,0.517037,0.517405
4,0.925775,1.116349,1.001952,0.981076,1.056944,0.862349,0.894499,1.104855,0.943278,0.948019,...,1.004675,0.988558,0.994583,0.991658,0.991658,0.994583,0.991658,0.994583,0.994583,0.991815
5,0.592726,0.647423,0.642757,0.664705,0.660545,0.58845,0.61117,0.701704,0.642806,0.604132,...,0.679317,0.670225,0.673683,0.672011,0.672011,0.673683,0.672011,0.673683,0.673683,0.672002
6,1.166092,0.0,1.216162,0.0,1.281619,1.05238,0.0,0.0,1.103872,0.0,...,1.156511,1.145772,1.149859,1.147877,1.147877,1.149859,1.147877,1.149859,1.149859,1.148014
7,0.0,0.899939,0.859642,0.88107,0.902715,0.737313,0.783812,0.9719,0.837445,0.792603,...,0.90573,0.888236,0.894845,0.891646,0.891646,0.894845,0.891646,0.894845,0.894845,0.890848
8,0.583841,0.0,0.645159,0.676337,0.664772,0.58531,0.613178,0.715482,0.650098,0.0,...,0.692612,0.682186,0.686161,0.684241,0.684241,0.686161,0.684241,0.686161,0.686161,0.68419
9,0.597454,0.642083,0.618713,0.61167,0.631608,0.584751,0.593972,0.641761,0.604428,0.605454,...,0.6157,0.613033,0.614098,0.613587,0.613587,0.614098,0.613587,0.614098,0.614098,0.613495
10,0.737283,0.831279,0.759712,0.718949,0.780388,0.698715,0.705724,0.783161,0.713273,0.746991,...,0.721882,0.719556,0.720479,0.720034,0.720034,0.720479,0.720034,0.720479,0.720479,0.719108


### Hasil rekomendasi film (5 teratas)

In [25]:
# User 1
pred_train_df_cs.iloc[0].sort_values(ascending = False)[0:10]

movieId
1516    3.830940
1170    3.408577
1998    3.375800
2415    3.280950
2264    3.280950
3143    3.280950
2812    3.161027
476     3.161027
2659    2.768163
40      2.766582
Name: 1, dtype: float64

In [26]:
# User 1
pred_train_df_ed.iloc[0].sort_values(ascending = False)[0:10]

movieId
1515    2.015507
5428    2.012789
6039    1.984656
6062    1.969766
8119    1.968658
4890    1.965492
5247    1.961049
4587    1.959211
4132    1.958595
4564    1.944334
Name: 1, dtype: float64

## Evaluasi

Proses evaluasi akan sama seperti proses prediksi, yang membedakan adalah data yang akan dievaluasi adalah data film yang telah diberi rating oleh pengguna, berbeda dengan proses prediksi yang menggunakan data film yang belum diberi rating oleh pengguna.

### Menggunakan Item-Item Similarity 

In [27]:
# Cosine Similarity

test_item_features_cs = X_test.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
test_item_similarity_cs = cosine_similarity(test_item_features_cs)
test_item_similarity_cs[np.isnan(test_item_similarity_cs)] = 0 

print(test_item_similarity_cs)
print("- "*10)
print(test_item_similarity_cs.shape)

[[1.         0.15732306 0.09932171 ... 0.         0.         0.        ]
 [0.15732306 1.         0.         ... 0.         0.         0.        ]
 [0.09932171 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         1.        ]
 [0.         0.         0.         ... 0.         1.         1.        ]]
- - - - - - - - - - 
(6124, 6124)


In [28]:
# Euclidean Distance

test_item_features_ed = X_test.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
test_item_similarity_ed = euclidean_distances(test_item_features_ed)
test_item_similarity_ed[np.isnan(test_item_similarity_ed)] = 0 

print(test_item_similarity_ed)
print("- "*10)
print(test_item_similarity_ed.shape)

[[ 0.         34.93207695 34.05877273 ... 32.16753021 32.29163978
  32.34192326]
 [34.93207695  0.         24.6221445  ... 19.6468827  19.84943324
  19.93113143]
 [34.05877273 24.6221445   0.         ... 14.90805152 15.17399091
  15.28070679]
 ...
 [32.16753021 19.6468827  14.90805152 ...  0.          3.16227766
   3.64005494]
 [32.29163978 19.84943324 15.17399091 ...  3.16227766  0.
   0.5       ]
 [32.34192326 19.93113143 15.28070679 ...  3.64005494  0.5
   0.        ]]
- - - - - - - - - - 
(6124, 6124)


In [29]:
# Cosine Similarity

item_predicted_ratings_test_cs = np.dot(test_item_features_cs.T, test_item_similarity_cs)
item_predicted_ratings_test_cs

array([[1.90435509e+01, 1.37292388e+01, 1.20305765e+01, ...,
        4.19602222e-01, 0.00000000e+00, 0.00000000e+00],
       [9.33282962e-01, 1.09150271e+00, 6.76481425e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.34037111e+00, 3.35148768e-01, 5.76138709e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [6.10452867e+01, 6.86734298e+01, 3.74823847e+01, ...,
        4.19602222e-01, 0.00000000e+00, 0.00000000e+00],
       [1.59448344e+00, 1.48111314e+00, 1.51270481e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.50933268e+01, 5.62696921e+01, 2.60485764e+01, ...,
        0.00000000e+00, 8.98355095e+00, 8.98355095e+00]])

In [30]:
# Euclidean Distance

item_predicted_ratings_test_ed = np.dot(test_item_features_ed.T, test_item_similarity_ed)
item_predicted_ratings_test_ed

array([[ 9155.16988336,  6420.58248999,  5493.09528195, ...,
         3826.70865392,  3917.12565559,  3952.35877548],
       [  872.30645236,   610.40946016,   538.38695739, ...,
          384.87976223,   392.1745262 ,   395.06447657],
       [ 1048.33740478,   693.41758158,   558.78978791, ...,
          291.59607906,   308.38219196,   314.71226664],
       ...,
       [27422.25395155, 18892.06026726, 16357.44570977, ...,
        11242.90330025, 11528.70392035, 11639.3321545 ],
       [  863.51387771,   589.61020977,   499.69887968, ...,
          313.45615551,   326.30724614,   331.0792156 ],
       [49858.95436607, 33317.38809492, 27766.10292808, ...,
        15761.94696933, 16496.9233935 , 16776.6675996 ]])

### Testing pada data film yang telah diberi rating oleh pengguna

In [31]:
# Cosine Similarity
test_item_final_rating_cs = np.multiply(item_predicted_ratings_test_cs, dummy_test)
test_item_final_rating_cs.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,189043,189111,189333,189547,189713,190213,190219,193567,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,9.480468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Euclidean Distance
test_item_final_rating_ed = np.multiply(item_predicted_ratings_test_ed, dummy_test)
test_item_final_rating_ed.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,189043,189111,189333,189547,189713,190213,190219,193567,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1435.768521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

Berdasarkan nilai rating diatas, dapat dilihat bahwa nilai rating memiliki nilai minimum 0.5 dan maksimum 5 sehingga kita perlu me-normalisasi nilai rating antara (0.5, 5)

In [34]:
from sklearn.preprocessing import MinMaxScaler

# Cosine Similarity
X_cs = test_item_final_rating_cs.copy() 
X_cs = X_cs[X_cs > 0] # abaikan nilai 0 yang merupakan data film yang belum diberi rating oleh pengguna

scaler_cs = MinMaxScaler(feature_range = (0.5, 5))
scaler_cs.fit(X_cs)
pred_cs = scaler_cs.transform(X_cs)

print(pred_cs)

print("- "*10)

# Euclidean Distance
X_ed = test_item_final_rating_ed.copy() 
X_ed = X_ed[X_ed > 0] # abaikan nilai 0 yang merupakan data film yang belum diberi rating oleh pengguna

scaler_ed = MinMaxScaler(feature_range = (0.5, 5))
scaler_ed.fit(X_ed)
pred_ed = scaler_ed.transform(X_ed)

print(pred_ed)

[[       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 ...
 [       nan 2.06051833        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]]
- - - - - - - - - - 
[[       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 ...
 [       nan 2.88712639        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]]


In [35]:
# Total nilai non-NaN

# Cosine Similarity
total_non_nan_cs = np.count_nonzero(~np.isnan(pred_cs))
print(total_non_nan_cs)

# Euclidean Distance
total_non_nan_ed = np.count_nonzero(~np.isnan(pred_ed))
print(total_non_nan_ed)

30251
30251


In [36]:
test = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating')
test

movieId,1,2,3,4,5,6,7,8,9,10,...,187595,189043,189111,189333,189547,189713,190213,190219,193567,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,,2.0,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [37]:
test.T.loc[test.T[1].notnull(), 1][:10]

movieId
151     5.0
223     3.0
423     3.0
593     4.0
596     5.0
673     3.0
1029    5.0
1030    3.0
1198    5.0
1214    4.0
Name: 1, dtype: float64

In [38]:
pred_cs[0][~np.isnan(pred_cs[0])][:10]

array([5.        , 1.18042832, 0.5       , 1.34504247, 2.45657639,
       0.92409448, 0.92110018, 0.91189502, 1.28712752, 1.05678354])

In [39]:
pred_ed[0][~np.isnan(pred_ed[0])][:10]

array([5.        , 1.21706239, 0.5       , 1.22840972, 1.55774108,
       0.7567311 , 0.79828351, 0.77964661, 1.03547158, 1.02855324])

In [40]:
# Root Mean Square Error

# Cosine Similarity
diff_sqr_matrix_cs = (test - pred_cs)**2
sum_of_squares_err_cs = diff_sqr_matrix_cs.sum().sum()

rmse_cs = np.sqrt(sum_of_squares_err_cs/total_non_nan_cs)
print(rmse_cs)

# Euclidean Distance
diff_sqr_matrix_ed = (test - pred_ed)**2
sum_of_squares_err_ed = diff_sqr_matrix_ed.sum().sum()

rmse_ed = np.sqrt(sum_of_squares_err_ed/total_non_nan_ed)
print(rmse_ed)

2.512699212653213
2.552274503980821


Algoritma Cosine Similarity menghasilkan nilai Root Mean Square Error 2.51 sedangkan algoritma Euclidean Distance menghasilkan nilai Root Mean Square Error 2.55

In [41]:
# Mean Absolute Error

# Cosine Similarity
mae_cs = np.abs(pred_cs - test).sum().sum()/total_non_nan_cs
print(mae_cs)

# Euclidean Distance
mae_ed = np.abs(pred_ed - test).sum().sum()/total_non_nan_ed
print(mae_ed)

2.215407217950911
2.249909078617775


Algoritma Cosine Similarity menghasilkan nilai Mean Absolute Error 2.21 sedangkan algoritma Euclidean Distance menghasilkan nilai Mean Absolute Error 2.24

## Kesimpulan


Dari hasil tersebut, dapat disimpulkan bahwa Algoritma Cosine Similarity lebih baik daripada Algoritma Euclidean Distance karena memiliki nilai RMSE dan MAE yang lebih kecil