<a href="https://colab.research.google.com/github/bvaisakh/rec_sys/blob/master/Collaborative_Filtering_Memory_Based_%5BCosine_Similarity%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# importing libraries

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

In [0]:
# defines the predict function

def predict(ratings, similarity, type='user'):
    if type == 'user':
        pred = similarity.dot(ratings) / np.array([similarity.sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([similarity.sum(axis=1)])
    return pred

In [3]:
# loads the training data

r_cols = ['user_id', 'product_id', 'rating', 'unix_timestamp']

ratings_train = pd.read_csv('ratings_train.data', sep='\t', names=r_cols,encoding='latin-1')
print("\nRatings Data :")
print("shape : ", ratings_train.shape)
print(ratings_train.head())

data_matrix_training = np.array(ratings_train.pivot(index = 'user_id', columns ='product_id', values = 'rating').fillna(0))
print("\nRatings Matrix :")
print("shape : ", data_matrix_training.shape)
print(data_matrix_training)


Ratings Data :
shape :  (90570, 4)
   user_id  product_id  rating  unix_timestamp
0        1           1       5       874965758
1        1           2       3       876893171
2        1           3       4       878542960
3        1           4       3       876893119
4        1           5       3       889751712

Ratings Matrix :
shape :  (943, 1680)
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]


In [4]:
# creates similarity matrices
user_similarity = np.abs(pairwise_distances(data_matrix_training, metric='cosine') - 1)
print("\nUser Similarity Matrix :")
print("shape : ", user_similarity.shape)
print(user_similarity)

item_similarity = np.abs(pairwise_distances(data_matrix_training.T, metric='cosine') - 1)
print("\nItem Similarity Matrix :")
print("shape : ", item_similarity.shape)
print(item_similarity)


User Similarity Matrix :
shape :  (943, 943)
[[1.         0.14675076 0.0506765  ... 0.03870478 0.1727177  0.38039608]
 [0.14675076 1.         0.12580785 ... 0.17370692 0.17318465 0.08094333]
 [0.0506765  0.12580785 1.         ... 0.02798846 0.12481628 0.02969262]
 ...
 [0.03870478 0.17370692 0.02798846 ... 1.         0.03995129 0.01914385]
 [0.1727177  0.17318465 0.12481628 ... 0.03995129 1.         0.14471056]
 [0.38039608 0.08094333 0.02969262 ... 0.01914385 0.14471056 1.        ]]

Item Similarity Matrix :
shape :  (1680, 1680)
[[1.         0.40295926 0.33326137 ... 0.         0.05080415 0.05080415]
 [0.40295926 1.         0.2691851  ... 0.         0.08155909 0.08155909]
 [0.33326137 0.2691851  1.         ... 0.         0.         0.09901475]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.05080415 0.08155909 0.         ... 0.         1.         0.        ]
 [0.05080415 0.08155909 0.09901475 ... 0.         0.         1.        ]]


In [0]:
# generates the prediction matrix (the model)

user_prediction = predict(data_matrix_training, user_similarity, type='user')
item_prediction = predict(data_matrix_training, item_similarity, type='item')

In [6]:
# loads the test data

ratings_test = pd.read_csv('ratings_test.data', sep='\t', names=r_cols,encoding='latin-1')
print("\nTest Data:")
print("shape : ", ratings_test.shape)
print(ratings_test.head())

data_matrix_test = np.array(ratings_test.pivot(index = 'user_id', columns ='product_id', values = 'rating').fillna(0))


Test Data:
shape :  (9430, 4)
   user_id  product_id  rating  unix_timestamp
0        1          20       4       887431883
1        1          33       4       878542699
2        1          61       4       878542420
3        1         117       3       874965739
4        1         155       2       878542201


In [7]:
# generates the test results and measures the accuracy

# user-user filtering
test_results_user_filtering = [
(i, j, data_matrix_test[i, j], user_prediction[i, j])
for i in range(data_matrix_test.shape[0])
for j in range(data_matrix_test.shape[0])
if data_matrix_test[i, j] > 0
]

# calculates the RMSE
targets = np.asarray([result[2] for result in test_results_user_filtering])
predictions = np.asarray([result[3] for result in test_results_user_filtering])
rmse = np.sqrt(((predictions - targets) ** 2).mean())

print("RMSE for User-User filtering: {}".format(rmse))


# item - item filtering
test_results_item_filtering = [
(i, j, data_matrix_test[i, j], item_prediction[i, j])
for i in range(data_matrix_test.shape[0])
for j in range(data_matrix_test.shape[0])
if data_matrix_test[i, j] > 0
]

# calculates the RMSE
targets = np.asarray([result[2] for result in test_results_item_filtering])
predictions = np.asarray([result[3] for result in test_results_item_filtering])
rmse = np.sqrt(((predictions - targets) ** 2).mean())

print("RMSE for Item-Item filtering: {}".format(rmse))


RMSE for User-User filtering: 3.21611814887489
RMSE for Item-Item filtering: 3.4483009646221983


In [8]:
test_results_user_filtering

[(0, 18, 4.0, 0.3426062841086195),
 (0, 31, 4.0, 0.536115953630958),
 (0, 54, 4.0, 0.9429303563341433),
 (0, 106, 3.0, 0.1989788760467733),
 (0, 143, 2.0, 1.568722170362042),
 (0, 147, 4.0, 0.5203434641984005),
 (0, 158, 5.0, 0.585789652832404),
 (0, 176, 3.0, 0.8607626149804786),
 (0, 189, 5.0, 0.8541344359469806),
 (0, 251, 4.0, 0.642306348977414),
 (1, 12, 4.0, 0.7116388467227549),
 (1, 43, 5.0, 0.26396414058807766),
 (1, 237, 5.0, 0.8201500544378739),
 (1, 266, 3.0, 0.02974132470022596),
 (1, 267, 3.0, 0.9755003740400586),
 (1, 276, 3.0, 0.36578902328154395),
 (1, 278, 4.0, 0.1361067647668947),
 (1, 283, 4.0, 0.7543345612670136),
 (1, 298, 3.0, 0.2404611317950934),
 (1, 300, 1.0, 0.8962568353432606),
 (2, 232, 1.0, 0.2909622995291951),
 (2, 280, 2.0, 0.3832197542187937),
 (2, 308, 2.0, 0.08897374999236868),
 (2, 313, 5.0, 0.005926051617564392),
 (2, 316, 4.0, 0.38212788179982615),
 (2, 317, 1.0, 1.1680354768586285),
 (2, 319, 3.0, 0.1432268249363448),
 (2, 320, 1.0, 0.7290909460315