## User-Based Collaborative Filtering

### Import necessary modules

In [10]:
#data analysis libraries 
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Enable multiple output cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [11]:
# Load training dataset which contains the ratings for movies by different users
training_full = pd.read_csv("/Users/ankitadeshmukh/Desktop/SJSU/Academic/Fall22/CMPE257/Project/Dataset/serendipity-sac2018/training.csv")
training_full.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,142882,91658,2.5,1515209647000
1,142882,4344,1.0,1515209646000
2,142882,45720,2.0,1515209643000
3,142882,4734,2.0,1515209641000
4,142882,91542,2.0,1515209637000


In [12]:
# Drop unnecessary columns
cols_to_drop = ['timestamp']
training_full.drop(cols_to_drop, axis = 1, inplace = True)
training_full.head()

Unnamed: 0,userId,movieId,rating
0,142882,91658,2.5
1,142882,4344,1.0
2,142882,45720,2.0
3,142882,4734,2.0
4,142882,91542,2.0


In [13]:
training_full.shape

(9997850, 3)

In [14]:
n_users = training_full['userId'].nunique()
n_movies = training_full['movieId'].nunique()

print('Number of users:', n_users)
print('Number of movies:', n_movies)

Number of users: 104661
Number of movies: 49151


In [15]:
train_sample_df = training_full.iloc[:1000000] 
train_sample_df.shape

(1000000, 3)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(train_sample_df, test_size = 0.30, random_state = 42)

print(X_train.shape)
print(X_test.shape)

(700000, 3)
(300000, 3)


In [17]:
# pivot ratings into movie features
user_data = X_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
user_data.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183301,183303,183307,183311,183313,183319,183321,183325,183327,183333
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# make a copy of train and test datasets
dummy_train = X_train.copy()
dummy_test = X_test.copy()

In [19]:
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x > 0 else 1)
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x > 0 else 0)
# The movies not rated by user is marked as 1 for prediction 
dummy_train = dummy_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(1)
# The movies not rated by user is marked as 0 for evaluation 
dummy_test = dummy_test.pivot(index ='userId', columns = 'movieId', values = 'rating').fillna(0)

In [20]:
dummy_train.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183301,183303,183307,183311,183313,183319,183321,183325,183327,183333
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
100053,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
100067,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
100094,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
100119,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
dummy_test.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183245,183249,183253,183275,183283,183305,183317,183329,183331,183335
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# User-User Similarity matrix
# Using Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# User Similarity Matrix using Cosine similarity as a similarity measure between Users
user_similarity = cosine_similarity(user_data)
user_similarity[np.isnan(user_similarity)] = 0
print(user_similarity)
print(user_similarity.shape)

[[1.         0.09837613 0.         ... 0.09592137 0.         0.27320784]
 [0.09837613 1.         0.         ... 0.         0.         0.10838304]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.09592137 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.27320784 0.10838304 0.         ... 0.         0.         1.        ]]
(11499, 11499)


In [23]:
# Predicting the User ratings on the movies
user_predicted_ratings = np.dot(user_similarity, user_data)
user_predicted_ratings

array([[5.82735714e+01, 2.86910721e+01, 1.96280660e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.38010924e-01],
       [8.86905118e+01, 4.40661056e+01, 3.97146277e+00, ...,
        2.21081510e-01, 0.00000000e+00, 5.06692670e-02],
       [6.41810064e+01, 2.16182552e+01, 2.64588690e+00, ...,
        1.25202719e-01, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.01439894e+02, 3.77056415e+01, 3.75132624e+00, ...,
        9.68313009e-02, 0.00000000e+00, 1.15278169e-01],
       [3.08090142e+01, 1.61121896e+01, 6.16090006e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.16930089e+01, 2.70550696e+01, 2.31031393e+00, ...,
        2.94417089e-02, 0.00000000e+00, 1.75536038e-01]])

In [24]:
user_predicted_ratings.shape

(11499, 27003)

In [25]:
# np.multiply for cell-by-cell multiplication 

user_final_ratings = np.multiply(user_predicted_ratings, dummy_train)
user_final_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183301,183303,183307,183311,183313,183319,183321,183325,183327,183333
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,58.273571,28.691072,1.962807,0.025532,2.866334,25.274669,1.843326,0.102204,0.885905,13.735097,...,0.046003,0.127331,0.077007,0.018401,0.327634,0.223859,0.255839,0.0,0.0,0.238011
100053,88.690512,44.066106,3.971463,0.335592,7.761842,28.758353,6.063443,0.57601,0.0,19.098019,...,0.022986,0.0,0.070083,0.009194,0.080732,0.031692,0.036219,0.221082,0.0,0.050669
100067,64.181006,21.618255,2.645887,0.188325,5.288246,32.311966,3.617316,0.469767,0.720776,11.975226,...,0.016706,0.0,0.178272,0.006682,0.0,0.0,0.0,0.125203,0.0,0.0
100094,210.444494,81.034522,5.179119,0.394406,9.672527,57.586697,6.926618,0.43032,1.878888,36.721813,...,0.040653,0.169367,0.046481,0.016261,0.178814,0.117103,0.133832,0.343577,0.275927,0.186507
100119,29.634581,16.639764,1.304534,0.313928,1.936053,15.185107,1.722519,0.071393,0.960991,8.398118,...,0.002416,0.113676,0.0,0.000966,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
user_final_ratings.iloc[42].sort_values(ascending = False)[0:5]

movieId
79132    871.723099
2571     862.530299
318      808.526865
2959     772.055910
7153     756.256497
Name: 100726, dtype: float64

In [27]:
# Item-based collaborative filtering
movie_features = X_train.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
movie_features.head()

userId,100036,100053,100067,100094,100119,100143,100163,100165,100177,100200,...,206819,206868,206886,206903,206905,206916,206921,206951,206968,206981
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Item-Item Similarity matrix
# Using Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Item Similarity Matrix using Cosine similarity as a similarity measure between Items
item_similarity = cosine_similarity(movie_features)
item_similarity[np.isnan(item_similarity)] = 0
print(item_similarity)
print("- "*10)
print(item_similarity.shape)

[[1.         0.20269973 0.05105613 ... 0.         0.         0.        ]
 [0.20269973 1.         0.0915931  ... 0.04753553 0.         0.        ]
 [0.05105613 0.0915931  1.         ... 0.         0.         0.        ]
 ...
 [0.         0.04753553 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
- - - - - - - - - - 
(27003, 27003)


In [29]:
# Predicting the User ratings on the movies
item_predicted_ratings = np.dot(movie_features.T, item_similarity)
item_predicted_ratings

array([[ 3.53008182,  3.12670017,  1.2091554 , ...,  0.        ,
         0.        ,  0.56846186],
       [16.32733536, 16.11575496,  8.53533497, ...,  2.78803086,
         0.        ,  0.36226178],
       [ 1.57277633,  1.19931441,  0.69510569, ...,  0.38339441,
         0.        ,  0.        ],
       ...,
       [16.02939205, 12.16780622,  6.54449357, ...,  0.7815803 ,
         0.        ,  0.64608384],
       [ 1.29770743,  0.96046596,  0.38921714, ...,  0.        ,
         0.        ,  0.        ],
       [ 4.81838685,  3.58117287,  2.20383578, ...,  0.12713013,
         0.        ,  0.43696281]])

In [30]:
item_predicted_ratings.shape

(11499, 27003)

In [31]:
dummy_train.shape

(11499, 27003)

In [32]:
# Filtering the ratings only for the movies not already rated by the user for recommendation
# np.multiply for cell-by-cell multiplication 

item_final_ratings = np.multiply(item_predicted_ratings, dummy_train)
item_final_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183301,183303,183307,183311,183313,183319,183321,183325,183327,183333
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,3.530082,3.1267,1.209155,0.01828,1.022254,3.16052,0.923887,0.216121,1.221178,2.330454,...,0.480304,0.094416,0.234528,0.480304,0.631089,0.20626,0.20626,0.0,0.0,0.568462
100053,16.327335,16.115755,8.535335,1.626797,9.517992,12.856148,10.332567,4.083972,0.0,11.835884,...,0.706653,0.0,0.399027,0.706653,0.309202,0.12767,0.12767,2.788031,0.0,0.362262
100067,1.572776,1.199314,0.695106,0.200643,0.816619,1.623315,0.689636,0.408053,0.532653,0.950356,...,0.115018,0.0,0.140078,0.115018,0.0,0.0,0.0,0.383394,0.0,0.0
100094,18.447946,14.336008,5.563386,0.937638,5.318855,12.112356,5.352042,1.108791,4.825224,10.41828,...,0.967765,0.302391,0.117195,0.967765,0.509925,0.235819,0.235819,4.027897,0.211496,0.711656
100119,3.643663,3.769186,1.397114,0.453144,1.433038,4.125124,1.589977,0.140143,2.824516,3.328336,...,0.048513,0.338986,0.0,0.048513,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# Top 5 movie recommendations for the User 42
item_final_ratings.iloc[42].sort_values(ascending = False)[0:5]

movieId
77561    509.339876
3793     506.887889
8644     502.147128
34048    501.595442
87232    501.317144
Name: 100726, dtype: float64

### Evaluation
#### Evaluation will we same as you have seen above for the prediction. The only difference being, you will evaluate for the movie already rated by the User instead of predicting it for the movie not rated by the user.


In [34]:
# Using User-User similarity
test_user_features = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
test_user_similarity = cosine_similarity(test_user_features)
test_user_similarity[np.isnan(test_user_similarity)] = 0

print(test_user_similarity)
print("- "*10)
print(test_user_similarity.shape)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
- - - - - - - - - - 
(10779, 10779)


In [35]:
user_predicted_ratings_test = np.dot(test_user_similarity, test_user_features)
user_predicted_ratings_test

array([[ 9.50449617,  3.78883607,  0.39394224, ...,  0.        ,
         0.        ,  0.        ],
       [26.07139299, 11.34987756,  0.74237774, ...,  0.17034344,
         0.09540937,  0.        ],
       [23.07740193,  9.95847832,  0.76544873, ...,  0.        ,
         0.13897713,  0.        ],
       ...,
       [ 8.56347818,  5.44552254,  0.1575638 , ...,  0.        ,
         0.        ,  0.        ],
       [ 6.62310759,  3.29488303,  0.13146393, ...,  0.        ,
         0.        ,  0.        ],
       [14.78261935,  7.2003291 ,  0.41570155, ...,  0.        ,
         0.23328327,  0.        ]])

In [36]:
# Testing on the movies already rated by the user
test_user_final_rating = np.multiply(user_predicted_ratings_test, dummy_test)
test_user_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183245,183249,183253,183275,183283,183305,183317,183329,183331,183335
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
train_sample_df['rating'].describe()

count    1000000.000000
mean           3.543143
std            1.090318
min            0.500000
25%            3.000000
50%            3.500000
75%            4.500000
max            5.000000
Name: rating, dtype: float64

In [38]:
# But we need to normalize the final rating values between range (0.5, 5)

from sklearn.preprocessing import MinMaxScaler

X = test_user_final_rating.copy() 
X = X[X > 0] # only consider non-zero values as 0 means the user haven't rated the movies

scaler = MinMaxScaler(feature_range = (0.5, 5))
scaler.fit(X)
pred = scaler.transform(X)

print(pred)

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]


In [39]:
# total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(pred))
total_non_nan

300000

In [40]:
test = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating')
test.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183245,183249,183253,183275,183283,183305,183317,183329,183331,183335
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,,,,,,,,,,,...,,,,,,,,,,
100053,,,,,,,,,,,...,,,,,,,,,,
100094,,,,,,,,,,,...,,,,,,,,,,
100119,,,,,,,,,,,...,,,,,,,,,,
100143,,,,,,,,,,,...,,,,,,,,,,


In [41]:
# RMSE Score

diff_sqr_matrix = (test - pred)**2
sum_of_squares_err = diff_sqr_matrix.sum().sum() # df.sum().sum() by default ignores null values

rmse = np.sqrt(sum_of_squares_err/total_non_nan)
print(rmse)

1.5216169789671445


In [42]:
# Mean abslute error

mae = np.abs(pred - test).sum().sum()/total_non_nan
print(mae)

1.2368013867194152


### Conclusion
#### It means that on an average our User-based recommendation engine is making an error of 1.2 in predicting the ratings by users.