## Item-Based Collaborative Filtering

### Import necessary modules

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Load training dataset which contains the ratings for movies by different users
training_full = pd.read_csv("/Users/Pranav/Desktop/ML257/Project/serendipity-sac2018/training.csv")
training_full.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,142882,91658,2.5,1515209647000
1,142882,4344,1.0,1515209646000
2,142882,45720,2.0,1515209643000
3,142882,4734,2.0,1515209641000
4,142882,91542,2.0,1515209637000


In [3]:
# Drop unnecessary columns
cols_to_drop = ['timestamp']
training_full.drop(cols_to_drop, axis = 1, inplace = True)
training_full.head()

Unnamed: 0,userId,movieId,rating
0,142882,91658,2.5
1,142882,4344,1.0
2,142882,45720,2.0
3,142882,4734,2.0
4,142882,91542,2.0


In [4]:
training_full.shape

(9997850, 3)

In [5]:
n_users = training_full['userId'].nunique()
n_movies = training_full['movieId'].nunique()

print('Number of users:', n_users)
print('Number of movies:', n_movies)

Number of users: 104661
Number of movies: 49151


In [6]:
train_sample_df = training_full.iloc[:1000000] 
train_sample_df.shape

(1000000, 3)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(train_sample_df, test_size = 0.30, random_state = 42)

print(X_train.shape)
print(X_test.shape)

(700000, 3)
(300000, 3)


In [8]:
# pivot ratings into movie features
user_data = X_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
user_data.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183301,183303,183307,183311,183313,183319,183321,183325,183327,183333
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# make a copy of train and test datasets
dummy_train = X_train.copy()
dummy_test = X_test.copy()

In [10]:
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x > 0 else 1)
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x > 0 else 0)
# The movies not rated by user is marked as 1 for prediction 
dummy_train = dummy_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(1)
dummy_train.head()
# The movies not rated by user is marked as 0 for evaluation 
dummy_test = dummy_test.pivot(index ='userId', columns = 'movieId', values = 'rating').fillna(0)
dummy_test.head()

In [12]:
# Item-based collaborative filtering
movie_features = X_train.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
movie_features.head()

userId,100036,100053,100067,100094,100119,100143,100163,100165,100177,100200,...,206819,206868,206886,206903,206905,206916,206921,206951,206968,206981
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Item-Item Similarity matrix
# Using Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Item Similarity Matrix using Cosine similarity as a similarity measure between Items
item_similarity = cosine_similarity(movie_features)
item_similarity[np.isnan(item_similarity)] = 0
print(item_similarity)
print("- "*10)
print(item_similarity.shape)

[[1.         0.20269973 0.05105613 ... 0.         0.         0.        ]
 [0.20269973 1.         0.0915931  ... 0.04753553 0.         0.        ]
 [0.05105613 0.0915931  1.         ... 0.         0.         0.        ]
 ...
 [0.         0.04753553 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
- - - - - - - - - - 
(27003, 27003)


In [14]:
# Predicting the User ratings on the movies
item_predicted_ratings = np.dot(movie_features.T, item_similarity)
item_predicted_ratings

array([[ 3.53008182,  3.12670017,  1.2091554 , ...,  0.        ,
         0.        ,  0.56846186],
       [16.32733536, 16.11575496,  8.53533497, ...,  2.78803086,
         0.        ,  0.36226178],
       [ 1.57277633,  1.19931441,  0.69510569, ...,  0.38339441,
         0.        ,  0.        ],
       ...,
       [16.02939205, 12.16780622,  6.54449357, ...,  0.7815803 ,
         0.        ,  0.64608384],
       [ 1.29770743,  0.96046596,  0.38921714, ...,  0.        ,
         0.        ,  0.        ],
       [ 4.81838685,  3.58117287,  2.20383578, ...,  0.12713013,
         0.        ,  0.43696281]])

In [15]:
item_predicted_ratings.shape

(11499, 27003)

In [16]:
dummy_train.shape

(11499, 27003)

In [17]:
# Filtering the ratings only for the movies not already rated by the user for recommendation
# np.multiply for cell-by-cell multiplication 

item_final_ratings = np.multiply(item_predicted_ratings, dummy_train)
item_final_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183301,183303,183307,183311,183313,183319,183321,183325,183327,183333
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,3.530082,3.1267,1.209155,0.01828,1.022254,3.16052,0.923887,0.216121,1.221178,2.330454,...,0.480304,0.094416,0.234528,0.480304,0.631089,0.20626,0.20626,0.0,0.0,0.568462
100053,16.327335,16.115755,8.535335,1.626797,9.517992,12.856148,10.332567,4.083972,0.0,11.835884,...,0.706653,0.0,0.399027,0.706653,0.309202,0.12767,0.12767,2.788031,0.0,0.362262
100067,1.572776,1.199314,0.695106,0.200643,0.816619,1.623315,0.689636,0.408053,0.532653,0.950356,...,0.115018,0.0,0.140078,0.115018,0.0,0.0,0.0,0.383394,0.0,0.0
100094,18.447946,14.336008,5.563386,0.937638,5.318855,12.112356,5.352042,1.108791,4.825224,10.41828,...,0.967765,0.302391,0.117195,0.967765,0.509925,0.235819,0.235819,4.027897,0.211496,0.711656
100119,3.643663,3.769186,1.397114,0.453144,1.433038,4.125124,1.589977,0.140143,2.824516,3.328336,...,0.048513,0.338986,0.0,0.048513,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Top 5 movie recommendations for the User 42
item_final_ratings.iloc[42].sort_values(ascending = False)[0:5]

movieId
77561    509.339876
3793     506.887889
8644     502.147128
34048    501.595442
87232    501.317144
Name: 100726, dtype: float64

It means that on an average our User-based recommendation engine is making an error of 1.2 in predicting the User ratings.

Now, let's evaluate Item-based recommendation engine.

### Evaluation
#### Evaluation will we same as you have seen above for the prediction. The only difference being, you will evaluate for the movie already rated by the User instead of predicting it for the movie not rated by the user.


In [19]:
# Using Item-Item Similarity
test_item_features = X_test.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
test_item_similarity = cosine_similarity(test_item_features)
test_item_similarity[np.isnan(test_item_similarity)] = 0 

print(test_item_similarity)
print("- "*10)
print(test_item_similarity.shape)

[[1.         0.10496433 0.03844121 ... 0.         0.         0.        ]
 [0.10496433 1.         0.05255384 ... 0.         0.         0.        ]
 [0.03844121 0.05255384 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
- - - - - - - - - - 
(19084, 19084)


In [20]:
item_predicted_ratings_test = np.dot(test_item_features.T, test_item_similarity )
item_predicted_ratings_test

array([[0.53457312, 0.53706728, 0.18628149, ..., 0.        , 0.        ,
        0.        ],
       [4.66636881, 4.74204399, 1.47299046, ..., 0.27525158, 0.27530905,
        0.        ],
       [3.50786389, 3.57918612, 1.5000066 , ..., 0.        , 0.2726304 ,
        0.        ],
       ...,
       [1.80732022, 2.39943309, 0.48722066, ..., 0.        , 0.        ,
        0.        ],
       [0.26749019, 0.21995076, 0.03036355, ..., 0.        , 0.        ,
        0.        ],
       [0.67707766, 0.74707039, 0.1285358 , ..., 0.        , 0.2176607 ,
        0.        ]])

In [21]:
# Testing on the movies already rated by the user
test_item_final_rating = np.multiply(item_predicted_ratings_test, dummy_test)
test_item_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183245,183249,183253,183275,183283,183305,183317,183329,183331,183335
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
train_sample_df['rating'].describe()

count    1000000.000000
mean           3.543143
std            1.090318
min            0.500000
25%            3.000000
50%            3.500000
75%            4.500000
max            5.000000
Name: rating, dtype: float64

In [23]:
# But we need to normalize the final rating values between range (0.5, 5)

from sklearn.preprocessing import MinMaxScaler

X = test_item_final_rating.copy() 
X = X[X > 0] # only consider non-zero values as 0 means the user haven't rated the movies

scaler = MinMaxScaler(feature_range = (0.5, 5))
scaler.fit(X)
pred = scaler.transform(X)

print(pred)

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]


In [24]:
# total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(pred))
total_non_nan

300000

In [25]:
test = X_test.pivot(index = 'userId', columns = 'movieId', values = 'rating')
test.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,183245,183249,183253,183275,183283,183305,183317,183329,183331,183335
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100036,,,,,,,,,,,...,,,,,,,,,,
100053,,,,,,,,,,,...,,,,,,,,,,
100094,,,,,,,,,,,...,,,,,,,,,,
100119,,,,,,,,,,,...,,,,,,,,,,
100143,,,,,,,,,,,...,,,,,,,,,,


In [26]:
# RMSE Score

diff_sqr_matrix = (test - pred)**2
sum_of_squares_err = diff_sqr_matrix.sum().sum() # df.sum().sum() by default ignores null values

rmse = np.sqrt(sum_of_squares_err/total_non_nan)
print(rmse)

2.691069364611445


In [27]:
# Mean abslute error

mae = np.abs(pred - test).sum().sum()/total_non_nan
print(mae)

2.436065813632872


### Conclusion
#### It means that on an average our Item-based recommendation engine is making an error of 2.4 in predicting the ratings for items.