# Recommendation System Notebook
- User based recommendation
- User based prediction
- Item based recommendation
- Item based prediction
- Evaluation

In [1]:
# import libraties
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
ratings = pd.read_csv('ratings.csv', encoding='latin-1')

In [2]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,12882,1,4.0,1147195252
1,12882,32,3.5,1147195307
2,12882,47,5.0,1147195343
3,12882,50,5.0,1147185499
4,12882,110,4.5,1147195239


## Dividing the dataset into train and test

In [3]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings, test_size=0.30, random_state=31)

In [4]:
print(train.shape)
print(test.shape)

(185153, 4)
(79352, 4)


In [5]:
# pivot ratings into movie features
df_movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [6]:
df_movie_features.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,2.5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.5,0.0,0.0,0.0,3.0,4.5,3.5,3.0
910,0.0,4.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0


### Copy train and test dataset
These dataset will be used for prediction and evaluation. 
- Dummy train will be used later for prediction of the movies which has not been rated by the user. To ignore the movies rated by the user, we will mark it as 0 during prediction. The movies not rated by user is marked as 1 for prediction. 
- Dummy test will be used for evaluation. To evaluate, we will only make prediction on the movies rated by the user. So, this is marked as 1. This is just opposite of dummy_train

In [7]:
dummy_train = train.copy()
dummy_test = test.copy()

In [8]:
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x>=1 else 0)

In [9]:
# The movies not rated by user is marked as 1 for prediction. 
dummy_train = dummy_train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
    #end
).fillna(1)

# The movies not rated by user is marked as 0 for evaluation. 
dummy_test = dummy_test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [10]:
dummy_train.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
320,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
359,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
370,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
910,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0


In [11]:
dummy_test.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
359,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
370,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
910,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# User Similarity Matrix

## Using Cosine Similarity

In [12]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_movie_features, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

[[1.         0.27437326 0.20805722 ... 0.21701259 0.         0.15886221]
 [0.27437326 1.         0.16309164 ... 0.15044338 0.         0.13787661]
 [0.20805722 0.16309164 1.         ... 0.26569469 0.02849447 0.23968733]
 ...
 [0.21701259 0.15044338 0.26569469 ... 1.         0.044708   0.14407477]
 [0.         0.         0.02849447 ... 0.044708   1.         0.0174666 ]
 [0.15886221 0.13787661 0.23968733 ... 0.14407477 0.0174666  1.        ]]


In [13]:
user_correlation.shape

(862, 862)

## Using adjusted Cosine 

### Here, not removing the NaN values and calculating the mean only for the movies rated by the user

In [14]:
movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [15]:
movie_features.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,2.5,,,,,,2.0,,2.5,,...,,,,,,,,,,
320,,,,,,,,,,,...,,,,,,,,,,
359,,,,,,,,,4.0,4.0,...,,,,,,,,,,
370,,,,,,,,,,,...,,3.0,4.5,,,,3.0,4.5,3.5,3.0
910,,4.0,,,,3.5,,,,4.0,...,,,,,,,,4.5,,


### Normalising the rating of the movie for each user aroung 0 mean

In [16]:
mean = np.nanmean(movie_features, axis=1)
df_subtracted = (movie_features.T-mean).T

In [17]:
df_subtracted.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,-0.847368,,,,,,-1.347368,,-0.847368,,...,,,,,,,,,,
320,,,,,,,,,,,...,,,,,,,,,,
359,,,,,,,,,0.3157,0.3157,...,,,,,,,,,,
370,,,,,,,,,,,...,,-0.790441,0.709559,,,,-0.790441,0.709559,-0.290441,-0.790441
910,,0.087607,,,,-0.412393,,,,0.087607,...,,,,,,,,0.587607,,


### Finding cosine similarity

In [18]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

[[ 1.          0.113453    0.11494838 ...  0.07731154  0.
  -0.04190499]
 [ 0.113453    1.          0.07188267 ...  0.07301337  0.
   0.02765892]
 [ 0.11494838  0.07188267  1.         ...  0.06492798  0.08561282
  -0.01730643]
 ...
 [ 0.07731154  0.07301337  0.06492798 ...  1.          0.05669439
   0.03825774]
 [ 0.          0.          0.08561282 ...  0.05669439  1.
  -0.00102823]
 [-0.04190499  0.02765892 -0.01730643 ...  0.03825774 -0.00102823
   1.        ]]


## Prediction

Doing the prediction for the users which are positively related with other users, and not the users which are negatively related as we are interested in the users which are more similar to the current users. So, ignoring the correlation for values less than 0. 

In [19]:
user_correlation[user_correlation<0]=0
user_correlation

array([[1.        , 0.113453  , 0.11494838, ..., 0.07731154, 0.        ,
        0.        ],
       [0.113453  , 1.        , 0.07188267, ..., 0.07301337, 0.        ,
        0.02765892],
       [0.11494838, 0.07188267, 1.        , ..., 0.06492798, 0.08561282,
        0.        ],
       ...,
       [0.07731154, 0.07301337, 0.06492798, ..., 1.        , 0.05669439,
        0.03825774],
       [0.        , 0.        , 0.08561282, ..., 0.05669439, 1.        ,
        0.        ],
       [0.        , 0.02765892, 0.        , ..., 0.03825774, 0.        ,
        1.        ]])

Rating predicted by the user (for movies rated as well as not rated) is the weighted sum of correlation with the movie rating (as present in the rating dataset). 

In [20]:
user_predicted_ratings = np.dot(user_correlation, movie_features.fillna(0))
user_predicted_ratings

array([[50.25431628, 23.21431421,  6.92774725, ..., 10.29310194,
         5.18963663, 10.30393691],
       [63.1668671 , 28.66715852,  7.70238561, ..., 10.94090127,
         7.23656686, 13.43793905],
       [65.8132159 , 31.28951501,  8.87591053, ..., 13.18816596,
         7.40423075, 13.93635509],
       ...,
       [90.84386199, 45.29729897, 14.90281743, ..., 14.53319001,
        10.03775996, 17.92253468],
       [40.08352   , 17.48819858,  5.61950399, ...,  5.25427855,
         4.14222761,  5.40077794],
       [42.21672064, 21.86231082,  5.81298622, ...,  5.1213186 ,
         4.37796843,  6.21908569]])

In [21]:
user_predicted_ratings.shape

(862, 2500)

Since we are interested only in the movies not rated by the user, we will ignore the movies rated by the user by making it zero. 

In [22]:
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
user_final_rating.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,0.0,23.214314,6.927747,0.549306,6.464053,23.535059,0.0,2.235682,0.0,10.623249,...,7.328997,7.604004,7.196259,8.23946,9.917107,7.476424,7.68381,10.293102,5.189637,10.303937
320,63.166867,28.667159,7.702386,1.141707,7.491681,30.55314,9.110414,2.12739,33.996904,11.592103,...,9.891204,9.906554,11.141293,12.7072,14.315709,11.806246,8.871086,10.940901,7.236567,13.437939
359,65.813216,31.289515,8.875911,0.875243,9.167411,33.674505,10.068789,2.86145,0.0,0.0,...,11.181333,10.368695,11.039097,12.413749,13.442644,10.815282,11.196572,13.188166,7.404231,13.936355
370,54.51687,27.369566,5.724738,0.685778,6.437963,26.894367,7.839079,2.315056,29.568027,10.9155,...,10.899825,0.0,0.0,12.529633,13.884563,12.486731,0.0,0.0,0.0,0.0
910,31.147689,0.0,4.950286,0.461965,5.406109,0.0,6.736494,1.663783,15.6144,0.0,...,5.734458,4.455202,4.900093,5.007222,5.803241,4.223923,4.302043,0.0,4.06361,5.529644


### Finding the top 5 recommendation for the user 1 

In [23]:
user_final_rating.iloc[1].sort_values(ascending=False)[0:5]

movieId
296     107.287371
356      93.704186
7153     90.188210
318      87.105961
5952     86.462804
Name: 320, dtype: float64

# Item Based Similarity

Using Correlation

Taking the transpose of the rating matrix to normalize the rating around the mean for different movie ID. In the user based similarity, we had taken mean for each user intead of each movie. 

In [24]:
movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).T

movie_features.head()

userId,316,320,359,370,910,975,1015,1387,1447,1588,...,137118,137209,137227,137446,137559,137609,137805,138072,138176,138200
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.5,,,,,,4.0,3.0,4.5,,...,3.0,,,,3.5,,,3.0,,3.0
2,,,,,4.0,,3.0,3.0,,,...,,3.5,,,,,2.0,1.0,,
3,,,,,,,,,,,...,,,,,,,,1.0,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


Normalising the movie rating for each movie

In [25]:
mean = np.nanmean(movie_features, axis=1)
df_subtracted = (movie_features.T-mean).T

In [26]:
df_subtracted.head()

userId,316,320,359,370,910,975,1015,1387,1447,1588,...,137118,137209,137227,137446,137559,137609,137805,138072,138176,138200
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.275148,,,,,,0.224852,-0.775148,0.724852,,...,-0.775148,,,,-0.275148,,,-0.775148,,-0.775148
2,,,,,0.963351,,-0.036649,-0.036649,,,...,,0.463351,,,,,-1.036649,-2.036649,,
3,,,,,,,,,,,...,,,,,,,,-1.827273,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


Finding the cosine similarity. Note that since the data is normalised, both the cosine metric and correlation metric will give the same value. 

In [27]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
item_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

[[ 1.          0.19611408  0.10223599 ...  0.09934463  0.02130102
   0.0145184 ]
 [ 0.19611408  1.          0.18851304 ...  0.0281184  -0.00746555
   0.0101341 ]
 [ 0.10223599  0.18851304  1.         ...  0.01574747 -0.007927
  -0.00541087]
 ...
 [ 0.09934463  0.0281184   0.01574747 ...  1.          0.08416291
   0.03814656]
 [ 0.02130102 -0.00746555 -0.007927   ...  0.08416291  1.
   0.13139561]
 [ 0.0145184   0.0101341  -0.00541087 ...  0.03814656  0.13139561
   1.        ]]


Filtering the correlation only for which the value is greater than 0. (Positively correlated)

In [28]:
item_correlation[item_correlation<0]=0
item_correlation

array([[1.        , 0.19611408, 0.10223599, ..., 0.09934463, 0.02130102,
        0.0145184 ],
       [0.19611408, 1.        , 0.18851304, ..., 0.0281184 , 0.        ,
        0.0101341 ],
       [0.10223599, 0.18851304, 1.        , ..., 0.01574747, 0.        ,
        0.        ],
       ...,
       [0.09934463, 0.0281184 , 0.01574747, ..., 1.        , 0.08416291,
        0.03814656],
       [0.02130102, 0.        , 0.        , ..., 0.08416291, 1.        ,
        0.13139561],
       [0.0145184 , 0.0101341 , 0.        , ..., 0.03814656, 0.13139561,
        1.        ]])

# Prediction

In [29]:
item_predicted_ratings = np.dot((movie_features.fillna(0).T),item_correlation)
item_predicted_ratings

array([[ 35.72244621,  33.61943773,  17.61957634, ...,  14.07831105,
         12.37426444,  10.58864866],
       [ 12.30429899,  11.07593323,   6.31819286, ...,   6.27027129,
          5.48612867,   3.21837321],
       [145.6692098 , 159.97680608,  97.55572597, ...,  64.77522177,
         67.92537345,  59.61531021],
       ...,
       [ 76.88225353,  99.7533036 ,  70.63454659, ...,  29.17762969,
         33.19190502,  31.47341465],
       [  2.55313278,   2.67010587,   1.49637968, ...,   0.91907785,
          0.20122135,   0.43094328],
       [ 70.45775693,  81.7468927 ,  39.37750036, ...,  43.49277964,
         32.15301569,  24.95317062]])

In [30]:
item_predicted_ratings.shape

(862, 2500)

In [31]:
dummy_train.shape

(862, 2500)

### Filtering the rating only for the movies not rated by the user for recommendation

In [32]:
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,0.0,33.619438,17.619576,8.914504,11.67471,27.737117,0.0,9.70467,0.0,19.740081,...,11.030932,10.800044,14.493394,5.000271,7.729876,8.923475,9.503463,14.078311,12.374264,10.588649
320,12.304299,11.075933,6.318193,4.108668,6.103119,12.63356,10.225404,3.749199,11.348623,8.639603,...,3.566254,3.321089,6.654742,2.251768,5.984891,4.864561,4.378397,6.270271,5.486129,3.218373
359,145.66921,159.976806,97.555726,59.044311,77.82268,151.312676,136.346168,60.309664,0.0,0.0,...,66.171481,84.546042,92.141129,41.721983,52.477514,55.186842,83.740062,64.775222,67.925373,59.61531
370,66.146818,62.323377,30.49144,24.956732,30.636299,73.89134,51.145885,22.90937,62.989541,53.878952,...,33.101405,0.0,0.0,22.652241,41.779748,32.508788,0.0,0.0,0.0,0.0
910,122.927954,0.0,79.654806,51.80493,73.719938,0.0,122.394418,53.41339,128.275675,0.0,...,55.120072,67.510038,72.927416,38.281436,46.971501,44.801488,68.694545,0.0,60.740312,47.538129


### Top 5 prediction for the user -1

In [33]:
item_final_rating.iloc[1].sort_values(ascending=False)[0:5]

movieId
5989    17.543053
5952    17.044737
1961    16.815261
2028    15.954765
1210    15.805789
Name: 320, dtype: float64

# Evaluation

Evaluation will we same as you have seen above for the prediction. The only difference being, you will evaluate for the movie already rated by the user insead of predicting it for the movie not rated by the user. 

## Using User Similarity

In [34]:
test_movie_features = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)
mean = np.nanmean(test_movie_features, axis=1)
test_df_subtracted = (test_movie_features.T-mean).T

# User Similarity Matrix
test_user_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_user_correlation[np.isnan(test_user_correlation)] = 0
print(test_user_correlation)

[[ 1.00000000e+00  4.91024889e-02  7.64197890e-03 ... -8.07095335e-03
   0.00000000e+00  3.09176081e-02]
 [ 4.91024889e-02  1.00000000e+00 -3.05475844e-02 ... -4.53168966e-03
   0.00000000e+00  3.81566866e-02]
 [ 7.64197890e-03 -3.05475844e-02  1.00000000e+00 ...  6.25580296e-02
   4.27777475e-02  1.78353303e-02]
 ...
 [-8.07095335e-03 -4.53168966e-03  6.25580296e-02 ...  1.00000000e+00
   0.00000000e+00  2.28426444e-05]
 [ 0.00000000e+00  0.00000000e+00  4.27777475e-02 ...  0.00000000e+00
   1.00000000e+00  0.00000000e+00]
 [ 3.09176081e-02  3.81566866e-02  1.78353303e-02 ...  2.28426444e-05
   0.00000000e+00  1.00000000e+00]]


In [35]:
test_user_correlation[test_user_correlation<0]=0
test_user_predicted_ratings = np.dot(test_user_correlation, test_movie_features.fillna(0))
test_user_predicted_ratings

array([[11.24326336,  5.96031871,  1.46781278, ...,  1.35966561,
         1.22654022,  1.01885626],
       [16.14894831,  7.1246858 ,  2.01042495, ...,  2.6824788 ,
         1.02510671,  2.71741776],
       [18.26993904,  6.83632805,  2.21944733, ...,  2.59095562,
         1.49795726,  1.89435702],
       ...,
       [16.56078836,  9.82294673,  3.09482881, ...,  3.20570977,
         1.63083896,  3.07683335],
       [ 6.71623131,  2.79143628,  1.45412467, ...,  1.63741472,
         1.77063201,  1.6795732 ],
       [11.52541632,  6.18519973,  1.55625762, ...,  1.67140647,
         1.96862719,  1.71481242]])

### Doing prediction for the movies rated by the user

In [36]:
test_user_final_rating = np.multiply(test_user_predicted_ratings,dummy_test)

In [37]:
test_user_final_rating.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
359,18.269939,0.0,0.0,0.0,0.0,13.420117,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
370,21.182567,10.914147,0.0,0.0,0.0,13.035304,0.0,0.0,0.0,0.0,...,5.614698,0.0,0.0,5.586026,0.0,0.0,0.0,0.0,0.0,0.0
910,22.282618,0.0,5.521608,0.0,4.644921,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.019657,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculating the RMSE for only the movies rated by user. For  RMSE, normalising the rating to (1,5) range. 

In [38]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_user_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

MinMaxScaler(copy=True, feature_range=(1, 5))
[[       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [2.72027362        nan        nan ...        nan        nan        nan]
 ...
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]]


In [39]:
test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [40]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [41]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

1.1414096577656148


## Using Item similarity

In [42]:
test_movie_features = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).T

mean = np.nanmean(test_movie_features, axis=1)
test_df_subtracted = (test_movie_features.T-mean).T

test_item_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_item_correlation[np.isnan(test_item_correlation)] = 0
test_item_correlation[test_item_correlation<0]=0

In [43]:
test_item_correlation.shape

(2500, 2500)

In [44]:
test_movie_features.shape

(2500, 862)

In [45]:
test_item_predicted_ratings = (np.dot(test_item_correlation, test_movie_features.fillna(0))).T
test_item_final_rating = np.multiply(test_item_predicted_ratings,dummy_test)
test_item_final_rating.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
359,33.633893,0.0,0.0,0.0,0.0,45.786436,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
370,25.4048,26.507249,0.0,0.0,0.0,29.99403,0.0,0.0,0.0,0.0,...,12.183829,0.0,0.0,14.921445,0.0,0.0,0.0,0.0,0.0,0.0
910,41.421834,0.0,33.654179,0.0,31.550066,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,13.76225,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [47]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_item_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))


test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

MinMaxScaler(copy=True, feature_range=(1, 5))


### Finding RMSE

In [48]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

1.8089391863621964


Thank-you