In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances     # This is used for calculating the similarities between the users based on distances
from scipy.spatial.distance import cosine,correlation

In [3]:
movies = pd.read_csv("Movie.csv")

In [4]:
movies.head()

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [5]:
movies.shape

(8992, 3)

In [6]:
# As the userid column has repeated values we ae suppose to extract distinct values from that column so we will do that with the help of unique function. And also we are required to find how many unique values are there so we will use length function
# we are extracting unique values because the dataset size is very large we can make it small by working on unique values only

In [7]:
len(movies.userId.unique())

4081

In [8]:
# extracting unique movies from the dataset

In [9]:
len(movies.movie.unique())

10

In [10]:
# Creating Collaboration matrix using PIVOT function

In [11]:
movies_collab = movies.pivot(index = 'userId',
                             columns = 'movie',
                             values = 'rating').reset_index(drop=True)

In [12]:
movies_collab

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
0,,,,,3.5,,,,,
1,,,4.0,,,,,,,
2,,,,,,,,,4.0,
3,,4.0,,3.0,,,,,,
4,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
4076,4.0,,,,,,,,,
4077,3.5,,,,,,,,4.0,
4078,,3.0,4.0,5.0,,3.0,1.0,,4.0,
4079,,,,,,,,,5.0,


In [13]:
# Mapping the userids at the place of index column hence used .index there

In [14]:
movies_collab.index = movies.userId.unique()

In [15]:
movies_collab

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
3,,,,,3.5,,,,,
6,,,4.0,,,,,,,
8,,,,,,,,,4.0,
10,,4.0,,3.0,,,,,,
11,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
7044,4.0,,,,,,,,,
7070,3.5,,,,,,,,4.0,
7080,,3.0,4.0,5.0,,3.0,1.0,,4.0,
7087,,,,,,,,,5.0,


In [16]:
# As we are required to calculate similarities so we cannot calculate as there are NaN values in our data, for calculating similarities we will be needing real values so we will substitute 0 in place of all the NaN values by using fillna() method

In [17]:
movies_collab.fillna(0,inplace=True)

In [18]:
movies_collab

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
3,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
10,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7044,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7070,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
7080,0.0,3.0,4.0,5.0,0.0,3.0,1.0,0.0,4.0,0.0
7087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [None]:
# Calculating cosine similarities between the users. 
# Similarities will be not calculated directly first we are required to calculate distance using pairwise function and then we can calculate similarities as 1-distance

In [19]:
user_sim = 1 - pairwise_distances(movies_collab.values,metric = 'cosine')

In [20]:
user_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 1.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 1.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        1.        ]])

In [52]:
# converting the calculated similarities to dataframe format

In [21]:
user_sim_df = pd.DataFrame(user_sim)

In [22]:
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,1.0


In [23]:
# Mapping the index and columns to actual unique user ids

In [24]:
user_sim_df.index = movies.userId.unique()
user_sim_df.columns = movies.userId.unique()

In [25]:
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,3,6,8,10,11
3,1.0,0.0,0.0,0.0,1.0
6,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0
10,0.0,0.0,0.0,1.0,0.0
11,1.0,0.0,0.0,0.0,1.0


In [26]:
# After reading the above matrix we can see that there is a maximum similarity between the same users for eg.: similartity between 3 and 3 is 1, likewise between 4 and 4 it is 1, we dont want to calculate teh silirity between same people we are required to camculate it between different users so we will fill the diagonal elements of that matrix with zero by using fill_diagonal function from numpy

In [27]:
np.fill_diagonal(user_sim,0)

In [28]:
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,3,6,8,10,11
3,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0
11,1.0,0.0,0.0,0.0,0.0


In [29]:
# finding the most similar users by using idxmax function

In [30]:
user_sim_df.idxmax(axis = 1)

3         11
6        168
8         16
10      4047
11         3
        ... 
7044      80
7070    1808
7080     708
7087       8
7105    4110
Length: 4081, dtype: int64

Inference : User 3 and 11 has maximum similarity 
    user 6 and 168 are similar 
    user 8 and 16 are similar and so on..

In [31]:
# Displaying the similar records 

In [34]:
movies[(movies['userId'] == 6) | (movies['userId'] ==11)]

Unnamed: 0,userId,movie,rating
1,6,Toy Story (1995),5.0
4,11,Toy Story (1995),4.5
3725,6,Grumpier Old Men (1995),3.0
6464,6,Sabrina (1995),5.0
7446,11,GoldenEye (1995),2.5


In [74]:
user_1 = movies[movies.userId==11]

In [75]:
user_2 = movies[movies.userId==39]

In [None]:
# If I want to give recommendation to user2 based on similarties, I will create recommendation matrix 

In [77]:
pd.merge(user_1,user_2, on ='movie', how = 'outer')

Unnamed: 0,userId_x,movie,rating_x,userId_y,rating_y
0,11.0,Toy Story (1995),4.5,39.0,5.0
1,11.0,GoldenEye (1995),2.5,,
2,,Heat (1995),,39.0,5.0


Inference: User 11th and user 39th both watched movie Toy story and also rated, user 11th watched Golden eye but user 39th did not watched Golden eye so we can recommend user 39th to watch golden eye and similarly we can recommend user 11th to watch heat.