Recommendation System

In [1]:
import pandas as pd
import numpy as np

In [2]:
movie = pd.read_csv('Movie.csv')

In [3]:
movie.head()

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [4]:
movie.sort_values('userId')

Unnamed: 0,userId,movie,rating
2569,1,Jumanji (1995),3.5
3724,2,Grumpier Old Men (1995),4.0
0,3,Toy Story (1995),4.0
5204,4,Heat (1995),3.0
7444,4,GoldenEye (1995),4.0
...,...,...,...
6463,7117,Heat (1995),5.0
2567,7119,Toy Story (1995),5.0
2568,7120,Toy Story (1995),4.5
3723,7120,Jumanji (1995),4.0


In [5]:
# Number of unique users in the dataset
len(movie.userId.unique())

4081

In [6]:
movie['rating'].value_counts().sort_index()

0.5      57
1.0     212
1.5      61
2.0     542
2.5     277
3.0    2736
3.5     679
4.0    2660
4.5     374
5.0    1394
Name: rating, dtype: int64

In [7]:
len(movie.movie.unique())

10

In [8]:
movie.movie.value_counts()

Toy Story (1995)                      2569
GoldenEye (1995)                      1548
Heat (1995)                           1260
Jumanji (1995)                        1155
Sabrina (1995)                         700
Grumpier Old Men (1995)                685
Father of the Bride Part II (1995)     657
Sudden Death (1995)                    202
Waiting to Exhale (1995)               138
Tom and Huck (1995)                     78
Name: movie, dtype: int64

In [10]:
user_movies_df = movie.pivot(index = 'userId',
                                columns = 'movie',
                                values = 'rating')
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,3.5,,,,,
2,,,4.0,,,,,,,
3,,,,,,,,,4.0,
4,,4.0,,3.0,,,,,,
5,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
7115,4.0,,,,,,,,,
7116,3.5,,,,,,,,4.0,
7117,,3.0,4.0,5.0,,3.0,1.0,,4.0,
7119,,,,,,,,,5.0,


In [11]:
user_movies_df.index = movie.userId.unique()

In [12]:
# Impute Those NANs with 0 values
user_movies_df.fillna(0,inplace = True)

In [13]:
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
3,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
10,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7044,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7070,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
7080,0.0,3.0,4.0,5.0,0.0,3.0,1.0,0.0,4.0,0.0
7087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [14]:
# Calculating Cosine similarity between users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine ,correlation

In [15]:
user_sim = 1 - pairwise_distances(user_movies_df.values, metric = 'cosine')
user_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 1.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 1.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        1.        ]])

In [16]:
# Store the results in a DataFrame
user_sim_df = pd.DataFrame(user_sim)

In [17]:
# Set th index and columns names to userIds
user_sim_df.index = movie.userId.unique()

In [18]:
user_sim_df.columns = movie.userId.unique()

In [19]:
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,3,6,8,10,11
3,1.0,0.0,0.0,0.0,1.0
6,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0
10,0.0,0.0,0.0,1.0,0.0
11,1.0,0.0,0.0,0.0,1.0


In [21]:
np.fill_diagonal(user_sim , 0)

In [22]:
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,3,6,8,10,11
3,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0
11,1.0,0.0,0.0,0.0,0.0


In [23]:
# Most simmilar Users
user_sim_df.idxmax(axis = 1)[0:5]

3       11
6      168
8       16
10    4047
11       3
dtype: int64

In [24]:
movie[(movie['userId'] == 6)|(movie['userId'] == 168)]

Unnamed: 0,userId,movie,rating
1,6,Toy Story (1995),5.0
60,168,Toy Story (1995),4.5
3725,6,Grumpier Old Men (1995),3.0
6464,6,Sabrina (1995),5.0


In [25]:
user_1 = movie[movie['userId'] == 6]

In [26]:
user_2 = movie[movie['userId'] == 168]

In [27]:
user_2.movie

60    Toy Story (1995)
Name: movie, dtype: object

In [28]:
user_1.movie

1              Toy Story (1995)
3725    Grumpier Old Men (1995)
6464             Sabrina (1995)
Name: movie, dtype: object

In [29]:
pd.merge(user_1, user_2, on = 'movie',how = 'outer')

Unnamed: 0,userId_x,movie,rating_x,userId_y,rating_y
0,6,Toy Story (1995),5.0,168.0,4.5
1,6,Grumpier Old Men (1995),3.0,,
2,6,Sabrina (1995),5.0,,
