# Recommendation Systems

In [1]:
# import numpy and pandas
import numpy as np
import pandas as pd

In [2]:
# load the dataset Movie.csv # kaggle.com
df = pd.read_csv('Movie.csv') # movielens dataset - https://grouplens.org/datasets/movielens/100k/
df.head()

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [3]:
# sort rows by userid
df = df.sort_values(by='userId')
df.head()

Unnamed: 0,userId,movie,rating
2569,1,Jumanji (1995),3.5
3724,2,Grumpier Old Men (1995),4.0
0,3,Toy Story (1995),4.0
5204,4,Heat (1995),3.0
7444,4,GoldenEye (1995),4.0


In [4]:
# get the number of unique userid and movieid
n_users = df['userId'].nunique()
n_movies = df['movie'].nunique()
n_users, n_movies

(4081, 10)

In [5]:
# frequency of ratings
df['rating'].value_counts()

3.0    2736
4.0    2660
5.0    1394
3.5     679
2.0     542
4.5     374
2.5     277
1.0     212
1.5      61
0.5      57
Name: rating, dtype: int64

In [6]:
# frequency count of movies
df['movie'].value_counts()

Toy Story (1995)                      2569
GoldenEye (1995)                      1548
Heat (1995)                           1260
Jumanji (1995)                        1155
Sabrina (1995)                         700
Grumpier Old Men (1995)                685
Father of the Bride Part II (1995)     657
Sudden Death (1995)                    202
Waiting to Exhale (1995)               138
Tom and Huck (1995)                     78
Name: movie, dtype: int64

In [7]:
# create a pivot table with userid as index and movie as columns and ratings as values
df_pivot = df.pivot(index='userId', columns='movie', values='rating')
df_pivot.head()

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,3.5,,,,,
2,,,4.0,,,,,,,
3,,,,,,,,,4.0,
4,,4.0,,3.0,,,,,,
5,,,,,3.0,,,,,


In [8]:
# impute NaN values with 0
df_pivot.fillna(0, inplace=True)

In [9]:
df_pivot.head()

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [22]:
item_pivot = df_pivot.T  # transpose the pivot table transpose() is also used
item_pivot.head()

userId,1,2,3,4,5,6,7,8,10,11,...,7105,7107,7108,7110,7113,7115,7116,7117,7119,7120
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,3.5,0.0,0.0,0.0
GoldenEye (1995),0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,2.5,...,2.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0
Grumpier Old Men (1995),0.0,4.0,0.0,0.0,0.0,3.0,3.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
Heat (1995),0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,0.0
Jumanji (1995),3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0


In [10]:
# import function to calculate pariwise distance 
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [11]:
# print user similarity matrix with calculated using cosine similarity
# display the values as float upto 2 decimal places
user_similarity = pairwise_distances(df_pivot, metric='cosine')
user_similarity.round(2)

array([[0.  , 1.  , 1.  , ..., 1.  , 1.  , 0.45],
       [1.  , 0.  , 1.  , ..., 0.54, 1.  , 1.  ],
       [1.  , 1.  , 0.  , ..., 0.54, 0.  , 0.38],
       ...,
       [1.  , 0.54, 0.54, ..., 0.  , 0.54, 0.52],
       [1.  , 1.  , 0.  , ..., 0.54, 0.  , 0.38],
       [0.45, 1.  , 0.38, ..., 0.52, 0.38, 0.  ]])

In [12]:
user_similarity.shape

(4081, 4081)

In [13]:
#Store the results in a dataframe
user_similarity_df = pd.DataFrame(user_similarity)

In [14]:
user_similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080
0,0.000000,1.000000,1.000000,1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,0.000000,0.292893,1.000000,1.000000,1.000000e+00,1.000000e+00,1.000000,4.466284e-01
1,1.000000,0.000000,1.000000,1.000000,1.000000,0.609433,0.292893,0.384543,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000e+00,5.411685e-01,1.000000,1.000000e+00
2,1.000000,1.000000,0.000000,1.000000,1.000000,0.349055,1.000000,0.507634,0.000000,0.125843,...,1.000000,0.000000,1.000000,0.292893,1.000000,1.000000,2.474233e-01,5.411685e-01,0.000000,3.774570e-01
3,1.000000,1.000000,1.000000,0.000000,1.000000,1.000000,1.000000,0.384543,1.000000,0.611486,...,0.200000,1.000000,1.000000,1.000000,0.010051,1.000000,1.000000e+00,3.805775e-01,1.000000,1.000000e+00
4,0.000000,1.000000,1.000000,1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,0.000000,0.292893,1.000000,1.000000,1.000000e+00,1.000000e+00,1.000000,4.466284e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,3.414954e-01,1.000000e+00,1.000000,1.000000e+00
4077,1.000000,1.000000,0.247423,1.000000,1.000000,0.510114,1.000000,0.629457,0.247423,0.342130,...,1.000000,0.247423,1.000000,0.467848,1.000000,0.341495,1.110223e-16,6.546941e-01,0.247423,5.314886e-01
4078,1.000000,0.541169,0.541169,0.380578,1.000000,0.298116,0.432225,0.110468,0.541169,0.431788,...,0.655876,0.541169,1.000000,0.675557,0.351114,1.000000,6.546941e-01,1.110223e-16,0.541169,5.239295e-01
4079,1.000000,1.000000,0.000000,1.000000,1.000000,0.349055,1.000000,0.507634,0.000000,0.125843,...,1.000000,0.000000,1.000000,0.292893,1.000000,1.000000,2.474233e-01,5.411685e-01,0.000000,3.774570e-01


In [15]:
# set the index as userid and column also as userid
user_similarity_df.index = df['userId'].unique()
user_similarity_df.columns = df['userId'].unique()

In [16]:
# display the first 5 rows and first 5 columns of the dataframe
user_similarity_df.iloc[:5, :5]

Unnamed: 0,1,2,3,4,5
1,0.0,1.0,1.0,1.0,0.0
2,1.0,0.0,1.0,1.0,1.0
3,1.0,1.0,0.0,1.0,1.0
4,1.0,1.0,1.0,0.0,1.0
5,0.0,1.0,1.0,1.0,0.0


In [17]:
# set the diagonal values to 1. So that we dont get the most similar user as itself
np.fill_diagonal(user_similarity, 1)
user_similarity_df.iloc[:5, :5]

Unnamed: 0,1,2,3,4,5
1,1.0,1.0,1.0,1.0,0.0
2,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0
5,0.0,1.0,1.0,1.0,1.0


In [19]:
# most simialr users
user_similarity_df.idxmin(axis=1)[:5]

1       5
2     112
3      10
4    2577
5       1
dtype: int64

In [20]:
# extract the rows from dataframe with one pair of similar users eg user 3 and 10
df[(df['userId']==3) | (df['userId']==10)]

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0


In [21]:
# between userid 4 and userid 2577
df[(df['userId']==4) | (df['userId']==2577)]

Unnamed: 0,userId,movie,rating
5204,4,Heat (1995),3.0
7444,4,GoldenEye (1995),4.0
8015,2577,GoldenEye (1995),4.0
5662,2577,Heat (1995),3.0


Based on the user similarity matrix, for a specific user, we can recommend movies which have been highly rated by other users who are similar to the user in question. This is the basic idea behind a recommendation system.