In [26]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#loading the movies dataset
movies_data = pd.read_csv("datasets/movies.csv")
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# loading the ratings dataset
ratings_data = pd.read_csv("datasets/ratings.csv")
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


#### Creating a user-movie matrix which will give us the information that a user has given what ratings to all the movies. If the user has not rated the movie it will come as null value.

In [7]:
# Creating user-movie matrix
user_rating_df = ratings_data.pivot(index = 'userId', columns ='movieId', values = 'rating')
user_rating_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


#### Making the matrix mean centered corresponding to a user. It will scale the ratings given by all the users on same level. It is done by subtracting the mean of ratings given by the user from the all the ratings given by that particular user.

In [8]:
# making the matrix mean centered
user_rating_df = (user_rating_df.T.loc[:,:] - np.array(user_rating_df.mean(axis=1))).T

In [9]:
user_rating_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,,-0.366379,,,-0.366379,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.363636,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,,,,,,-1.157399,,,,...,,,,,,,,,,
607,0.213904,,,,,,,,,,...,,,,,,,,,,
608,-0.634176,-1.134176,-1.134176,,,,,,,0.865824,...,,,,,,,,,,
609,-0.270270,,,,,,,,,0.729730,...,,,,,,,,,,


#### replacing the null values with zero as we have to calculate similarity among users

In [10]:
# replacing the null values with zero
user_rating_df[user_rating_df.isnull()] = 0
user_rating_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.0,-0.366379,0.0,0.0,-0.366379,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.363636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
user_rating_df.sum(axis = 1) # since it is mean centered, sum of all the ratings given by a user is zero

userId
1      4.263256e-14
2      6.217249e-15
3      7.105427e-15
4      4.440892e-14
5      4.884981e-15
           ...     
606   -1.847411e-13
607   -3.552714e-15
608    1.278977e-13
609    3.552714e-15
610   -1.598721e-13
Length: 610, dtype: float64

# User-User Cosine Similarity Matrix

In [15]:
cos_sim_users = pd.DataFrame(cosine_similarity(user_rating_df))
cos_sim_users.index = user_rating_df.index
cos_sim_users.columns = user_rating_df.index

In [16]:
cos_sim_users.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.0062,0.047013,0.01951,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.010932
2,0.001265,1.0,0.0,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.0,0.003012,...,-0.050551,-0.031581,-0.001688,0.0,0.0,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,0.0,1.0,-0.01126,-0.031539,0.0048,0.0,-0.032471,0.0,0.0,...,-0.004904,-0.016117,0.017749,0.0,-0.001431,-0.037289,-0.007789,-0.013001,0.0,0.01955
4,0.048419,-0.017164,-0.01126,1.0,-0.02962,0.013956,0.058091,0.002065,-0.005874,0.05159,...,-0.037687,0.063122,0.02764,-0.013782,0.040037,0.02059,0.014628,-0.037569,-0.017884,-0.000995
5,0.021847,0.021796,-0.031539,-0.02962,1.0,0.009111,0.010117,-0.012284,0.0,-0.033165,...,0.015964,0.012427,0.027076,0.012461,-0.036272,0.026319,0.031896,-0.001751,0.093829,-0.000278


In [17]:
cos_sim_users.describe()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
count,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,...,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0
mean,0.015605,-0.005365,-0.002438,0.002604,0.017619,0.01427,0.024487,0.028104,0.011973,-0.009937,...,0.024978,0.037299,0.016117,0.017626,0.014432,0.029681,0.014784,0.033802,0.032449,0.032738
std,0.052842,0.047632,0.044891,0.050042,0.064098,0.059536,0.054187,0.065923,0.04941,0.053461,...,0.059205,0.067727,0.04952,0.062077,0.054678,0.053299,0.054402,0.0559,0.065978,0.05665
min,-0.105003,-0.166806,-0.070858,-0.108871,-0.190382,-0.107733,-0.14314,-0.153665,-0.157403,-0.138436,...,-0.216858,-0.177311,-0.073246,-0.16779,-0.188196,-0.084901,-0.106672,-0.077451,-0.126257,-0.065244
25%,-0.00588,-0.017924,-0.01367,-0.016809,-0.006918,-0.010556,0.000557,-0.000663,0.0,-0.032654,...,-0.00055,0.0,-0.004283,-0.003801,-0.00799,0.00441,-0.010152,0.004512,0.0,0.003201
50%,0.012211,0.0,0.0,2.6e-05,0.006508,0.004331,0.021885,0.020416,0.000487,-0.008209,...,0.021402,0.028802,0.009693,0.00977,0.008377,0.023997,0.010346,0.027625,0.022965,0.022511
75%,0.035697,0.000552,0.0,0.020005,0.03106,0.025372,0.04514,0.048283,0.023638,0.006355,...,0.049322,0.060898,0.030079,0.035431,0.034203,0.048854,0.036642,0.056185,0.051926,0.047181
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
# Extracting the movies in different dataset
movie_list = movies_data[["movieId", "title"]] 
movie_list.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [22]:
def similar_user_movies_recommendation(user, top = 5):

    #Get the user similar to the given user using cosine similarity

    #sim_user = (cos_sim_users[user][(cos_sim_users[user] > sim)] ).index # & (cos_sim_users[user]<1)]).index
    sim_user = list(cos_sim_users.sort_values(by = [user],ascending = False).index[1:top+1])

    watched = set()
    for i in list(sim_user):
        id=list(ratings_data.loc[ratings_data.userId==i, "movieId"])
        empty = list()
        for j in id :
            empty.append((movie_list.loc[movie_list.movieId == j, ["title"]].iloc[0,0]))
        watched = set(watched).union(set(empty))
      
    id=list(ratings_data.loc[ratings_data.userId==user, "movieId"])
    empty = list()
    for j in id:
        empty.append((movie_list.loc[movie_list.movieId == j, ["title"]].iloc[0,0]))
    watched_user = set(empty)
    
    return(set(watched)-set(watched_user))

In [23]:
similar_user_movies_recommendation(5, 5)

{'Ace Ventura: When Nature Calls (1995)',
 'Adventures of Priscilla, Queen of the Desert, The (1994)',
 'American President, The (1995)',
 'Aristocats, The (1970)',
 'Beverly Hillbillies, The (1993)',
 'Beverly Hills Cop III (1994)',
 'Billy Madison (1995)',
 'Birdcage, The (1996)',
 'Boys on the Side (1995)',
 'Circle of Friends (1995)',
 'Clerks (1994)',
 'Client, The (1994)',
 'Cliffhanger (1993)',
 'Coneheads (1993)',
 'Congo (1995)',
 'Copycat (1995)',
 'Crimson Tide (1995)',
 'Crow, The (1994)',
 'Dangerous Minds (1995)',
 'Dave (1993)',
 'Demolition Man (1993)',
 'Desperado (1995)',
 'Die Hard: With a Vengeance (1995)',
 'Disclosure (1994)',
 'Don Juan DeMarco (1995)',
 'Drop Zone (1994)',
 'Dumb & Dumber (Dumb and Dumber) (1994)',
 'Ed Wood (1994)',
 'Exit to Eden (1994)',
 'Father of the Bride Part II (1995)',
 'Firm, The (1993)',
 'First Knight (1995)',
 'Forget Paris (1995)',
 'Forrest Gump (1994)',
 'Free Willy (1993)',
 'French Kiss (1995)',
 'Ghost (1990)',
 'Grumpier Old

# Observations :-

* If we are taking only the ratings given by users, and not making it mean centered, then we won't be able to differentiate between an easy movie rater and a tough movie rater.
* Problem with mean-centered ratings :- If there is a user A which has given rating 5(or any other number) to all the movies he watched, mean centering will make all the ratings zero. This will create problem in finding its similar user using cosine similarity as it will have cosine similarity of zero with all other users. Another problem in this approach is if user A and B has watched same movies, user A has given 5 to all movies while user B has given 1 to all those movies, they will be considered similar. In nutshell, if there are users which gives only 1 kind of rating(1-5) to all the movies they watch, they will be considered as similar. 