In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# loading the movies dataset
movies_data = pd.read_csv("datasets/movies.csv")
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Since the year of movie is given in title, extracting the year and making a new column of it. This will help in finding the similarity between the movies as the user may be interested in watching movie of a paricular year or more specifically a paricular time period. A new column in the dataset named 'year' is created.

In [3]:
movies_data['title'] = movies_data['title'].apply(lambda x: x.strip())  # removing whitespace at the end of title

In [4]:
# Extracting the year from movie title in a list
year_list = []
for movie in movies_data["title"]:
    year_list.append(movie[-5:-1])

In [5]:
# year column is added in the dataset
movies_data["year"] = year_list
movies_data.sample(5)

Unnamed: 0,movieId,title,genres,year
3186,4294,"5,000 Fingers of Dr. T, The (1953)",Children|Fantasy|Musical,1953
886,1183,"English Patient, The (1996)",Drama|Romance|War,1996
5825,32179,Elevator to the Gallows (a.k.a. Frantic) (Asce...,Crime|Drama|Thriller,1958
5922,33834,Land of the Dead (2005),Action|Horror|Thriller,2005
3212,4338,Twelve O'Clock High (1949),Drama|War,1949


In [6]:
movies_data.iloc[10,3]          # year column is in string

'1995'

### The year column is in string. Converting into the int format. For some rows it will give error because some movie title are not having year in it. So the above logic of extracting year will extract characters which cannot be converted into int for those movies. Therefore, those movies are taken in another list by using try and except.

In [7]:
# Those years which cannot be converted in string is taken in another list
not_converted = []
for i in range(len(year_list)):
    try:
        year_list[i] = int(year_list[i])  # which can be converted into int
    except:
        not_converted.append(i)           # which cannot be converted into int
len(not_converted)

12

### There are 12 movies whose year is not present. Therefore, we will have to do some imputations.

In [8]:
not_converted

[6059, 9031, 9091, 9138, 9179, 9259, 9367, 9448, 9514, 9515, 9525, 9611]

In [9]:
movies_data.iloc[6059]

movieId        40697
title      Babylon 5
genres        Sci-Fi
year            lon 
Name: 6059, dtype: object

In [9]:
# index and title of movies whose year is not converted into int
movies_without_year = []
for i in not_converted:
    movies_without_year.append(movies_data.iloc[i,1])
    print(i, movies_data.iloc[i,1])

6059 Babylon 5
9031 Ready Player One
9091 Hyena Road
9138 The Adventures of Sherlock Holmes and Doctor Watson
9179 Nocturnal Animals
9259 Paterson
9367 Moonlight
9448 The OA
9514 Cosmos
9515 Maria Bamford: Old Baby
9525 Generation Iron 2
9611 Black Mirror


### The movies which do not have year, for them imputation is done with the median value of year.

In [10]:
# finding median of years to impute missing values
med = []
for i in year_list:
    if type(i) is not str:
        med.append(i)
np.median(med)

1999.0

In [11]:
for i in not_converted:     # imputing the missing values with median of years
    year_list[i] = 1999

In [12]:
movies_data['year'] = year_list

In [13]:
movies_data.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [14]:
movies_data.iloc[6059]

movieId        40697
title      Babylon 5
genres        Sci-Fi
year            1999
Name: 6059, dtype: object

### Now extracting all the genres which we have in the dataset which will help in finding the similarity among the movies as movies with the same genres are simillar to each other.

In [15]:
# Extracting list of uniqe genres in the entire genre columns
genres_list = []
for genre in movies_data['genres']:
    all_genres = genre.split('|') # splitting by '|' as diffent genres are seperated by '|' in 'generes' column
    for item in all_genres:
        if item not in genres_list:
            genres_list.append(item)    # appends the genre which is not present in list
            
print(genres_list)

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX', 'Western', 'Film-Noir', '(no genres listed)']


In [18]:
# Initial values is set to zero for all new genre columns
for genre in genres_list :
    movies_data[genre] = 0
movies_data.sample()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
5475,26231,Performance (1970),Crime|Drama|Thriller,1970,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Movie[movie, Genre] = 1, if Genre is present in movie
for index , row in movies_data.iterrows():
    for current_genre in genres_list:
        if current_genre in row['genres']:
            movies_data.loc[index, current_genre] = 1

In [21]:
movies_data.sample(5)

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
1673,2252,Hero (1992),Comedy|Drama,1992,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6659,57147,Cassandra's Dream (2007),Crime|Drama|Thriller,2007,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4360,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,2003,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1402,1921,Pi (1998),Drama|Sci-Fi|Thriller,1998,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9124,146028,The Adventures of Sherlock Holmes and Dr. Wats...,Crime|Mystery,1981,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Taking the part of dataframe to find the similarity between the movies

In [22]:
movies_sub = movies_data.iloc[:,3:]
movies_sub.sample(2)

Unnamed: 0,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
7933,2009,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2384,1999,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Cosine similarity

In [72]:
cos_sim_movies = pd.DataFrame(cosine_similarity(movies_sub))        # cosine similarity
cos_sim_movies.index = movies_data["title"]
cos_sim_movies.columns = movies_data["title"]

In [73]:
cos_sim_movies.head()

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.0,1.0,0.999999,0.999999,0.999999,0.999999,0.999999,1.0,0.999999,0.999999,...,0.999999,0.999999,0.999999,0.999999,0.999999,1.0,1.0,0.999999,0.999999,0.999999
Jumanji (1995),1.0,1.0,0.999999,0.999999,0.999999,0.999999,0.999999,1.0,0.999999,0.999999,...,0.999999,0.999999,0.999999,0.999999,0.999999,0.999999,1.0,1.0,0.999999,0.999999
Grumpier Old Men (1995),0.999999,0.999999,1.0,1.0,1.0,0.999999,1.0,0.999999,1.0,0.999999,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Waiting to Exhale (1995),0.999999,0.999999,1.0,1.0,1.0,0.999999,1.0,0.999999,0.999999,0.999999,...,0.999999,1.0,1.0,0.999999,0.999999,0.999999,1.0,1.0,0.999999,1.0
Father of the Bride Part II (1995),0.999999,0.999999,1.0,1.0,1.0,0.999999,1.0,1.0,1.0,0.999999,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Since the value of year is much higher than the value of genres(0,1), year is almost singlehandedly deciding the cosine similarity. Therefore, standardizing the year of movies to bring it to almost same scale as of the genres to get the meaningful result.

## Standardizing year of movie

In [23]:
movies_sub["year"] = (movies_sub["year"] - movies_sub["year"].mean())/(movies_sub["year"].std())

In [25]:
movies_sub.head()

Unnamed: 0,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.020498,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.020498,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.020498,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.020498,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.020498,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# cosine similarity on standardize year data
cos_sim_movies = pd.DataFrame(cosine_similarity(movies_sub))
cos_sim_movies.index = movies_data["title"]
cos_sim_movies.columns = movies_data["title"]

In [27]:
cos_sim_movies.head()

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.0,0.774618,0.316314,0.258278,0.447289,0.000108,0.316314,0.632495,0.000188,0.258278,...,0.416535,0.264123,0.259665,0.307571,0.006783,0.578904,0.640529,0.007061,0.242026,0.437133
Jumanji (1995),0.774618,1.0,0.000171,0.00014,0.000243,0.00014,0.000171,0.816525,0.000243,0.333427,...,0.004537,0.006796,0.007038,0.008757,0.008757,0.253193,0.280145,0.009116,0.007879,-0.00227
Grumpier Old Men (1995),0.316314,0.000171,1.0,0.816525,0.707181,0.000171,1.0,0.00021,0.000297,0.000171,...,0.332057,0.008324,0.410541,0.010724,0.010724,0.310086,0.343095,0.011164,0.00965,0.691125
Waiting to Exhale (1995),0.258278,0.00014,0.816525,1.0,0.577431,0.00014,0.816525,0.000171,0.000243,0.00014,...,0.271133,0.340971,0.663397,0.008757,0.008757,0.253193,0.280145,0.377223,0.007879,0.564321
Father of the Bride Part II (1995),0.447289,0.000243,0.707181,0.577431,1.0,0.000243,0.707181,0.000297,0.00042,0.000243,...,0.46955,0.01177,0.580532,0.015165,0.015165,0.438481,0.485158,0.015787,0.013646,0.977295


# Reading the Ratings Dataset

In [28]:
ratings_data = pd.read_csv("datasets/ratings.csv")
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Content-Based Recommender System

### Finding user preferences :-  In this function, finding what the user likes to watch i.e. the movies which are rated high by the user.

### Recently watched :- Preference is given to the recently watched movies which are highly rated by user by using timestamp data. Preference is given to recently watched movies because they are the latest choice of the user which can be different what user use to watch in past.

### Finding Similar movies :- Finding the similar movies with respect to the movies which are highly rated by the user based on cosine similarity.

### Watchlist of user :- finding the watchlist of the user as it is not good idea to recommend the movies which are already watched by the user.

### => Sorting the ratings dataset in descending order of rating and timestamp, it will be helpful to pick recently watched movies which are rated highly by the user.

In [45]:
sorted_data = ratings_data.sort_values(by=['rating','timestamp'], ascending=False)

In [46]:
sorted_data.loc[sorted_data.userId==1, :]

Unnamed: 0,userId,movieId,rating,timestamp
31,1,553,5.0,964984153
9,1,157,5.0,964984100
90,1,1298,5.0,964984086
201,1,3053,5.0,964984086
214,1,3448,5.0,964984054
...,...,...,...,...
76,1,1219,2.0,964983393
152,1,2389,2.0,964983094
170,1,2617,2.0,964982588
143,1,2253,2.0,964981775


### Creating a function content_based_recommendation which will take the user_id and no_of_movies_to_be_recommended as arguments. If no. of movies is not given by the user, by default 10 movies will be recommended. In the function, based on the user id, top 10 recently watched high rated movies by that user is picked. Then for the selected movies, similar movies  are selected which are having cosine similarity > 0.8. After that the movies which are already watched by user is removed. And rest movies are recommended to the user based on how many movies to be recommended.

In [78]:
def content_based_recommendation(user_id, no_of_movies_to_be_recommended = 10):
    
    ######### using the sorted data on timestamp so latest watched movies will be picked first #################
    max_rating = max(sorted_data.loc[(sorted_data.userId==user_id), "rating"]) 
    if max_rating < 4:
        print("None liked so far!")
        movie_rated = []
    else :
        movie_5 = sorted_data.loc[(sorted_data.userId == user_id) & (sorted_data.rating == 5), "movieId"] 
        movie_4 = sorted_data.loc[(sorted_data.userId == user_id) & (sorted_data.rating == 4), "movieId"]
        
        
        ################### Taking only top 10 rated movies by the user in this function ################
        
        # If number of 5-rated movies are more than 10, then return a list of 10 recently watched movie
        if len(movie_5) > 10:
            movie_rated = list(movie_5)[:10]
        
        # If the total number of 5-rated movies & 4-rated movies are less than 10, then return a list of all movies rated 4 and 5
        elif len(movie_5) + len(movie_4) < 10:
            movie_rated = list(movie_5) + list(movie4)
        
        # otherwise return a list containing all the 5-rated movies and  rest recently watched 4 rated movies
        else: 
            movie_rated = list(movie_5) + list(movie_4)[0:10-len(movie_5)]
            

    #################### Finding similar movies of the movies which user has rated high ##################
    
    sim_movies = []
    for movie_id in movie_rated:
        # Get the movie title for the given movie id
        title = movies_data.loc[movies_data.movieId == movie_id, "title"]
        title = np.array(title)[0]
        
        # Simillar movies in which cosine similarity value is greater than 0.8
        temp = list(cos_sim_movies.loc[title, :][cos_sim_movies.loc[title,:]> 0.8].index)
        for k in temp:
            sim_movies.append(k)

    ############################ Movies to be recommended to the user ########################################
    
    watched_id = list(sorted_data.loc[sorted_data.userId==user_id, "movieId"])  # id's of all movies watched by a user
    watched_movies = list()
    
    # appending movies watched by user in the list
    for j in watched_id :
        watched_movies.append((movies_data.loc[movies_data.movieId == j, ["title"]].iloc[0,0])) # appending the movie titles watched by user in list

    # movies which are similar to the movies watched by user and to be recommended
    # This list excludes the movies which are already watched by user
    not_watched = set(sim_movies)-set(watched_movies)
    
    recommended_movies = list(not_watched)
    
    # how many movies to be recommended
    if no_of_movies_to_be_recommended <= len(recommended_movies):
        return recommended_movies[0:no_of_movies_to_be_recommended]
    else:
        return recommended_movies
    

In [81]:
content_based_recommendation(2)

['Mutant Chronicles (2008)',
 'Fahrenheit 9/11 (2004)',
 'China Blue (2005)',
 'Wag the Dog (1997)',
 'Edukators, The (Die Fetten Jahre sind vorbei) (2004)',
 'Fifty Shades of Grey (2015)',
 'Kidnapping Mr. Heineken (2015)',
 'Chasers (1994)',
 'Before Midnight (2013)',
 'Civil Action, A (1998)']

In [80]:
content_based_recommendation(28,8)

['Wag the Dog (1997)',
 'Ordet (Word, The) (1955)',
 'Fifty Shades of Grey (2015)',
 'Vie en Rose, La (Môme, La) (2007)',
 'Skipped Parts (2000)',
 'Eat Drink Man Woman (Yin shi nan nu) (1994)',
 'Before Midnight (2013)',
 'Mark of Zorro, The (1940)']

### Further Improvement :- While selecting the similar movies to a movie watched by user, movies can be picked in decreasing order of there average rating, so that while recommending the movies top average rated movies get the priority.

# Collaborative Filtering Recommender System :-

### Creating a user-movie matrix which will give us the information that a user has given what ratings to all the movies. If the user has not rated the movie it will come as null value.

In [82]:
# Creating user-movie matrix
user_rating_df = ratings_data.pivot(index = 'userId', columns ='movieId', values = 'rating')
user_rating_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


### Making the matrix mean centered corresponding to a user. It will scale the ratings given by all the users on same level. It is done by subtracting the mean of ratings given by the user from the all the ratings given by that particular user.

In [83]:
# making the matrix mean centered
user_rating_df = (user_rating_df.T - np.array(user_rating_df.mean(axis=1))).T

In [84]:
user_rating_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,,-0.366379,,,-0.366379,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.363636,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,,,,,,-1.157399,,,,...,,,,,,,,,,
607,0.213904,,,,,,,,,,...,,,,,,,,,,
608,-0.634176,-1.134176,-1.134176,,,,,,,0.865824,...,,,,,,,,,,
609,-0.270270,,,,,,,,,0.729730,...,,,,,,,,,,


### Replacing the null values with zero as we have to calculate similarity among users

In [85]:
# replacing the null values with zero
user_rating_df[user_rating_df.isnull()] = 0
user_rating_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.0,-0.366379,0.0,0.0,-0.366379,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.363636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
user_rating_df.sum(axis = 1) # since it is mean centered, sum of all the ratings given by a user is zero

userId
1      4.263256e-14
2      6.217249e-15
3      7.105427e-15
4      4.440892e-14
5      4.884981e-15
           ...     
606   -1.847411e-13
607   -3.552714e-15
608    1.278977e-13
609    3.552714e-15
610   -1.598721e-13
Length: 610, dtype: float64

# User-User Cosine Similarity Matrix

In [87]:
cos_sim_users = pd.DataFrame(cosine_similarity(user_rating_df))
cos_sim_users.index = user_rating_df.index
cos_sim_users.columns = user_rating_df.index

In [88]:
cos_sim_users.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.0062,0.047013,0.01951,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.010932
2,0.001265,1.0,0.0,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.0,0.003012,...,-0.050551,-0.031581,-0.001688,0.0,0.0,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,0.0,1.0,-0.01126,-0.031539,0.0048,0.0,-0.032471,0.0,0.0,...,-0.004904,-0.016117,0.017749,0.0,-0.001431,-0.037289,-0.007789,-0.013001,0.0,0.01955
4,0.048419,-0.017164,-0.01126,1.0,-0.02962,0.013956,0.058091,0.002065,-0.005874,0.05159,...,-0.037687,0.063122,0.02764,-0.013782,0.040037,0.02059,0.014628,-0.037569,-0.017884,-0.000995
5,0.021847,0.021796,-0.031539,-0.02962,1.0,0.009111,0.010117,-0.012284,0.0,-0.033165,...,0.015964,0.012427,0.027076,0.012461,-0.036272,0.026319,0.031896,-0.001751,0.093829,-0.000278


In [89]:
# Extracting the movies in different dataset
movie_list = movies_data[["movieId", "title"]] 
movie_list.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


### Creating a fuction collaborative_filter_recommendation which is taking the user_id, no_of_similar_users and no. of movies to be recommended as argument. Firstly, for the given user, similar users are taken in order of most similar to less. Number of similar user is given in the function or by default it is 5. After this, top rated movies of these users are picked in a list. The movies which are already watched by the user are removed from the list. Rest all are recommended to the user on the basis of no. of movies to be recommended which is given in function or by default is 10.

In [94]:
def collaborative_filter_recommendation(user, no_of_similar_users = 5, no_of_movies_to_be_recommended = 10):

    # Get the user similar to the given user using cosine similarity,
    sim_user = list(cos_sim_users.sort_values(by = [user], ascending = False).index[1:no_of_similar_users+1])

    watched_movies = set()
    for i in sim_user:
        # taking only 5 and 4 star rated movie of similar user
        watched_id = list(ratings_data.loc[(ratings_data.userId == i) & (ratings_data.rating > 3), "movieId"])
        temp = list()
        for j in watched_id :
            temp.append((movie_list.loc[movie_list.movieId == j, ["title"]].iloc[0,0]))
        watched_movies = set(watched_movies).union(set(temp))    # wont allow insertion of already present movie
    
    # movies which are already watched by user
    watched_id = list(ratings_data.loc[ratings_data.userId == user, "movieId"])
    watched_title = list()
    for j in watched_id:
        watched_title.append((movie_list.loc[movie_list.movieId == j, ["title"]].iloc[0,0]))
    watched_user = set(watched_title)
    
    # taking movies which are not watched by the user but watched by the similar users who have rated 4 and 5
    recommended_movies =  watched_movies - watched_user
    recommended_movies = list(recommended_movies)
    
    # how many movies to be recommended
    if no_of_movies_to_be_recommended <= len(recommended_movies):
        return recommended_movies[0:no_of_movies_to_be_recommended]
    else:
        return recommended_movies

In [95]:
collaborative_filter_recommendation(2)

['Madagascar: Escape 2 Africa (2008)',
 'Batman: The Dark Knight Returns, Part 2 (2013)',
 'Tron: Legacy (2010)',
 'Sky Captain and the World of Tomorrow (2004)',
 'Triplets of Belleville, The (Les triplettes de Belleville) (2003)',
 'Incredibles, The (2004)',
 'Inside Out (2015)',
 'Social Network, The (2010)',
 'Good bye, Lenin! (2003)',
 'Elizabeth (1998)']

In [96]:
collaborative_filter_recommendation(28, 4, 5)

['Fahrenheit 9/11 (2004)',
 'Drop Zone (1994)',
 'Star Trek III: The Search for Spock (1984)',
 'Born on the Fourth of July (1989)',
 'Natural, The (1984)']

## Observations in my approach of collaborative Filtering :-

### If we are taking only the ratings given by users, and not making it mean centered, then we won't be able to differentiate between an easy movie rater and a tough movie rater.
### Problem with mean-centered ratings :- If there is a user A which has given rating 5(or any other number) to all the movies he watched, mean centering will make all the ratings zero. This will create problem in finding its similar user using cosine similarity as it will have cosine similarity of zero with all other users. Another problem in this approach is if user A and B has watched same movies, user A has given 5 to all movies while user B has given 1 to all those movies, they will be considered similar. In nutshell, if there are users which gives only 1 kind of rating(1-5) to all the movies they watch, they will be considered as similar.