In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [250]:
#loading the movies dataset
movies_data = pd.read_csv("movies.csv")
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Since the year of movie is given in title, extracting the year and making a new column of it. This will help in finding the similarity between the movies as the user may be interested in watching movie of a paricular year or more specifically a paricular time period. A new column in the dataset named 'year' is created.

In [251]:
movies_data['title'] = movies_data['title'].apply(lambda x: x.strip())  # removing whitespace at the end of title

In [252]:
# Extracting the year from movie title in a list
year_list = []
for movie in movies_data["title"]:
    year_list.append(movie[-5:-1])

In [253]:
# year column is added in the dataset
movies_data["year"] = year_list
movies_data.sample(5)

Unnamed: 0,movieId,title,genres,year
242,280,Murder in the First (1995),Drama|Thriller,1995
3683,5073,"Son's Room, The (Stanza del figlio, La) (2001)",Drama,2001
9060,142115,The Blue Planet (2001),Documentary,2001
1581,2120,Needful Things (1993),Drama|Horror,1993
2653,3551,Marathon Man (1976),Crime|Drama|Thriller,1976


In [254]:
movies_data.iloc[10,3]          # year column is in string

'1995'

#### The year column is in string. Converting into the int format. For some rows it will give error because some movie title are not having year in it. So the above logic of extracting year will extract characters which cannot be converted into int for those movies. Therefore, those movies are taken in another list by using try and except.

In [255]:
# Those years which cannot be converted in string is taken in another list
not_converted = []
for i in range(len(year_list)):
    try:
        year_list[i] = int(year_list[i])  # which can be converted into int
    except:
        not_converted.append(i)           # which cannot be converted into int
len(not_converted)

12

In [256]:
not_converted

[6059, 9031, 9091, 9138, 9179, 9259, 9367, 9448, 9514, 9515, 9525, 9611]

In [257]:
movies_data.iloc[6059]

movieId        40697
title      Babylon 5
genres        Sci-Fi
year            lon 
Name: 6059, dtype: object

In [258]:
# index and title of movies whose year is not converted into int
movies_without_year = []
for i in not_converted:
    movies_without_year.append(movies_data.iloc[i,1])
    print(i)
movies_without_year

6059
9031
9091
9138
9179
9259
9367
9448
9514
9515
9525
9611


['Babylon 5',
 'Ready Player One',
 'Hyena Road',
 'The Adventures of Sherlock Holmes and Doctor Watson',
 'Nocturnal Animals',
 'Paterson',
 'Moonlight',
 'The OA',
 'Cosmos',
 'Maria Bamford: Old Baby',
 'Generation Iron 2',
 'Black Mirror']

#### The movies which do not have year, for them imputation is done with the median value of year.

In [293]:
# finding median of years to impute missing values
med = []
for i in year_list:
    if type(i) is not str:
        med.append(i)
np.median(med)

1999.0

In [259]:
for i in not_converted:     # imputing the missing values with median of years
    year_list[i] = 1999

In [260]:
movies_data['year'] = year_list

In [261]:
movies_data.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [262]:
movies_data.iloc[6059]

movieId        40697
title      Babylon 5
genres        Sci-Fi
year            1999
Name: 6059, dtype: object

#### Now extracting all the genres which we have in the dataset which will help in finding the similarity among the movies as movies with the same genres are simillar to each other.

In [263]:
# Extracting list of uniqe genres in the entire genre columns
genres_list = []
for genre in movies_data['genres'] :
    genre_splitted = genre.split('|') # splitting by '|' as diffent genres are seperated by '|' in 'generes' column
    for item in genre_splitted :
        if item not in genres_list :
            genres_list.append(item)
            
print(genres_list)

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX', 'Western', 'Film-Noir', '(no genres listed)']


In [264]:
# Initial values is set to zero for all new genre columns
for genre in genres_list :
    movies_data[genre] = 0
movies_data.sample(2)

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
5613,27036,Merlin (1998),Action|Adventure|Drama|Fantasy|Romance,1998,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8770,128842,Dragonheart 3: The Sorcerer's Curse (2015),Action|Adventure|Fantasy,2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [265]:
# Movie[movie, Genre] = 1, if Genre is present in movie
for index , row in movies_data.iterrows() :
    for current_genre in genres_list :
        if current_genre in row['genres'] :
            movies_data.loc[index, current_genre] = 1

In [266]:
movies_data.sample(2)

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
763,1005,D3: The Mighty Ducks (1996),Children|Comedy,1996,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7924,95497,Dragon Ball Z: Super Android 13! (Doragon bôru...,Action|Adventure|Animation,1992,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Taking the part of dataframe to find the similarity between the movies

In [267]:
movies_sub = movies_data.iloc[:,3:]
movies_sub.sample(2)

Unnamed: 0,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
2475,2000,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1582,1983,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


# Cosine similarity

In [268]:
cos_sim_movies = pd.DataFrame(cosine_similarity(movies_sub))
cos_sim_movies.index = movies_data["title"]
cos_sim_movies.columns = movies_data["title"]

In [269]:
cos_sim_movies.head()

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.0,1.0,0.999999,0.999999,0.999999,0.999999,0.999999,1.0,0.999999,0.999999,...,0.999999,0.999999,0.999999,0.999999,0.999999,1.0,1.0,0.999999,0.999999,0.999999
Jumanji (1995),1.0,1.0,0.999999,0.999999,0.999999,0.999999,0.999999,1.0,0.999999,0.999999,...,0.999999,0.999999,0.999999,0.999999,0.999999,0.999999,1.0,1.0,0.999999,0.999999
Grumpier Old Men (1995),0.999999,0.999999,1.0,1.0,1.0,0.999999,1.0,0.999999,1.0,0.999999,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Waiting to Exhale (1995),0.999999,0.999999,1.0,1.0,1.0,0.999999,1.0,0.999999,0.999999,0.999999,...,0.999999,1.0,1.0,0.999999,0.999999,0.999999,1.0,1.0,0.999999,1.0
Father of the Bride Part II (1995),0.999999,0.999999,1.0,1.0,1.0,0.999999,1.0,1.0,1.0,0.999999,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Since the value of year is much higher than the value of genres(0,1), year is almost singlehandedly deciding the cosine similarity. Therefore, standardizing the year of movies to bring it to almost same scale as of the genres to get the meaningful result.

### Standardizing year of movie

In [270]:
movies_sub["year"] = (movies_sub["year"] - movies_sub["year"].mean())/(movies_sub["year"].std())

In [271]:
movies_sub.head(5)

Unnamed: 0,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.020498,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.020498,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.020498,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.020498,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.020498,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [272]:
# cosine similarity on standardize year data
cos_sim_movies = pd.DataFrame(cosine_similarity(movies_sub))
cos_sim_movies.index = movies_data["title"]
cos_sim_movies.columns = movies_data["title"]

In [273]:
cos_sim_movies.head()

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.0,0.774618,0.316314,0.258278,0.447289,0.000108,0.316314,0.632495,0.000188,0.258278,...,0.416535,0.264123,0.259665,0.307571,0.006783,0.578904,0.640529,0.007061,0.242026,0.437133
Jumanji (1995),0.774618,1.0,0.000171,0.00014,0.000243,0.00014,0.000171,0.816525,0.000243,0.333427,...,0.004537,0.006796,0.007038,0.008757,0.008757,0.253193,0.280145,0.009116,0.007879,-0.00227
Grumpier Old Men (1995),0.316314,0.000171,1.0,0.816525,0.707181,0.000171,1.0,0.00021,0.000297,0.000171,...,0.332057,0.008324,0.410541,0.010724,0.010724,0.310086,0.343095,0.011164,0.00965,0.691125
Waiting to Exhale (1995),0.258278,0.00014,0.816525,1.0,0.577431,0.00014,0.816525,0.000171,0.000243,0.00014,...,0.271133,0.340971,0.663397,0.008757,0.008757,0.253193,0.280145,0.377223,0.007879,0.564321
Father of the Bride Part II (1995),0.447289,0.000243,0.707181,0.577431,1.0,0.000243,0.707181,0.000297,0.00042,0.000243,...,0.46955,0.01177,0.580532,0.015165,0.015165,0.438481,0.485158,0.015787,0.013646,0.977295


# Reading the Ratings Dataset

In [274]:
ratings_data = pd.read_csv("ratings.csv")
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Finding user preferences :-
#### In this function, finding what the user likes to watch i.e. the movies which are rated high by the user.

In [275]:
def top_rated_watched_movies(user_id, top_n_movies=5):
    max_rating = max(ratings_data.loc[(ratings_data.userId==user_id), "rating"]) 
    if max_rating < 4:
        print("None liked so far!")
        return None
    else :
        movie_5 = ratings_data.loc[(ratings_data.userId==user_id) & (ratings_data.rating==5), "movieId"] 
        movie_4 = ratings_data.loc[(ratings_data.userId==user_id) & (ratings_data.rating==4), "movieId"]
        
        # If number of 5-rated movies are more than top_n_movies, then return a list of randomly selected movie
        if len(movie_5) > top_n_movies:
            return (list(np.random.choice(movie_5,top_n_movies, replace=False)))
        
        # If the total number of 5-rated movies & 4-rated movies are less than top_n_movies, then return a list of movie rated 4  and 5
        elif len(movie_5) + len(movie_4) < top_n_movies:
            return (list(movie_5)+list(movie_4))
        
        # otherwise return a list containing all the 5-rated movies and a few random selection 
        else: 
            return (list(movie_5) + list(np.random.choice (movie_4, top_n_movies-len(movie_5), replace=False)))


In [276]:
top_rated_watched_movies(10,6)   # top rated 6 movies of user 10

[33794, 71579, 8533, 79091, 81845, 91529]

# Finding Similar movies :-
#### In this function, finding the similar movies with respect to the given movie based on cosine similarity.

In [277]:
def get_similar_movies(movie_id, sim=0.95):

    # Get the movie title for the given movie id
    title= movies_data.loc[movies_data.movieId==movie_id, "title"]
    title = np.array(title)[0]

    # Extract the row containing the movie ID from the similarity matrix 
    similarity= cos_sim_movies.loc[title,:]

    # scores of all movies
    scores = np.array(similarity)

    # Related movies
    related= cos_sim_movies.loc[title, :][cos_sim_movies.loc[title,:]> sim].index

    return list(related)

In [278]:
get_similar_movies(10)

['GoldenEye (1995)',
 'Broken Arrow (1996)',
 'Cliffhanger (1993)',
 'Executive Decision (1996)',
 'Surviving the Game (1994)',
 'Rock, The (1996)',
 'Chain Reaction (1996)',
 'Maximum Risk (1996)',
 'Die Hard 2 (1990)',
 'Anaconda (1997)',
 'Con Air (1997)',
 'Hunt for Red October, The (1990)',
 'Tomorrow Never Dies (1997)',
 'View to a Kill, A (1985)',
 'Rambo: First Blood Part II (1985)',
 'Licence to Kill (1989)',
 'World Is Not Enough, The (1999)',
 'Mission: Impossible II (2000)',
 'Living Daylights, The (1987)',
 'Die Another Day (2002)',
 'Raiders of the Lost Ark: The Adaptation (1989)']

# Watchlist of user :-
#### In this function, finding the watchlist of the user it is not good idea to recommend the movies which are already watched by the user.

In [279]:
# The below function will return all the movies watched by an user 
def user_watch_list(user_id): 

    id = list(ratings_data.loc[ratings_data.userId==user_id, "movieId"])  # id's of all movies watched by a user
    watched_user = list()
    for j in id :
        watched_user.append((movies_data.loc[movies_data.movieId == j, ["title"]].iloc[0,0])) # appending the movie titles watched by user in list
    
    return watched_user

In [280]:
user_watch_list(2)

['Shawshank Redemption, The (1994)',
 'Tommy Boy (1995)',
 'Good Will Hunting (1997)',
 'Gladiator (2000)',
 'Kill Bill: Vol. 1 (2003)',
 'Collateral (2004)',
 'Talladega Nights: The Ballad of Ricky Bobby (2006)',
 'Departed, The (2006)',
 'Dark Knight, The (2008)',
 'Step Brothers (2008)',
 'Inglourious Basterds (2009)',
 'Zombieland (2009)',
 'Shutter Island (2010)',
 'Exit Through the Gift Shop (2010)',
 'Inception (2010)',
 'Town, The (2010)',
 'Inside Job (2010)',
 'Louis C.K.: Hilarious (2010)',
 'Warrior (2011)',
 'Dark Knight Rises, The (2012)',
 'Girl with the Dragon Tattoo, The (2011)',
 'Django Unchained (2012)',
 'Wolf of Wall Street, The (2013)',
 'Interstellar (2014)',
 'Whiplash (2014)',
 'The Drop (2014)',
 'Ex Machina (2015)',
 'Mad Max: Fury Road (2015)',
 'The Jinx: The Life and Deaths of Robert Durst (2015)']

# Content-Based Recommender System

In [281]:
# Creating a dataset which only has movieid and title
movie_list = movies_data[["movieId", "title"]] 
movie_list.head() 

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [282]:
# For a given user get the list of top-N rated watched movies
N=5
User = 2
top = top_rated_watched_movies(User, N)
print(top)

[60756, 122882, 106782, 131724, 89774]


In [283]:
movie_list.loc[movie_list.movieId==131724,:]

Unnamed: 0,movieId,title
8828,131724,The Jinx: The Life and Deaths of Robert Durst ...


In [284]:
# Get the most similar movie corresponding to the list of above movies 
i = 1 
sim_movies = get_similar_movies(top[i]) 
sim_movies

['Toy Story (1995)',
 'Antz (1998)',
 'Toy Story 2 (1999)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 "Emperor's New Groove, The (2000)",
 'Monsters, Inc. (2001)',
 'Wild, The (2006)',
 'Shrek the Third (2007)',
 'Tale of Despereaux, The (2008)',
 'Asterix and the Vikings (Astérix et les Vikings) (2006)']

In [285]:
# Get the watchlist of the user
user_watch_list(User)

['Shawshank Redemption, The (1994)',
 'Tommy Boy (1995)',
 'Good Will Hunting (1997)',
 'Gladiator (2000)',
 'Kill Bill: Vol. 1 (2003)',
 'Collateral (2004)',
 'Talladega Nights: The Ballad of Ricky Bobby (2006)',
 'Departed, The (2006)',
 'Dark Knight, The (2008)',
 'Step Brothers (2008)',
 'Inglourious Basterds (2009)',
 'Zombieland (2009)',
 'Shutter Island (2010)',
 'Exit Through the Gift Shop (2010)',
 'Inception (2010)',
 'Town, The (2010)',
 'Inside Job (2010)',
 'Louis C.K.: Hilarious (2010)',
 'Warrior (2011)',
 'Dark Knight Rises, The (2012)',
 'Girl with the Dragon Tattoo, The (2011)',
 'Django Unchained (2012)',
 'Wolf of Wall Street, The (2013)',
 'Interstellar (2014)',
 'Whiplash (2014)',
 'The Drop (2014)',
 'Ex Machina (2015)',
 'Mad Max: Fury Road (2015)',
 'The Jinx: The Life and Deaths of Robert Durst (2015)']

In [286]:
# movies in the similarity list which the user has not watched yet
not_watched = set(sim_movies)-set(user_watch_list(User))

In [289]:
not_watched = list(not_watched)
not_watched

['Shrek the Third (2007)',
 "Emperor's New Groove, The (2000)",
 'Monsters, Inc. (2001)',
 'Toy Story (1995)',
 'Tale of Despereaux, The (2008)',
 'Antz (1998)',
 'Wild, The (2006)',
 'Asterix and the Vikings (Astérix et les Vikings) (2006)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 'Toy Story 2 (1999)']

In [290]:
#Get the scores of the movies in the not_watched list from the similarity matrix WRT the selected movie

#movie_name = np.array(movie_list.loc[movie_list.movieId==top[i], "title"])[0] 
#cos_sim_movies.loc[movie_name, not_watched]

title
Shrek the Third (2007)                                     0.325736
Emperor's New Groove, The (2000)                           0.256366
Monsters, Inc. (2001)                                      0.267007
Toy Story (1995)                                           0.200330
Tale of Despereaux, The (2008)                             0.334587
Antz (1998)                                                0.234468
Wild, The (2006)                                           0.316606
Asterix and the Vikings (Astérix et les Vikings) (2006)    0.316606
Adventures of Rocky and Bullwinkle, The (2000)             0.256366
Toy Story 2 (1999)                                         0.245515
Name: Mad Max: Fury Road (2015), dtype: float64