In [1]:
import pandas as pd
from math import pow, sqrt
import time
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
movies = pd.read_csv('./ml-latest-small/movies.csv')

movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
# users_id has list of unique userids, same goes to movies_id
users_id = ratings.userId.unique().tolist()
movies_id = ratings.movieId.unique().tolist()
print("# of users: ", len(users_id))
print('# of movies: ', len(movies_id))

# of users:  610
# of movies:  9724


In [4]:
# given user id and movie id, it returns its corresponding rating
def get_rating(userId, movieId):
    return (ratings.loc[(ratings.userId==userId) & (ratings.movieId==movieId), 'rating'].iloc[0]) # --> 4.0
print(get_rating(1,1))

4.0


In [5]:
# given user id, returns list of all movies(movieId) the user has rated
def get_movieIds(userId):
    return (ratings.loc[(ratings.userId==userId), 'movieId']).tolist()
print(get_movieIds(2))

[318, 333, 1704, 3578, 6874, 8798, 46970, 48516, 58559, 60756, 68157, 71535, 74458, 77455, 79132, 80489, 80906, 86345, 89774, 91529, 91658, 99114, 106782, 109487, 112552, 114060, 115713, 122882, 131724]


In [6]:
# given movie id, returns corresponding movie title
def get_movie_title(movieId):
    return (movies.loc[(movies.movieId == movieId), 'title'].iloc[0])
print(get_movie_title(2))

Jumanji (1995)


In [7]:
# calculate euclidean distance between 2 users by finding the common movies they have rated and applying euclidean
# distance formula between the 2 users' ratings.
from scipy.spatial import distance
def euclidean_dist(user1_id, user2_id):
    user1_movie_list = ratings.loc[ratings.userId == user1_id, 'movieId'].tolist()
    user2_movie_list = ratings.loc[ratings.userId == user2_id, 'movieId'].tolist()
    dist = 0
    for movie in user1_movie_list:
        if movie in user2_movie_list:
            user1_rating = get_rating(user1_id, movie)
            user2_rating = get_rating(user2_id, movie)
            dist += pow((user1_rating-user2_rating),2) # (x-y)^2
    dist = sqrt(dist)
    return dist
            
print("distance=",euclidean_dist(1,500))

distance= 7.810249675906654


In [8]:
# calculate pearson coefficient between 2 users by finding the common movies they have rated and applying pearson
# formula between the 2 users' ratings.
def pearson_coeff(user1_id, user2_id):
    movies_common  = []
    user1_movie_list = ratings.loc[ratings.userId == user1_id, 'movieId'].tolist()
    user2_movie_list = ratings.loc[ratings.userId == user2_id, 'movieId'].tolist()
    for movie in user1_movie_list:
        if movie in user2_movie_list:
            movies_common.append(movie)
    
    
    
    n = len(movies_common)
    if (n == 0):
        return 0
    sum_x = sum([get_rating(user1_id, movie) for movie in movies_common])
    sum_y = sum([get_rating(user2_id, movie) for movie in movies_common])
    sum_x2 = sum([pow(get_rating(user1_id, movie),2) for movie in movies_common])
    sum_y2 = sum([pow(get_rating(user2_id, movie),2) for movie in movies_common])
    
    numerator = sum([get_rating(user1_id, movie) * get_rating(user2_id, movie) for movie in movies_common]) - ((sum_x*sum_y)/n)
    denominator = sqrt((sum_x2-pow(sum_x, 2)/n) * (sum_y2 - pow(sum_y,2)/n))
    
    if denominator == 0:
        return 0
    return numerator/denominator        

print('{0}'.format(pearson_coeff(11, 30)))
    

0.3333333333333333


In [9]:
# returns recommended list of movies according to pearson coefficient by calculating similarity between the given users
# and all the other users and then sorting the list in the reverse order to get movies with highest correlations first.
def movie_recommendation(user_id):
    user_list = ratings.userId.unique().tolist()
    movies_watched_by_user_id = get_movieIds(user_id)
    # print("movies watched by user: ")
    # for movie in movies_watched_by_user_id:
    #     print(get_movie_title(movie))
    total = {}
    similarity_sum = {}
    for user in user_list[:100]:
        if user != user_id:
            r = pearson_coeff(user_id, user)
            if r > 0:
                for movie in get_movieIds(user):
                    # get movies that are not watched by user_id
                    if movie not in movies_watched_by_user_id or get_rating(user_id, movie) == 0:
                        total[movie] = 0
                        total[movie] += get_rating(user, movie) * r
                        similarity_sum[movie] = 0
                        similarity_sum[movie] += r
    ranking = [(tot/similarity_sum[movie],movie) for movie,tot in total.items()]
    # print(ranking)
    ranking.sort()
    # print(ranking)
    ranking.reverse()
    # print(ranking)
    recommendations = [get_movie_title(movie) for r, movie in ranking]
    return recommendations[:10]                        

In [10]:
# returns recommended list of movies according to euclidean distance by calculating similarity between the given users
# and all the other users and then sorting the list to get movies with lowest distance first.
def movie_recommendation_euclidean(user_id):
    user_list = ratings.userId.unique().tolist()
    movies_watched_by_user_id = get_movieIds(user_id)
    total = {}
    similarity_sum = {}
    for user in user_list[:100]:
        if user != user_id:
            r = euclidean_dist(user_id, user)
            if r > 0:
                for movie in get_movieIds(user):
                    # get movies that are not watched by user_id
                    if movie not in movies_watched_by_user_id or get_rating(user_id, movie) == 0:
                        total[movie] = 0
                        total[movie] += get_rating(user, movie) * r
                        similarity_sum[movie] = 0
                        similarity_sum[movie] += r
    ranking = [(tot/similarity_sum[movie],movie) for movie,tot in total.items()]
    ranking.sort()
    ranking.reverse()
    recommendations = [get_movie_title(movie) for r, movie in ranking]
    return recommendations[:10]
    return 0

print("euclidean recommendation example")
print(movie_recommendation_euclidean(2))

euclidean recommendation example
['Deadpool 2 (2018)', 'Sherlock - A Study in Pink (2010)', 'Wonder (2017)', 'Coco (2017)', 'Three Billboards Outside Ebbing, Missouri (2017)', 'Black Mirror', 'Black Mirror: White Christmas (2014)', 'The Godfather Trilogy: 1972-1990 (1992)', 'Arrival (2016)', 'All Yours (2016)']


## Content Based Filtering
Below code we use to cauculate the similarity between two movies using cosine similarity.We take the genre as the feature.We combine it to a single string seperated by space and apply CountVectorizer on it. We make use of Cosine_similarity from sklearn to create a similarity matrix for each movie.
The diagnol elements are 1,as the movie is similar to itself.We index to the matrix for each movie and obtain the similarity vector to all movies.We sort it by the value in descending order and return the top 10 similar movies ,again getting the title via the index.
We Call this method for the all the movies in the user watched list.

In [11]:
#Reading movie csv file ,into a different dataframe
movies2 = pd.read_csv('./ml-latest-small/movies.csv')

#Cleaning and removing the year from the movie titles
split_values = movies2['title'].str.split("(", n = 1, expand = True)
movies2.title= split_values[0]

#Iterating through the rows and removing any white space characters at the end and processing genre ,to replace '|'
#with white space and converting it to lower characters.
for index,row in movies2.iterrows():
    movies2.loc[index,'title']=row['title'].rstrip()
    movies2.loc[index,'genres']=row['genres'].replace('|',' ').lower()
movies2

Unnamed: 0,movieId,title,genres
0,1,Toy Story,adventure animation children comedy fantasy
1,2,Jumanji,adventure children fantasy
2,3,Grumpier Old Men,comedy romance
3,4,Waiting to Exhale,comedy drama romance
4,5,Father of the Bride Part II,comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,action animation comedy fantasy
9738,193583,No Game No Life: Zero,animation comedy fantasy
9739,193585,Flint,drama
9740,193587,Bungo Stray Dogs: Dead Apple,action animation


In [12]:
#We set the index to title and delete other columns ,and have only one column genre which is used for vectorization.
del movies2['movieId']
movies2.set_index('title',inplace=True)
movies2

Unnamed: 0_level_0,genres
title,Unnamed: 1_level_1
Toy Story,adventure animation children comedy fantasy
Jumanji,adventure children fantasy
Grumpier Old Men,comedy romance
Waiting to Exhale,comedy drama romance
Father of the Bride Part II,comedy
...,...
Black Butler: Book of the Atlantic,action animation comedy fantasy
No Game No Life: Zero,animation comedy fantasy
Flint,drama
Bungo Stray Dogs: Dead Apple,action animation


In [13]:
#Calling CountVectorizer from sklearn and calculating cosine similarity which is stored in a matrix.Diagnol elments are one because 
#the movie is similar to itself.
count = CountVectorizer()
count_matrix = count.fit_transform(movies2['genres'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

In [14]:
#Creating a series for the movie titles so that they are matched with ordered numerical list used later
indices = pd.Series(movies2.index)

In [15]:
def content_recommendation(title, cosine_sim = cosine_sim):
    recommended_movies=[]
    
    #Finding the index of the title in the series created initially.
    idx = indices[indices == title].index[0]
    
    #Indexing into the similarity matrix and sorting the values in descending order.
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    
    #Selecting the top 10 most similar movies ,excluding the first index,as it is the movie itself.
    top_10_indexes = list(score_series.iloc[1:11].index)

    #Appending the movie titles and returning 
    for i in top_10_indexes:
        recommended_movies.append(list(movies2.index)[i])
    
    return recommended_movies

In [17]:
#Merging user rating and movies table,so that we get the movie title along with the movieId and the rating
usr_rat=pd.merge(ratings,movies,how='left',on='movieId')[['userId','movieId','rating','title']]
usr_rat

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,1,3,4.0,Grumpier Old Men (1995)
2,1,6,4.0,Heat (1995)
3,1,47,5.0,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,"Usual Suspects, The (1995)"
...,...,...,...,...
100831,610,166534,4.0,Split (2017)
100832,610,168248,5.0,John Wick: Chapter Two (2017)
100833,610,168250,5.0,Get Out (2017)
100834,610,168252,5.0,Logan (2017)


In [18]:
#Function to add new user to the rating table ,which takes a userId movieIds and rating for the repective movies.
#We are adding a new user with id 612 who likes crime movies and has watched Goodfellas and Heat
def add_user(userid,movies,usr_rating):
    for mov_idx in range(len(movies)):
        row=[]
        row.append(userid)
        row.append(movies[mov_idx])
        row.append(usr_rating[mov_idx])
        row.append(time.time())
        ratings.loc[ratings.index.max() + 1]=row
#Adding the User
add_user(612,[1213,6],[4,5])
#Outputing the respective user in ratings table
ratings.loc[ratings['userId']==612]

Unnamed: 0,userId,movieId,rating,timestamp
100836,612.0,1213.0,4.0,1606710000.0
100837,612.0,6.0,5.0,1606710000.0


In [19]:
#Making recommendation for the new user using collaborative filtering.
movie_recommendation(612)

['The Martian (2015)',
 'Mad Max: Fury Road (2015)',
 'Star Trek Into Darkness (2013)',
 '21 Jump Street (2012)',
 'The Hunger Games (2012)',
 'Harry Potter and the Deathly Hallows: Part 1 (2010)',
 'Inception (2010)',
 'How to Train Your Dragon (2010)',
 "She's Out of My League (2010)",
 'Avatar (2009)']

In [20]:
#Making recommendatio using content based approach
content_recommendation('Goodfellas')

['Gotti',
 'Virgin Spring, The',
 "Man Who Wasn't There, The",
 'Heist',
 'No Country for Old Men',
 'Pickpocket',
 'American Buffalo',
 'Bonnie and Clyde',
 'Shattered Glass',
 'Battles Without Honor & Humanity']

In [21]:
content_recommendation('Heat')

['Punisher, The',
 'Shaft',
 'xXx',
 'Dead Pool, The',
 'Bourne Ultimatum, The',
 'Ninja: Shadow of a Tear',
 'Furious 7',
 'Poker Night',
 'Headhunters',
 'Riki-Oh: The Story of Ricky']

In [22]:
movies=['Goodfellas','Heat']
final=[]
#Taking 5 movies from each movies watched
for mov in movies:
    final=final+content_recommendation(mov)[:6]
for mov in final:
    print(mov)

Gotti
Virgin Spring, The
Man Who Wasn't There, The
Heist
No Country for Old Men
Pickpocket
Punisher, The
Shaft
xXx
Dead Pool, The
Bourne Ultimatum, The
Ninja: Shadow of a Tear
