In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load the dataset
df = pd.read_csv(
    StringIO(
    """
movie_id,title,genre,director,actors
1,The Shawshank Redemption,Drama,Frank Darabont,Tim Robbins Morgan Freeman Bob Gunton
2,The Godfather,Crime Drama,Francis Ford Coppola,Marlon Brando Al Pacino James Caan
3,The Dark Knight,Action Crime Drama,Christopher Nolan,Christian Bale Heath Ledger Morgan Freeman
4,Pulp Fiction,Crime Drama,Quentin Tarantino,John Travolta Uma Thurman Samuel L. Jackson
5,The Lord of the Rings: The Return of the King,Adventure Fantasy,Peter Jackson,Elijah Wood Ian McKellen Viggo Mortensen
6,The Good the Bad and the Ugly,Western,Sergio Leone,Clint Eastwood Eli Wallach Lee Van Cleef
7,The Matrix,Science Fiction Action,The Wachowskis,Keanu Reeves Laurence Fishburne Carrie-Anne Moss
8,Inception,Science Fiction Action,Christopher Nolan,Leonardo DiCaprio Ken Watanabe Joseph Gordon-Levitt
9,The Silence of the Lambs,Thriller Crime,Jonathan Demme,Jodie Foster Anthony Hopkins Scott Glenn
10,Forrest Gump,Drama Romance,Robert Zemeckis,Tom Hanks Robin Wright Gary Sinise
11,Jurassic Park,Science Fiction Adventure,Steven Spielberg,Sam Neill Laura Dern Jeff Goldblum
12,The Lion King,Animation Musical Drama,Roger Allers Rob Minkoff,Matthew Broderick James Earl Jones Jeremy Irons
13,The Prestige,Drama Mystery Thriller,Christopher Nolan,Hugh Jackman Christian Bale Michael Caine
14,The Dark Knight Rises,Crime Thriller Action,Christopher Nolan,Christian Bale Tom Hardy Morgan Freeman
15,The Departed,Crime Drama Thriller,Martin Scorsese,Leonardo DiCaprio Matt Damon Jack Nicholson
16,A Few Dollars More,Western,Sergio Leone,Clint Eastwood Lee Van Cleef Eli Wallach
17,The Pianist,Drama War,Roman Polanski,Adrien Brody Thomas Kretschmann Frank Finlay
18,Gladiator,Action Adventure Drama,Ridley Scott,Russell Crowe Joaquin Phoenix Connie Nielsen
19,The Green Mile,Fantasy Crime Drama,Frank Darabont,Tom Hanks David Morse Michael Clarke Duncan
20,The Lord of the Rings: The Fellowship of the Ring,Fantasy Adventure,Peter Jackson,Elijah Wood Ian McKellen Orlando Bloom
21,The Lord of the Rings: The Two Towers,Fantasy Adventure,Peter Jackson,Elijah Wood Ian McKellen Viggo Mortensen
22,The Incredibles,Animation Action Adventure,Brad Bird,Craig T. Nelson Holly Hunter Samuel L. Jackson
23,Finding Nemo,Animation Adventure Comedy,Andrew Stanton Lee Unkrich,Albert Brooks Ellen DeGeneres Alexander Gould
24,The Passion of the Christ,Drama,Mel Gibson,Jim Caviezel Monica Bellucci Maia Morgenstern
25,The Aviator,Drama Biography,Martin Scorsese,Leonardo DiCaprio Cate Blanchett Kate Beckinsale
26,The Descent,Adventure Horror Thriller,Neil Marshall,Shauna Macdonald Natalie Mendoza Alex Reid
    """
        )
    )

In [3]:
# get genre as a matrix
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['genre'])

# Calculate the cosine similarity between all the items
similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function that gets movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    # Build a 1-d array of movie titles
    titles = df['title']
    indices = pd.Series(df.index, index=df['title'])
    idx = indices[title]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:25]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

# What other movies are like LOTR: ROTK that I should watch?
genre_recommendations('The Lord of the Rings: The Return of the King').head(5)

19    The Lord of the Rings: The Fellowship of the Ring
20                The Lord of the Rings: The Two Towers
18                                       The Green Mile
17                                            Gladiator
21                                      The Incredibles
Name: title, dtype: object

In [4]:
# get director as a matrix
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['director'])

# Calculate the cosine similarity between all the items
similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function that gets movie recommendations based on the cosine similarity score of movie directors
def director_recommendations(title):
    # Build a 1-d array of movie titles
    titles = df['title']
    indices = pd.Series(df.index, index=df['title'])
    idx = indices[title]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

#What other movies directed by Christoper Nolan should I watch?
director_recommendations('The Dark Knight').head(3)

7                 Inception
12             The Prestige
13    The Dark Knight Rises
Name: title, dtype: object

In [5]:
# get actors as a matrix
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['actors'])

# Calculate the cosine similarity between all the items
similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function that get movie recommendations based on the cosine similarity score of movie directors
def actors_recommendations(title):
    # Build a 1-d array of movie titles
    titles = df['title']
    indices = pd.Series(df.index, index=df['title'])
    idx = indices[title]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

# What other movies featuring Samuel Jackson should I watch?
actors_recommendations('The Incredibles').head(1)

3    Pulp Fiction
Name: title, dtype: object

In [7]:
from collections import defaultdict
from surprise import Dataset, SVD


def get_top_n(predictions, n=10):
    """
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.        
    Returns:
        top_n_recommendations: A dict where keys are user (raw) ids 
        and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n_recommendations = defaultdict(list)
    for user_id, movie_id, true_r, est, _ in predictions:
        top_n_recommendations[user_id].append((movie_id, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for user_id, user_ratings in top_n_recommendations.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n_recommendations[user_id] = user_ratings[:n]

    return top_n_recommendations

# First train an SVD algorithm on the movielens dataset
data = Dataset.load_builtin("ml-100k")
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

# We can why user 6, who rated exciting movies like
# Star Wars, Shawshank Redepmtion, and Fargo a 5/5
# would be reccomended a movie like Courage Under Fire 
# because it was highly rated by other users who rated exciting movies 5/5

196 ['114', '408', '169', '963', '657', '275', '483', '318', '513', '694']
186 ['318', '114', '313', '604', '286', '64', '190', '603', '513', '480']
22 ['357', '135', '64', '98', '169', '408', '197', '480', '114', '199']
244 ['137', '127', '483', '134', '603', '12', '478', '511', '474', '606']
166 ['64', '272', '12', '169', '408', '318', '603', '511', '316', '479']
298 ['64', '169', '408', '313', '272', '165', '114', '657', '963', '12']
115 ['134', '258', '408', '474', '285', '272', '483', '179', '135', '923']
253 ['174', '251', '515', '169', '178', '313', '479', '520', '408', '657']
305 ['603', '114', '657', '513', '124', '1194', '647', '137', '606', '514']
6 ['603', '657', '661', '114', '963', '428', '251', '652', '150', '647']
62 ['408', '647', '615', '484', '1142', '661', '137', '923', '124', '317']
286 ['515', '479', '178', '194', '496', '657', '482', '427', '589', '197']
200 ['603', '480', '178', '64', '144', '302', '190', '19', '510', '651']
210 ['169', '64', '199', '408', '511'