Exercise Prompt Hints
- Key step: how to combine movie data into a single string
- Recall TfidfVectorizer expects one string per 'document'
- Transform (into a vectors) the strings using TF-IDF
- Assume the query is always an existing movie in the database
- E.g. query = "Scream 3", then recommend other movies based on this
- Get the TF-IDF representation of Scream 3
- Compute similarity between Scream 3 and all other vectors
- Sort by similarity
- Print out the top 5 closest movies
- Try movies from other genres

In [None]:
# Gathering data
!wget https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv

In [None]:
# creating data frame
import numpy as np
import pandas as pd

df = pd.read_csv('tmdb_5000_movies.csv')
df.head()

In [27]:
# Creating a function to extract string to vectorize
import json

def extract_movie_string(df, movie_title):
    row_index = df[df["title"] == movie_title].index[0]
    movie_string = (
        " ".join(str(genre["name"]) for genre in json.loads(df.loc[row_index, "genres"]))
        + " " + " ".join(str(keyword["name"]) for keyword in json.loads(df.loc[row_index, "keywords"]))
        + " " + str(df.loc[row_index, "overview"])
        + " " + str(df.loc[row_index, "tagline"])
    )
    return movie_string


    

In [28]:
# Example
print(extract_movie_string(df, "Avatar"))


Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Enter the World of Pandora.


In [None]:
# Getting the movie strings

movie_strings = pd.DataFrame(
    [extract_movie_string(df, movie) for movie in df["title"]], 
    columns=["movie_string"]
) 
movie_strings.head()
    


In [33]:
# Creating vectorizer object
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")

In [34]:
# Vectorizing movie strings
movie_vectors = tfidf.fit_transform(movie_strings["movie_string"])


(4803, 23490)


In [55]:
# Evaluating similarity
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

def get_recommendations(movie_title, n_recommendations):
    movie_index = df[df["title"] == movie_title].index[0]
    
    distances = abs(cosine_similarity(movie_vectors[movie_index], movie_vectors).reshape((-1,)))
    movie_indices = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:n_recommendations+1]
    recommended_movies = [df.iloc[i[0]]["title"] for i in movie_indices]
    return recommended_movies

In [56]:
# Testing
print(get_recommendations("Avatar", 5))
print(get_recommendations("Pirates of the Caribbean: At World's End", 5))
print(get_recommendations("The Dark Knight Rises", 5))
print(get_recommendations("Spectre", 5))
print(get_recommendations("The Avengers", 5))



['Mission to Mars', 'Aliens', 'Moonraker', 'Alien³', 'Alien']
["Pirates of the Caribbean: Dead Man's Chest", 'Pirates of the Caribbean: The Curse of the Black Pearl', 'Pirates of the Caribbean: On Stranger Tides', 'The Pirates! In an Adventure with Scientists!', "Nim's Island"]
['The Dark Knight', 'Batman Returns', 'Batman', 'Batman', 'Batman Forever']
['Restless', 'Never Say Never Again', 'Dr. No', 'Skyfall', 'From Russia with Love']
['Avengers: Age of Ultron', 'X-Men', 'Fantastic Four', 'Captain America: Civil War', 'X-Men: Apocalypse']


In [57]:
# Refactorting

class Recommender:    
    def __init__(self, df):
        self.df = df
        self.tfidf = TfidfVectorizer(stop_words="english")
        self.__create_movie_vectors()
        
    def __extract_movie_string(self, movie_title):
        row_index = self.df[self.df["title"] == movie_title].index[0]
        movie_string = (
            " ".join(str(genre["name"]) for genre in json.loads(self.df.loc[row_index, "genres"]))
            + " " + " ".join(str(keyword["name"]) for keyword in json.loads(self.df.loc[row_index, "keywords"]))
            + " " + str(self.df.loc[row_index, "overview"])
            + " " + str(self.df.loc[row_index, "tagline"])
        )
        return movie_string
    
    def __create_movie_strings(self):
        self.movie_strings = pd.DataFrame(
            [self.__extract_movie_string(movie) for movie in self.df["title"]], 
            columns=["movie_string"]
        ) 
    
    def __create_movie_vectors(self):
        self.__create_movie_strings()
        self.movie_vectors = self.tfidf.fit_transform(self.movie_strings["movie_string"])
        
    def get_recommendations(self, movie_title, n_recommendations):
        movie_index = self.df[self.df["title"] == movie_title].index[0]
        
        distances = abs(cosine_similarity(self.movie_vectors[movie_index], self.movie_vectors).reshape((-1,)))
        movie_indices = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:n_recommendations+1]
        recommended_movies = [self.df.iloc[i[0]]["title"] for i in movie_indices]
        return recommended_movies

In [58]:
# Testing
recommender = Recommender(df)
print(recommender.get_recommendations("Avatar", 5))
print(recommender.get_recommendations("Pirates of the Caribbean: At World's End", 5))
print(recommender.get_recommendations("The Dark Knight Rises", 5))
print(recommender.get_recommendations("Spectre", 5))
print(recommender.get_recommendations("The Avengers", 5))

['Mission to Mars', 'Aliens', 'Moonraker', 'Alien³', 'Alien']
["Pirates of the Caribbean: Dead Man's Chest", 'Pirates of the Caribbean: The Curse of the Black Pearl', 'Pirates of the Caribbean: On Stranger Tides', 'The Pirates! In an Adventure with Scientists!', "Nim's Island"]
['The Dark Knight', 'Batman Returns', 'Batman', 'Batman', 'Batman Forever']
['Restless', 'Never Say Never Again', 'Dr. No', 'Skyfall', 'From Russia with Love']
['Avengers: Age of Ultron', 'X-Men', 'Fantastic Four', 'Captain America: Civil War', 'X-Men: Apocalypse']
