# Importing Libraries and Data


In [1]:
# import libraries
import numpy as np
import pandas as pd
import pickle

from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# load data 
credits = pd.read_csv("data/tmdb_5000_credits.csv")
movies = pd.read_csv("data/tmdb_5000_movies.csv")

# Data Exploration

In [3]:
# Check few observations of credits dataframe
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
# Check few observations of movies dataframe
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [5]:
# Check shape of both dataframes
credits.shape, movies.shape

((4803, 4), (4803, 20))

# Merge Dataframes

In [6]:
# Find common columns in both dataframes
def compare_column_names(df1, df2):
    cols1 = set(df1.columns)
    cols2 = set(df2.columns)
    return cols1.intersection(cols2)

print(compare_column_names(movies, credits))

{'title'}


In [7]:
# Merge two dataframes on the column 'title'
df = movies.merge(credits, on='title')

# view
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# Preprocessing Data

In [8]:
# Thought process what columns are needed for recommendation
# Consider each coloum
# Write coloumns that are needed.
# ['movie_id', 'title', 'overview', cast', 'genres', 'keywords', 'crew']

In [9]:
# Select required columns
df = df[['movie_id', 'title', 'overview', 'cast', 'genres', 'keywords', 'crew']]
df.head()

Unnamed: 0,movie_id,title,overview,cast,genres,keywords,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [10]:
# Check for missing values
df.isnull().sum()

movie_id    0
title       0
overview    3
cast        0
genres      0
keywords    0
crew        0
dtype: int64

In [11]:
# Drop rows with missing values
df.dropna(inplace=True)
df.isnull().sum()

movie_id    0
title       0
overview    0
cast        0
genres      0
keywords    0
crew        0
dtype: int64

In [12]:
# check duplicat values
df.duplicated().sum()

0

In [13]:
# precprocessing steps
# cast column has dictionary in list of list 
# so make function to get key 'name' value 
# check fist list
df.cast[0]

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [14]:
# Define a function to extract the 'name' key from the cast column
import ast 

def convert1(text):
    """
    Extract the first three 'name' keys from a list of dictionaries stored in the 'cast' column.
    
    Args:
    text (str): A string representation of a list of dictionaries
    
    Returns:
    list: A list containing the first three 'name' keys from the dictionaries in the input list
    """
    names_list = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            names_list.append(i['name'])
        counter += 1
    return names_list

In [15]:
# Apply the function and save in the same column
df['cast'] = df['cast'].apply(convert1)

In [16]:
# view
df.head(2)

Unnamed: 0,movie_id,title,overview,cast,genres,keywords,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [17]:
# preprocessing 'genres' column
df.genres[0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [18]:
# Define a function to extract the 'name' key from the genres column
def convert2(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


In [19]:
# Define a function to extract the 'name' key from the genres column
def convert2(text):
    return [i['name'] for i in ast.literal_eval(text)]


In [20]:
# Apply the function and save in the same column
df['genres'] = df['genres'].apply(convert2)

# view
df.head()

Unnamed: 0,movie_id,title,overview,cast,genres,keywords,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[Adventure, Fantasy, Action]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[Action, Adventure, Crime]","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]","[Action, Crime, Drama, Thriller]","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[Action, Adventure, Science Fiction]","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [21]:
# Preprocess 'keywords' column
df["keywords"] = df.keywords.apply(convert2)

# view
df.head()

Unnamed: 0,movie_id,title,overview,cast,genres,keywords,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [22]:
# Define a function to extract the 'name' key for the 'Director' job from the crew column
def fetch_director(text):
    """
    Extracts the 'name' key from the 'Director' job in a list of dictionaries 
    stored in the input text string. 
    
    Args: 
    text (str): A string representation of a list of dictionaries
    
    Returns: 
    list: A list containing the 'name' keys from the input list where the job is 'Director'
    """
    return [i['name'] for i in ast.literal_eval(text) if i['job'] == 'Director']


In [23]:
# Apply the function and save in the same column
df["crew"] = df.crew.apply(fetch_director)

# view
df.head()

Unnamed: 0,movie_id,title,overview,cast,genres,keywords,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",[Andrew Stanton]


In [24]:
# Preprocess 'overview' column
df.overview[0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [25]:
# Split the 'overview' column into a list of words
df["overview"] = df.overview.apply(lambda x: x.split())

# view
df.head()

Unnamed: 0,movie_id,title,overview,cast,genres,keywords,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",[Andrew Stanton]


In [26]:
# Define a function to remove spaces from each word in a list
def remove_spaces_from_words(word_list):
    """
    Removes spaces from each word in a list and returns a new list.
    
    Args: 
    word_list (list): A list of strings
    
    Returns: 
    list: A list containing the strings from the input list with spaces removed
    """
    return [word.replace(" ", "") for word in word_list]


In [27]:
# Apply the function to the 'genres', 'cast', 'keywords', and 'crew' columns
df["genres"] = df.genres.apply(remove_spaces_from_words)
df["cast"] = df.cast.apply(remove_spaces_from_words)
df["keywords"] = df.keywords.apply(remove_spaces_from_words)
df["crew"] = df.crew.apply(remove_spaces_from_words)
# view
df.head()

Unnamed: 0,movie_id,title,overview,cast,genres,keywords,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...",[AndrewStanton]


In [28]:
# Combine the columns 'overview', 'cast', 'genres', 'keywords', and 'crew'
df["tags"] = df.overview + df.cast + df.genres + df.keywords + df.crew

# view
df.head(2)

Unnamed: 0,movie_id,title,overview,cast,genres,keywords,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."


In [29]:
# drop columns 
new_df = df.drop(columns=["overview", "cast" , "genres", "keywords", "crew"])
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [30]:
# remove list
new_df["tags"] = df.tags.apply(lambda x: " ".join(x))
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [31]:
new_df["tags"] = new_df.tags.str.lower()
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [32]:
# create CounterVector objet
cv = CountVectorizer(max_features=5000, stop_words="english")

In [33]:
# apply CounterVector

vector = cv.fit_transform(new_df.tags).toarray()
vector.shape

(4806, 5000)

In [34]:
# words are not to its base form so need to convert to base form
# get feature names
feature_names = cv.vocabulary_
feature_names

{'century': 730,
 'marine': 2779,
 'dispatched': 1279,
 'moon': 2984,
 'unique': 4698,
 'mission': 2946,
 'torn': 4564,
 'following': 1722,
 'orders': 3201,
 'protecting': 3522,
 'alien': 158,
 'civilization': 833,
 'zoesaldana': 4994,
 'sigourneyweaver': 4080,
 'action': 76,
 'adventure': 108,
 'fantasy': 1611,
 'sciencefiction': 3925,
 'cultureclash': 1072,
 'future': 1794,
 'society': 4147,
 'spacetravel': 4185,
 'futuristic': 1795,
 'romance': 3808,
 'space': 4179,
 'tribe': 4625,
 'soldier': 4152,
 'battle': 422,
 '3d': 40,
 'captain': 662,
 'long': 2661,
 'believed': 451,
 'dead': 1141,
 'come': 902,
 'life': 2612,
 'headed': 2009,
 'edge': 1390,
 'earth': 1379,
 'turner': 4647,
 'elizabeth': 1414,
 'quite': 3569,
 'johnnydepp': 2393,
 'orlandobloom': 3209,
 'keiraknightley': 2461,
 'ocean': 3162,
 'drugabuse': 1345,
 'exoticisland': 1547,
 'loveofone': 2700,
 'slife': 4130,
 'traitor': 4599,
 'shipwreck': 4053,
 'ship': 4052,
 'alliance': 165,
 'afterlife': 121,
 'fighter': 1663

In [35]:
# apply PorterStemmer to reduce words to base form
ps = PorterStemmer()

In [36]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [37]:
new_df["tags"] = new_df.tags.apply(stem)

In [38]:
# check stem function
stem("killer killed killing")

'killer kill kill'

In [39]:
# apply CounterVector again

vector = cv.fit_transform(new_df.tags).toarray()
vector.shape

(4806, 5000)

In [40]:
# check words are converted to base form
feature_names = cv.vocabulary_
feature_names

{'century': 736,
 'marin': 2803,
 'dispatch': 1286,
 'moon': 3013,
 'pandora': 3278,
 'uniqu': 4680,
 'mission': 2980,
 'becom': 441,
 'torn': 4552,
 'follow': 1729,
 'order': 3231,
 'protect': 3530,
 'alien': 157,
 'zoesaldana': 4993,
 'sigourneyweav': 4074,
 'action': 79,
 'adventur': 106,
 'fantasi': 1629,
 'sciencefict': 3924,
 'cultureclash': 1084,
 'futur': 1807,
 'societi': 4147,
 'spacetravel': 4193,
 'futurist': 1809,
 'romanc': 3805,
 'space': 4187,
 'tribe': 4603,
 'alienplanet': 160,
 'soldier': 4154,
 'battl': 424,
 '3d': 47,
 'jamescameron': 2327,
 'captain': 676,
 'long': 2696,
 'believ': 453,
 'dead': 1150,
 'ha': 1962,
 'come': 913,
 'life': 2647,
 'head': 2020,
 'edg': 1407,
 'earth': 1395,
 'turner': 4626,
 'elizabeth': 1435,
 'noth': 3168,
 'quit': 3568,
 'johnnydepp': 2429,
 'orlandobloom': 3238,
 'keiraknightley': 2499,
 'ocean': 3194,
 'drugabus': 1357,
 'exoticisland': 1571,
 'loveofone': 2729,
 'slif': 4126,
 'traitor': 4582,
 'shipwreck': 4050,
 'ship': 4049,


In [41]:
# create similarity object 
similarity = cosine_similarity(vector)
similarity.shape

(4806, 4806)

In [42]:
# check index with similarity
similarity[2]

array([0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
       0.        ])

In [43]:
# now find index of movie name in data frame
new_df[new_df["title"] == "The Lego Movie"].index[0]

744

In [44]:
# now give index of movie to similarity it return cosine distance with other movies
similarity[new_df[new_df["title"] == "The Lego Movie"].index[0]]

array([0.06488857, 0.06859943, 0.03535534, ..., 0.02773501, 0.        ,
       0.        ])

In [45]:
# now sort these score in decsendind order but keep indx position and also in key sort bases of second position
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])

[(0, 1.0000000000000002),
 (1216, 0.28676966733820225),
 (2409, 0.26901379342448517),
 (3730, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574),
 (582, 0.24511108480187255),
 (1204, 0.24455799402225925),
 (1194, 0.23179316248638276),
 (778, 0.23174488732966073),
 (4048, 0.2278389747471728),
 (1920, 0.2252817784447915),
 (61, 0.22269966704152225),
 (2786, 0.21853668936906193),
 (172, 0.21239769762143662),
 (972, 0.2108663315950723),
 (322, 0.2105263157894737),
 (2333, 0.20443988269091456),
 (3608, 0.20437977982832192),
 (260, 0.20395079136182276),
 (151, 0.2029530274475215),
 (4192, 0.2029530274475215),
 (1444, 0.20277677641345318),
 (74, 0.2024645717996314),
 (1089, 0.2020475485519274),
 (3675, 0.1979082783981174),
 (973, 0.19767387315371682),
 (577, 0.1976738731537168),
 (47, 0.19672236884115843),
 (2971, 0.1925214071641298),
 (942, 0.19134594929397597),
 (495, 0.19088542889273336),
 (1201, 0.19088542889273336),
 (305, 0.19007487139298027),
 (4, 0.18929940971

In [46]:
# define recommendation function
def recommender(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)
    

In [47]:
recommender('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [48]:
# save model
pickle.dump(new_df, open('model/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('model/similarity.pkl', 'wb'))
