In [0]:
#description: Build a content based movie recommendation engine


In [0]:
#Import Libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [0]:
#load the data
df = pd.read_csv('https://raw.githubusercontent.com/randerson112358/Python/master/Movie_Recommender/movie_dataset.csv')

In [28]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combined_features
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,culture clash future space war space colony so...
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,ocean drug abuse exotic island east india trad...
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,spy based on novel secret agent sequel mi6 Dan...
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan,dc comics crime fighter terrorist secret ident...
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton,based on novel mars medallion space travel pri...


In [5]:
#get the number of rows and columns in the given dataset
df.shape

(4803, 24)

In [11]:
#create a list of important column to keep
features =['keywords','cast','genres','director']

df[features].head(3)

Unnamed: 0,keywords,cast,genres,director
0,culture clash future space war space colony so...,Sam Worthington Zoe Saldana Sigourney Weaver S...,Action Adventure Fantasy Science Fiction,James Cameron
1,ocean drug abuse exotic island east india trad...,Johnny Depp Orlando Bloom Keira Knightley Stel...,Adventure Fantasy Action,Gore Verbinski
2,spy based on novel secret agent sequel mi6,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Action Adventure Crime,Sam Mendes


In [0]:
#clean and process a data

for feature in features:
  df[feature] = df[feature].fillna('') #filling the missing value with empty string
  

In [0]:
#combine text in all columns and store them in new column
#create a function to combine the values of important columns into a single string
def combine_features(row):
  return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']


In [0]:
#apply function to each row in dataset and store the combined string into a new columns called combined_features
df['combined_features'] = df.apply(combine_features,axis=1)


In [0]:
#print the new combined data set
df.head(3)

In [0]:
#convert a collection of text to matrix of token counts
count_matrix = CountVectorizer().fit_transform(df['combined_features'])

In [0]:
print(count_matrix)

In [15]:

#get the cosine similarity matrix from the count_matrix
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         0.10540926 0.12038585 ... 0.         0.         0.        ]
 [0.10540926 1.         0.0761387  ... 0.03651484 0.         0.        ]
 [0.12038585 0.0761387  1.         ... 0.         0.11145564 0.        ]
 ...
 [0.         0.03651484 0.         ... 1.         0.         0.04264014]
 [0.         0.         0.11145564 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.04264014 0.         1.        ]]


In [16]:
#get the number of rows and columns in cosine_sim

cosine_sim.shape

(4803, 4803)

In [0]:
#helper function to get title from the index

def get_title_from_index(index):
  return df[df.index==index]['title'].values[0]

#helper function to get index from the title

def get_index_from_title(title):
  return df[df.title==title]['index'].values[0]

In [0]:
#get the title of the movies that the user likes

movie_user_like ='The Amazing Spider-Man'

#find that movies index
movie_index = get_index_from_title(movie_user_like)


In [21]:
movie_index

20

In [0]:
#Enumerate through all the similarity scores of 'The Amazing Spider-Man'
#and make a tuple of movie index and similarity scores.

#NOTE: we will return a list in the form of (movie index, similarity_score)

similar_movies = list(enumerate(cosine_sim[movie_index]))
similar_movies


In [0]:
#sort the list of similar movies according to the similarity scores in descending order

sorted_similar_movies = sorted(similar_movies, key= lambda x:x[1], reverse=True)[1:]  #except for the first element which is the movie itself




In [0]:
#print
sorted_similar_movies

In [32]:
#create a loop to print first 5 entries from the sorted similar movies list

i=0
print('top 5 movies similar to'+movie_user_like)
for element in sorted_similar_movies:
  print(get_title_from_index(element[0]))
  i+=1
  if i>5:
    break

top 5 movies similar toThe Amazing Spider-Man
The Amazing Spider-Man 2
Duma
Highlander: Endgame
Cold Mountain
Spider-Man 2
The Croods
