In [1]:
#import Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Image

In [2]:
df = pd.read_csv("movie_dataset.csv")
df.head(2)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski


In [3]:
#printing all columns
print(df.columns.values)

['index' 'budget' 'genres' 'homepage' 'id' 'keywords' 'original_language'
 'original_title' 'overview' 'popularity' 'production_companies'
 'production_countries' 'release_date' 'revenue' 'runtime'
 'spoken_languages' 'status' 'tagline' 'title' 'vote_average' 'vote_count'
 'cast' 'crew' 'director']


In [4]:
#get a count of the number of rows/movies in the data set and the number of columns
df.shape

(4803, 24)

In [5]:
#creating list of important columns to keep the main content of movie
features = ['keywords','cast','genres','director']

In [6]:
#checking if there are any missing values in features.
df[features].isnull().values.any()

True

In [7]:
#Clean and preprocess the data
for feature in features:
    df[feature] = df[feature].fillna('') #filling any missing values with empty string    

In [8]:
#A function to combine the values of the important columns into single string
def combine_features(row):
    return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row["director"]

In [9]:
#Create a new column called combined_features with keywords,cast,genres & director
df["combined_features"] = df.apply(combine_features,axis=1)

In [10]:
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combined_features
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,culture clash future space war space colony so...
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,ocean drug abuse exotic island east india trad...
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,spy based on novel secret agent sequel mi6 Dan...


In [11]:
#convert a collection of text to a matrix/vector of token counts
count_matrix = CountVectorizer().fit_transform(df["combined_features"])
# print(count_matrix[0])
# print(df['combined_features'][0])

In [12]:
#get the cosine similarity matrix from the count matrix (cos(theta))
cosine_sim = cosine_similarity(count_matrix)

#print the cosine similarity matrix
print(cosine_sim)

[[1.         0.10540926 0.12038585 ... 0.         0.         0.        ]
 [0.10540926 1.         0.0761387  ... 0.03651484 0.         0.        ]
 [0.12038585 0.0761387  1.         ... 0.         0.11145564 0.        ]
 ...
 [0.         0.03651484 0.         ... 1.         0.         0.04264014]
 [0.         0.         0.11145564 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.04264014 0.         1.        ]]


In [13]:
#get the number of rows and columns in th dataset
cosine_sim.shape

(4803, 4803)

<img src="cosine.JPG">

In [14]:
#helper function to get the title from the index
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

#helper function to get the index from the title
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0] #get the title of the movie that user likes

def get_director_name(index):
    return df[df.index == index]["director"].values[0]

In [15]:
movie_user_likes= input("Enter the Movie: ")

Enter the Movie: Avatar


In [16]:
#find that movie index
movie_index = get_index_from_title(movie_user_likes)
# print(movie_index)

In [17]:
similar_movies = list(enumerate(cosine_sim[movie_index]))
# print(similar_movies)

In [18]:
#sort the list of similar movies according to the similarity scores in descending orde
#since the most similar movies is itself, we will discard the first element after sorting.

sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

In [19]:
#create a loop to print the first 5 enteries from the sorted similar movies list
print("Top 5 Similar Movies To " + movie_user_likes + " are:-")
for i in range(5):
    print(str(i+1)+'. '+ get_title_from_index(sorted_similar_movies[i][0]) + ' by '+ get_director_name(sorted_similar_movies[i][0]))

Top 5 Similar Movies To Avatar are:-
1. Guardians of the Galaxy by James Gunn
2. Aliens by James Cameron
3. Star Wars: Clone Wars: Volume 1 by Genndy Tartakovsky
4. Star Trek Into Darkness by J.J. Abrams
5. Star Trek Beyond by Justin Lin
