# Importing dependencies

In [2]:
import numpy as np
import pandas as pd
import difflib # To find the initial closest match of user input movie from the dataset
from sklearn.feature_extraction.text import TfidfVectorizer # To convert textual vector to numerical vector
from sklearn.metrics.pairwise import cosine_similarity # To use cosine similarity

# Data Collection and preprocessing


In [11]:
# Merging two DataFrames
df_movies = pd.read_csv("../Movie_recommendation_system/tmdb_5000_movies.csv")
df_credits = pd.read_csv("../Movie_recommendation_system/tmdb_5000_credits.csv")

# Renaming column name from "id" to "movie_id"
df_movies = df_movies.rename(columns = {"id":"movie_id"})

# Merging two DataFrames for efficient recommendation 
movie_df = pd.merge(df_movies,df_credits,on = "movie_id")

movie_df = movie_df.drop(["title_x","title_y"],axis = 1)

movie_df


# Extrcting rating df only to display top 10 most rated movies, if the use specified movie is not present in the data set

rating_df = movie_df.sort_values(by = ["vote_count"],ascending = False)
top_10_movies = rating_df["original_title"].head(10).tolist()
top_10_movies

['Inception',
 'The Dark Knight',
 'Avatar',
 'The Avengers',
 'Deadpool',
 'Interstellar',
 'Django Unchained',
 'Guardians of the Galaxy',
 'The Hunger Games',
 'Mad Max: Fury Road']

In [3]:
relevant_features = ['genres','keywords','tagline','cast','director']


# Filling all NaNs with empty string to all columns
for column in movie_df.columns:

    if movie_df[column].isnull().any() == True:
        movie_df[column] = movie_df[column].fillna('')


In [4]:
# Combining all the selected features
combined_features = movie_df["genres"] + " " + movie_df["keywords"] + " " + movie_df["overview"] + " " + movie_df["tagline"]+ " " +movie_df["cast"]+" "+movie_df["production_companies"]



In [5]:
# Converting the textual data to numerical data
vectorizer = TfidfVectorizer()

In [6]:
feature_numerical_vectors = vectorizer.fit_transform(combined_features)


# Applying cosine similarity

In [7]:
# Getting the similarity scores using cosine similarity
similarity = cosine_similarity(feature_numerical_vectors)


In [9]:
favt_movie_name = input("Enter your favorite movie name:")

Enter your favorite movie name:The dark knight


In [10]:
# Getting a list of all the movie names mentioned in the dataset
list_of_all_movies = movie_df["original_title"].tolist()


In [11]:
# Finding closest match for the movie name given by the user
find_match_movies = difflib.get_close_matches(favt_movie_name,list_of_all_movies)
closest_match_movie = find_match_movies[0]
closest_match_movie

'The Dark Knight'

In [12]:
# Finding the index of the movie provided by the user
index_of_favt_movie = movie_df[movie_df["original_title"] == closest_match_movie].index[0]
index_of_favt_movie

65

In [13]:
# Getting similarity row of particular index_of_favt_movie

# Here enumerate() is used to get both the index and the corresponding similarity score of each movie, returns a list of tuples(index,similarity score)
similarity_score_of_favt_movie = list(enumerate(similarity[index_of_favt_movie]))



In [14]:
# Sorting the list in decreasing order, so that we can choose the top most similar movies which has highest similarity score
sorted_similar_movies = sorted(similarity_score_of_favt_movie, key = lambda x:x[1],reverse=True)


# Retrieving top similar movies with that of given movie

In [15]:
# Print the names of similar movies based on index value
print("Similar movies recommended for you!!\n\n")
i = 1

for movie in sorted_similar_movies:
    
    index = movie[0]
    similarity_num = movie[1]
    recommended_movie = movie_df.iloc[index]["original_title"]
    
    if recommended_movie == closest_match_movie:
        continue
        
    if(i < 16):
        print(recommended_movie,similarity_num)
        i += 1
    else:
        break

Similar movies recommended for you!!


The Dark Knight Rises 0.7428899434846056
American Gangster 0.7403957715940724
Spider-Man 3 0.729440031728997
The Life of David Gale 0.7257238624001825
Batman v Superman: Dawn of Justice 0.7206832420838649
Inherent Vice 0.7120768970580503
Django Unchained 0.7099627374198296
GoodFellas 0.7068113127335884
Gone Girl 0.7052289414861093
The Avengers 0.7018465420151908
15 Minutes 0.7016544600686186
Batman Begins 0.7009026288916638
Suicide Squad 0.7004651051039957
Hot Pursuit 0.6999328030553839
The Hangover 0.6998411983775945


# Desiging the GUI part