# Movie Recommendation System

### Import all dependencies

In [4]:
import numpy as np
import pandas as pd
import difflib 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Data Collection and Preprocessing

In [5]:
# load dataset (download the dataset from https://kaggle.com/datasets/muhammetgamal5/tmdbmoviescsv)
data = pd.read_csv("tmdb-movies.csv")

In [6]:
# show dataset
data.head()

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/15,5562,6.5,2015,137999900.0,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/15,6185,7.1,2015,137999900.0,348161300.0
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/15,2480,6.3,2015,101200000.0,271619000.0
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,...,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/15,5292,7.5,2015,183999900.0,1902723000.0
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,...,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/15,2947,7.3,2015,174799900.0,1385749000.0


In [7]:
# number of rows and columns
print("Shape of the dataset:", data.shape)

Shape of the dataset: (10866, 21)


In [8]:
# missing values
print("Missing values per column:\n", data.isnull().sum())

Missing values per column:
 id                         0
imdb_id                   10
popularity                 0
budget                     0
revenue                    0
original_title             0
cast                      76
homepage                7930
director                  44
tagline                 2824
keywords                1493
overview                   4
runtime                    0
genres                    23
production_companies    1030
release_date               0
vote_count                 0
vote_average               0
release_year               0
budget_adj                 0
revenue_adj                0
dtype: int64


In [9]:
# Potential Features in the Dataset:
# id, imdb_id, popularity, budget, revenue, original_title,
# cast, homepage, director, tagline, keywords, overview, 
# runtime, genres, production_companies, release_date, 
# vote_count, vote_average, release_year, budget_adj, revenue_adj
#
# After evaluation, we will use:
# 'original_title', 'genres', 'keywords', 'tagline', 'cast', 'director', 'overview'
selected_features = ['original_title', 'genres', 'keywords', 'tagline', 'cast', 'director', 'overview']
data = data[selected_features]
data.head()

Unnamed: 0,original_title,genres,keywords,tagline,cast,director,overview
0,Jurassic World,Action|Adventure|Science Fiction|Thriller,monster|dna|tyrannosaurus rex|velociraptor|island,The park is open.,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,Twenty-two years after the events of Jurassic ...
1,Mad Max: Fury Road,Action|Adventure|Science Fiction|Thriller,future|chase|post-apocalyptic|dystopia|australia,What a Lovely Day.,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,An apocalyptic story set in the furthest reach...
2,Insurgent,Adventure|Science Fiction|Thriller,based on novel|revolution|dystopia|sequel|dyst...,One Choice Can Destroy You,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,Beatrice Prior must confront her inner demons ...
3,Star Wars: The Force Awakens,Action|Adventure|Science Fiction|Fantasy,android|spaceship|jedi|space opera|3d,Every generation has a story.,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,Thirty years after defeating the Galactic Empi...
4,Furious 7,Action|Crime|Thriller,car race|speed|revenge|suspense|car,Vengeance Hits Home,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,Deckard Shaw seeks revenge against Dominic Tor...


In [10]:
# replacing null values
for feature in selected_features:
    data[feature] = data[feature].fillna('')

In [11]:
# combine all the selected features
data['combined_features'] = data['genres'] + ' ' + data['keywords'] + ' ' + data['tagline'] + ' ' + data['cast'] + ' ' + data['director'] + ' ' + data['overview']

In [12]:
# Show combined features
data[['original_title', 'combined_features']].head()

Unnamed: 0,original_title,combined_features
0,Jurassic World,Action|Adventure|Science Fiction|Thriller mons...
1,Mad Max: Fury Road,Action|Adventure|Science Fiction|Thriller futu...
2,Insurgent,Adventure|Science Fiction|Thriller based on no...
3,Star Wars: The Force Awakens,Action|Adventure|Science Fiction|Fantasy andro...
4,Furious 7,Action|Crime|Thriller car race|speed|revenge|s...


### Feature Extraction

In [13]:
# perform feature extraction

In [14]:
# converting to numerical values
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(data['combined_features'])

In [15]:
#print feature vectors
print(feature_vectors)

  (np.int32(0), np.int32(820))	0.05242389129226209
  (np.int32(0), np.int32(1014))	0.0612464834689844
  (np.int32(0), np.int32(37875))	0.06649923117571706
  (np.int32(0), np.int32(15559))	0.06659941635011578
  (np.int32(0), np.int32(42903))	0.04873600831207453
  (np.int32(0), np.int32(28745))	0.10996054276352246
  (np.int32(0), np.int32(12435))	0.1552290296890786
  (np.int32(0), np.int32(44181))	0.1632447364825671
  (np.int32(0), np.int32(36010))	0.14703851636957438
  (np.int32(0), np.int32(45161))	0.18814068177590537
  (np.int32(0), np.int32(21900))	0.10378646283040217
  (np.int32(0), np.int32(42724))	0.04700903946401648
  (np.int32(0), np.int32(31796))	0.35616420217525824
  (np.int32(0), np.int32(21860))	0.03533035180653477
  (np.int32(0), np.int32(30940))	0.12594725624220293
  (np.int32(0), np.int32(8185))	0.08706172972048314
  (np.int32(0), np.int32(33487))	0.15029453861851236
  (np.int32(0), np.int32(6308))	0.1552290296890786
  (np.int32(0), np.int32(10639))	0.14115143860557566
  

### Cosine Similarity

In [16]:
#calculate cosine similarity of feature vectors
similarity = cosine_similarity(feature_vectors)

In [17]:
# show/print the results
print("Similarity matrix shape:", similarity.shape)

Similarity matrix shape: (10866, 10866)


In [18]:
# input movie name 
movie_name = "Avatar"

In [19]:
# creating a list with all the movie names
list_of_all_titles = data['original_title'].tolist()

In [20]:
#print movie list
print(list_of_all_titles)



In [21]:
# finding the close match with input
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
if find_close_match:
    close_match = find_close_match[0]
else:
    print("No close matches found for the given movie.")
    exit()


In [29]:
#print matched list
index_of_movie = data[data['original_title'] == close_match]

In [None]:
# the movie name itself
index_of_movie = data[data['original_title'] == close_match].index[0]
print(index_of_movie)

In [24]:
# finding index in dataset


In [25]:
# calculate similarity score and display similarity list


In [26]:
# sort this list to have the highest similarity score


In [27]:
# code to print similar movies


### Recommandation System

In [28]:
# take movie name


# finding the close match with input


# closest match - the searched movie most of the time


# index of closest match


# make a list with similarity socore and index, of that movie


# sort to get most similar movies at first


# print similar movies
