In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import warnings

plt.style.use("fivethirtyeight")
warnings.filterwarnings("ignore")

In [87]:
movies_md = pd.read_csv("data/movies_metadata.csv")
movies_keywords = pd.read_csv("data/keywords.csv")
movies_credits = pd.read_csv("data/credits.csv")

In [55]:
movies_md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [56]:
movies_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [57]:
movies_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [88]:
movies_md = movies_md[movies_md['vote_count'] >= 1500]

In [89]:
movies_md = movies_md[['id', 'original_title', 'overview', 'genres']]

In [90]:
movies_md.head()

Unnamed: 0,id,original_title,overview,genres
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
5,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam..."
31,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...","[{'id': 878, 'name': 'Science Fiction'}, {'id'..."
46,807,Se7en,Two homicide detectives are on a desperate hun...,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na..."


In [91]:
movies_md['title'] = movies_md['original_title']
movies_md = movies_md[movies_md['id'].str.isnumeric()]
movies_md['id'] = movies_md['id'].astype(int)
movies_df = pd.merge(movies_md, movies_keywords, on='id', how='left')

In [92]:
movies_df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...","[{'id': 878, 'name': 'Science Fiction'}, {'id'...",Twelve Monkeys,"[{'id': 222, 'name': 'schizophrenia'}, {'id': ..."
4,807,Se7en,Two homicide detectives are on a desperate hun...,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...",Se7en,"[{'id': 476, 'name': 'self-fulfilling prophecy..."


In [93]:
movies_df.reset_index(inplace=True, drop=True)
movies_df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...","[{'id': 878, 'name': 'Science Fiction'}, {'id'...",Twelve Monkeys,"[{'id': 222, 'name': 'schizophrenia'}, {'id': ..."
4,807,Se7en,Two homicide detectives are on a desperate hun...,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...",Se7en,"[{'id': 476, 'name': 'self-fulfilling prophecy..."


In [94]:

movies_df['genre'] = movies_df['genres'].apply(lambda x: [i['name'] for i in eval(x)])
movies_df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,genre
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[Animation, Comedy, Family]"
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[Adventure, Fantasy, Family]"
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...","[Action, Crime, Drama, Thriller]"
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...","[{'id': 878, 'name': 'Science Fiction'}, {'id'...",Twelve Monkeys,"[{'id': 222, 'name': 'schizophrenia'}, {'id': ...","[Science Fiction, Thriller, Mystery]"
4,807,Se7en,Two homicide detectives are on a desperate hun...,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...",Se7en,"[{'id': 476, 'name': 'self-fulfilling prophecy...","[Crime, Mystery, Thriller]"


In [95]:
movies_df['genre'] = movies_df['genre'].apply(lambda x: [i.replace(" ", "") for i in x])

In [96]:
movies_df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,genre
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[Animation, Comedy, Family]"
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[Adventure, Fantasy, Family]"
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...","[Action, Crime, Drama, Thriller]"
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...","[{'id': 878, 'name': 'Science Fiction'}, {'id'...",Twelve Monkeys,"[{'id': 222, 'name': 'schizophrenia'}, {'id': ...","[ScienceFiction, Thriller, Mystery]"
4,807,Se7en,Two homicide detectives are on a desperate hun...,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...",Se7en,"[{'id': 476, 'name': 'self-fulfilling prophecy...","[Crime, Mystery, Thriller]"


In [97]:
movies_df.isnull().sum()

id                0
original_title    0
overview          0
genres            0
title             0
keywords          0
genre             0
dtype: int64

In [98]:
movies_df['keywords'].fillna('[]', inplace=True)
movies_df['genre'] = movies_df['genre'].apply(lambda x: ' '.join(x))

In [99]:
movies_df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,genre
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",Animation Comedy Family
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",Adventure Fantasy Family
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...",Action Crime Drama Thriller
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...","[{'id': 878, 'name': 'Science Fiction'}, {'id'...",Twelve Monkeys,"[{'id': 222, 'name': 'schizophrenia'}, {'id': ...",ScienceFiction Thriller Mystery
4,807,Se7en,Two homicide detectives are on a desperate hun...,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...",Se7en,"[{'id': 476, 'name': 'self-fulfilling prophecy...",Crime Mystery Thriller


In [100]:
movies_df.drop('genres', axis=1, inplace=True)
movies_df.head()

Unnamed: 0,id,original_title,overview,title,keywords,genre
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",Animation Comedy Family
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",Adventure Fantasy Family
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...",Action Crime Drama Thriller
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...",Twelve Monkeys,"[{'id': 222, 'name': 'schizophrenia'}, {'id': ...",ScienceFiction Thriller Mystery
4,807,Se7en,Two homicide detectives are on a desperate hun...,Se7en,"[{'id': 476, 'name': 'self-fulfilling prophecy...",Crime Mystery Thriller


In [101]:
movies_df = pd.merge(movies_df, movies_credits, on='id', how='left')
movies_df.reset_index(inplace=True, drop=True)
movies_df

Unnamed: 0,id,original_title,overview,title,keywords,genre,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",Animation Comedy Family,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",Adventure Fantasy Family,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...",Action Crime Drama Thriller,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de..."
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...",Twelve Monkeys,"[{'id': 222, 'name': 'schizophrenia'}, {'id': ...",ScienceFiction Thriller Mystery,"[{'cast_id': 41, 'character': 'James Cole', 'c...","[{'credit_id': '52fe4212c3a36847f8001ac7', 'de..."
4,807,Se7en,Two homicide detectives are on a desperate hun...,Se7en,"[{'id': 476, 'name': 'self-fulfilling prophecy...",Crime Mystery Thriller,"[{'cast_id': 17, 'character': 'Detective David...","[{'credit_id': '52fe4279c3a36847f802176f', 'de..."
...,...,...,...,...,...,...,...,...
740,337339,The Fate of the Furious,When a mysterious woman seduces Dom into the w...,The Fate of the Furious,"[{'id': 339, 'name': 'submarine'}, {'id': 9663...",Action Crime Thriller,"[{'cast_id': 0, 'character': 'Dominic Toretto'...","[{'credit_id': '553ea93f92514138a900b0a5', 'de..."
741,339403,Baby Driver,After being coerced into working for a crime b...,Baby Driver,"[{'id': 642, 'name': 'robbery'}, {'id': 2076, ...",Action Crime,"[{'cast_id': 9, 'character': 'Baby', 'credit_i...","[{'credit_id': '55c58c009251417a21000e5d', 'de..."
742,324852,Despicable Me 3,Gru and his wife Lucy must stop former '80s ch...,Despicable Me 3,"[{'id': 190999, 'name': 'minions'}]",Action Animation Adventure Family Comedy,"[{'cast_id': 0, 'character': 'Gru / Dru (voice...","[{'credit_id': '57e71794c3a368222700add6', 'de..."
743,281338,War for the Planet of the Apes,Caesar and his apes are forced into a deadly c...,War for the Planet of the Apes,"[{'id': 818, 'name': 'based on novel'}, {'id':...",Drama ScienceFiction War,"[{'cast_id': 6, 'character': 'Caesar', 'credit...","[{'credit_id': '591f036fc3a368774e036079', 'de..."


In [102]:
movies_df['cast'] = movies_df['cast'].apply(lambda x: [i['name'] for i in eval(x)])
movies_df['cast'] = movies_df['cast'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [i['name'] for i in eval(x)])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))
movies_df.head()

Unnamed: 0,id,original_title,overview,title,keywords,genre,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Toy Story,jealousy toy boy friendship friends rivalry bo...,Animation Comedy Family,TomHanks TimAllen DonRickles JimVarney Wallace...,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Jumanji,boardgame disappearance basedonchildren'sbook ...,Adventure Fantasy Family,RobinWilliams JonathanHyde KirstenDunst Bradle...,"[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",Heat,robbery detective bank obsession chase shootin...,Action Crime Drama Thriller,AlPacino RobertDeNiro ValKilmer JonVoight TomS...,"[{'credit_id': '52fe4292c3a36847f802916d', 'de..."
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...",Twelve Monkeys,schizophrenia philadelphia cassandrasyndrom st...,ScienceFiction Thriller Mystery,BruceWillis MadeleineStowe BradPitt Christophe...,"[{'credit_id': '52fe4212c3a36847f8001ac7', 'de..."
4,807,Se7en,Two homicide detectives are on a desperate hun...,Se7en,self-fulfillingprophecy detective s.w.a.t. dru...,Crime Mystery Thriller,BradPitt MorganFreeman GwynethPaltrow JohnC.Mc...,"[{'credit_id': '52fe4279c3a36847f802176f', 'de..."


In [103]:
movies_df['tags'] = movies_df['overview']+' '+movies_df['keywords']+' '
+movies_df['cast']+' '+movies_df['genre']+' '+movies_df['original_title']

0      TomHanks TimAllen DonRickles JimVarney Wallace...
1      RobinWilliams JonathanHyde KirstenDunst Bradle...
2      AlPacino RobertDeNiro ValKilmer JonVoight TomS...
3      BruceWillis MadeleineStowe BradPitt Christophe...
4      BradPitt MorganFreeman GwynethPaltrow JohnC.Mc...
                             ...                        
740    VinDiesel JasonStatham DwayneJohnson MichelleR...
741    AnselElgort LilyJames KevinSpacey JamieFoxx Jo...
742    SteveCarell KristenWiig TreyParker MirandaCosg...
743    AndySerkis WoodyHarrelson SteveZahn KarinKonov...
744    FionnWhitehead TomGlynn-Carney JackLowden Harr...
Length: 745, dtype: object

In [104]:
movies_df.drop(['genre', 'original_title', 'keywords', 'cast', 'overview', 'crew'], axis=1, inplace=True)

In [105]:
movies_df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant..."
4,807,Se7en,Two homicide detectives are on a desperate hun...


In [106]:
movies_df.isnull().sum()
movies_df.drop(movies_df[movies_df['tags'].isnull()].index, inplace=True)

In [107]:
movies_df.shape

(745, 3)

In [108]:
movies_df.drop_duplicates(inplace=True)
movies_df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."
3,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant..."
4,807,Se7en,Two homicide detectives are on a desperate hun...


In [109]:
movies_df.shape

(740, 3)

In [110]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies_df['tags'] = movies_df['tags'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies_df['tags'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(740, 10001)

In [None]:
vectorized_data = tfidf.fit_transform(movies_df['tags'].values)

In [30]:
# we can dicrease 
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=3000)

reduced_data = svd.fit_transform(vectorized_dataframe)
reduced_data.shape

(8595, 3000)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(reduced_data)

In [34]:
similarity.shape

(8595, 8595)

In [35]:
def recomendation_system(movie):
    # find id of movie by its title
    id_of_movie = movies_df[movies_df['title']==movie].index[0]
    # Get the similarity scores of all movies with that movie
    distances = similarity[id_of_movie]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:15]
    for movie_id in movie_list:
        print(movies_df.iloc[movie_id[0]].title)

In [47]:
recomendation_system('The Dark Knight Rises')

Flipper
The Goods: Live Hard, Sell Hard
Dolphin Tale 2
Smokey and the Bandit II
Escape from Planet Earth
The Cove
Nine to Five
1408
Alien Abduction
Ace Ventura: Pet Detective
Sala samobójców
Payback
Little Boy
In a Heartbeat


In [111]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

In [112]:
def get_recommendations(movie, cosine_similarity=cosine_similarity):
    # Get the index of the movie that matches the title
    idx = movies_df[movies_df['title']==movie].index[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_similarity[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [113]:
get_recommendations('The Matrix')

206                The Matrix Revolutions
193                   The Matrix Reloaded
734                    Ghost in the Shell
68                         The Terminator
293                 Live Free or Die Hard
475                           Cloud Atlas
45                  2001: A Space Odyssey
567                         Transcendence
560                          Interstellar
200    Terminator 3: Rise of the Machines
Name: title, dtype: object

In [114]:
get_recommendations('Memento')

212                           50 First Dates
213    Eternal Sunshine of the Spotless Mind
691                             Finding Dory
175                      The Bourne Identity
701                             Jason Bourne
594                          The Maze Runner
28                              Blade Runner
211                     The Butterfly Effect
245                                 Sin City
437                        The Expendables 2
Name: title, dtype: object