## Imports

In [1]:
!pip install ydata-profiling



In [2]:
import numpy as np
import pandas as pd
import ast

import warnings
warnings.filterwarnings('ignore')

## Basic Info about data frame

In [3]:
def displayDetails(df, name):
    print(name)
    print(df.sample(10), end='\n\n')

    print("Columns:")
    print(df.columns, end='\n\n')

    print("Info:")
    print(df.info(), end='\n\n')

    print("Na values:")
    print(df.isna().sum(), end='\n\n')

In [4]:
df_movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
df_credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [5]:
displayDetails(df_movies, "Movies:")

Movies:
        budget                                             genres  \
1618  30000000  [{"id": 53, "name": "Thriller"}, {"id": 80, "n...   
639   65000000                     [{"id": 35, "name": "Comedy"}]   
2380   2700000  [{"id": 10749, "name": "Romance"}, {"id": 878,...   
3465         0  [{"id": 35, "name": "Comedy"}, {"id": 18, "nam...   
4254         0           [{"id": 878, "name": "Science Fiction"}]   
782   60000000  [{"id": 28, "name": "Action"}, {"id": 35, "nam...   
894   52000000  [{"id": 28, "name": "Action"}, {"id": 35, "nam...   
2280  19000000  [{"id": 18, "name": "Drama"}, {"id": 80, "name...   
1552  26000000                     [{"id": 35, "name": "Comedy"}]   
2084  15000000  [{"id": 28, "name": "Action"}, {"id": 53, "nam...   

                                homepage      id  \
1618  http://www.sideeffectsmayvary.com/  109421   
639                                  NaN   12133   
2380                                 NaN   10017   
3465                    

In [6]:
displayDetails(df_credits, "Credits:")

Credits:
      movie_id                      title  \
595       8814                       Doom   
3661     19615                  Flying By   
2786      8952  I Love You Phillip Morris   
258      77931               The Smurfs 2   
4644    325123            Teeth and Blood   
2583    254904           The November Man   
2331     10934       Under the Tuscan Sun   
4251     46729              Fetching Cody   
2819     75674               Act of Valor   
878       3683       Flags of Our Fathers   

                                                   cast  \
595   [{"cast_id": 1, "character": "Sarge", "credit_...   
3661  [{"cast_id": 1, "character": "George", "credit...   
2786  [{"cast_id": 4, "character": "Steven Russell",...   
258   [{"cast_id": 5, "character": "Gargamel", "cred...   
4644  [{"cast_id": 0, "character": "Vincent Augustin...   
2583  [{"cast_id": 4, "character": "Peter H. Deverea...   
2331  [{"cast_id": 1, "character": "Frances", "credi...   
4251  [{"cast_id": 7, "

In [7]:
'''
['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count']
'''

df_movies['original_language'].value_counts()
df_movies['production_countries'].value_counts()
df_movies['spoken_languages'].value_counts()

spoken_languages
[{"iso_639_1": "en", "name": "English"}]                                                                                                                                                                                                                                                                                                                                           3171
[{"iso_639_1": "en", "name": "English"}, {"iso_639_1": "es", "name": "Espa\u00f1ol"}]                                                                                                                                                                                                                                                                                               127
[{"iso_639_1": "en", "name": "English"}, {"iso_639_1": "fr", "name": "Fran\u00e7ais"}]                                                                                                                                                 

## Feature Selection and Engineering

In [8]:
# english language and US country is max, so remove the columns
df_movies = df_movies[['id', 'title', 'genres', 'keywords', 'overview', 'vote_average', 'vote_count']]

df_movies.head()

Unnamed: 0,id,title,genres,keywords,overview,vote_average,vote_count
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",7.2,11800
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",6.9,4500
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,6.3,4466
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,7.6,9106
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...",6.1,2124


In [9]:
credits_req = df_credits[['movie_id', 'cast', 'crew']]
credits_req.rename(columns={'movie_id': 'id'}, inplace=True)
credits_req.head()

Unnamed: 0,id,cast,crew
0,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


#### joining the two data frames

In [10]:
movies = df_movies.merge(credits_req, on='id', how='inner')

movies.head()

Unnamed: 0,id,title,genres,keywords,overview,vote_average,vote_count,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...",6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [11]:
movies = movies.dropna(subset=['overview'])

convert the dictionary into list format for easier access and joins`

In [12]:
# converting dictionary to list format
def dict_to_list(obj):
    list_obj = []

    for i in ast.literal_eval(obj):
        list_obj.append(i['name'])

    return list_obj

In [13]:
movies['genres'] = movies['genres'].apply(dict_to_list)
movies['keywords'] = movies['keywords'].apply(dict_to_list)

In [14]:
# converting dictionary to list format
def dict_to_list_cast(obj):
    list_obj = []
    
    for i in ast.literal_eval(obj):
        if (len(list_obj) <= 5):
            list_obj.append(i['name'])
        else:
            break

    return list_obj

In [15]:
movies['cast'] = movies['cast'].apply(dict_to_list_cast)

In [16]:
# converting dictionary to list format
def dict_to_list_crew(obj):
    list_obj = []
    
    for i in ast.literal_eval(obj):
        if (i['job'] == "Director"):
            list_obj.append(i['name'])

    return list_obj

In [17]:
movies['crew'] = movies['crew'].apply(dict_to_list_crew)

In [18]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [19]:
movies.head()

Unnamed: 0,id,title,genres,keywords,overview,vote_average,vote_count,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...",7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...",6.9,4500,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send...",6.3,4466,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Following, the, death, of, District, Attorney...",7.6,9106,"[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[John, Carter, is, a, war-weary,, former, mili...",6.1,2124,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


merging the name and surnames of people so that not confused

In [20]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [21]:
movies.head()

Unnamed: 0,id,title,genres,keywords,overview,vote_average,vote_count,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",7.2,11800,"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...",6.9,4500,"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...",6.3,4466,"[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...",7.6,9106,"[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...",6.1,2124,"[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]


## Final dataset creation and saving

In [22]:
movies['tag'] = movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

movies.head()

Unnamed: 0,id,title,genres,keywords,overview,vote_average,vote_count,cast,crew,tag
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",7.2,11800,"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...",6.9,4500,"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...",6.3,4466,"[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...",7.6,9106,"[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...",6.1,2124,"[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [23]:
movies['tag'] = movies['tag'].apply(lambda x: " ".join(x))

movies.head()

Unnamed: 0,id,title,genres,keywords,overview,vote_average,vote_count,cast,crew,tag
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",7.2,11800,"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...",6.9,4500,"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski],"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...",6.3,4466,"[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes],A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...",7.6,9106,"[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan],Following the death of District Attorney Harve...
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...",6.1,2124,"[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton],"John Carter is a war-weary, former military ca..."


In [24]:
df = movies[['id', 'title', 'tag', 'vote_average', 'vote_count']]

df['tag'] = df['tag'].apply(lambda x: x.lower())

df.head()

Unnamed: 0,id,title,tag,vote_average,vote_count
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",7.2,11800
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",6.9,4500
2,206647,Spectre,a cryptic message from bond’s past sends him o...,6.3,4466
3,49026,The Dark Knight Rises,following the death of district attorney harve...,7.6,9106
4,49529,John Carter,"john carter is a war-weary, former military ca...",6.1,2124


saving the dataset

In [25]:
df.to_csv('movies.csv', index=False)

## Vectorization and Recommendation

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['tag'])
print(tfidf_matrix.shape)

(4800, 41117)


In [28]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [29]:
def findMovie(movie_title):
    # Get index of the movie that matches the title
    movie_details = df[df['title'] == movie_title]

    if movie_details.empty:
        return "Movie not found!"
    else:
        return movie_details

In [30]:
def recommend(movie_title, cosine_sim, n_recommendations=5):

    if movie_title not in df['title'].values:
        return "Movie not found!"

    else:
        # Get index of the movie that matches the title
        idx = df[df['title'] == movie_title].index[0]
        
        # Get similarity scores for all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        # Sort movies based on similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get the top 5 similar movies (excluding the input movie itself)
        sim_scores = sim_scores[1:n_recommendations+1]
        
        # Get movie indices
        movie_indices = [i[0] for i in sim_scores]
        
        # Return the top recommended movie titles
        return df.loc[movie_indices, ['id', 'title']].values.tolist()

In [31]:
findMovie("Batman")

Unnamed: 0,id,title,tag,vote_average,vote_count
1359,268,Batman,the dark knight of gotham city begins his war ...,7.0,2096
4267,2661,Batman,the dynamic duo faces four super-villains who ...,6.1,203


In [32]:
recommend("Batman", cosine_sim, n_recommendations=10)

[[415, 'Batman & Robin'],
 [364, 'Batman Returns'],
 [49026, 'The Dark Knight Rises'],
 [155, 'The Dark Knight'],
 [272, 'Batman Begins'],
 [414, 'Batman Forever'],
 [209112, 'Batman v Superman: Dawn of Justice'],
 [39303, '2:13'],
 [1924, 'Superman'],
 [44912, 'Green Lantern']]

## dumping the required things as pkl

In [33]:
import pickle

In [34]:
pickle.dump(df, open('movies.pkl', 'wb'))

In [35]:
pickle.dump(cosine_sim, open('cosine_sim.pkl', 'wb'))