In [125]:
# Importing required libraries
import numpy as np
import pandas as pd

In [161]:
# Loading the datasets
credits=pd.read_csv('tmdb_5000_credits.csv')          # Contains crew and cast details
movies=pd.read_csv('tmdb_5000_movies.csv')            # Contains movie metadata

In [129]:
# Checking the first row of each dataset
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [7]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [131]:
# Checking the shape (rows, columns) before merging
movies.shape

(4803, 20)

In [11]:
credits.shape

(4803, 4)

## Making new dataframe after merging two datasets movies and credits

In [163]:
movies=movies.merge(credits,on='title')             # Merging both datasets on the common column 'title'

In [137]:
movies.shape                                        # Shape after merging

(4809, 23)

In [139]:
movies['original_language'].value_counts()          # Checking the number of movies per language

original_language
en    4510
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
ko      12
cn      12
ru      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
ta       2
cs       2
ro       2
id       2
ar       2
vi       1
sl       1
ps       1
no       1
ky       1
hu       1
pl       1
af       1
nb       1
tr       1
is       1
xx       1
te       1
el       1
Name: count, dtype: int64

## Updating Dataframe movies

In [165]:
# Selecting only useful columns for the recommendation system
# These will help us capture the essence of a movie

movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [167]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [13]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


In [15]:
## Checking Missing Values
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [169]:
movies.dropna(inplace=True)                              # Dropping rows with null values

In [171]:
movies.isnull().sum()                                    # Confirm no nulls remain

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [173]:
# Checking for duplicate records
movies.duplicated().sum()

0

In [175]:
# Inspecting the format of the genres column (it's in stringified list format)
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [93]:
## Updating Genres Column

In [177]:
# Importing ast to convert stringified lists to actual lists
import ast

# Function to extract only the names from genres/keywords fields
def convert(obj):
    l=[]
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

In [179]:
# Applying the function to genres and keywords columns
movies['genres']=movies['genres'].apply(convert)
movies['keywords']=movies['keywords'].apply(convert)

In [181]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [89]:
## Updating Cast

In [183]:
# Function to extract the top 3 cast members
def convert3(obj):
    l=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter !=3:
            l.append(i['name'])
        else:
            break
    return l 

In [185]:
# Applying the function to cast column
movies['cast']=movies['cast'].apply(convert3)

In [187]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [189]:
# Function to extract director from crew
def fetch_director(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            l.append(i['name'])
            break
    return l

In [191]:
# Applying the function to crew column
movies['crew']=movies['crew'].apply(fetch_director)

In [193]:
movies.head()                           # Displaying cleaned data

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [47]:
movies['overview'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [195]:
# Converting overview from string to list of words 
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [111]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]


In [197]:
# Removing spaces in multi-word names (like "Science Fiction" → "ScienceFiction")
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","")for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","")for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","")for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","")for i in x])

In [101]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]


In [199]:
# Creating a new 'tags' column by combining all useful text fields
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [105]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [201]:
# Creating a final dataframe with only the necessary columns
new_df=movies[['movie_id','title','tags']]

In [203]:
# Converting the list of tags into a single string
new_df['tags']= new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(lambda x:" ".join(x))


In [137]:
new_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."


In [139]:
new_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver StephenLang MichelleRodriguez GiovanniRibisi JoelDavidMoore CCHPounder WesStudi LazAlonso DileepRao MattGerald SeanAnthonyMoran JasonWhyte ScottLawrence KellyKilgour JamesPatrickPitt SeanPatrickMurphy PeterDillon KevinDorman KelsonHenderson DavidVanHorn JacobTomuri MichaelBlain-Rozgay JonCurry LukeHawker WoodySchultz PeterMensah SoniaYee JahnelCurfman IlramChoi KylaWarren LisaRoumain DebraWilson ChrisMala TaylorKibby JodieLandau JulieLamm CullenB.Madden JosephBradyMadden FrankieTorres AustinWilson SaraWilson TamicaWashington-Miller LucyBriant NathanM

In [205]:
# Converting all text to lowercase for consistency
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


In [207]:
# Displaying the final dataframe
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [231]:
# Now We need to classifying simalarities between tags for we can easily recomend movies on the basis of similarities
# For this we need to perform vectorization on all tags
# for vectorization we need to remove sentence formation words like -[is , a , are , with etc ] we only use actual words

# Importing CountVectorizer to convert text into numerical vectors
from sklearn.feature_extraction.text import CountVectorizer

# Keeping only top 5000 frequent words and removing common English stopwords
cv=CountVectorizer(max_features=5000,stop_words='english')

In [233]:
# Transforming the 'tags' column into vector form
vectors=cv.fit_transform(new_df['tags']).toarray()

In [235]:
vectors[0]   # Now every movie converted into a vector form

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [215]:
# Checking the shape of the vectorized array (rows = movies, cols = unique words)
vectors.shape

(4806, 5000)

In [237]:
# Fetching all words (features) extracted
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zoo', 'zooeydeschanel', 'zoëkravitz'],
      dtype=object)

In [239]:
cv.get_feature_names_out().size                   # Should be 5000

5000

In [241]:
# OPTIONAL: Displaying all keywords (for analysis/debugging)
for i in cv.get_feature_names_out():
    print(i)

000
007
10
100
11
12
13
14
15
16
17
18
18th
19
1930
1940
1950
1960
1960s
1970
1970s
1980
1990
19th
19thcenturi
20
20th
24
25
30
3d
40
50
60
70
aaron
aaroneckhart
aarontaylor
aasifmandvi
abandon
abduct
abigailbreslin
abil
abl
abov
abus
academi
accept
access
accid
accident
accompani
accomplish
account
accus
ace
achiev
act
action
activ
activist
actor
actress
actual
adam
adambrodi
adamgoldberg
adamlefevr
adamsandl
adamscott
adamshankman
adapt
add
addict
adewaleakinnuoye
adjust
admir
admit
adolesc
adopt
ador
adrianmartinez
adrienbrodi
adult
adulteri
adulthood
advanc
adventur
adventure
advertis
advic
advis
affair
affect
afghanistan
africa
african
aftercreditssting
afterlif
aftermath
ag
agbaj
age
agediffer
agency
agenda
agent
aggress
ago
agre
ahead
aid
aidanquinn
ail
aim
air
airplan
airport
al
alanalda
alanarkin
alancum
alanrickman
alantudyk
alaska
albert
albertbrook
albertfinney
alcohol
alecbaldwin
alessandronivola
alex
alexandersiddig
alexapenavega
alexborstein
alexisbledel
alfredhitchcock


In [223]:
# =========================
#   STEMMING FOR CLEANING
# =========================

In [91]:
# We can see in above program there are many similar words So, we have to remove and replace these words to one common word
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [225]:
# Function to reduce words to their root form (e.g., "loved", "loving" → "love")
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [227]:
# Applying stemming to all tags
new_df['tags']=new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


In [245]:
# =========================
#  RE-CALCULATE VECTORS
# =========================

In [243]:
# Re-vectorize after stemming to ensure better accuracy
vectors = cv.fit_transform(new_df['tags']).toarray()

In [247]:
# Calculating cosine similarity between all movie vectors
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vectors)

In [249]:
# Shape = (number of movies, number of movies), showing similarity between each pair
similarity.shape

(4806, 4806)

In [109]:
similarity[0]

array([1.        , 0.10881351, 0.07927124, ..., 0.03464015, 0.01900543,
       0.01536191])

In [251]:
# Getting top 5 similar movies to the first movie
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]

[(1214, 0.27028123880866767),
 (582, 0.23162743094465488),
 (2329, 0.22996655275195002),
 (3728, 0.22498852128662875),
 (507, 0.22438727760202976)]

In [253]:
# =========================
#  RECOMMENDATION FUNCTION
# =========================

In [255]:
def recommend(movie):
    movie_index=new_df[new_df['title']==movie].index[0]               # Get index of the movie
    distances=similarity[movie_index]                                 # Similarities with all other movies
    movie_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]                # Top 5 matches
    
    for i in movie_list:
        print(new_df.iloc[i[0]].title)                 # Print titles of recommended movies

In [259]:
# Example call
recommend('Spider-Man')

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
Arachnophobia
The Amazing Spider-Man


In [261]:
# Saving the final dataframe and similarity matrix for use in web app

In [261]:
import pickle
pickle.dump(new_df,open('movies.pkl','wb'))

In [263]:
pickle.dump(similarity,open('similarity_1.pkl','wb'))