In [37]:
# First we are reading the two datasets in python
import pandas as pd
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [38]:
# Understanding the columns of the "movies" dataset
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


1) budget : The total amount of money spent to make and distribute the movie
2) genres : The type of the movie
3) homepage : The website of the movie
4) id : The unique tmdb_id of the movie in the "tmdb" database
5) keywords : The important words to describe the movie (Like for inception, the keywords can be like : dream, action, subconscious, idea etc.)
6) original_language : The original language of the movie (as movie is telecasted in different speaking_languages in different countries)
7) original_title : The original title of the movie (as movie is telecasted in different title-languages in different countries)
8) overview : The short summary of the movie
9) popularity : The popularity number of the movie
10) production_companies : The companies involved in producing the movie
11) production_countries : The countries where the movie is shooted
12) release_date : The date when the movie was released
13) revenue : The income of the movie
14) runtime : How long the duration of the movie is
15) spoken_languages : Which languages were told in the movie
16) status : The movie is either released or not
17) tagline : The texts written in the banner of the movie to specify it besides the title
18) title : The title of the movie
19) vote_average : The average vote received in the tmdb site for the movie
20) vote_count : The total no. of people voted in the tmdb site for the movie

In [39]:
# Understanding the columns of the "credits" dataset
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


1) movie_id : The unique tmdb_id of the movie in the "tmdb" database (same as "id" of the "movies" dataset above)
2) title : The title of the movie (same as "title" of the "movies" dataset above)
3) cast : The details of the persons who were in the movie (whom we can see in the movie : actor, actress, villain etc.)
4) crew : The details of the persons who were behind the movie (who helped in creating the movie like : producer, writer, editor, sound designer etc.)

In [40]:
# Merging the two dataframes together into a single dataframe
movies_merged=pd.merge(movies,credits,on='title')

In [42]:
# Checking the no. of rows and columns in the "movies_merged" dataframe
movies_merged.shape

(4809, 23)

In [43]:
# Visualizing the column names of the "movies_merged" dataframe
movies_merged.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [44]:
# Why we are dropping "original_language" column from the "movies_merged" dataframe
movies_merged['original_language'].value_counts()

original_language
en    4510
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
ko      12
cn      12
ru      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
id       2
cs       2
ta       2
ro       2
ar       2
te       1
hu       1
xx       1
af       1
is       1
tr       1
vi       1
pl       1
nb       1
ky       1
no       1
sl       1
ps       1
el       1
Name: count, dtype: int64

Nearabout 94% movies in the "movies_new" dataframe are in english language, so we don't require this column as it is biased to only one element.

In [27]:
# Why we are dropping "original_title" column from the "movies_merged" dataframe

It can be in any languages according to the region the movie is telecasted, so it's better to keep only the "title" removing the "original_title" of the movie

In [28]:
# Important columns to keep from this "movies_merged" dataframe
# 1) movie_id (Used to fetch the posters of the movies in the last stage while creating the website)
# 2) title
# 3) overview (Two different movies with almost same overview or summary can be treated as similar ones)
# 4) genres
# 5) keywords
# 6) cast (We prefer to see movies of same actor sometimes)
# 7) crew (We prefer to see movies of same director sometimes)

In [45]:
# Extracting only required variables from our "movies_merged" dataframe to "movies_new" dataframe
movies_new=movies_merged[['movie_id','title','overview','genres','keywords','cast','crew']]
movies_new.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [46]:
# Checking if there are any null values in the "movies_new" dataframe
movies_new.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [47]:
# Dropping the rows containing the null values from the "movies_new" dataframe
movies_new.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new.dropna(inplace=True)


In [49]:
# Checking if there are any null values in the "movies_new" dataframe now
movies_new.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [50]:
# Checking if there are any duplicated values in the "movies_new" dataframe
print(movies_new.duplicated().sum())

0


In [51]:
## Simplifying the "genres" column of the "movies_new" dataframe
# Visualizing the first record of "genres" column to see how it's
movies_new.iloc[0]['genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [52]:
# Defining a function "convert" which will convert the above string into a list of genres like ['Action','Adventure','Fantasy','Science Fiction']
def convert(obj):
    L=[]
    for i in obj:
        L.append(i['name'])
    return L

In [53]:
# Trying to apply the convert function on the first record of "genres" column
convert(movies_new.iloc[0]['genres'])

TypeError: string indices must be integers

We have to convert the string '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]' to a list for the "convert" function to run

In [54]:
# Converting the above string into a list
import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [55]:
# Redifining the "convert" function again
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [56]:
# Applying the convert function on the first record of "genres" column
convert(movies_new.iloc[0]['genres'])

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [57]:
# Applying the convert function on the whole "genres" column
movies_new['genres'].apply(convert)

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [58]:
# Replacing the above convert function applied whole 'genres' column into the original 'genres' column
movies_new['genres']=movies_new['genres'].apply(convert)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['genres']=movies_new['genres'].apply(convert)


In [59]:
# Visualizing the first record of the "movies_new" dataframe now
movies_new.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [60]:
# Simplifying the "keywords" column of the "movies_new" dataframe in a similar way like before
# Visualizing the first record of "keywords" column to see how it's
movies_new.iloc[0]['keywords']

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [61]:
# Applying the convert function on the first record of "keywords" column
convert(movies_new.iloc[0]['keywords'])

['culture clash',
 'future',
 'space war',
 'space colony',
 'society',
 'space travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love affair',
 'anti war',
 'power relations',
 'mind and soul',
 '3d']

In [62]:
# Applying the convert function on the whole "keywords" column
movies_new['keywords'].apply(convert)

0       [culture clash, future, space war, space colon...
1       [ocean, drug abuse, exotic island, east india ...
2       [spy, based on novel, secret agent, sequel, mi...
3       [dc comics, crime fighter, terrorist, secret i...
4       [based on novel, mars, medallion, space travel...
                              ...                        
4804    [united states–mexico barrier, legs, arms, pap...
4805                                                   []
4806    [date, love at first sight, narration, investi...
4807                                                   []
4808            [obsession, camcorder, crush, dream girl]
Name: keywords, Length: 4806, dtype: object

In [63]:
# Replacing the above convert function applied whole 'keywords' column into the original 'keywords' column
movies_new['keywords']=movies_new['keywords'].apply(convert)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['keywords']=movies_new['keywords'].apply(convert)


In [64]:
# Visualizing the first record of the "movies_new" dataframe now
movies_new.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [65]:
# Simplifying the "cast" column of the "movies_new" dataframe in a similar way like before
# Visualizing the first record of "cast" column to see how it's
movies_new.iloc[0]['cast']

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [66]:
# Defining a function "convert1" which will convert the above string into a list of casts like ['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver'] (Taking only first three casts)
def convert1(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter !=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [67]:
# Applying the convert1 function on the first record of "cast" column
convert1(movies_new.iloc[0]['cast'])

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

In [68]:
# Applying the convert1 function on the whole "cast" column
movies_new['cast'].apply(convert1)

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4806           [Eric Mabius, Kristin Booth, Crystal Lowe]
4807            [Daniel Henney, Eliza Coupe, Bill Paxton]
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4806, dtype: object

In [69]:
# Replacing the above convert1 function applied whole 'cast' column into the original 'cast' column
movies_new['cast']=movies_new['cast'].apply(convert1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['cast']=movies_new['cast'].apply(convert1)


In [70]:
# Visualizing the first record of the "movies_new" dataframe now
movies_new.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [71]:
# Simplifying the "crew" column of the "movies_new" dataframe in a similar way like before
# Visualizing the first record of "crew" column to see how it's
movies_new.iloc[0]['crew']

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [72]:
# Defining a function "fetch_director" which will fetch the director name from the above string
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [73]:
# Applying the fetch_director function on the first record of "crew" column
fetch_director(movies_new.iloc[0]['crew'])

['James Cameron']

In [74]:
# Applying the fetch_director function on the whole "crew" column
movies_new['crew'].apply(fetch_director)

0           [James Cameron]
1          [Gore Verbinski]
2              [Sam Mendes]
3       [Christopher Nolan]
4          [Andrew Stanton]
               ...         
4804     [Robert Rodriguez]
4805         [Edward Burns]
4806          [Scott Smith]
4807          [Daniel Hsia]
4808     [Brian Herzlinger]
Name: crew, Length: 4806, dtype: object

In [75]:
# Replacing the above fetch_director function applied whole 'crew' column into the original 'crew' column
movies_new['crew']=movies_new['crew'].apply(fetch_director)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['crew']=movies_new['crew'].apply(fetch_director)


In [76]:
# Visualizing the first record of the "movies_new" dataframe now
movies_new.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [77]:
# Simplifying the "overview" column of the "movies_new" dataframe in a similar way like before
# Visualizing the first record of "overview" column to see how it's
movies_new.iloc[0]['overview']

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [78]:
# Changing each of the strings into list of sub-strings of the whole "overview" column
movies_new['overview'].apply(lambda x:x.split())

0       [In, the, 22nd, century,, a, paraplegic, Marin...
1       [Captain, Barbossa,, long, believed, to, be, d...
2       [A, cryptic, message, from, Bond’s, past, send...
3       [Following, the, death, of, District, Attorney...
4       [John, Carter, is, a, war-weary,, former, mili...
                              ...                        
4804    [El, Mariachi, just, wants, to, play, his, gui...
4805    [A, newlywed, couple's, honeymoon, is, upended...
4806    ["Signed,, Sealed,, Delivered", introduces, a,...
4807    [When, ambitious, New, York, attorney, Sam, is...
4808    [Ever, since, the, second, grade, when, he, fi...
Name: overview, Length: 4806, dtype: object

In [79]:
# Replacing the above lambda function applied whole 'overview' column into the original 'overview' column
movies_new['overview']=movies_new['overview'].apply(lambda x:x.split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['overview']=movies_new['overview'].apply(lambda x:x.split())


In [80]:
# Visualizing the first 3 records of the "movies_new" dataframe now
movies_new.head(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


Now, we don't want spaces in between the words under any of the columns "overview","genres","keywords","cast" and "crew" as it can confuse our model (The first record under "cast" column contains "Sam Worthington" and while model training, model will treat "Sam" & "Worthington" differently. Again the third record under "crew" column contains "Sam Mendes" and while model training, model will treat "Sam" & "Mendes" differently. Now, model will get confused what output movie to recommend when it's getting the keyword "Sam" from any input movie.)

In [81]:
# Removing the spaces from the "genres" column
movies_new['genres'].apply(lambda x:[i.replace(" ","") for i in x])

0       [Action, Adventure, Fantasy, ScienceFiction]
1                       [Adventure, Fantasy, Action]
2                         [Action, Adventure, Crime]
3                   [Action, Crime, Drama, Thriller]
4                [Action, Adventure, ScienceFiction]
                            ...                     
4804                       [Action, Crime, Thriller]
4805                               [Comedy, Romance]
4806               [Comedy, Drama, Romance, TVMovie]
4807                                              []
4808                                   [Documentary]
Name: genres, Length: 4806, dtype: object

In [82]:
# Removing the spaces from each of the columns from "overview" to end
movies_new['genres']=movies_new['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies_new['keywords']=movies_new['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies_new['cast']=movies_new['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies_new['crew']=movies_new['crew'].apply(lambda x:[i.replace(" ","") for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['genres']=movies_new['genres'].apply(lambda x:[i.replace(" ","") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['keywords']=movies_new['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['cast']=movi

In [83]:
# Visualizing the first record of the "movies_new" dataframe now
movies_new.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [84]:
# Creating a tags column by joining all the columns from overview to end
movies_new['tags']=movies_new['overview']+movies_new['genres']+movies_new['keywords']+movies_new['cast']+movies_new['crew']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_new['tags']=movies_new['overview']+movies_new['genres']+movies_new['keywords']+movies_new['cast']+movies_new['crew']


In [85]:
# Visualizing the first record of the "movies_new" dataframe now
movies_new.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [86]:
# Removing the "overview", "genres", "keywords", "cast" and "crew" columns from the "movies_new" dataframe
final_df=movies_new[['movie_id','title','tags']]
final_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [87]:
# Python string join (concept to use next)
numlist = ['1', '3', '5', '7', '9']
separator = ' -> '

print('List before Join():', numlist)
print('List after Join():' , separator.join(numlist))


List before Join(): ['1', '3', '5', '7', '9']
List after Join(): 1 -> 3 -> 5 -> 7 -> 9


In [88]:
# Bringing the elements under the tags column in it's original string format from the list format
final_df['tags'].apply(lambda x:" ".join(x))

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bond’s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4804    El Mariachi just wants to play his guitar and ...
4805    A newlywed couple's honeymoon is upended by th...
4806    "Signed, Sealed, Delivered" introduces a dedic...
4807    When ambitious New York attorney Sam is sent t...
4808    Ever since the second grade when he first saw ...
Name: tags, Length: 4806, dtype: object

In [89]:
# Replacing the above lambda function applied whole 'tags' column into the original 'tags column
final_df['tags']=final_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags']=final_df['tags'].apply(lambda x:" ".join(x))


In [90]:
# Visualizing the first record of the "final_df" dataframe now
final_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."


In [91]:
# Bringing the elements under the tags column in lower case everywhere
final_df['tags'].apply(lambda x:x.lower())

0       in the 22nd century, a paraplegic marine is di...
1       captain barbossa, long believed to be dead, ha...
2       a cryptic message from bond’s past sends him o...
3       following the death of district attorney harve...
4       john carter is a war-weary, former military ca...
                              ...                        
4804    el mariachi just wants to play his guitar and ...
4805    a newlywed couple's honeymoon is upended by th...
4806    "signed, sealed, delivered" introduces a dedic...
4807    when ambitious new york attorney sam is sent t...
4808    ever since the second grade when he first saw ...
Name: tags, Length: 4806, dtype: object

In [92]:
# Replacing the above lambda function applied whole 'tags' column into the original 'tags column
final_df['tags']=final_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags']=final_df['tags'].apply(lambda x:x.lower())


In [93]:
# Visualizing the first record of the "final_df" dataframe now
final_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."


In [94]:
# Creating the system of generating a matrix of mostly used top 5000 words (eliminating the stop words : words that are not the keywords, rather used to make the sentences like - for, the, at, in, after etc.) in the columns (taken from the whole "tags" column) and 5000 movies in the rows
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [96]:
# Appling the system on the tags column to generate the matrix of mostly used top 5000 words (eliminating the stop words : words that are not the keywords rather used to make the sentences like - for, the, at, in, after etc.) in the columns and 5000 movies in the rows
vectors=cv.fit_transform(final_df['tags'])
vectors

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 133891 stored elements and shape (4806, 5000)>

In [99]:
# Showing the created "vectors" matrix above
vectors=cv.fit_transform(final_df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4806, 5000))

In [104]:
# Showing the top 5000 mostly used words in the whole "tags" column
list(cv.get_feature_names_out())

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '18thcentury',
 '19',
 '1930s',
 '1940s',
 '1950s',
 '1960s',
 '1970s',
 '1980',
 '1980s',
 '1985',
 '1990s',
 '19th',
 '19thcentury',
 '20',
 '200',
 '2009',
 '20th',
 '21st',
 '23',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '60s',
 '70',
 '70s',
 'aaron',
 'aaroneckhart',
 'abandoned',
 'abducted',
 'abigailbreslin',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abuse',
 'abusive',
 'academy',
 'accept',
 'accepted',
 'accepts',
 'access',
 'accident',
 'accidental',
 'accidentally',
 'accompanied',
 'accomplish',
 'account',
 'accountant',
 'accused',
 'ace',
 'achieve',
 'act',
 'acting',
 'action',
 'actionhero',
 'actions',
 'activist',
 'activities',
 'activity',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adam',
 'adams',
 'adamsandler',
 'adamshankman',
 'adaptation',
 'adapted',
 'addict',
 'addicted',
 'addiction',
 'adolescence',

In [106]:
## Converting similar types of words (in the 5000 words) into the parent word (Like converting the words "accept", "accepted", "accepts" into the parent word "accept")
# Importing nltk in python ("NLTK" stands for "Natural Language Toolkit")
import nltk

In [108]:
# Stemming is a method in text processing that eliminates prefixes and suffixes from words, transforming them into their fundamental or root form
# Importing PorterStemmer in python ("Porter’s Stemmer" is one of the most popular stemming methods)
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [110]:
# Use of "split" function in splitting a whole line (in the form of string) into the collection of words (in the form of list)
txt = "welcome to the jungle"

x = txt.split()

print(x)

['welcome', 'to', 'the', 'jungle']


In [112]:
# Stemming a text
ps.stem('loving')

'love'

In [114]:
# Defining the "stem" function to apply on any line (in the form of string)
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [117]:
# Applying the stem function on a line (in the form of string)
stem("I am addicted to her")

'i am addict to her'

In [118]:
# Applying the stem function on the whole "tags" column
final_df['tags'].apply(stem)

0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond’ past send him on a...
3       follow the death of district attorney harvey d...
4       john carter is a war-weary, former militari ca...
                              ...                        
4804    el mariachi just want to play hi guitar and ca...
4805    a newlyw couple' honeymoon is upend by the arr...
4806    "signed, sealed, delivered" introduc a dedic q...
4807    when ambiti new york attorney sam is sent to s...
4808    ever sinc the second grade when he first saw h...
Name: tags, Length: 4806, dtype: object

In [119]:
# Replacing the above stem function applied whole 'tags' column into the original 'tags column
final_df['tags']=final_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags']=final_df['tags'].apply(stem)


In [120]:
# Visualizing the first record of the "final_df" dataframe now
final_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."


In [123]:
# Calculating cosine similarity between each and every vectors of the "vectors" matrix
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vectors)
similarity

array([[1.        , 0.08740748, 0.05827165, ..., 0.02418254, 0.02564946,
        0.        ],
       [0.08740748, 1.        , 0.06451613, ..., 0.02677398, 0.        ,
        0.        ],
       [0.05827165, 0.06451613, 1.        , ..., 0.02677398, 0.        ,
        0.        ],
       ...,
       [0.02418254, 0.02677398, 0.02677398, ..., 1.        , 0.07071068,
        0.04836508],
       [0.02564946, 0.        , 0.        , ..., 0.07071068, 1.        ,
        0.05129892],
       [0.        , 0.        , 0.        , ..., 0.04836508, 0.05129892,
        1.        ]], shape=(4806, 4806))

In [125]:
# Checking how many no. of rows and columns are there in the "vectors" matrix above
similarity.shape

(4806, 4806)

In [167]:
# Extracting an index of a movie from the "final_df" dataframe (Applied within the "recommend" function)
movie_ind=final_df[final_df['title']=='Inception'].index[0]
movie_ind

np.int64(96)

In [176]:
# Finding the cosine similarity vector for the above indexed movie (Applied within the "recommend" function)
dist=similarity[movie_ind]
dist

array([0.0860309 , 0.0952501 , 0.06350006, ..., 0.02635231, 0.02795085,
       0.05735393], shape=(4806,))

In [189]:
# Trying to sort the above vector-elements for the above index from highest to lowest
sorted(dist,reverse=True)

[np.float64(0.9999999999999999),
 np.float64(0.22097086912079608),
 np.float64(0.2041241452319315),
 np.float64(0.20073876713674155),
 np.float64(0.1976423537605237),
 np.float64(0.19611613513818402),
 np.float64(0.1862113249355328),
 np.float64(0.18257418583505533),
 np.float64(0.18190171877724967),
 np.float64(0.18042195912175807),
 np.float64(0.17928429140015903),
 np.float64(0.17677669529663687),
 np.float64(0.17677669529663687),
 np.float64(0.17677669529663687),
 np.float64(0.17677669529663687),
 np.float64(0.17677669529663687),
 np.float64(0.175),
 np.float64(0.17334381132038412),
 np.float64(0.17334381132038412),
 np.float64(0.17160161824591103),
 np.float64(0.17149858514250882),
 np.float64(0.16997502591252123),
 np.float64(0.1670382761952652),
 np.float64(0.1670382761952652),
 np.float64(0.16666666666666666),
 np.float64(0.16413304107465318),
 np.float64(0.16413304107465318),
 np.float64(0.16413304107465318),
 np.float64(0.16222142113076252),
 np.float64(0.16222142113076252),


But here the elements are shifted without the movie_ids, which is not meaningful

In [179]:
# Trying to sort the above vector-elements for the above index from highest to lowest keeping the "movie name-similarity score" combo
sorted(enumerate(dist),reverse=True)

[(4805, np.float64(0.057353933467640436)),
 (4804, np.float64(0.02795084971874737)),
 (4803, np.float64(0.026352313834736494)),
 (4802, np.float64(0.0)),
 (4801, np.float64(0.052128603514268686)),
 (4800, np.float64(0.021128856368212913)),
 (4799, np.float64(0.06154574548966637)),
 (4798, np.float64(0.022271770159368695)),
 (4797, np.float64(0.041666666666666664)),
 (4796, np.float64(0.0)),
 (4795, np.float64(0.05521576303742327)),
 (4794, np.float64(0.04564354645876383)),
 (4793, np.float64(0.0)),
 (4792, np.float64(0.0)),
 (4791, np.float64(0.032826608214930636)),
 (4790, np.float64(0.06681531047810609)),
 (4789, np.float64(0.024056261216234404)),
 (4788, np.float64(0.04055535528269063)),
 (4787, np.float64(0.049029033784546004)),
 (4786, np.float64(0.04490132550669372)),
 (4785, np.float64(0.02795084971874737)),
 (4784, np.float64(0.0472455591261534)),
 (4783, np.float64(0.08838834764831843)),
 (4782, np.float64(0.05661385170722978)),
 (4781, np.float64(0.13363062095621217)),
 (4780

Here the data is being sorted w.r.t. the movie_id column, but we want the sorting w.r.t. the cosine score column

In [186]:
# Sorting the above vector-elements for the above index from highest to lowest keeping the "movie name-similarity score" combo
sorted(enumerate(dist),reverse=True,key=lambda x:x[1])

[(96, np.float64(0.9999999999999999)),
 (1268, np.float64(0.22097086912079608)),
 (4405, np.float64(0.2041241452319315)),
 (2816, np.float64(0.20073876713674155)),
 (1717, np.float64(0.1976423537605237)),
 (3976, np.float64(0.19611613513818402)),
 (1570, np.float64(0.1862113249355328)),
 (1433, np.float64(0.18257418583505533)),
 (2899, np.float64(0.18190171877724967)),
 (35, np.float64(0.18042195912175807)),
 (1788, np.float64(0.17928429140015903)),
 (134, np.float64(0.17677669529663687)),
 (370, np.float64(0.17677669529663687)),
 (920, np.float64(0.17677669529663687)),
 (3966, np.float64(0.17677669529663687)),
 (4008, np.float64(0.17677669529663687)),
 (2158, np.float64(0.175)),
 (122, np.float64(0.17334381132038412)),
 (711, np.float64(0.17334381132038412)),
 (2209, np.float64(0.17160161824591103)),
 (1321, np.float64(0.17149858514250882)),
 (2400, np.float64(0.16997502591252123)),
 (922, np.float64(0.1670382761952652)),
 (1427, np.float64(0.1670382761952652)),
 (1472, np.float64(0.1

In [190]:
# Extracting the top 5 movies from above with the highest similarity score
sorted(enumerate(dist),reverse=True,key=lambda x:x[1])[1:6]

[(1268, np.float64(0.22097086912079608)),
 (4405, np.float64(0.2041241452319315)),
 (2816, np.float64(0.20073876713674155)),
 (1717, np.float64(0.1976423537605237)),
 (3976, np.float64(0.19611613513818402))]

In [215]:
# Defining the main "recommend" function, which will recommend the similar output movie indexes for a single input movie
def recommend(movie):
    movie_index=final_df[final_df['title']==movie].index[0]
    distances=similarity[movie_index]   
    movies_list=sorted(enumerate(distances),reverse=True,key=lambda x:x[1])[1:6]
    for i in movies_list:
        print(i[0])
    return

In [216]:
recommend('Titanic')

818
1561
2145
310
104


In [226]:
final_df.iloc[818].title

'Captain Phillips'

In [228]:
# Defining the main "recommend" function, which will recommend the similar output movies for a single input movie
def recommend(movie):
    movie_index=final_df[final_df['title']==movie].index[0]
    distances=similarity[movie_index]   
    movies_list=sorted(enumerate(distances),reverse=True,key=lambda x:x[1])[1:6]
    for i in movies_list:
        print(final_df.iloc[i[0]].title)
    return

In [229]:
recommend('Titanic')

Captain Phillips
The Notebook
Ghost Ship
In the Heart of the Sea
Poseidon


In [210]:
# Exporting the final_df dataframe in binary mode to form our website
import pickle
pickle.dump(final_df,open('movies.pkl','wb'))

In [211]:
# Exporting the similarity array in binary mode to form our website
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))