In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
movie = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
movie.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credit = pd.read_csv('tmdb_5000_credits.csv')

In [5]:
credit.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### We will merge both the datasets into a single dataset to make our further process a bit easy.

In [6]:
df = movie.merge(credit, on = 'title')

In [7]:
df.head(1).columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [8]:
df.shape

(4809, 23)

## Now, let's decide which all columns we will be using for the model.
Since we want to make a content-based recommendation system, we have to check for those features which influence the content.

#### Features we will use:
1. genres
2. production_companies
3. cast
4. keywords
5. title
6. overview
7. crew

In [9]:
df = df[['genres','production_companies','cast','keywords','title','overview','crew','id']]

In [10]:
df.head(1)

Unnamed: 0,genres,production_companies,cast,keywords,title,overview,crew,id
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",19995


In [11]:
# Check for missing data
df.isnull().sum()

genres                  0
production_companies    0
cast                    0
keywords                0
title                   0
overview                3
crew                    0
id                      0
dtype: int64

In [12]:
# Since only 3 values are missing, we can also drop those rows..
df.dropna(inplace = True)

In [13]:
df.isnull().sum()

genres                  0
production_companies    0
cast                    0
keywords                0
title                   0
overview                0
crew                    0
id                      0
dtype: int64

In [14]:
# Check if there are any duplicate rows
df.duplicated().sum()

0

In [15]:
df.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

### We will now modify the genres column.
['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [16]:
import ast

In [17]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [18]:
df['genres'] = df['genres'].apply(convert)
# Converted into the format we wanted!

In [19]:
df.head()

Unnamed: 0,genres,production_companies,cast,keywords,title,overview,crew,id
0,"[Action, Adventure, Fantasy, Science Fiction]","[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",19995
1,"[Adventure, Fantasy, Action]","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",285
2,"[Action, Adventure, Crime]","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",Spectre,A cryptic message from Bond’s past sends him o...,"[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",206647
3,"[Action, Crime, Drama, Thriller]","[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",49026
4,"[Action, Adventure, Science Fiction]","[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",John Carter,"John Carter is a war-weary, former military ca...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",49529


In [20]:
# Same concept we will apply for keywords column.
df['keywords'] = df['keywords'].apply(convert)

In [21]:
df.head()

Unnamed: 0,genres,production_companies,cast,keywords,title,overview,crew,id
0,"[Action, Adventure, Fantasy, Science Fiction]","[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",19995
1,"[Adventure, Fantasy, Action]","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",285
2,"[Action, Adventure, Crime]","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,"[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",206647
3,"[Action, Crime, Drama, Thriller]","[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",49026
4,"[Action, Adventure, Science Fiction]","[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",49529


In [22]:
# Same concept we will apply for production_companies column.
df['production_companies'] = df['production_companies'].apply(convert)

In [23]:
df.head()

Unnamed: 0,genres,production_companies,cast,keywords,title,overview,crew,id
0,"[Action, Adventure, Fantasy, Science Fiction]","[Ingenious Film Partners, Twentieth Century Fo...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",19995
1,"[Adventure, Fantasy, Action]","[Walt Disney Pictures, Jerry Bruckheimer Films...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",285
2,"[Action, Adventure, Crime]","[Columbia Pictures, Danjaq, B24]","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,"[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",206647
3,"[Action, Crime, Drama, Thriller]","[Legendary Pictures, Warner Bros., DC Entertai...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",49026
4,"[Action, Adventure, Science Fiction]",[Walt Disney Pictures],"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",49529


In [24]:
# Now, let's focus on cleaning cast column.
df['cast']

0       [{"cast_id": 242, "character": "Jake Sully", "...
1       [{"cast_id": 4, "character": "Captain Jack Spa...
2       [{"cast_id": 1, "character": "James Bond", "cr...
3       [{"cast_id": 2, "character": "Bruce Wayne / Ba...
4       [{"cast_id": 5, "character": "John Carter", "c...
                              ...                        
4804    [{"cast_id": 1, "character": "El Mariachi", "c...
4805    [{"cast_id": 1, "character": "Buzzy", "credit_...
4806    [{"cast_id": 8, "character": "Oliver O\u2019To...
4807    [{"cast_id": 3, "character": "Sam", "credit_id...
4808    [{"cast_id": 3, "character": "Herself", "credi...
Name: cast, Length: 4806, dtype: object

In [25]:
# We will focus on getting top 3 actors in the movie..
def convert_top(obj):
    L = []
    cnt = 0
    for i in ast.literal_eval(obj):
        if cnt != 3:
            L.append(i['name'])
            cnt += 1
    return L

In [26]:
df['cast'] = df['cast'].apply(convert_top)

In [27]:
df['cast']

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4806           [Eric Mabius, Kristin Booth, Crystal Lowe]
4807            [Daniel Henney, Eliza Coupe, Bill Paxton]
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4806, dtype: object

In [28]:
# Similar for crew.. But we want 'Director' in particular

In [29]:
def director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [30]:
df['crew'] = df['crew'].apply(director)

In [31]:
df['crew']

0           [James Cameron]
1          [Gore Verbinski]
2              [Sam Mendes]
3       [Christopher Nolan]
4          [Andrew Stanton]
               ...         
4804     [Robert Rodriguez]
4805         [Edward Burns]
4806          [Scott Smith]
4807          [Daniel Hsia]
4808     [Brian Herzlinger]
Name: crew, Length: 4806, dtype: object

In [32]:
df.head()

Unnamed: 0,genres,production_companies,cast,keywords,title,overview,crew,id
0,"[Action, Adventure, Fantasy, Science Fiction]","[Ingenious Film Partners, Twentieth Century Fo...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...",[James Cameron],19995
1,"[Adventure, Fantasy, Action]","[Walt Disney Pictures, Jerry Bruckheimer Films...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",[Gore Verbinski],285
2,"[Action, Adventure, Crime]","[Columbia Pictures, Danjaq, B24]","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,[Sam Mendes],206647
3,"[Action, Crime, Drama, Thriller]","[Legendary Pictures, Warner Bros., DC Entertai...","[Christian Bale, Michael Caine, Gary Oldman]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,[Christopher Nolan],49026
4,"[Action, Adventure, Science Fiction]",[Walt Disney Pictures],"[Taylor Kitsch, Lynn Collins, Samantha Morton]","[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...",[Andrew Stanton],49529


In [33]:
# overview is a string, so we will also convert it to a list
df['overview'] = df['overview'].apply(lambda x:x.split())

In [34]:
df.head()

Unnamed: 0,genres,production_companies,cast,keywords,title,overview,crew,id
0,"[Action, Adventure, Fantasy, Science Fiction]","[Ingenious Film Partners, Twentieth Century Fo...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[culture clash, future, space war, space colon...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...",[James Cameron],19995
1,"[Adventure, Fantasy, Action]","[Walt Disney Pictures, Jerry Bruckheimer Films...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...",[Gore Verbinski],285
2,"[Action, Adventure, Crime]","[Columbia Pictures, Danjaq, B24]","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[spy, based on novel, secret agent, sequel, mi...",Spectre,"[A, cryptic, message, from, Bond’s, past, send...",[Sam Mendes],206647
3,"[Action, Crime, Drama, Thriller]","[Legendary Pictures, Warner Bros., DC Entertai...","[Christian Bale, Michael Caine, Gary Oldman]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,"[Following, the, death, of, District, Attorney...",[Christopher Nolan],49026
4,"[Action, Adventure, Science Fiction]",[Walt Disney Pictures],"[Taylor Kitsch, Lynn Collins, Samantha Morton]","[based on novel, mars, medallion, space travel...",John Carter,"[John, Carter, is, a, war-weary,, former, mili...",[Andrew Stanton],49529


### Now, we have in a more clean format!!

In [35]:
# We will have to do transformation to remove spaces in the list.. 
df['cast'] = df['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
df['crew'] = df['crew'].apply(lambda x: [i.replace(' ', '') for i in x])
df['keywords'] = df['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])
df['production_companies'] = df['production_companies'].apply(lambda x: [i.replace(' ', '') for i in x])
df['genres'] = df['genres'].apply(lambda x: [i.replace(' ', '') for i in x])

In [36]:
df.head()

Unnamed: 0,genres,production_companies,cast,keywords,title,overview,crew,id
0,"[Action, Adventure, Fantasy, ScienceFiction]","[IngeniousFilmPartners, TwentiethCenturyFoxFil...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...",[JamesCameron],19995
1,"[Adventure, Fantasy, Action]","[WaltDisneyPictures, JerryBruckheimerFilms, Se...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]","[ocean, drugabuse, exoticisland, eastindiatrad...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...",[GoreVerbinski],285
2,"[Action, Adventure, Crime]","[ColumbiaPictures, Danjaq, B24]","[DanielCraig, ChristophWaltz, LéaSeydoux]","[spy, basedonnovel, secretagent, sequel, mi6, ...",Spectre,"[A, cryptic, message, from, Bond’s, past, send...",[SamMendes],206647
3,"[Action, Crime, Drama, Thriller]","[LegendaryPictures, WarnerBros., DCEntertainme...","[ChristianBale, MichaelCaine, GaryOldman]","[dccomics, crimefighter, terrorist, secretiden...",The Dark Knight Rises,"[Following, the, death, of, District, Attorney...",[ChristopherNolan],49026
4,"[Action, Adventure, ScienceFiction]",[WaltDisneyPictures],"[TaylorKitsch, LynnCollins, SamanthaMorton]","[basedonnovel, mars, medallion, spacetravel, p...",John Carter,"[John, Carter, is, a, war-weary,, former, mili...",[AndrewStanton],49529


In [37]:
# Now we will make the dataframe into 3 columns : 'title', 'id', 'tag'

In [38]:
df['tags'] = df['overview'] + df['cast'] + df['crew'] + df['genres'] + df['production_companies']

In [39]:
new_df = df[['id', 'title', 'tags']]

In [40]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [41]:
# Now we will convert the tags into string and also convert to lower case for better use case purpose
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
# new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [42]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [59]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4806 entries, 0 to 4808
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      4806 non-null   int64 
 1   title   4806 non-null   object
 2   tags    4806 non-null   object
dtypes: int64(1), object(2)
memory usage: 150.2+ KB


In [60]:
# Lowercase for better purpose
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [None]:
new_df.info

In [44]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


#### Now, we have all the things what we wanted for futher process!!

## Vectorization

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [46]:
vec = cv.fit_transform(new_df['tags']).toarray()

In [47]:
vec[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [48]:
# Now we need to check for the similar words and attach them to a root word
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [49]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [50]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


### To recommend similar movies, we will now use cosine distance to check for similarity

In [51]:
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
sim = cosine_similarity(vec)
sim.shape

(4806, 4806)

In [53]:
sorted(list(enumerate(sim[0])), reverse = True, key = lambda x:x[1])[1:6]

[(1654, 0.28721347895177635),
 (942, 0.26804385337361925),
 (292, 0.26111648393354675),
 (1214, 0.2519516299301343),
 (33, 0.25087260300212727)]

## Main Function for Recommendation
1. We will first try to get the index of the movie which we called.

In [54]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distance = sim[movie_index]
    movie_list = sorted(list(enumerate(distance)), reverse = True, key = lambda x:x[1])[1:6]    
    
    for i in movie_list:
        print(new_df.iloc[i[0]].title)
        

In [55]:
recommend('Batman Begins')

The Dark Knight Rises
The Dark Knight
Batman
Amidst the Devil's Wings
Batman


### Now we need this file to be connected with the frontend.

In [56]:
import pickle

In [57]:
pickle.dump(new_df.to_dict(), open('movies_dict.pkl', 'wb'))

In [58]:
pickle.dump(sim, open('sim.pkl', 'wb'))