In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import ast
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import warnings

In [3]:
warnings.filterwarnings("ignore")

In [4]:
movies = pd.read_csv('tmdb_5000_movies.csv',sep=',')
credits = pd.read_csv('tmdb_5000_credits.csv',sep=',')

In [5]:
movies.shape

(4803, 20)

In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [7]:
credits.shape

(4803, 4)

In [8]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


### Merging Datasets

In [9]:
movies = movies.merge(credits, on='title')

In [10]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [11]:
movies.shape

(4809, 23)

In [12]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

### Truncated DataFrame

In [13]:
movies = movies[['movie_id','title','overview','genres','keywords','production_companies','cast','crew']]
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,production_companies,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### Preprocessing data

In [14]:
movies.isnull().sum()

movie_id                0
title                   0
overview                3
genres                  0
keywords                0
production_companies    0
cast                    0
crew                    0
dtype: int64

In [15]:
movies.dropna(inplace=True)

In [16]:
movies.duplicated().sum()

0

In [17]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4806 entries, 0 to 4808
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   movie_id              4806 non-null   int64 
 1   title                 4806 non-null   object
 2   overview              4806 non-null   object
 3   genres                4806 non-null   object
 4   keywords              4806 non-null   object
 5   production_companies  4806 non-null   object
 6   cast                  4806 non-null   object
 7   crew                  4806 non-null   object
dtypes: int64(1), object(7)
memory usage: 337.9+ KB


### Column Conversion
#### -Genres and Keywords
An example of genres look like :

In [18]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [19]:
def convert(obj):
    li = []
    for i in ast.literal_eval(obj):
        li.append(i['name'])
    return li

In [20]:
movies['genres'] = movies['genres'].apply(convert)
movies['genres'][0:6]

0    [Action, Adventure, Fantasy, Science Fiction]
1                     [Adventure, Fantasy, Action]
2                       [Action, Adventure, Crime]
3                 [Action, Crime, Drama, Thriller]
4             [Action, Adventure, Science Fiction]
5                     [Fantasy, Action, Adventure]
Name: genres, dtype: object

In [21]:
movies['keywords'] = movies['keywords'].apply(convert)
movies['keywords'][0:6]

0    [culture clash, future, space war, space colon...
1    [ocean, drug abuse, exotic island, east india ...
2    [spy, based on novel, secret agent, sequel, mi...
3    [dc comics, crime fighter, terrorist, secret i...
4    [based on novel, mars, medallion, space travel...
5    [dual identity, amnesia, sandstorm, love of on...
Name: keywords, dtype: object

Here we can see that our dataframe is strating to look like a little better

In [22]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,production_companies,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


#### -Production Companies

In [23]:
movies['production_companies'][0]

'[{"name": "Ingenious Film Partners", "id": 289}, {"name": "Twentieth Century Fox Film Corporation", "id": 306}, {"name": "Dune Entertainment", "id": 444}, {"name": "Lightstorm Entertainment", "id": 574}]'

In [24]:
def convert_prod(obj):
    li=[]
    counter = 0
    for i in ast.literal_eval(obj):
        if counter < 4 :
            li.append(i['name'])
            counter += 1
    return li

In [25]:
movies['production_companies'] = movies['production_companies'].apply(convert_prod)
movies['production_companies'][0:6]

0    [Ingenious Film Partners, Twentieth Century Fo...
1    [Walt Disney Pictures, Jerry Bruckheimer Films...
2                     [Columbia Pictures, Danjaq, B24]
3    [Legendary Pictures, Warner Bros., DC Entertai...
4                               [Walt Disney Pictures]
5    [Columbia Pictures, Laura Ziskin Productions, ...
Name: production_companies, dtype: object

#### -Cast:

In [26]:
movies['cast'][0][:500]

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "c'

In [27]:
def convert_cast(obj):
    li=[]
    counter = 0
    for i in ast.literal_eval(obj):
        if counter < 3 :
            li.append(i['name'])
            counter += 1
    return li

In [28]:
movies['cast'] = movies['cast'].apply(convert_cast)
movies['cast'][0:6]

0    [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1       [Johnny Depp, Orlando Bloom, Keira Knightley]
2        [Daniel Craig, Christoph Waltz, Léa Seydoux]
3        [Christian Bale, Michael Caine, Gary Oldman]
4      [Taylor Kitsch, Lynn Collins, Samantha Morton]
5        [Tobey Maguire, Kirsten Dunst, James Franco]
Name: cast, dtype: object

#### -Crew :

In [29]:
movies['crew'][0][:500]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0,'

In [30]:
def convert_crew(obj):
    crew_set = set()
    crew_list = []
    
    for i in ast.literal_eval(obj):
        if i['job'] in ['Director', 'Screenplay', 'Producer']:
            name = i['name']
            if name not in crew_set:
                crew_set.add(name)
                crew_list.append(name)
    return crew_list

In [31]:
movies['crew'] = movies['crew'].apply(convert_crew)
movies['crew'][0:6]

0                          [James Cameron, Jon Landau]
1    [Gore Verbinski, Jerry Bruckheimer, Ted Elliot...
2    [Sam Mendes, John Logan, Barbara Broccoli, Rob...
3    [Charles Roven, Christopher Nolan, Jonathan No...
4    [Andrew Stanton, Colin Wilson, Jim Morris, Lin...
5    [Sam Raimi, Laura Ziskin, Avi Arad, Alvin Sarg...
Name: crew, dtype: object

In [32]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,production_companies,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Ingenious Film Partners, Twentieth Century Fo...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[James Cameron, Jon Landau]"
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Walt Disney Pictures, Jerry Bruckheimer Films...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[Gore Verbinski, Jerry Bruckheimer, Ted Elliot..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Columbia Pictures, Danjaq, B24]","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[Sam Mendes, John Logan, Barbara Broccoli, Rob..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Legendary Pictures, Warner Bros., DC Entertai...","[Christian Bale, Michael Caine, Gary Oldman]","[Charles Roven, Christopher Nolan, Jonathan No..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",[Walt Disney Pictures],"[Taylor Kitsch, Lynn Collins, Samantha Morton]","[Andrew Stanton, Colin Wilson, Jim Morris, Lin..."


#### Overview

In [33]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['overview'][0:6]

0    [In, the, 22nd, century,, a, paraplegic, Marin...
1    [Captain, Barbossa,, long, believed, to, be, d...
2    [A, cryptic, message, from, Bond’s, past, send...
3    [Following, the, death, of, District, Attorney...
4    [John, Carter, is, a, war-weary,, former, mili...
5    [The, seemingly, invincible, Spider-Man, goes,...
Name: overview, dtype: object

We will save a copy of this dataframe to create our website.

In [34]:
cleaned_movies = movies.copy()
cleaned_movies.to_csv('cleaned_movies.csv')

#### Feature Transformation
We will remove the spaces between strings for each value in 'genres', 'keywords', 'production_companies','cast' and 'crew'.
The purpose of this to create only one tag per feature instead of two or more.
Example : 'Daniel Craig' will be 'DanielCreig'

In [35]:
movies['genres']=movies['genres'].apply(lambda x:[i.replace(' ', '') for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(' ', '') for i in x])
movies['production_companies']=movies['production_companies'].apply(lambda x:[i.replace(' ', '') for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(' ', '') for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(' ', '') for i in x])

In [36]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,production_companies,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[IngeniousFilmPartners, TwentiethCenturyFoxFil...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[JamesCameron, JonLandau]"
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[WaltDisneyPictures, JerryBruckheimerFilms, Se...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]","[GoreVerbinski, JerryBruckheimer, TedElliott, ..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[ColumbiaPictures, Danjaq, B24]","[DanielCraig, ChristophWaltz, LéaSeydoux]","[SamMendes, JohnLogan, BarbaraBroccoli, Robert..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[LegendaryPictures, WarnerBros., DCEntertainme...","[ChristianBale, MichaelCaine, GaryOldman]","[CharlesRoven, ChristopherNolan, JonathanNolan..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...",[WaltDisneyPictures],"[TaylorKitsch, LynnCollins, SamanthaMorton]","[AndrewStanton, ColinWilson, JimMorris, Lindse..."


In [37]:
### Final Dataframe

In [38]:
movies['tags']=movies['overview'] + movies['genres'] + movies['keywords'] + movies['production_companies'] + movies['cast'] + movies['crew']
movies

Unnamed: 0,movie_id,title,overview,genres,keywords,production_companies,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[IngeniousFilmPartners, TwentiethCenturyFoxFil...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[JamesCameron, JonLandau]","[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[WaltDisneyPictures, JerryBruckheimerFilms, Se...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]","[GoreVerbinski, JerryBruckheimer, TedElliott, ...","[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[ColumbiaPictures, Danjaq, B24]","[DanielCraig, ChristophWaltz, LéaSeydoux]","[SamMendes, JohnLogan, BarbaraBroccoli, Robert...","[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[LegendaryPictures, WarnerBros., DCEntertainme...","[ChristianBale, MichaelCaine, GaryOldman]","[CharlesRoven, ChristopherNolan, JonathanNolan...","[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...",[WaltDisneyPictures],"[TaylorKitsch, LynnCollins, SamanthaMorton]","[AndrewStanton, ColinWilson, JimMorris, Lindse...","[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...,...,...,...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui...","[Action, Crime, Thriller]","[unitedstates–mexicobarrier, legs, arms, paper...",[ColumbiaPictures],"[CarlosGallardo, JaimedeHoyos, PeterMarquardt]","[RobertRodriguez, CarlosGallardo]","[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended...","[Comedy, Romance]",[],[],"[EdwardBurns, KerryBishé, MarshaDietlein]","[EdwardBurns, WilliamRexer, AaronLubin]","[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,...","[Comedy, Drama, Romance, TVMovie]","[date, loveatfirstsight, narration, investigat...","[FrontStreetPictures, MuseEntertainmentEnterpr...","[EricMabius, KristinBooth, CrystalLowe]","[HarveyKahn, ScottSmith]","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is...",[],[],[],"[DanielHenney, ElizaCoupe, BillPaxton]",[DanielHsia],"[When, ambitious, New, York, attorney, Sam, is..."


In [39]:
movies_df = movies[['movie_id','title','tags']]
movies_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [40]:
movies_df['tags'] = movies_df['tags'].apply(lambda x:' '.join(x))
movies_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [41]:
movies_df['tags'] = movies_df['tags'].apply(lambda x:x.lower())

In [42]:
movies_df['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d ingeniousfilmpartners twentiethcenturyfoxfilmcorporation duneentertainment lightstormentertainment samworthington zoesaldana sigourneyweaver jamescameron jonlandau'

### Preparing the system

In [43]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies_df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [44]:
vectors.shape

(4806, 5000)

In [45]:
cv.get_feature_names_out()[:101]

array(['000', '007', '10', '100', '11', '12', '13', '14', '1492pictures',
       '15', '16', '17', '18', '18th', '19', '1930s', '1940s', '1950s',
       '1960s', '1970s', '1980', '1980s', '1985', '1990s', '19th',
       '19thcentury', '20', '200', '2009', '20th', '21lapsentertainment',
       '24', '25', '2929productions', '30', '300', '3artsentertainment',
       '3d', '40', '40acres', '50', '500', '60', '60s', '70', 'aaron',
       'aaroneckhart', 'abandoned', 'abducted', 'abigailbreslin',
       'abilities', 'ability', 'able', 'aboard', 'abrams', 'abuse',
       'abusive', 'academy', 'accept', 'accepted', 'accepts', 'access',
       'accident', 'accidental', 'accidentally', 'accompanied',
       'accomplish', 'account', 'accountant', 'accused', 'ace', 'achieve',
       'act', 'acting', 'action', 'actionhero', 'actions', 'activist',
       'activities', 'activity', 'actor', 'actors', 'actress', 'acts',
       'actual', 'actually', 'adam', 'adammckay', 'adams', 'adamsandler',
       '

### Stemming Features

In [46]:
ps = PorterStemmer()

In [47]:
def stemming(text):
    li=[]
    for i in text.split():
        li.append(ps.stem(i))
    return ' '.join(li)

In [48]:
movies_df['tags'] = movies_df['tags'].apply(stemming)

### Similarities

In [49]:
similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.08006408, 0.05337605, ..., 0.02414023, 0.02668803,
        0.        ],
       [0.08006408, 1.        , 0.05555556, ..., 0.02512595, 0.        ,
        0.        ],
       [0.05337605, 0.05555556, 1.        , ..., 0.02512595, 0.        ,
        0.        ],
       ...,
       [0.02414023, 0.02512595, 0.02512595, ..., 1.        , 0.07537784,
        0.04956816],
       [0.02668803, 0.        , 0.        , ..., 0.07537784, 1.        ,
        0.05479966],
       [0.        , 0.        , 0.        , ..., 0.04956816, 0.05479966,
        1.        ]])

In [50]:
similarity.shape

(4806, 4806)

In [51]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])[1:6]

[(539, 0.2668802563418119),
 (1216, 0.2656722567395829),
 (507, 0.26148818018424536),
 (2409, 0.2470831055537004),
 (220, 0.2300789234172203)]

### Recommendation Function

In [52]:
def recommend(movie):
    movies_index = movies_df[movies_df['title'] == movie].index[0]
    distances = similarity[movies_index]
    movies_list= sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(movies_df.iloc[i[0]].title)

#### Example:

In [53]:
recommend('Avatar')

Titan A.E.
Aliens vs Predator: Requiem
Independence Day
Aliens
Prometheus


In [54]:
recommend('Batman Begins')

The Dark Knight
The Dark Knight Rises
Batman v Superman: Dawn of Justice
Batman
Batman & Robin


In [55]:
recommend('Serenity')

The Helix... Loaded
The Black Hole
Alien
Equilibrium
Star Trek: Generations


### Pickling

In [56]:
pickle.dump(movies_df.to_dict(), open('movies_dict.pkl', 'wb'))

In [57]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))