In [830]:
import numpy as np 
import pandas as pd
import ast
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [767]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

In [768]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [769]:
movies.shape

(4803, 20)

In [770]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [771]:
credits.shape

(4803, 4)

In [772]:
movies = movies.merge(credits,on='title')

In [773]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [774]:
movies.shape

(4809, 23)

In [775]:
# Keeping important columns for recommendation
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [776]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [777]:
movies.dropna(inplace=True)

In [778]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [779]:
movies.shape

(4806, 7)

In [780]:
movies.duplicated().sum()

0

In [781]:
movies.iloc[0]['genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [782]:
def movies_genres(genres):
    lst = []
    for i in ast.literal_eval(genres):
        lst.append(i["name"])
    return lst

In [783]:
movies["genres"] = movies["genres"].apply(movies_genres)
movies["genres"][0 : 5]

0    [Action, Adventure, Fantasy, Science Fiction]
1                     [Adventure, Fantasy, Action]
2                       [Action, Adventure, Crime]
3                 [Action, Crime, Drama, Thriller]
4             [Action, Adventure, Science Fiction]
Name: genres, dtype: object

In [784]:
type(movies['keywords'][0])

str

In [785]:
movies.iloc[0]['keywords']

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [786]:
movies["keywords"] = movies["keywords"].apply(movies_genres)
movies["keywords"][0 : 5]

0    [culture clash, future, space war, space colon...
1    [ocean, drug abuse, exotic island, east india ...
2    [spy, based on novel, secret agent, sequel, mi...
3    [dc comics, crime fighter, terrorist, secret i...
4    [based on novel, mars, medallion, space travel...
Name: keywords, dtype: object

In [787]:
movies.iloc[0]['cast']

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [788]:
def movies_cast(cast):
    lst = []
    count = 0
    for i in ast.literal_eval(movies.iloc[0]['cast']):
        if count <= 2:
            lst.append(i["name"])
            count += 1
        else:
            break
    return lst

In [789]:
movies["cast"] = movies["cast"].apply(movies_cast)
movies["cast"][0 : 5]

0    [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1    [Sam Worthington, Zoe Saldana, Sigourney Weaver]
2    [Sam Worthington, Zoe Saldana, Sigourney Weaver]
3    [Sam Worthington, Zoe Saldana, Sigourney Weaver]
4    [Sam Worthington, Zoe Saldana, Sigourney Weaver]
Name: cast, dtype: object

In [790]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [791]:
movies.iloc[0]['crew']

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [792]:
def movies_crew(crew):
    lst = []
    for i in ast.literal_eval(crew):
        if i["job"] == "Director":
            lst.append(i["name"])
        if i["job"] == "Screenplay":
            lst.append(i["name"])
    return lst

In [793]:
movies["crew"] = movies["crew"].apply(movies_crew)
movies["crew"][0 : 10]

0                       [James Cameron, James Cameron]
1          [Gore Verbinski, Ted Elliott, Terry Rossio]
2    [Sam Mendes, John Logan, Robert Wade, Neal Pur...
3    [Christopher Nolan, Christopher Nolan, Jonatha...
4    [Andrew Stanton, Andrew Stanton, Michael Chabo...
5    [Sam Raimi, Sam Raimi, Alvin Sargent, Ivan Raimi]
6           [Dan Fogelman, Byron Howard, Nathan Greno]
7                                        [Joss Whedon]
8                          [Steve Kloves, David Yates]
9          [David S. Goyer, Zack Snyder, Chris Terrio]
Name: crew, dtype: object

In [794]:
movies.iloc[0]['overview']

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [795]:
lst = movies.iloc[0]['overview'].split(" ")

In [796]:
def movies_overview(overview):
    lst = overview.split(" ")
    return lst

In [797]:
movies["overview"] = movies["overview"].apply(movies_overview)
movies["overview"][0 : 5]

0    [In, the, 22nd, century,, a, paraplegic, Marin...
1    [Captain, Barbossa,, long, believed, to, be, d...
2    [A, cryptic, message, from, Bond’s, past, send...
3    [Following, the, death, of, District, Attorney...
4    [John, Carter, is, a, war-weary,, former, mili...
Name: overview, dtype: object

In [798]:
def remove_space(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [799]:
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

In [800]:
movies["keywords"][0 : 5]

0    [cultureclash, future, spacewar, spacecolony, ...
1    [ocean, drugabuse, exoticisland, eastindiatrad...
2    [spy, basedonnovel, secretagent, sequel, mi6, ...
3    [dccomics, crimefighter, terrorist, secretiden...
4    [basedonnovel, mars, medallion, spacetravel, p...
Name: keywords, dtype: object

In [801]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[JamesCameron, JamesCameron]"
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[GoreVerbinski, TedElliott, TerryRossio]"
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[SamMendes, JohnLogan, RobertWade, NealPurvis,..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[ChristopherNolan, ChristopherNolan, JonathanN..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[SamWorthington, ZoeSaldana, SigourneyWeaver]","[AndrewStanton, AndrewStanton, MichaelChabon, ..."


In [802]:
def lower_case(value):
    str = " "
    for i in value:
        i = i.lower()
        str += i+" "
    return str

In [803]:
movies['cast'] = movies['cast'].apply(lower_case)
movies['crew'] = movies['crew'].apply(lower_case)
movies['genres'] = movies['genres'].apply(lower_case)
movies['keywords'] = movies['keywords'].apply(lower_case)
movies['overview'] = movies['overview'].apply(lower_case)

In [804]:
movies["crew"]

0                              jamescameron jamescameron 
1                   goreverbinski tedelliott terryrossio 
2        sammendes johnlogan robertwade nealpurvis jez...
3        christophernolan christophernolan jonathannolan 
4        andrewstanton andrewstanton michaelchabon mar...
                              ...                        
4804                                     robertrodriguez 
4805                                         edwardburns 
4806                                          scottsmith 
4807                                          danielhsia 
4808                   brianherzlinger jongunn brettwinn 
Name: crew, Length: 4806, dtype: object

In [805]:
movies['overview']

0        in the 22nd century, a paraplegic marine is d...
1        captain barbossa, long believed to be dead, h...
2        a cryptic message from bond’s past sends him ...
3        following the death of district attorney harv...
4        john carter is a war-weary, former military c...
                              ...                        
4804     el mariachi just wants to play his guitar and...
4805     a newlywed couple's honeymoon is upended by t...
4806     "signed, sealed, delivered" introduces a dedi...
4807     when ambitious new york attorney sam is sent ...
4808     ever since the second grade when he first saw...
Name: overview, Length: 4806, dtype: object

In [806]:
movies['tags'] = movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew'] + movies['overview']

In [807]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is d...",action adventure fantasy sciencefiction,cultureclash future spacewar spacecolony soci...,samworthington zoesaldana sigourneyweaver,jamescameron jamescameron,action adventure fantasy sciencefiction cult...
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, h...",adventure fantasy action,ocean drugabuse exoticisland eastindiatrading...,samworthington zoesaldana sigourneyweaver,goreverbinski tedelliott terryrossio,adventure fantasy action ocean drugabuse exo...
2,206647,Spectre,a cryptic message from bond’s past sends him ...,action adventure crime,spy basedonnovel secretagent sequel mi6 briti...,samworthington zoesaldana sigourneyweaver,sammendes johnlogan robertwade nealpurvis jez...,action adventure crime spy basedonnovel secr...
3,49026,The Dark Knight Rises,following the death of district attorney harv...,action crime drama thriller,dccomics crimefighter terrorist secretidentit...,samworthington zoesaldana sigourneyweaver,christophernolan christophernolan jonathannolan,action crime drama thriller dccomics crimefi...
4,49529,John Carter,"john carter is a war-weary, former military c...",action adventure sciencefiction,basedonnovel mars medallion spacetravel princ...,samworthington zoesaldana sigourneyweaver,andrewstanton andrewstanton michaelchabon mar...,action adventure sciencefiction basedonnovel...


In [808]:
new_df = movies[['movie_id','title','tags']]

In [809]:
new_df["tags"][0]

' action adventure fantasy sciencefiction  cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d  samworthington zoesaldana sigourneyweaver  jamescameron jamescameron  in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. '

In [810]:
ps = PorterStemmer()

In [811]:
def stem(text):
    lst = text.split(" ")
    str = ""
    for i in lst:
        str = str + ps.stem(i) + " "
    return str

In [812]:
new_df["tags"] = new_df["tags"].apply(stem)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [813]:
cv = CountVectorizer(max_features=5000,stop_words='english')

In [832]:
cv

CountVectorizer(max_features=5000, stop_words='english')

In [814]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [815]:
len(vector[1])

5000

In [816]:
vector.shape

(4806, 5000)

In [817]:
similarity = cosine_similarity(vector)

In [818]:
similarity.shape

(4806, 4806)

In [819]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse = True, key = lambda x : x[1])
    for i in distances[0:6]:
        print(new_df.iloc[i[0]].title)

In [829]:
recommend("Spider-Man")

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
Arachnophobia
The Helix... Loaded


In [831]:
pickle.dump(new_df,open('artifacts/movie_list.pkl','wb'))
pickle.dump(similarity,open('artifacts/similarity.pkl','wb'))

In [833]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventur fantasi sciencefict culturec...
1,285,Pirates of the Caribbean: At World's End,adventur fantasi action ocean drugabus exoti...
2,206647,Spectre,action adventur crime spi basedonnovel secre...
3,49026,The Dark Knight Rises,action crime drama thriller dccomic crimefig...
4,49529,John Carter,action adventur sciencefict basedonnovel mar...
