In [2]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer
import pickle

In [3]:
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [4]:
movies=movies.merge(credits,on='title')

In [5]:
movies.shape

(4809, 23)

In [6]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [7]:
#remove budget, homepage, orignal_language, popularity, production_companies, release_date, revenue, runtime,
#spoken_languages, status, tagline, vote_average, vote_count, movie_id

In [8]:
movies=movies[['id','title','genres','overview','cast','crew','keywords']]
movies.head()

Unnamed: 0,id,title,genres,overview,cast,crew,keywords
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name..."
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,..."
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":..."


In [9]:
movies=movies.dropna() #drop null values

In [10]:
def reformat(obj):
  L=[]
  for l in ast.literal_eval(obj):
      L.append(l['name'])
  return L

In [11]:
movies['genres']=movies['genres'].apply(reformat)
movies.head()

Unnamed: 0,id,title,genres,overview,cast,crew,keywords
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na..."
2,206647,Spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":..."


In [12]:
movies['keywords']=movies['keywords'].apply(reformat)

In [13]:
def fetch_cast(obj):
  L=[]
  cnt=0
  for l in ast.literal_eval(obj):
      if cnt!=3:
        L.append(l['name'])
        cnt+=1
      else:
        break
  return L

In [14]:
movies['cast']=movies['cast'].apply(fetch_cast)

In [15]:
movies['cast']

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4806           [Eric Mabius, Kristin Booth, Crystal Lowe]
4807            [Daniel Henney, Eliza Coupe, Bill Paxton]
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4806, dtype: object

In [16]:
def fetch_director(obj):
  L=[]
  for l in ast.literal_eval(obj):
      if l['job']=='Director':
        L.append(l['name'])
        break
  return L

In [17]:
movies['crew']=movies['crew'].apply(fetch_director)

In [18]:
movies.head()

Unnamed: 0,id,title,genres,overview,cast,crew,keywords
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],"[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],"[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],"[based on novel, mars, medallion, space travel..."


In [20]:
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [21]:
movies.head()

Unnamed: 0,id,title,genres,overview,cast,crew,keywords
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],"[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],"[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],"[based on novel, mars, medallion, space travel..."


In [22]:
movies['tags']=movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [23]:
movies['tags']=movies['tags'].apply(lambda x:[i.replace(" ","") for i in x])

In [24]:
movies.head()

Unnamed: 0,id,title,genres,overview,cast,crew,keywords,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[culture clash, future, space war, space colon...","[Action, Adventure, Fantasy, ScienceFiction, c..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[ocean, drug abuse, exotic island, east india ...","[Adventure, Fantasy, Action, ocean, drugabuse,..."
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],"[spy, based on novel, secret agent, sequel, mi...","[Action, Adventure, Crime, spy, basedonnovel, ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],"[dc comics, crime fighter, terrorist, secret i...","[Action, Crime, Drama, Thriller, dccomics, cri..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],"[based on novel, mars, medallion, space travel...","[Action, Adventure, ScienceFiction, basedonnov..."


In [25]:
movies['tags']=movies['overview']+movies['tags']

In [26]:
movies.head()

Unnamed: 0,id,title,genres,overview,cast,crew,keywords,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],"[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],"[dc comics, crime fighter, terrorist, secret i...","[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],"[based on novel, mars, medallion, space travel...","[John, Carter, is, a, war-weary,, former, mili..."


In [27]:
new_df = movies[['id','title','tags','genres','cast','crew']]

In [28]:
new_df.head()

Unnamed: 0,id,title,tags,genres,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [29]:
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))


In [30]:
new_df.head()

Unnamed: 0,id,title,tags,genres,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [31]:
ps=PorterStemmer()
def stem(text):
  y=[]
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [32]:
stem('playing football')

'play footbal'

In [33]:
new_df['tags']=new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


In [34]:
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


In [35]:
new_df.head()

Unnamed: 0,id,title,tags,genres,cast,crew
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c...","[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,a cryptic messag from bond’ past send him on a...,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"john carter is a war-weary, former militari ca...","[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [36]:
# cv= CountVectorizer(max_features=5000,stop_words='english')
vectorizer = TfidfVectorizer(max_features=5000,stop_words='english')

In [37]:
# vectors=cv.fit_transform(new_df['tags']).toarray()
vectors=vectorizer.fit_transform(new_df['tags'])

In [38]:
vectors.shape

(4806, 5000)

In [39]:
similarity=cosine_similarity(vectors)

In [40]:
similarity[0]

array([1.        , 0.02203984, 0.03038546, ..., 0.02319866, 0.        ,
       0.        ])

In [41]:
def recommend(movie):
  movie_idx=new_df[new_df['title']==movie].index[0]
  distances=similarity[movie_idx]
  movies_list=sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:26]

  for i in movies_list:
    print(new_df.iloc[i[0]].title)

In [42]:
recommend('Batman')

Batman
Batman & Robin
Batman Returns
The Dark Knight Rises
Batman Begins
Batman
Batman Forever
The Dark Knight
Batman v Superman: Dawn of Justice
Superman
Spider-Man 2
Superman Returns
Chill Factor
Superman II
Mirrormask
Man of Steel
Green Lantern
Defendor
Superman IV: The Quest for Peace
Griff the Invisible
The Talented Mr. Ripley
The Secret in Their Eyes
Free State of Jones
Kick-Ass 2
Fantastic 4: Rise of the Silver Surfer


In [43]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])

[(0, 1.0000000000000002),
 (2409, 0.2519930130215996),
 (3730, 0.22559347561842458),
 (582, 0.19388378323858985),
 (1216, 0.18848538168904172),
 (3608, 0.17684269115072765),
 (47, 0.1730458195053612),
 (778, 0.16564313221878),
 (1204, 0.16306831442162661),
 (539, 0.1562998465801078),
 (942, 0.15459411606089474),
 (1920, 0.1542577009440469),
 (557, 0.14657202013357098),
 (507, 0.14551199601641027),
 (438, 0.1371678375038985),
 (3538, 0.1364285511289609),
 (1089, 0.13514407456470146),
 (2786, 0.1343107019292942),
 (311, 0.13380103355567788),
 (260, 0.13234936195955957),
 (74, 0.13139220820095845),
 (4048, 0.1313315488611379),
 (4336, 0.13047808864882732),
 (838, 0.12558586190032173),
 (94, 0.12356432056424219),
 (513, 0.1212433467438935),
 (322, 0.12123636936430861),
 (3675, 0.12063560345370145),
 (1071, 0.12049350870113182),
 (1348, 0.1200478497854614),
 (184, 0.11971910559078813),
 (373, 0.1194669342459123),
 (3162, 0.1191927407051544),
 (1201, 0.11897803487602246),
 (3628, 0.118572591

In [44]:
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [45]:
pickle.dump(similarity,open('similarity.pkl','wb'))