# Importing liberaries

In [1]:
import numpy as np
import pandas as pd

### Merging movies and credits dataset

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.shape

(4803, 20)

In [4]:
credits.shape

(4803, 4)

In [5]:
movies = movies.merge(credits, on='title')

In [6]:
movies.shape

(4809, 23)

### Preprocessing stage

In [7]:
# keeping only the required columns : genres, keywords, overview, title, movie_id, cast, crew
movies = movies[['genres', 'keywords', 'overview', 'title', 'movie_id', 'cast', 'crew']]

In [8]:
# removing movies with null value
movies.isnull().sum()

genres      0
keywords    0
overview    3
title       0
movie_id    0
cast        0
crew        0
dtype: int64

In [9]:
movies.dropna(inplace=True)

In [10]:
#checking for duplicate rows
movies.duplicated().sum()

0

In [11]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
#to convert string into list
import ast

In [13]:
def convert(obj):
    l = []
    for d in ast.literal_eval(obj):
        l.append(d['name'])
    return l

In [14]:
movies['genres'] = movies['genres'].apply(convert)
movies['genres']

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [15]:
movies['keywords'] = movies['keywords'].apply(convert)
movies['keywords']

0       [culture clash, future, space war, space colon...
1       [ocean, drug abuse, exotic island, east india ...
2       [spy, based on novel, secret agent, sequel, mi...
3       [dc comics, crime fighter, terrorist, secret i...
4       [based on novel, mars, medallion, space travel...
                              ...                        
4804    [united states–mexico barrier, legs, arms, pap...
4805                                                   []
4806    [date, love at first sight, narration, investi...
4807                                                   []
4808            [obsession, camcorder, crush, dream girl]
Name: keywords, Length: 4806, dtype: object

In [16]:
def top3cast(obj):
    l = []
    top = 0
    for d in ast.literal_eval(obj):
        top = top + 1
        if top <= 3:
            l.append(d['name'])
        else:
            break
    return l

In [17]:
movies['cast'] = movies['cast'].apply(top3cast)
movies['cast']

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4806           [Eric Mabius, Kristin Booth, Crystal Lowe]
4807            [Daniel Henney, Eliza Coupe, Bill Paxton]
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4806, dtype: object

In [18]:
def fetch_director(obj):
    l = []
    for d in ast.literal_eval(obj):
        if d['job']=='Director':
            l.append(d['name'])
            break
    return l

In [19]:
movies['crew'] = movies['crew'].apply(fetch_director)
movies['crew']

0           [James Cameron]
1          [Gore Verbinski]
2              [Sam Mendes]
3       [Christopher Nolan]
4          [Andrew Stanton]
               ...         
4804     [Robert Rodriguez]
4805         [Edward Burns]
4806          [Scott Smith]
4807          [Daniel Hsia]
4808     [Brian Herzlinger]
Name: crew, Length: 4806, dtype: object

In [20]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())
movies['overview']

0       [In, the, 22nd, century,, a, paraplegic, Marin...
1       [Captain, Barbossa,, long, believed, to, be, d...
2       [A, cryptic, message, from, Bond’s, past, send...
3       [Following, the, death, of, District, Attorney...
4       [John, Carter, is, a, war-weary,, former, mili...
                              ...                        
4804    [El, Mariachi, just, wants, to, play, his, gui...
4805    [A, newlywed, couple's, honeymoon, is, upended...
4806    ["Signed,, Sealed,, Delivered", introduces, a,...
4807    [When, ambitious, New, York, attorney, Sam, is...
4808    [Ever, since, the, second, grade, when, he, fi...
Name: overview, Length: 4806, dtype: object

In [21]:
movies.head()

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...",Pirates of the Caribbean: At World's End,285,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send...",Spectre,206647,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Following, the, death, of, District, Attorney...",The Dark Knight Rises,49026,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[John, Carter, is, a, war-weary,, former, mili...",John Carter,49529,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [22]:
movies['genres'] = movies['genres'].apply(lambda x : [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x : [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x : [i.replace(" ","") for i in x])

In [23]:
movies['tags'] = movies['genres'] + movies['keywords'] + movies['overview'] + movies['cast'] + movies['crew']

In [24]:
movies.head()

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew,tags
0,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,19995,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...",Pirates of the Caribbean: At World's End,285,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Adventure, Fantasy, Action, ocean, drugabuse,..."
2,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...",Spectre,206647,"[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[Action, Adventure, Crime, spy, basedonnovel, ..."
3,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...",The Dark Knight Rises,49026,"[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Action, Crime, Drama, Thriller, dccomics, cri..."
4,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...",John Carter,49529,"[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[Action, Adventure, ScienceFiction, basedonnov..."


In [25]:
# making a fresh data frame new_movies that consists of only 'movie_id', 'title', 'tags'
new_movies = movies.copy()
new_movies.drop(['genres', 'keywords', 'overview', 'cast', 'crew'], axis=1, inplace=True)
new_movies = new_movies.reindex(columns=['movie_id', 'title', 'tags'])

In [26]:
new_movies['tags'] = new_movies['tags'].apply(lambda x:" ".join(x))
new_movies['tags'] = new_movies['tags'].apply(lambda x:x.lower())

In [27]:
new_movies

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventure fantasy sciencefiction cultur...
1,285,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drugabuse exoti...
2,206647,Spectre,action adventure crime spy basedonnovel secret...
3,49026,The Dark Knight Rises,action crime drama thriller dccomics crimefigh...
4,49529,John Carter,action adventure sciencefiction basedonnovel m...
...,...,...,...
4804,9367,El Mariachi,action crime thriller unitedstates–mexicobarri...
4805,72766,Newlyweds,comedy romance a newlywed couple's honeymoon i...
4806,231617,"Signed, Sealed, Delivered",comedy drama romance tvmovie date loveatfirsts...
4807,126186,Shanghai Calling,when ambitious new york attorney sam is sent t...


### text vectorization (here : bag of words, word2vec, if idf)

In [28]:
#importing Natural Language Toolkit for stemming all the tags
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

# importing scikit laern for text vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [29]:
def stem(text):
    l = []
    for word in text.split():
        l.append(ps.stem(word))
    return " ".join(l)

In [30]:
new_movies['tags'] = new_movies['tags'].apply(stem)
new_movies['tags']

0       action adventur fantasi sciencefict culturecla...
1       adventur fantasi action ocean drugabus exotici...
2       action adventur crime spi basedonnovel secreta...
3       action crime drama thriller dccomic crimefight...
4       action adventur sciencefict basedonnovel mar m...
                              ...                        
4804    action crime thriller unitedstates–mexicobarri...
4805    comedi romanc a newlyw couple' honeymoon is up...
4806    comedi drama romanc tvmovi date loveatfirstsig...
4807    when ambiti new york attorney sam is sent to s...
4808    documentari obsess camcord crush dreamgirl eve...
Name: tags, Length: 4806, dtype: object

In [31]:
vectors = cv.fit_transform(new_movies['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [32]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [33]:
#finding the pairwise distance between each vectors
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [34]:
similarity

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

### Steps to recommend a movie
#### 1. Fetch the index of the movie
#### 2. Sort the the similarity array of that index
#### 3. Return the top similar movies

In [35]:
def recommend(movie):
    movie_index = new_movies[new_movies['title']==movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x : x[1])[1:5]
    
    for film in movies_list:
        print(new_movies.iloc[film[0]].title)

In [36]:
# recommend(input("Enter thr movie name : "))

In [37]:
#Thank You

In [42]:
#exporting movie_list
import pickle

pickle.dump(new_movies, open('new_movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))