In [1]:
# Importing required libraries

import numpy as np
import pandas as pd
import ast
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Importing datasets

movies=pd.read_csv('data/tmdb_5000_movies.csv')

credits=pd.read_csv('data/tmdb_5000_credits.csv')

In [4]:
# credits=credits.rename(columns={'movie_id':'id'})

In [5]:
movies=movies.merge(credits)

In [6]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [7]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [8]:
#Selecting only relevent columns

select_cols=['id','genres','title','overview','keywords','cast','crew']

In [9]:
movies=movies[select_cols]

In [10]:
movies.isnull().sum()

id          0
genres      0
title       0
overview    3
keywords    0
cast        0
crew        0
dtype: int64

In [11]:
movies.dropna(inplace=True)

In [12]:
movies.duplicated().sum()

0

In [13]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [14]:
#function to get keywords and genres for a movie, using ast to get in dict form

def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [15]:
movies['genres']=movies['genres'].apply(convert)

In [16]:
movies['keywords']=movies['keywords'].apply(convert)

In [17]:
#function to get top 3 cast from cast list

def convert3(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter !=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [18]:
movies['cast']=movies['cast'].apply(convert3)

In [19]:
# Function to get the director name from the crew list

def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Direcor':
            L.append(i['name'])
            break
    return L

In [20]:
movies['crew']=movies['crew'].apply(fetch_director)

In [21]:
#Storing summary of movie in a list

movies['overview']=movies['overview'].apply(lambda x:x.split())

In [22]:
#Removing white spaces between these features values, so that they can be used as one entity

movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [23]:
#Creating tag, combining all relevent features

movies['tags']=movies['genres']+movies['cast']+movies['keywords']+movies['crew']+movies['overview']

In [24]:
df=movies[['id','title','tags']]

In [25]:
df['tags']=df['tags'].apply(lambda x:" ".join(x))

In [26]:
df['tags']=df['tags'].apply(lambda x:x.lower())

In [27]:
#Stemming the words using nltk library

ps=PorterStemmer()

In [28]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [29]:
df['tags']=df['tags'].apply(stem)

In [30]:
vectorizer = TfidfVectorizer(max_features=5000,stop_words='english')

In [79]:
vectors = vectorizer.fit_transform(df['tags'])
vectors=vectors.toarray()

In [80]:
knn=NearestNeighbors(n_neighbors=6,metric='cosine')

knn.fit(vectors)

In [81]:
def recommend(movie):
    movie_idx=df[df['title']==movie].index[0]
    distances, indices = knn.kneighbors(vectors[movie_idx].reshape(1, 5000))
    movies_list=indices[0][1:] #saving index of 5 closest movies
    for i in movies_list:
        print(df.iloc[i]['id'],df.iloc[i].title)
    return None


In [82]:
recommend('Avatar')

270938 Falcon Rising
679 Aliens
44943 Battle: Los Angeles
440 Aliens vs Predator: Requiem
50357 Apollo 18


In [83]:
# saving dependency files for app

In [84]:
#Saving movie ids and titles

df_id_name=df[['id','title']]
pickle.dump(df_id_name,open('dependencies/movies.pkl','wb'))

In [85]:
#Saving vectors of movie tags

pickle.dump(vectors,open('dependencies/vectors.pkl','wb'))

In [88]:
#Saving knn object to get recommended movies

pickle.dump(knn,open('dependencies/knn.pkl','wb'))

In [87]:
knn