Importing Libraries

In [47]:
import numpy as np
import pandas as pd

Importing Dataset

In [48]:
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

Merging the two datasets based on common column title

In [49]:
movies = movies.merge(credits,on='title')

In [50]:
# genres
# id
# keywords
# title
# overview
# cast
# crew
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [51]:
import ast

In [52]:
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [53]:
movies.dropna(inplace=True)

In [54]:
movies['genres']=movies['genres'].apply(convert)

In [55]:
movies['keywords']=movies['keywords'].apply(convert)

In [56]:
def convert3(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter !=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [57]:
movies['cast']=movies['cast'].apply(convert3)

In [58]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [59]:
movies['crew']=movies['crew'].apply(fetch_director)

Splitting the overview in the form of a string so that we can concatenate and form tags

In [60]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

Replacing all the spaces in the names

In [61]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","")for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","")for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","")for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","")for i in x])

In [62]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

A new dataset with only required values

In [63]:
new_df = movies[['movie_id','title','tags']]

In [64]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [65]:
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


Vectorization-- We are making all the tags vectors

In [66]:
# Removing Stopwords
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [67]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [68]:
import nltk

In [69]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [70]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [71]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [72]:
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
similarity = cosine_similarity(vectors)

In [74]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]

[(539, 0.26089696604360174),
 (1192, 0.2581988897471611),
 (507, 0.25302403842552984),
 (260, 0.25110592822973776),
 (1214, 0.24944382578492943)]

In [75]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)
    return

In [76]:
recommend('Avatar')

Titan A.E.
Small Soldiers
Independence Day
Ender's Game
Aliens vs Predator: Requiem


In [77]:
 import pickle

In [78]:
pickle.dump(new_df,open('movies.pkl','wb'))

In [79]:
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [80]:
pickle.dump(similarity,open('similarity.pkl','wb'))