In [None]:
import pandas as pd
import numpy as np
import ast
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib
import pickle
from sklearn.metrics.pairwise import linear_kernel

In [None]:
movies_metadata=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies=movies_metadata.merge(credits,on='title')
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew','popularity','vote_average','vote_count']]

Data Pre-processing

In [None]:
movies.shape

In [None]:
movies.head()

In [None]:
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 

In [None]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L 

In [None]:
movies['cast']=movies['cast'].apply(convert3)

In [None]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L

In [None]:
movies['crew']=movies['crew'].apply(fetch_director)

In [None]:
movies.head()

In [None]:
movies.rename(columns = {'crew':'Director'}, inplace = True)

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)

In [None]:
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['Director']=movies['Director'].apply(lambda x:[i.replace(" ","") for i in x])
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [None]:
movies['tags']=movies['genres']+movies['keywords']+movies['overview']

In [None]:
movies.head()

Based on Content

In [None]:
new_df=movies[['movie_id','title','tags']]

In [None]:
new_df.shape

In [None]:
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

In [None]:
new_df.shape

In [None]:
vectorizer=TfidfVectorizer(stop_words='english')
vectors=vectorizer.fit_transform(new_df['tags'])

In [None]:
vectors.shape

In [None]:
similarity=linear_kernel(vectors,vectors)

In [None]:
movie_name=input('Enter your favourite movie name')

In [None]:
movie_titles=movies['title'].tolist()

In [None]:
close_match=difflib.get_close_matches(movie_name,movie_titles)[0]

In [None]:
close_match

In [None]:
movie_index=movies[movies.title == close_match].index[0]

In [None]:
similarity_score=list(enumerate(similarity[movie_index]))

In [None]:
sorted_similar_movies=sorted(similarity_score,key=lambda x:x[1],reverse=True)
len(sorted_similar_movies)

In [None]:
print("Movies suggested for you:\n")
i=1
for movie in sorted_similar_movies:
    index=movie[0]
    title_of_index=new_df.iloc[index].title
    if i<=10:
        print(i,' ',title_of_index)
        i+=1


In [None]:
pickle.dump(new_df,open('content_movie_list.pkl','wb'))
pickle.dump(similarity,open('content_similarity.pkl','wb'))

Based on cast and crew

In [None]:
new_df1=movies[['movie_id','title','Director','cast']]

In [None]:
new_df1['people']=new_df1['Director']+new_df1['cast']

In [None]:
new_df1['people']=new_df1['people'].apply(lambda x:" ".join(x))
new_df1['people']=new_df1['people'].apply(lambda x:x.lower())

In [None]:
new_df1.shape

In [None]:
vectorizer1=TfidfVectorizer()
vectors1=vectorizer1.fit_transform(new_df1['people'])

In [None]:
similarity1=linear_kernel(vectors1,vectors1)

In [None]:
similarity_score1=list(enumerate(similarity1[movie_index]))

In [None]:
sorted_similar_movies1=sorted(similarity_score1,key=lambda x:x[1],reverse=True)

In [None]:
print("Movies suggested for you:\n")
i=1
for movie in sorted_similar_movies1:
    index=movie[0]
    title_of_index=new_df1.iloc[index].title
    if i<=10:
        print(i,' ',title_of_index)
        i+=1

In [None]:
new_df1.shape

In [None]:
pickle.dump(new_df1,open('cast_movie_list.pkl','wb'))
pickle.dump(similarity1,open('cast_similarity.pkl','wb'))

Poplualar among people

In [None]:
movies.shape

In [None]:
df2=movies[['movie_id','title','tags','popularity','vote_average','vote_count']]

In [None]:
df2.shape

In [None]:
df2.sort_values(by="popularity",ascending=False,kind="mergesort")

In [None]:
V = df2['vote_count']
R = df2['vote_average']
C = df2['vote_average'].mean()
m = df2['vote_count'].quantile(0.9)

df2['weighted_average'] = (V/(V+m) * R) + (m/(m+V) * C)

In [None]:
df2.shape

In [None]:
q_movies = df2.loc[movies['vote_count'] >= m]
q_movies.shape

In [None]:
q_movies = q_movies.sort_values('weighted_average', ascending=False)
li=q_movies['title'][:10].tolist()

In [None]:
li

In [None]:
q_movies[['title','weighted_average']].head(10)

In [None]:
pickle.dump(q_movies,open('popular_df.pkl','wb'))