In [38]:
# importing necessary libraries
import pandas as pd
import  numpy as np
import ast
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
# reading the data sets
movies = pd.read_csv('tmdb_5000_movies.csv')
credit = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
# merging data frames
movies = movies.merge(credit, on = 'title')

In [4]:
#dropping un-necessary columns

un_necessary_columns = ['budget', 'id', 'homepage', 'original_language', 'original_title', 'popularity', 'production_companies', 'production_countries',
                       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'vote_average',
                       'vote_count']
movies.drop(columns = un_necessary_columns, inplace= True)

In [5]:
#checking for null values
#movies.isnull().sum()

In [6]:
#dropping null values
movies.dropna(inplace= True)

In [7]:
#checking duplicate value
#movies.duplicated().sum()

In [8]:
# extracting tags from genre
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)

In [9]:
#converting keywords to tags
movies['keywords'] = movies['keywords'].apply(convert)

In [10]:
# converting cast to tags
def convert_cast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter < 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L
movies['cast'] = movies['cast'].apply(convert_cast)

In [11]:
# to extract director from crew
def extract_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L
movies['crew'] = movies['crew'].apply(extract_director)

In [12]:
# converting overview to list
movies['overview'] = movies['overview'].str.split(' ')

In [13]:
# removing space rom tags so that each name have a unique tag
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ','') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ','') for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ','') for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(' ','') for i in x])

In [14]:
# combing all tags under the new columns "tags"
movies['tags'] = movies['genres'] + movies['keywords'] + movies['overview'] + movies['cast'] + movies['crew']

In [15]:
# re arranging the columns and discarding rest
new_df = movies[['movie_id', 'title', 'tags']]

In [16]:
#converting list to string using a space
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [17]:
#converting str to lower
new_df['tags'] = new_df['tags'].apply(lambda x : x.lower())

In [42]:
# now we want to stem i.e. loved, loving, love all these will be converted to love, love, love (i.e. root words)
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return ' '.join(y)

new_df['tags'] = new_df['tags'].apply(stem)

In [43]:
cv = CountVectorizer(max_features= 5000, stop_words='english') #  we are defining max words aand removing stop words of english

In [44]:
vectors = cv.fit_transform(new_df['tags']).toarray()   # we are giving tags from df and converting scipi matrix to an array

In [45]:
# to check the words in cv
#cv.get_feature_names_out()

In [47]:
# now we have to calculate cosine distance between vectors.
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
similarity = cosine_similarity(vectors)

In [57]:
sorted(enumerate(similarity[0]), reverse= True, key= lambda x: x[1])[1:6]

[(1216, 0.29061909685954823),
 (3730, 0.26401000024165),
 (507, 0.25903973506580724),
 (539, 0.2537477434955704),
 (2409, 0.2507061052819501)]

In [60]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(enumerate(distances), reverse= True, key= lambda x: x[1])[1:6]
    for i in movie_list:
        print (new_df.iloc[i[0]]['title'])

In [62]:
recommend('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf
