Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import ast

Importing dataset

In [None]:
# Dataset imported from https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv') 

In [None]:
# movies.head()
# credits.head(2)['crew'].values

Merging the datasets

In [None]:
movies = movies.merge(credits,on = 'title')
movies.shape

(4809, 23)

In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

Removing columns that are not necessary

In [None]:
# Columns necessary: Genre, Homepage, Keywords,Title, Overview, Cast, Crew, 
movies = movies[['movie_id','genres','keywords', 'title', 'overview','cast', 'crew']]

Pre-processing

In [None]:
# new dataframe would only contain movie_id, title and tags (containing overview + tags + top 3 cast and director in crew)
# movies.isnull().sum()
movies.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [None]:
movies.isnull().sum()

movie_id    0
genres      0
keywords    0
title       0
overview    0
cast        0
crew        0
dtype: int64

In [None]:
movies.duplicated().sum()

0

In [None]:
# To just fetch the name of the json objects (for genres and keywords)
def convert(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L

In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
# to draw out top 3 casts
def convert_cast(obj):
  L = []
  c = 0
  for i in ast.literal_eval(obj):
    if(c!=3):
     L.append(i['name'])
     c+=1
    else:
      break
  return L

In [None]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [None]:
# to draw out only directors name
def convert_crew(obj):
  L = []
  for i in ast.literal_eval(obj):
    if(i['job']=='Director'):
      L.append(i['name'])
      break
  return L

In [None]:
movies['crew'] = movies['crew'].apply(convert_crew)

In [None]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [None]:
# now we need to remove spaces between words in cast, crew, keywords and genres. For eg: Daniel Craig - is same entity but NLP will consider them as different words.
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['cast'] + movies['crew']

In [None]:
new_df = movies[['movie_id', 'title', 'tags']]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
# Vectorisation
# Need to convert tags of each movie a vector (text -> vector : text vectorisation) --> Bag of words
# These vectors wont contain stopwords
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
# Apply stemming to fetch the root form of words
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
# At higher dimensions, eucledian distance is not a good way to measure distances
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
sorted(list(enumerate(similarity[0])), reverse=True, key = lambda x: x[1])[1:6]

[(2999, 0.25854384499750954),
 (4405, 0.24618298195866548),
 (61, 0.23262105259961768),
 (1444, 0.2279211529192759),
 (942, 0.22498852128662875)]

In [None]:
def recommend(movie):
  movie_index = new_df[new_df['title'] == movie].index[0]
  distances = similarity[movie_index]
  movies_list = sorted(list(enumerate(distances)), reverse=True, key = lambda x: x[1])[1:8]

  for i in movies_list:
    print(new_df.iloc[i[0]].title)

In [None]:
recommend('The Matrix')

The Matrix Revolutions
Hackers
The Helix... Loaded
Red Planet
Transcendence
Jupiter Ascending
U.F.O.


In [None]:
# new_df.iloc[4405].title