In [1]:
# Code explanation in the README markdown.

In [1]:
# import block
import pandas as pd
import numpy as np

In [2]:
metadata = pd.read_csv(r'Dataset\movies_metadata.csv', low_memory = False)

In [None]:
metadata.head(3)

In [None]:
metadata.columns

# Simple Recommenders

In [None]:
# Calulating C = mean rating across all the movies
C = metadata['vote_average'].mean()
print(C)

In [None]:
# Calculate the min no. of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
print(m) # this is your threshold hyperparameter

Now filter out any movie that has number of votes <= m.

In [None]:
# creating a copy of the dataset
q_movies = metadata.copy().loc[metadata['vote_count'] >= m] # this filters through the movies that don't have the minimum average rating.
q_movies.shape, metadata.shape # comparison between original dataset and the new dataset after the filtering

In [8]:
# To calculate weighted average rating
def weighted_rating(x, m = m, C = C):
    v = x['vote_count']
    R = x['vote_average']

    # return the calculated weighted average rating of each movie & store it in a column of q_movies called 'score'.
    return (v/(v+m) * R) + (m/(m+v) * C)


In [9]:
q_movies['score'] = q_movies.apply(weighted_rating, axis = 1) # axis = 1 is columns axis, AKA each row.

In [None]:
#Sort movies based on score calculated above in descending order
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(15)
#------------------------- Simple Recommenders Complete! -----------------------------------

# Content Based Recommender

In [None]:
metadata[['title', 'overview']].head()

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm

# TF-IDF Vecotrizer object and remove the stop words.
tfidf = TfidfVectorizer(stop_words = 'english')

# Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

# Construct the TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])
# This does the tf-idf value for each word in the 45k+ movies.

In [6]:
# replacing the NaN values in the 'overview' column with empty string
metadata['overview'] = metadata['overview'].fillna('')

In [None]:
tfidf_matrix.shape # 45466 movies, 75827 words

In [None]:
# Array mapping from feature integer indices to feature name
tfidf.get_feature_names_out()[5000:5010]

In [15]:
# Compute the similarity score.
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

This linear kernel helps compare 2 matrices, X @ Y.T

In [None]:
cosine_sim.shape

In [None]:
cosine_sim[1]

In [8]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [None]:
indices[:10]

In [9]:
def get_recommendations(title, cosine_sim):
    # get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all the movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort based on the scores
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

    # get the top 10 most similar scores
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 mist similar movies
    return metadata['title'].iloc[movie_indices]

In [None]:
get_recommendations('The Dark Knight Rises')

## Credits, Genres and Keywords Based Recommender

* Usage of better metadata and by capturing more of the finer details.
* Features being used: Top 3 Actors, directors, related genres, and the movie plot keywords.

Keyword, cast and crew data are not available in your current dattaset, so the first step would be to load and merge them into your main DataFrame metadata.

In [10]:
# Load keywords and credits
credits = pd.read_csv(r'F:\Projects\Recommendation Systems\Dataset\credits.csv')
keywords = pd.read_csv(r'F:\Projects\Recommendation Systems\Dataset\keywords.csv')

In [11]:
# remove rows with bad IDs
metadata = metadata.drop([19730, 29503, 35587])

# Convert IDs to int. Required for merging.
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [12]:
metadata = metadata.merge(credits, on = 'id')
metadata = metadata.merge(keywords, on = 'id')

In [None]:
metadata.head(2)
metadata['cast'][0]

In [14]:
# parse the stringified features into their corresponding python object
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [None]:
metadata['cast'][0]

In [16]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
        
        return np.nan

In [17]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]

        # check if more than 3 elements exist. If yes, return only first three
        if len(names) > 3:
            names = names[:3]
        return names
    
    return []

In [18]:
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

In [None]:
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

* Convert the text into lowercase and strip all whitespaces.

Removing the spaces between words is an important preprocessing step. It is done so that your vectorizer doesn't count the Johnny of "Johnny Depp" and "Johnny Galecki" as the same. After this processing step, the aforementioned actors will be represented as "johnnydepp" and "johnnygalecki" and will be distinct to your vectorizer.

In [20]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
    
        else:
            return ''

In [21]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

### Metadata Soup
This is a string that contains all the metadata you want to feed your vecotrizer (actors, director, keywords).

This create_soup function will join all the required columns by a space. This will be fed to the vector model.

In [22]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [23]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [None]:
metadata[['soup']].head(2)

In [25]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words = 'english')
count_matrix = count.fit_transform(metadata['soup'])

In [None]:
count_matrix.shape

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Convert count_matrix to sparse format (if not already sparse)
count_matrix_sparse = csr_matrix(count_matrix, dtype = np.float32)

In [None]:
count_matrix_sparse, type(count_matrix_sparse)

In [29]:
# Compute cosine similarity using sparse matrix
cosine_sim2_new = cosine_similarity(count_matrix_sparse, count_matrix_sparse, dense_output = True)

In [31]:
# Reset index of main Dataframe and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index = metadata['title'])

In [None]:
get_recommendations('The Dark Knight Rises', cosine_sim2_new)

In [None]:
get_recommendations('The Godfather', cosine_sim2_new)