In [1]:
# import block
import pandas as pd
import numpy as np

In [2]:
metadata = pd.read_csv(r'Dataset\movies_metadata.csv', low_memory = False)

In [3]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [4]:
metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

# Simple Recommenders

### Most Basic metric to decide top 250: RATINGS; Why NOT to use this?
* Doesn't take into consideration the movie popularity. Hence, a movie rated 9 by 10 people would be ranked higher than the movie rated 8.5 by 1000 people.
* This could also mean tho, that the movie just came out.

To tackle this, we need to come up with a weighted rating that considers the average rating and the number of votes it accumulated.

Weighted Rating (WR) = (v/(v+m)*R + m/(v+m)*C)

* v = total number of votes for movie
* m = min votes required to be listed in the chart
* R = average rating of movie
* C = mean vote across the whole report

### How the formula works?

The formula combines the movie’s own average rating (R) with the overall average rating across all movies (C). But it doesn’t treat all movies equally; instead, it weights the rating based on how many people have rated the movie.

* (v/(v+m)) * R:
This part of the formula gives **weight to the movie’s average rating (R)**. The more votes (v) a movie has, the more weight its own rating has in the final score.

* (m/(v+m)) * C:
This part of the formula adds in the average rating of all movies (C). For **movies with fewer votes, this overall average has more influence. It helps to balance the score**, especially if there are too few votes to trust the movie's average rating alone.

**m** is a hyperparameter. You can choose accordingly as there is no correct value for m. It is subjective. This will remove the movies that have votes lesser than a certain threshold and not be taken into consideration.

This is in the form of a percentile. Considering 'x'th percentile would mean that the cirteria for the movie to be considered is to have more votes than x% of the movies in the overall list.

In [5]:
# Calulating C = mean rating across all the movies
C = metadata['vote_average'].mean()
print(C)

5.618207215134185


In [6]:
# Calculate the min no. of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
print(m) # this is your threshold hyperparameter

160.0


Now filter out any movie that has number of votes <= m.

In [7]:
# creating a copy of the dataset
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape, metadata.shape

((4555, 24), (45466, 24))

In [8]:
# To calculate weighted average rating
def weighted_rating(x, m = m, C = C):
    v = x['vote_count']
    R = x['vote_average']

    return (v/(v+m) * R) + (m/(m+v) * C)


In [9]:
q_movies['score'] = q_movies.apply(weighted_rating, axis = 1)

In [10]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)
#------------------------- Simple Recommenders Complete! -----------------------------------

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


# Content Based Recommender

In [11]:
metadata[['title', 'overview']].head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


We will be using pairwise cosine similarity scores for all the movies based on their plot descriptions and recommend movies based on the similarity score threshold.

In simple terms, we're going to generate scores of how similar some movies are to each other. If they are similar beyond a certain score threshold, the movies will be recommended along with movie in case.

This is a **Natural Language Processing (NLP)** problem. Hence, there needs to be feature extraction, that I assume would be word embeddings for the words in the summary or plor description. Yup, we need to get word vectors of each overview or document.

**word vectors are vectorized representation of words in a document. The vectors carry a semantic meaning with it.** For example, man & king will have vector representations close to each other while man & woman would have representation far from each other.

Compute the **Term Frequency-Inverse Document Frequency**

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm

# TF-IDF Vecotrizer object and remove the stop words.
tfidf = TfidfVectorizer(stop_words = 'english')

# Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

# Construct the TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])
# This does the tf-idf value for each word in the 45k+ movies.

In [13]:
tfidf_matrix.shape # 45466 movies, 75827 words

(45466, 75827)

In [14]:
# Array mapping from feature integer indices to feature name
tfidf.get_feature_names_out()[5000:5010]

array(['avails', 'avaks', 'avalanche', 'avalanches', 'avallone', 'avalon',
       'avant', 'avanthika', 'avanti', 'avaracious'], dtype=object)

In [15]:
# Compute the similarity score.
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

This linear kernel helps compare 2 matrices, X @ Y.T

In [16]:
cosine_sim.shape

(45466, 45466)

In [17]:
cosine_sim[1]

array([0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
       0.00929411])

In [18]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [19]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

### Recommendation Function
* Get the index of the movie given its title.
* Get the list of cosine similarity scores for that movie, with the rest of the movies.
* Convert into a list of tuples where the first element is its position, second is the similarity score. So **[(position, similarity score)]**.
* Sort the mentioned list of tuples based on similarity scores (second element).
* Get the top 10 elements of the list. Ignore the first element as it refers to self.
* Return the titles.

In [20]:
def get_recommendations(title, cosine_sim = cosine_sim):
    # get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all the movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort based on the scores
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

    # get the top 10 most similar scores
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 mist similar movies
    return metadata['title'].iloc[movie_indices]

In [21]:
get_recommendations('The Dark Knight Rises')

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

## Credits, Genres and Keywords Based Recommender

* Usage of better metadata and by capturing more of the finer details.
* Features being used: Top 3 Actors, directors, related genres, and the movie plot keywords.

Keyword, cast and crew data are not available in your current dattaset, so the first step would be to load and merge them into your main DataFrame metadata.

In [22]:
# Load keywords and credits
credits = pd.read_csv(r'F:\Projects\Recommendation Systems\Dataset\credits.csv')
keywords = pd.read_csv(r'F:\Projects\Recommendation Systems\Dataset\keywords.csv')

In [23]:
# remove rows with bad IDs
metadata = metadata.drop([19730, 29503, 35587])

# Convert IDs to int. Required for merging.
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [24]:
metadata = metadata.merge(credits, on = 'id')
metadata = metadata.merge(keywords, on = 'id')

In [25]:
metadata.head(2)
metadata['cast'][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [26]:
# parse the stringified features into their corresponding python object
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [27]:
metadata['cast'][0]

[{'cast_id': 14,
  'character': 'Woody (voice)',
  'credit_id': '52fe4284c3a36847f8024f95',
  'gender': 2,
  'id': 31,
  'name': 'Tom Hanks',
  'order': 0,
  'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'},
 {'cast_id': 15,
  'character': 'Buzz Lightyear (voice)',
  'credit_id': '52fe4284c3a36847f8024f99',
  'gender': 2,
  'id': 12898,
  'name': 'Tim Allen',
  'order': 1,
  'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'},
 {'cast_id': 16,
  'character': 'Mr. Potato Head (voice)',
  'credit_id': '52fe4284c3a36847f8024f9d',
  'gender': 2,
  'id': 7167,
  'name': 'Don Rickles',
  'order': 2,
  'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'},
 {'cast_id': 17,
  'character': 'Slinky Dog (voice)',
  'credit_id': '52fe4284c3a36847f8024fa1',
  'gender': 2,
  'id': 12899,
  'name': 'Jim Varney',
  'order': 3,
  'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'},
 {'cast_id': 18,
  'character': 'Rex (voice)',
  'credit_id': '52fe4284c3a36847f8024fa5',
  'gender': 2,
  'id': 12900,
 

In [28]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
        
        return np.nan

In [29]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]

        # check if more than 3 elements exist. If yes, return only first three
        if len(names) > 3:
            names = names[:3]
        return names
    
    return []

In [30]:
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

In [31]:
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


* Convert the text into lowercase and strip all whitespaces.

Removing the spaces between words is an important preprocessing step. It is done so that your vectorizer doesn't count the Johnny of "Johnny Depp" and "Johnny Galecki" as the same. After this processing step, the aforementioned actors will be represented as "johnnydepp" and "johnnygalecki" and will be distinct to your vectorizer.

In [32]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
    
        else:
            return ''

In [33]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

### Metadata Soup
This is a string that contains all the metadata you want to feed your vecotrizer (actors, director, keywords).

This create_soup function will join all the required columns by a space. This will be fed to the vector model.

In [34]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [35]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46628 entries, 0 to 46627
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  46628 non-null  object 
 1   belongs_to_collection  4574 non-null   object 
 2   budget                 46628 non-null  object 
 3   genres                 46628 non-null  object 
 4   homepage               8009 non-null   object 
 5   id                     46628 non-null  int64  
 6   imdb_id                46611 non-null  object 
 7   original_language      46617 non-null  object 
 8   original_title         46628 non-null  object 
 9   overview               46628 non-null  object 
 10  popularity             46624 non-null  object 
 11  poster_path            46229 non-null  object 
 12  production_companies   46624 non-null  object 
 13  production_countries   46624 non-null  object 
 14  release_date           46540 non-null  object 
 15  re

In [36]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [37]:
metadata[['soup']].head(2)

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [38]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words = 'english')
count_matrix_new = count.fit_transform(metadata['soup'])

In [39]:
count_matrix_new.shape

(46628, 69735)

In [40]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Convert count_matrix to sparse format (if not already sparse)
count_matrix_sparse = csr_matrix(count_matrix_new)

# Compute cosine similarity using sparse matrix
cosine_sim2_new = cosine_similarity(count_matrix_sparse, count_matrix_sparse, dense_output=False)


In [41]:
cosine_sim2_new.shape

(46628, 46628)