In [31]:
#%pip install altair vega vega_datasets pandas scikit-learn sentence-transformers streamlit

## Visualization Project: Movie Recommendation System

#### Dataset: The Movies Dataset
This system uses the movies dataset (https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset). It has metadata on over 45,000 movies. 26 million ratings from over 270,000 users.

#### Objective
The objective of this project is to build a movie recommendation system using the metadata of the movies dataset. The system will recommend movies with the appropriate visualizations based on the user's preferences.

In [32]:
import pandas as pd
import altair as alt
from ast import literal_eval
movies_metadata = pd.read_csv('./The_Movies_Dataset/movies_metadata.csv', low_memory=False)

In [33]:
movies_metadata_genres = movies_metadata[['id', 'title', 'genres']].copy()

# convert the genres column to a list
movies_metadata_genres['genres'] = (
    movies_metadata_genres['genres']
    .fillna('[]')
    .apply(literal_eval)
    .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
)

In [34]:
# Grouping by genres
genres = movies_metadata_genres.explode('genres')
genres = genres.groupby('genres').size().reset_index(name='count')
genres = genres.sort_values('count', ascending=False)

# calculate the percentage
genres['percentage'] = genres['count'] / genres['count'].sum() * 100

In [35]:
# plot the genres distribution
alt.Chart(genres).mark_arc().encode(
    theta=alt.Theta(field='percentage', type='quantitative', stack=True),
    color=alt.Color(field='genres', type='nominal'),
    tooltip=[alt.Tooltip('genres', title='Genre'), alt.Tooltip('percentage', format='.2f', title='Percentage')]
).properties(
    title='Genres Distribution'
).interactive()

In [36]:
credits = pd.read_csv('./The_Movies_Dataset/credits.csv', low_memory=False)

In [37]:
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
movies_metadata_credits = movies_metadata.merge(credits, on='id')

In [38]:
# Calculating the average rating for each movie is not enough. We need to calculate the weighted rating for each movie.
# https://math.stackexchange.com/questions/169032/understanding-the-imdb-weighted-rating-function-for-usage-on-my-own-website
# Weighted Rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C

# v = number of votes for the movie
# m = minimum votes required to be listed in the chart
# R = average rating of the movie
# C = mean vote across the whole report

C = movies_metadata_credits['vote_average'].mean()
m = movies_metadata_credits['vote_count'].quantile(0.90)

# filter out movies that have less than m votes
movies_metadata_credits = movies_metadata_credits[movies_metadata_credits['vote_count'] >= m]

# calculate the score
movies_metadata_credits['score'] = (
    (movies_metadata_credits['vote_count'] / (movies_metadata_credits['vote_count'] + m)) * movies_metadata_credits['vote_average'] +
    (m / (movies_metadata_credits['vote_count'] + m)) * C
)
movies_metadata_credits = movies_metadata_credits.sort_values('score', ascending=False)

In [39]:
# Grouping by genres
genres = movies_metadata_genres.explode('genres')
genres = genres.groupby('genres').size().reset_index(name='count')
genres = genres.sort_values('count', ascending=False)
# calculate the percentage
genres['percentage'] = genres['count'] / genres['count'].sum() * 100
genres['top_movies'] = genres['genres'].apply(lambda x: ' '.join(f"{i+1}. {title}" for i, title in enumerate(movies_metadata_genres[movies_metadata_genres['genres'].apply(lambda y: x in y)]['title'].astype(str).head(10))))

In [40]:
# plot the genres distribution and show top 10 movies as a tooltip
alt.Chart(genres).mark_arc().encode(
    theta=alt.Theta(field='percentage', type='quantitative', stack=True),
    color=alt.Color(field='genres', type='nominal'),
    tooltip=[alt.Tooltip('genres', title='Genre'), alt.Tooltip('percentage', format='.2f', title='Percentage'), alt.Tooltip('top_movies', title='Top Movies')],
    order=alt.Order('count', sort='descending')
).properties(
    title='Genres Distribution and Top Movies'
).interactive()

In [41]:
import os
from sentence_transformers import SentenceTransformer

model_name = 'all-MiniLM-L6-v2'
model_directory = './models'

# Create the models directory if it doesn't exist
os.makedirs(model_directory, exist_ok=True)

model_path = os.path.join(model_directory, model_name)

# Check if the model is already saved on disk
if os.path.exists(model_path):
    # Load the model from disk
    model = SentenceTransformer(model_path)
else:
    # Download the model and save it to disk
    model = SentenceTransformer(model_name)
    model.save(model_path)

In [42]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

movies_metadata_overview = movies_metadata[['id', 'title', 'overview', 'popularity', 'vote_average']].copy()
# cache the embeddings in a folder
embeddings_directory = './embeddings'
os.makedirs(embeddings_directory, exist_ok=True)
if os.path.exists(os.path.join(embeddings_directory, 'movies_metadata_overview_embeddings.npy')):
    embeddings = np.load(os.path.join(embeddings_directory, 'movies_metadata_overview_embeddings.npy'))
else:
    # Generate the embeddings
    embeddings = model.encode(movies_metadata_overview['overview'].astype(str).tolist())
    # Save the embeddings to disk
    np.save(os.path.join(embeddings_directory, 'movies_metadata_overview_embeddings.npy'), embeddings)

In [43]:
# Function to find similar movies
def find_similar_movies(movie_title, top_n=10):
    idx = movies_metadata_overview[movies_metadata_overview['title'] == movie_title].index[0]
    query_embedding = embeddings[idx].reshape(1, -1)

    # Calculate cosine similarity between the query and all movie embeddings
    similarities = cosine_similarity(query_embedding, embeddings).flatten()

    # Get the top N similar movies
    similar_indices = similarities.argsort()[::-1][1:top_n+1]
    similar_movies = movies_metadata_overview.iloc[similar_indices]

    return similar_movies

# pick a random movie title
movie_title = movies_metadata_overview['title'].sample(1).values[0]
similar_movies = find_similar_movies(movie_title)
similar_movies = similar_movies.copy()
similar_movies['popularity'] = similar_movies['popularity'].astype(float).round(2)

# display the similar movies on a chart
alt.Chart(similar_movies).mark_point().encode(
    alt.X('popularity', title='Popularity'),
    alt.Y('vote_average', title='Vote Average'),
    alt.Color('title', title='Title'),
    tooltip=['title', 'overview']
).properties(
    title='Similar Movies to ' + movie_title,
    width=600,
    height=400,
).interactive()

In [44]:
movies_metadata_genres = movies_metadata[['id', 'title', 'genres', 'release_date']].copy()

# convert the genres column to a list
movies_metadata_genres['genres'] = (
    movies_metadata_genres['genres']
    .fillna('[]')
    .apply(literal_eval)
    .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
)

movies_metadata_genres['release_date'] = pd.to_datetime(movies_metadata_genres['release_date'], errors='coerce')
movies_metadata_genres = movies_metadata_genres[~movies_metadata_genres['release_date'].isna()]
movies_metadata_genres['year'] = movies_metadata_genres['release_date'].dt.year.astype(int)

genres = movies_metadata_genres.explode('genres')
genres = genres.groupby(['genres', 'year']).size().reset_index(name='count')
genres = genres.sort_values('count', ascending=False)

# Create a selection object for the slider
year_slider = alt.binding_range(min=genres['year'].min(), max=genres['year'].max(), step=1, name='Year:')
year_select = alt.selection_point(fields=['year'], bind=year_slider, value={'year': genres['year'].min()})

# calculate the percentage
genres['percentage'] = genres['count'] / genres['count'].sum() * 100
# plot the genres distribution
chart = alt.Chart(genres).mark_bar().encode(
    x=alt.X('genres', title='Genre', sort='-y'),
    y=alt.Y('count', title='Count'),
    color=alt.Color('genres', title='Genre'),
    tooltip=[alt.Tooltip('genres', title='Genre'), alt.Tooltip('count', format='.2f', title='Count')]
).add_params(
    year_select
).transform_filter(
    year_select
).properties(
    title='Genres Distribution',
    width=600,
    height=400
).interactive()

chart