In [15]:
#%pip install altair vega vega_datasets

## Visualization Project: Movie Recommendation System

#### Dataset: The Movies Dataset
This system uses the movies dataset (https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset). It has metadata on over 45,000 movies. 26 million ratings from over 270,000 users.

#### Objective
The objective of this project is to build a movie recommendation system using the metadata of the movies dataset. The system will recommend movies with the appropriate visualizations based on the user's preferences.

In [1]:
import pandas as pd
import altair as alt
from ast import literal_eval
movies_metadata = pd.read_csv('./The_Movies_Dataset/movies_metadata.csv', low_memory=False)

In [2]:
movies_metadata_genres = movies_metadata[['id', 'title', 'genres']].copy()

movies_metadata_genres['genres'] = (
    movies_metadata_genres['genres']
    .fillna('[]')
    .apply(literal_eval)
    .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
)

movies_metadata_genres.head()

Unnamed: 0,id,title,genres
0,862,Toy Story,"[Animation, Comedy, Family]"
1,8844,Jumanji,"[Adventure, Fantasy, Family]"
2,15602,Grumpier Old Men,"[Romance, Comedy]"
3,31357,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,11862,Father of the Bride Part II,[Comedy]


In [3]:
# Grouping by genres
genres = movies_metadata_genres.explode('genres')
genres = genres.groupby('genres').size().reset_index(name='count')
genres = genres.sort_values('count', ascending=False)
# calculate the percentage
genres['percentage'] = genres['count'] / genres['count'].sum() * 100

In [4]:
# plot a pie chart
alt.Chart(genres).mark_arc().encode(
    theta=alt.Theta(field='percentage', type='quantitative', stack=True),
    color=alt.Color(field='genres', type='nominal'),
    tooltip=[alt.Tooltip('genres', title='Genre'), alt.Tooltip('percentage', format='.2f', title='Percentage')]
).properties(
    title='Genres Distribution'
)