## Visualization Project: Movie Recommendation System

### Overview
This project is part of MSDS course work (Course: DTSA 5304: Fundamentals of Data Visualization). The course uses the Altair library to explain visualization concepts.

### Project Description
The project uses the movies dataset to analyze multiple aspects of the dataset.

### Dataset: The Movies Dataset
This system uses the movies dataset (https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset). It has metadata on over 45,000 movies. 26 million ratings from over 270,000 users.

### Required Libraries
First we install the required libraries. This only needs to be done **once**.

In [203]:
#%pip install altair vega vega_datasets pandas scikit-learn sentence-transformers streamlit

### Visualition #1: Genres Distribution
The first visualization tries to analyze the distribution of genres in the movies dataset.

In [204]:
import pandas as pd
import altair as alt
from ast import literal_eval

# load the movies metadata
movies_metadata = pd.read_csv('./The_Movies_Dataset/movies_metadata.csv', low_memory=False)
# extract the genres from the genres column
movies_metadata_genres = movies_metadata[['id', 'title', 'genres']].copy()
movies_metadata_genres.head()

Unnamed: 0,id,title,genres
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]"


In [205]:
# genres column is a string representation of a list of dictionaries. We need to convert it to a list of strings.
movies_metadata_genres['genres'] = (
    movies_metadata_genres['genres']
    .fillna('[]')
    .apply(literal_eval)
    .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
)
movies_metadata_genres.head()

Unnamed: 0,id,title,genres
0,862,Toy Story,"[Animation, Comedy, Family]"
1,8844,Jumanji,"[Adventure, Fantasy, Family]"
2,15602,Grumpier Old Men,"[Romance, Comedy]"
3,31357,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,11862,Father of the Bride Part II,[Comedy]


In [206]:
# The genres column is now a list of strings. We can now explode the genres column and group by genres.
movies_metadata_genres_exploded = movies_metadata_genres.explode('genres')
movies_metadata_genres_exploded = movies_metadata_genres_exploded.groupby('genres').size().reset_index(name='count')
movies_metadata_genres_exploded = movies_metadata_genres_exploded.sort_values('count', ascending=False)
# calculate the percentage
movies_metadata_genres_exploded['percentage'] = movies_metadata_genres_exploded['count'] / movies_metadata_genres_exploded['count'].sum() * 100
movies_metadata_genres_exploded.head()

Unnamed: 0,genres,count,percentage
9,Drama,20265,22.243321
6,Comedy,13182,14.46886
28,Thriller,7624,8.368274
22,Romance,6735,7.392488
0,Action,6596,7.239918


In [207]:
# plot the genres distribution
alt.Chart(movies_metadata_genres_exploded).mark_arc().encode(
    theta=alt.Theta(field='percentage', type='quantitative', stack=True),
    color=alt.Color(field='genres', type='nominal'),
    tooltip=[alt.Tooltip('genres', title='Genre'), alt.Tooltip('percentage', format='.2f', title='Percentage')]
).properties(
    title='Genres Distribution'
).interactive()

### Visualition #2: Genres Distribution and Top Movies

In [208]:
credits = pd.read_csv('./The_Movies_Dataset/credits.csv', low_memory=False)
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
movies_metadata_credits = movies_metadata.merge(credits, on='id')
# Calculating the average rating for each movie is not enough. We need to calculate the weighted rating for each movie.
# https://math.stackexchange.com/questions/169032/understanding-the-imdb-weighted-rating-function-for-usage-on-my-own-website
# Weighted Rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C

# v = number of votes for the movie
# m = minimum votes required to be listed in the chart
# R = average rating of the movie
# C = mean vote across the whole report

C = movies_metadata_credits['vote_average'].mean()
m = movies_metadata_credits['vote_count'].quantile(0.90)

# filter out movies that have less than m votes
movies_metadata_credits = movies_metadata_credits[movies_metadata_credits['vote_count'] >= m]

# calculate the score
movies_metadata_credits['score'] = (
    (movies_metadata_credits['vote_count'] / (movies_metadata_credits['vote_count'] + m)) * movies_metadata_credits['vote_average'] +
    (m / (movies_metadata_credits['vote_count'] + m)) * C
)

In [209]:
movies_metadata_credits['genres'] = (
    movies_metadata_credits['genres']
    .fillna('[]')
    .apply(literal_eval)
    .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
)
genres = movies_metadata_credits.explode('genres')
genres = genres.groupby('genres').size().reset_index(name='count')
genres['score'] = genres['genres'].apply(
    lambda x: movies_metadata_credits[
        movies_metadata_credits['genres'].apply(lambda y: x in y)
    ]['score'].mean()
)
genres['percentage'] = genres['count'] / genres['count'].sum() * 100
genres['top_movies'] = genres['genres'].apply(
    lambda x: ' '.join(
        f"{i + 1}. {title}"
        for i, title in enumerate(
            movies_metadata_credits[
                movies_metadata_credits['genres'].apply(lambda y: x in y)
            ]
            .sort_values('score', ascending=False)['title']
            .head(10)
        )
    )
)

In [210]:
# plot the genres distribution and show top 10 movies as a tooltip
alt.Chart(genres).mark_arc().encode(
    theta=alt.Theta(field='percentage', type='quantitative', stack=True),
    color=alt.Color(field='genres', type='nominal'),
    tooltip=[alt.Tooltip('genres', title='Genre'), alt.Tooltip('percentage', format='.2f', title='Percentage'), alt.Tooltip('top_movies', title='Top Movies')],
    order=alt.Order('count', sort='descending')
).properties(
    title='Genres Distribution and Top Movies'
).interactive()

### Visualition #3: Recommendation System

In [211]:
import os
from sentence_transformers import SentenceTransformer

model_name = 'all-MiniLM-L6-v2'
model_directory = './models'

# Create the models directory if it doesn't exist
os.makedirs(model_directory, exist_ok=True)

model_path = os.path.join(model_directory, model_name)

# Check if the model is already saved on disk
if os.path.exists(model_path):
    # Load the model from disk
    model = SentenceTransformer(model_path)
else:
    # Download the model and save it to disk
    model = SentenceTransformer(model_name)
    model.save(model_path)

In [212]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

movies_metadata_overview = movies_metadata[['id', 'title', 'overview', 'popularity', 'vote_average']].copy()
# cache the embeddings in a folder
embeddings_directory = './embeddings'
os.makedirs(embeddings_directory, exist_ok=True)
if os.path.exists(os.path.join(embeddings_directory, 'movies_metadata_overview_embeddings.npy')):
    embeddings = np.load(os.path.join(embeddings_directory, 'movies_metadata_overview_embeddings.npy'))
else:
    # Generate the embeddings
    embeddings = model.encode(movies_metadata_overview['overview'].astype(str).tolist())
    # Save the embeddings to disk
    np.save(os.path.join(embeddings_directory, 'movies_metadata_overview_embeddings.npy'), embeddings)

In [213]:
# Function to find similar movies
def find_similar_movies(movie_title, top_n=10):
    idx = movies_metadata_overview[movies_metadata_overview['title'] == movie_title].index[0]
    query_embedding = embeddings[idx].reshape(1, -1)

    # Calculate cosine similarity between the query and all movie embeddings
    similarities = cosine_similarity(query_embedding, embeddings).flatten()

    # Get the top N similar movies
    similar_indices = similarities.argsort()[::-1][1:top_n+1]
    similar_movies = movies_metadata_overview.iloc[similar_indices]

    return similar_movies

# pick 5 movies from the top 100 movies
filtered_movies = movies_metadata_credits.head(100).sample(5)
similar_movies = pd.concat([find_similar_movies(movie_title).assign(query_title=movie_title) for movie_title in filtered_movies['title']])
similar_movies = similar_movies.copy()
similar_movies['popularity'] = similar_movies['popularity'].astype(float).round(2)

dropdown_symbol = alt.binding_select(options=list(similar_movies["query_title"].unique()), name=" ")
selection_symbol = alt.selection_point(fields=["query_title"], bind=dropdown_symbol, value=filtered_movies['title'].values[0])

# display the similar movies on a chart
alt.Chart(similar_movies).mark_point().encode(
    alt.X('popularity', title='Popularity'),
    alt.Y('vote_average', title='Vote Average'),
    alt.Color('title', title='Title'),
    tooltip=['title', 'overview']
).add_params(
    selection_symbol
).transform_filter(
    selection_symbol
).properties(
    title='Similar Movies to Selected Movie',
    width=600,
    height=400,
).interactive()

### Visualition #4: Genres Distribution by Decade

In [214]:
movies_metadata_genres = movies_metadata[['id', 'title', 'genres', 'release_date']].copy()

# convert the genres column to a list
movies_metadata_genres['genres'] = (
    movies_metadata_genres['genres']
    .fillna('[]')
    .apply(literal_eval)
    .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
)

movies_metadata_genres['release_date'] = pd.to_datetime(movies_metadata_genres['release_date'], errors='coerce')
movies_metadata_genres = movies_metadata_genres[~movies_metadata_genres['release_date'].isna()]
# convert year to decade
movies_metadata_genres['year'] = (movies_metadata_genres['release_date'].dt.year // 10 * 10).astype(int)

genres = movies_metadata_genres.explode('genres')
genres = genres.groupby(['genres', 'year']).size().reset_index(name='count')
genres = genres.sort_values('count', ascending=False)

# Create a selection object for the slider
year_slider = alt.binding_range(min=1900, max=2010, step=10, name='Year:')
year_select = alt.selection_point(fields=['year'], bind=year_slider, value={'year': 1900})

# calculate the percentage
genres['percentage'] = genres['count'] / genres['count'].sum() * 100
# plot the genres distribution
chart = alt.Chart(genres).mark_bar().encode(
    x=alt.X('genres', title='Genre', sort='-y'),
    y=alt.Y('count', title='Count'),
    color=alt.Color('genres', title='Genre'),
    tooltip=[alt.Tooltip('genres', title='Genre'), alt.Tooltip('count', format='.2f', title='Count')]
).add_params(
    year_select
).transform_filter(
    year_select
).properties(
    title='Genres Distribution',
    width=600,
    height=400,
).interactive()

chart

### Visualition #5: Countries Distribution

In [215]:
import time
import json
from ast import literal_eval
import requests

# extract the country from the production_countries column
movies_metadata_countries = movies_metadata[['id', 'title', 'production_countries']].copy()

# convert the production_countries column to a list
movies_metadata_countries['production_countries'] = (
    movies_metadata_countries['production_countries']
    .fillna('[]')
    .apply(literal_eval)
    .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
)

# explode the production_countries column
movies_metadata_countries = movies_metadata_countries.explode('production_countries')
movies_metadata_countries = movies_metadata_countries.groupby('production_countries').size().reset_index(name='count')
movies_metadata_countries = movies_metadata_countries.sort_values('count', ascending=False)

def fetch_lat_long_data(country):
    country_param = country.replace(' ', '%20')
    country_filename = country.replace(' ', '_').lower() + '.json'
    data_directory = './data'
    if os.path.exists(os.path.join(data_directory, country_filename)):
        with open(os.path.join(data_directory, country_filename), 'r') as f:
            return json.load(f)
    else:
        lat_long_url = f"https://geocode.maps.co/search?q={country_param}&api_key=6668f5eb3d464108527128ndi77d74c"
        print('Fetching data for:', country)
        response = requests.get(lat_long_url)
        if response.status_code == 200:
            data = response.json()
            with open(os.path.join(data_directory, country_filename), 'w') as f:
                json.dump(data, f)
            print('Waiting for 5 seconds...')
            time.sleep(5)
            return data
        else:
            return None

def get_lat_long(country):
    data = fetch_lat_long_data(country)
    if data:
        return data[0]['lat'] + ',' + data[0]['lon']
    else:
        return None

movies_metadata_countries['lat_long'] = movies_metadata_countries.apply(lambda x: get_lat_long(x['production_countries']), axis=1)
movies_metadata_countries[['lat', 'long']] = movies_metadata_countries['lat_long'].str.split(',', expand=True)
movies_metadata_countries = movies_metadata_countries.drop(columns=['lat_long'])
# write the data to a csv file
movies_metadata_countries.to_csv('./movies_metadata_countries.csv', index=False)

In [216]:
import altair as alt
from vega_datasets import data

# Load a GeoJSON source that contains features for drawing
base = alt.topo_feature(data.world_110m.url, 'countries')

# Create a base map
base_map = alt.Chart(base).mark_geoshape(
    fill='lightblue',
    stroke='white'
).project('equirectangular')

# Create a scatter plot layer
points = alt.Chart(movies_metadata_countries).mark_circle().encode(
    longitude='long:Q',
    latitude='lat:Q',
    size=alt.Size('count:Q', title='Count', scale=alt.Scale(range=[50, 250])),
    # color=alt.Color('count:Q', title='Count'),
    color=alt.Color('count:Q', title='Count', scale=alt.Scale(range=['blue', 'darkblue'])),
    tooltip=[alt.Tooltip('production_countries', title='Country'), alt.Tooltip('count', format='.2f', title='Count')]
).project('equirectangular')

# Combine the base map and scatter plot
chart = base_map + points

# Set the chart properties
chart = chart.properties(
    title='Countries Distribution',
    width=800,
    height=600
).interactive()

chart