In [1]:
import altair as alt
import pandas as pd
import os
import pathlib

In [2]:
DATASETS = pathlib.Path(os.environ['DATASETS'])

In [3]:
ratings = pd.read_csv('data/ml-100k/preprocessed_ratings.csv', index_col=['userId', 'movieId'])

In [4]:
genre_cols = [
    "Unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movieId', 'title', 'release_date', 'video_release_date', 'imdb_url'
] + genre_cols

In [5]:
movies = pd.read_csv(
    DATASETS / 'recommender/movies/ml-100k/u.item', 
    sep='|', names=movies_cols, 
    index_col=['movieId'],
    encoding='latin-1'
)

In [6]:
movies.drop(columns=['title', 'release_date', 'video_release_date', 'imdb_url'], inplace=True)

In [7]:
merged = ratings.merge(movies, left_on='movieId', right_on='movieId')

In [8]:
averages = [(entry, merged[merged[entry] == 1]['rating'].mean()) for entry in genre_cols]

In [9]:
data = pd.DataFrame(averages, columns=['genre', 'avg_rating'])

In [10]:
brush = alt.selection(type='interval', encodings=['y'])

bars = alt.Chart().mark_bar().encode(
    y='genre',
    x='avg_rating',
    opacity=alt.condition(brush, alt.OpacityValue(1), alt.OpacityValue(0.7))
).add_selection(
    brush
)

line = alt.Chart().mark_rule(color='firebrick').encode(
    x='mean(avg_rating):Q',
    size=alt.SizeValue(3)
).transform_filter(
    brush
)
alt.layer(bars, line, data=data)