# My IMDb

In [None]:
from functools import reduce
import os
import numpy as np
import pandas as pd
import altair as alt

path = '../data/imdb.csv'

In [None]:
imdb = pd.read_csv(path, header=None,
                   names=['title', 'release_year', 'genre', 'rating', 'user_rating', 'votes', 'range_year'])

imdb['id'] = pd.Series(range(len(imdb)), name='id')
imdb.release_year = imdb.release_year
imdb.user_rating = imdb.user_rating.astype('float')

# Genres table.
genres = reduce(lambda x, y: x + y, [
  [dict(id=row.id, genre=g.strip()) for g in row.genre.split(',')]
  for _, row in imdb.iterrows()
], [])
genres = pd.DataFrame(genres)
genres.genre = genres.genre.astype('category')

ratings = imdb[['id', 'rating']].copy()
ratings['src'] = 'IMDb'
user_ratings = imdb[['id', 'user_rating']].rename(columns=dict(user_rating='rating'))
user_ratings['src'] = 'User'
ratings = ratings.append(user_ratings)
ratings.src = ratings.src.astype('category')

imdb = imdb.drop(columns=['genre', 'rating', 'user_rating'])

## Counts

In [None]:
imdb_join_genre = imdb[['id', 'release_year']].merge(genres, on='id', how='inner')

### By "Release Year"

In [None]:
hist = alt.Chart(imdb).mark_bar().encode(
  x=alt.X('release_year:N', title='Release Year'),
  y=alt.Y('count(id)', title='Count')
)
text = hist.mark_text(dy=-10).encode(text='count(id)')

(hist + text)

### By "Genre" every "Release Year"

In [None]:
base = alt.Chart(imdb_join_genre).transform_aggregate(N='count()', groupby=['release_year', 'genre']).encode(
    x=alt.X('release_year:N', title='Release Year'),
    y=alt.Y('genre', title='Genre')
)
heatmap = base.mark_rect().encode(color='N:Q')
text = base.mark_text(baseline='middle', color='gray').encode(text='N:Q')

(heatmap + text)

## Votes

### Histogram of "Votes"

In [None]:
chart = alt.Chart(imdb).mark_bar().encode(
  x=alt.X('votes', bin=alt.Bin(maxbins=50), axis=alt.Axis(title='Total Votes', labelAngle=-90)),
  y=alt.Y('count()', title='Bin Count'))

text = chart.mark_text(dy=-10).encode(text='count()')

(chart + text).properties(width=875)

### "Votes" by "Year"

In [None]:
alt.Chart(imdb[imdb['release_year'] > 1980]).mark_boxplot().encode(
  x=alt.X('release_year:N', title='Release Year'),
  y=alt.Y('votes', title='Number of votes'))

## Ratings

In [None]:
all_year_ratings = imdb[['id', 'release_year']].merge(ratings, on='id', how='inner')

### Distribution over all "Year"s

In [None]:
alt.Chart(all_year_ratings[all_year_ratings['release_year'] > 1980]).mark_boxplot().encode(
  y=alt.Y('src', title='Source'),
  x=alt.X('rating', title='Ratings'),
  color=alt.Color('src', title='Source'))

### Distribution for every "Release Year"

In [None]:
alt.Chart(all_year_ratings[all_year_ratings['release_year'] > 1980]).mark_boxplot().encode(
  y=alt.Y('src', title='Source'),
  x=alt.X('rating', title='Ratings'),
  row=alt.Row('release_year', title='Release Year'),
  color=alt.Color('src', title='Source'))

### Distribution by "Genre"

In [None]:
all_genre_ratings = imdb[['id']].merge(genres, on='id', how='inner').merge(ratings, on='id', how='inner')

In [None]:
alt.Chart(all_genre_ratings).mark_boxplot().encode(
  y=alt.Y('src', title='Source'),
  x=alt.X('rating', title='Ratings'),
  row=alt.Row('genre', title='Genre'),
  color=alt.Color('src', title='Source')).save('chart.json')