In [1]:
import altair as alt
import pandas as pd
import numpy as np

import utils

In [None]:
# Source: https://www.kaggle.com/tmdb/tmdb-movie-metadata#tmdb_5000_movies.csv
df = utils.read_movie_data()
df.head()

# What's the relationship between vote_count and vote_average?

In [None]:
relationship = alt.Chart(df).mark_point().encode(
    x='vote_count',
    y='vote_average'
)
# interactive
relationship

# How many movies were made per genre?

In [None]:
genre = alt.Chart(df).mark_bar().encode(
    x='genre',
    y='count()'
)
genre

# What kind of budget do different genres have?

In [None]:
genre_budget = alt.Chart(df).mark_bar().encode(
    x='genre',
    y='mean(budget)'
)
genre_budget

In [None]:
genre_budget | genre

# How are people voting?

In [None]:
votes = alt.Chart(df).mark_bar().encode(
    x=alt.X('vote_average', bin=True),
    y='count()'
)
votes

In [None]:
votes.encode(column='genre')

In [None]:
alt.Chart(df).mark_bar().encode(
    y='mean(vote_average)',
    x='genre'
)

# Let's compare Action vs Drama specifically

In [None]:
genre_hist = alt.Chart(df).mark_bar().encode(
    x=alt.X('vote_average', bin=True, title='Average Vote'),
    y='count()',
    opacity=alt.value(0.7))

action_genre = genre_hist.transform_filter('datum.genre == "Action"')
drama_genre = genre_hist.encode(color=alt.value('firebrick')).transform_filter('datum.genre == "Drama"')
action_genre + drama_genre

# Popularity vs Budget

In [None]:
selection = alt.selection_interval(empty='all')

base = alt.Chart(df).mark_circle().encode(
    x='vote_average',
    tooltip='title',
    color=alt.condition(selection, alt.value('firebrick'), alt.value('lightgray'))
).properties(
    selection=selection
)

base.encode(y='budget') | base.encode(y='revenue')

# Budget vs Revenue - Does a large budget equal large revenue?

In [None]:
budget_revenue_base = alt.Chart(df).mark_point().encode(
    x='budget',
    y='revenue',
)

vote_average = budget_revenue_base.encode(
    size=alt.Size('vote_average', bin=True, title='Average Vote'),
)

title_text = budget_revenue_base.mark_text(fontSize=8, dx=10, align='left', baseline='middle').encode(
    text='title',
    color=alt.value('black'),
)

(vote_average + title_text).interactive()

# Let's build a report!

In [None]:
selection = alt.selection_multi(fields=['genre'])


vote_average = alt.Chart(df).mark_bar().encode(
    y=alt.Y('release_year:O', title='Release Year'),
    x=alt.X('mean(vote_average)', title='Mean Vote')
).transform_filter(
    selection
)

profit = alt.Chart(df).mark_bar().encode(
    y=alt.Y('release_year:O', title='Release Year'),
    x=alt.X('mean(profits):Q', title='Mean Profit')
).transform_calculate(
    profits='datum.revenue - datum.budget'
).transform_filter(
    selection
)

releases = alt.Chart(df).mark_bar().encode(
    x=alt.X('genre', title='Genre'),
    y=alt.Y('count()', title='Number of movies'),
    opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1))
).properties(
    height=200,
    width=860,
    selection=selection
)

our_report = alt.vconcat(alt.hconcat(profit, vote_average), 
                         releases)
our_report

In [None]:
from jinja2 import FileSystemLoader, Environment

In [None]:
fs = FileSystemLoader('./templates/')
env = Environment(loader=fs)

In [None]:
comments = """
Drama is the most populous, but profits have been swingy. Critical acclaim does not seem to correspond to the profits made.<br>
Comedies have seen a steady increase in profits over the years - it is also the third most populous genre among the movies analysed.<br>
Action movies have generated large profits and seem to consistently be crowd pleasers
"""

In [None]:
chart_title = "Comparing Profits and Votes per Genre"

In [None]:
template = env.get_template('template.html')

with open('output.html', 'w') as f:
    chart_data = our_report.to_json()
    f.write(template.render(chart=chart_data, comments=comments, chart_title=chart_title))