In [None]:
# Usual imports
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Movies

In [None]:
# Read and examine a table of actors and their careers
actors = Table.read_table('actors.csv')
actors

In [None]:
# Are there relations between number of movies and various outcomes?
# Plot against Total Gross
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
# Plot against Average per Movie
actors.scatter('Number of Movies', 'Average per Movie')

In [None]:
# Who is that actor with a small number of hugely successful movies?
actors.where('Number of Movies', are.below(10))

In [None]:
# Let's consider him an outlier and redo the plot
no_outlier = actors.where('Number of Movies', are.above(10))
no_outlier.scatter('Number of Movies', 'Average per Movie')

In [None]:
# Who have appeared in the most movies?
actors.where('Number of Movies', are.above(50))

In [None]:
# Read another dataset of movies by year
movies = Table.read_table('movies_by_year.csv')
movies

In [None]:
# Plot the number of movies by year
movies.plot('Year', 'Number of Movies')

In [None]:
# Plot the number of movies during the 21st century
century_21 = movies.where('Year', are.above(1999))
century_21.plot('Year', 'Number of Movies')

In [None]:
# Plot the total gross of the 21st centuries movies
century_21.plot('Year', 'Total Gross')

In [None]:
# what is the 2009 movie?
century_21.where('Year', are.equal_to(2009))

## Plotting Categorical Distributions

In [None]:
top = Table.read_table('top_movies_2021.csv')
top
# Note order!

In [None]:
# Shorten this to the top 5, just to make the plots easier to see
top5 = top.take(np.arange(5))
top5.barh('Title', 'Gross')

In [None]:
# make a similar plot for Gross Adjusted
top5.barh('Title', 'Gross (Adjusted)')

In [None]:
top5.barh(0, 3) # You can use indexes instead of names
# much less typing, but much harder to understand

In [None]:
# What does Gross look like for these?
top5.select('Title', 'Gross').barh('Title')  

In [None]:
# Plot several quantities
top5.select('Title', 'Gross', 'Gross (Adjusted)').barh('Title')

In [None]:
# Plot the ratio of Gross and Gross (Adjusted) by year
result = top.with_column("ratio", top["Gross (Adjusted)"] / top["Gross"]).sort("Year")
plots.plot(result['Year'], result['ratio']);
# Any theories on what's going on?

In [None]:
# Look at some of the movies from 'low' years like 1950
top.where(top['Year'] == 1950)
# Try a few others, and some "high" ones.  Is there a pattern to the difference?

In [None]:
# Group by studio to see which has produced the most movies
studios = top.group('Studio')
studios

In [None]:
# Sort that from highest to lowest and make a bar plot
studios.sort('count', descending=True).barh('Studio')