In [None]:
from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True

## Lecture 8 ##

## Categorical Distribution ##

In [None]:
vgsales = Table.read_table('vgsales.csv')
vgsales

In [None]:
# New years_since variable
years_since = 2022 - vgsales.column('Year')

# Add Age column to table
vgsales = vgsales.with_column('Years Since Release', years_since)

# Select Nintendo, sort by gloabl sales, and make new table with 10 largest gloabl sales
top10_nintendo = vgsales.sort('Global_Sales', descending=True).take(np.arange(10))

# Barplot for Age of movies by title
top10_nintendo.barh('Name', 'Years Since Release')


In [None]:
genre = vgsales.select('Genre')
genre

In [None]:
genre_distribution = genre.group('Genre')

In [None]:
genre_distribution 

In [None]:
sum(genre_distribution.column('count'))

## Bar Charts ##

In [None]:
genre_distribution.barh('Genre')

In [None]:
genre_distribution.sort('count', descending=True).barh('Genre')

## Numerical Distribution - Binning 

In [None]:
vgsales

In [None]:
min(years_since), max(years_since)

In [None]:
my_bins = make_array(0, 5, 10, 15, 20, 30, 43)

In [None]:
binned_data = vgsales.bin('Years Since Release', bins = my_bins)
binned_data

In [None]:
sum(binned_data.column('Years Since Release count'))

In [None]:
vgsales.sort('Years Since Release', descending=True)

In [None]:
vgsales.bin('Years Since Release', bins = np.arange(0, 45, 5))

In [None]:
vgsales.bin('Years Since Release', bins = np.arange(0, 21, 5))

In [None]:
vgsales.where('Years Since Release', are.above_or_equal_to(20)).num_rows

## Histograms ##

In [None]:
my_bins

In [None]:
binned_data

In [None]:
# Let's make our first histogram!
vgsales.hist('Years Since Release', bins = my_bins)

In [None]:
# Let's try equally spaced bins instead.
vgsales.hist('Years Since Release', bins = np.arange(0, 45, 5))

In [None]:
# Let's try not specifying any bins!
vgsales.hist('Years Since Release')

In [None]:
# Add a column containing what percent of movies are in each bin
binned_data = binned_data.with_column(
    'Percent', 100*binned_data.column('Years Since Release count')/sum(binned_data.column('Years Since Release count')))

In [None]:
binned_data

In [None]:
sum(binned_data.column('Percent'))

## Height ##

### Question: What is the height of the [20, 30) bin?

In [None]:
# Step 1: Calculate % of games in the [20, 30) bin
percent = binned_data.where('bin', 20).column('Percent').item(0)
percent

In [None]:
# Step 2: Calculate the width of the 20-30 bin
width = 30 - 20

In [None]:
# Step 3: Area of rectangle = height * width
#         --> height = percent / width
height = percent / width
height

In [None]:
vgsales.hist('Years Since Release', bins = my_bins)

### What are the heights of the rest of the bins?

In [None]:
# Get the bin lefts
bin_lefts = binned_data.take(np.arange(binned_data.num_rows - 1))

In [None]:
# Get the bin widths
bin_widths = np.diff(binned_data.column('bin'))
bin_lefts = bin_lefts.with_column('Width', bin_widths)

In [None]:
# Get the bin heights
bin_heights = bin_lefts.column('Percent') / bin_widths
bin_lefts = bin_lefts.with_column('Height', bin_heights)

In [None]:
bin_lefts

In [None]:
vgsales.hist('Years Since Release', bins = my_bins)