# Lecture 5 Demo: Building Tables

In [1]:
from datascience import *
import numpy as np

## Columns of Tables are Arrays

In [None]:
# read the nba_salaries data from local directory
nba = Table.read_table('nba_salaries.csv')
nba

In [None]:
nba.column(3)

In [None]:
nba.column('team')

In [None]:
# Find the maximum season (year) in the data
np.max(nba.column('season'))

In [None]:
# Restrict our table rows to the 2020 season
nba = nba.where('season', 2020)
nba

In [None]:
# Now we can drop the season; every row is for the 2020 season
nba = nba.drop('season')
nba

In [None]:
# Our table has a number of rows
nba.num_rows

In [None]:
# And a number of columns
nba.num_columns

In [None]:
# And a sequence of column labels
nba.labels

In [None]:
# Make a new table just for the Pacers (2020 season)
pacers = nba.where('team', 'Indiana Pacers')
pacers.show()

In [None]:
# We can make a new table with the 'salary' column relabeled as '$'
pacers.relabeled('salary', '$')

In [None]:
# But notice, the value of pacers is unchanged
pacers

In [None]:
# A common coding pattern when working with tables is to chain together multiple 
# method calls in a single statement
# Let's make a table for the Detroit Pistons, 2019 season, from the original csv file
pistons = Table.read_table('nba_salaries.csv').where('season', 2019).drop('season').where('team', 'Detroit Pistons').relabeled('salary', '$')
          
pistons

In [None]:
# Having such a long line of code makes it hard to read
# Here's a trick for splitting the line of code over multiple notebook lines 
# (introduce parentheses around the expression on the right-hand side, then add line breaks)
pistons = (
    Table.read_table('nba_salaries.csv')
    .where('season', 2019)
    .drop('season')
    .where('team', 'Detroit Pistons')
    .relabeled('salary', '$')
)
pistons

In [None]:
# Let's select one column of the pistons table
pistons_salaries_2019 = pistons.select('$')
pistons_salaries_2019

In [None]:
# What type of thing is this?
type(pistons_salaries_2019)

In [None]:
# select returns a new Table object, not an array
# If we try to add up the salary numbers using np.sum(), what happens?
np.sum(pistons.select('$'))

In [None]:
# Since np.sum is meant to take an array as its argument, it's better to 
# find the sum like this:
np.sum(pistons.column('$'))

In [None]:
# Takeaway: t.select('some_label') returns a new table
#           t.column('some_label') returns an array
pistons.column('name')

In [None]:
pistons.column('rank')

In [None]:
# What was the average Phoenix Suns salary in the 2020 season 
# (for players in this nba table)
nba

In [None]:
nba.where('team', 'Phoenix Suns')


In [None]:
phoenix_salaries_2020 = nba.where('team', 'Phoenix Suns').select('name', 'salary')
phoenix_salaries_2020.show()

In [None]:
suns_avg_salary_2020 = np.mean(phoenix_salaries_2020.column('salary'))
round(suns_avg_salary_2020)

Back to slides...

## Ranges (np.arange)

In [None]:
# One way to get an array of numbers is to use np.arange
np.arange(7)

In [None]:
print(np.arange(7))

In [None]:
type(np.arange(7))

In [None]:
# np.arange call with two arguments: np.arange(start, end)
# The number of values is end minus start (when start and end are integers)
np.arange(5, 12)  # 12 - 5 = 7; 7 values in this range

In [None]:
# np.arange call with three arguments: np.arange(start, end, stepsize)
# Generates the sequence: start; start + stepsize; start + stepsize + stepsize; etc.
# end is NOT included 
np.arange(0, 20, 5)

In [None]:
np.arange(0, 19, 5)

In [None]:
np.arange(0, 21, 5)

In [None]:
# stepsize can be negative
np.arange(10, -10, -2)

In [None]:
# stepsize can be a float
np.arange(0, 2, .3)

In [None]:
# start and end can be floats
np.arange(1.5, 7.3)

In [None]:
np.arange(1.5, 7.3, 0.4)

Back to slides...

## Creating Tables

In [None]:
pistons

In [None]:
pistons.take(0)

In [None]:
# take makes a table using a sequence of row indexes
pistons.take(5, 3, 0, 2)

In [None]:
pistons.take(np.arange(5))  # same as pistons.take(0, 1, 2, 3, 4)

In [None]:
pistons.take(np.arange(0, pistons.num_rows + 1, 3))  # same as pistons.take(0, 4, 7, 10, 13)

In [None]:
# Compare and constrast: t.take() versus t.select()
pistons.take(0, 1, 2)

In [None]:
pistons.select(0, 1, 2)

In [None]:
pistons.select('rank', 'name', 'position')

In [None]:
# Compare and constrast: t.take() versus t.where()
pistons.sort('$', descending=True).take(np.arange(5))

In [None]:
pistons.where('$', are.above_or_equal_to(7e6))

In [None]:
# Compare and constrast: t.take(0, 1, 2) versus t.show(3)
pistons.take(0, 1, 2).sort('position')

In [None]:
pistons.show(3).sort('position')

## More Ways to Create Tables

In [None]:
# At the top of this notebook, we saw how to create a table using Table.read_csv(filename)
nba = Table.read_table('nba_salaries.csv')
nba.show(3)

In [None]:
# Another way to make a new table from scratch is to start with an empty
# table, then add one or more columns of values
# The values for a column can be collected up in an arry.

last = make_array('Abu', 'Berry', 'Boyles', 'Denton', 'King')
first = make_array('Peter', 'Lydia', 'Zach', 'Spencer', 'Kaela')
year = make_array(2026, 2026, 2025, 2025, 2027)
units = make_array(13, 15.5, 22.25, 23.5, 4.5)

students = (
    Table().with_column('last name', last)
    .with_column('first name', first)
    .with_column('grad year', year)
    .with_column('units earned', units)
)

students

Clearly, if we have more than a few rows of data, it's much easier to create the table in a spreadsheet program, save it in .csv format, and then bring it into the Jupyter Notebook environment using `Table.read_csv(filename)`

In [None]:
# What is the length of a table?
len(students)

The students table is a sequence of length 4, i.e., a sequence of four columns. Each column is an array of values. The length of a the array for a column equals the number of rows in the table.

In [None]:
students.num_rows

In [None]:
len(students.column('last name'))

In [None]:
# Notice we could rearrange the columns
students.select(1, 0, 3, 2)

In [None]:
# We can sort the rows based on one of the columns, such as 'units earned'
students.sort('units earned')

In [None]:
# Can sort into descending order as well
students.sort('last name', descending=True)

Back to slides...

# An Example

Discussion Question: Use the table functions we learned this week, and the W.E.B. DuBois data, to find the income bracket (“class”) that spent the highest proportion of their income on food.

In [2]:
du_bois = Table.read_table('du_bois.csv')
du_bois

CLASS,ACTUAL AVERAGE,RENT,FOOD,CLOTHES,TAXES,OTHER,STATUS
100-200,139.1,0.19,0.43,0.28,0.001,0.099,POOR
200-300,249.45,0.22,0.47,0.23,0.04,0.04,POOR
300-400,335.66,0.23,0.43,0.18,0.045,0.115,FAIR
400-500,433.82,0.18,0.37,0.15,0.055,0.245,FAIR
500-750,547.0,0.13,0.31,0.17,0.05,0.34,COMFORTABLE
750-1000,880.0,0.0,0.37,0.19,0.08,0.36,COMFORTABLE
1000 and over,1125.0,0.0,0.29,0.16,0.045,0.505,WELL-TO-DO


In [3]:
# We need to consider the actual average income for each class
du_bois.column('ACTUAL AVERAGE')

array([  139.1 ,   249.45,   335.66,   433.82,   547.  ,   880.  ,  1125.  ])

In [4]:
# We also need to consider the proportion of income spent on food for each class
du_bois.column('FOOD')

array([ 0.43,  0.47,  0.43,  0.37,  0.31,  0.37,  0.29])

In [5]:
# Name those arrays to communicate more clearly in our codes-to-come
income_avg = du_bois.column('ACTUAL AVERAGE')
income_avg

array([  139.1 ,   249.45,   335.66,   433.82,   547.  ,   880.  ,  1125.  ])

In [6]:
food_proportion = du_bois.column('FOOD')
food_proportion

array([ 0.43,  0.47,  0.43,  0.37,  0.31,  0.37,  0.29])

In [7]:
# Multiply each class's income_average with their food_proportion to learn total 
# dollars spent on food each year (on average). 
food_dollars = income_avg * food_proportion
food_dollars

array([  59.813 ,  117.2415,  144.3338,  160.5134,  169.57  ,  325.6   ,
        326.25  ])

In [8]:
# Add food_dollars info to the table to see those figures in context
# Make a table with four columns: class, food (proportion), status, and food (dollars)
du_bois.select(0, 3, 7).with_column('FOOD (DOLLARS)', food_dollars)

CLASS,FOOD,STATUS,FOOD (DOLLARS)
100-200,0.43,POOR,59.813
200-300,0.47,POOR,117.241
300-400,0.43,FAIR,144.334
400-500,0.37,FAIR,160.513
500-750,0.31,COMFORTABLE,169.57
750-1000,0.37,COMFORTABLE,325.6
1000 and over,0.29,WELL-TO-DO,326.25


In [9]:
# fix up the second column's label and assign a name
du_bois_2 = (du_bois.select(0, 3, 7)
             .with_column('FOOD (DOLLARS)', food_dollars)
             .relabeled('FOOD', 'FOOD (PROPORTION)')
)
du_bois_2

CLASS,FOOD (PROPORTION),STATUS,FOOD (DOLLARS)
100-200,0.43,POOR,59.813
200-300,0.47,POOR,117.241
300-400,0.43,FAIR,144.334
400-500,0.37,FAIR,160.513
500-750,0.31,COMFORTABLE,169.57
750-1000,0.37,COMFORTABLE,325.6
1000 and over,0.29,WELL-TO-DO,326.25


In [10]:
# swap the second and third columns
du_bois_3 = du_bois_2.select(0, 2, 1, 3)
du_bois_3

CLASS,STATUS,FOOD (PROPORTION),FOOD (DOLLARS)
100-200,POOR,0.43,59.813
200-300,POOR,0.47,117.241
300-400,FAIR,0.43,144.334
400-500,FAIR,0.37,160.513
500-750,COMFORTABLE,0.31,169.57
750-1000,COMFORTABLE,0.37,325.6
1000 and over,WELL-TO-DO,0.29,326.25


In [11]:
# sort in descending order by 'FOOD (DOLLARS)'
du_bois_3.sort('FOOD (DOLLARS)', descending = True)

CLASS,STATUS,FOOD (PROPORTION),FOOD (DOLLARS)
1000 and over,WELL-TO-DO,0.29,326.25
750-1000,COMFORTABLE,0.37,325.6
500-750,COMFORTABLE,0.31,169.57
400-500,FAIR,0.37,160.513
300-400,FAIR,0.43,144.334
200-300,POOR,0.47,117.241
100-200,POOR,0.43,59.813


In [12]:
# Now sort on FOOD (PROPORTION):
du_bois_3.sort('FOOD (PROPORTION)', descending = True)

CLASS,STATUS,FOOD (PROPORTION),FOOD (DOLLARS)
200-300,POOR,0.47,117.241
100-200,POOR,0.43,59.813
300-400,FAIR,0.43,144.334
400-500,FAIR,0.37,160.513
750-1000,COMFORTABLE,0.37,325.6
500-750,COMFORTABLE,0.31,169.57
1000 and over,WELL-TO-DO,0.29,326.25


In 1900, well-to-do Black Americans spent \\$326.25 on food, per year, on average (about \\$27 per month). Those with the lowest incomes spent about \\$60 on food, per year, on average (about \\$5 per month).

Even though they spent the most on food (in terms of dollars), those with the highest incomes spent the smallest *proportion* of their income on food (29%).