In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Lecture 10 #

## Functions with multiple arguments and review of tb.apply()

In [None]:
ages = Table().with_columns(
    'Person', make_array('A', 'B', 'C', 'D'),
    'Age', make_array(23, 110, 28, 102)
)
ages

In [None]:
# define the cut_off function that takes two arguments
def cut_off(z, cut_off_value = 100):
    return min(z, cut_off_value)

In [None]:
# get its type
type(cut_off)

In [None]:
# try out the function
cut_off(3)

In [None]:
cut_off(107)

In [None]:
cut_off(57, 50)

In [None]:
# apply the function to a table
cut_age_array = ages.apply(cut_off, 'Age')
cut_age_array

In [None]:
ages.with_column('Cut off ages', cut_age_array)

In [None]:
# define another derived function called cut_off_50
def cut_off_50(x):
    return cut_off(x, 50)

In [None]:
cut_off_50(67)

In [None]:
ages.apply(cut_off_50, 'Age')

## Prediction example ##

To practice applying functions to data, let's look at data collected by Francis Galton.


In [None]:
galton = Table.read_table('galton.csv')

In [None]:
#Each row corresponds to one adult child
#family = family indicator
#father height (inches) 
#mother height (inches) 
#"midparent height"= weighted average of parents' heights
#children= # of children in the family
#childNum = child's birth rank (1 = oldest)
#gender
#height (inches)
galton

In [None]:
# get only the columns we need
heights = galton.select(3, 7).relabeled(0, 'MidParent').relabeled(1, 'Child')

In [None]:
heights

In [None]:
# Side note:  overlapping histogram 
heights.hist(bins=20, unit='inches')

In [None]:
# create a scatter plot of the data
heights.scatter('MidParent', 'Child')

In [None]:
# add vertical lines in a particular range of values
heights.scatter('MidParent', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

In [None]:
# calculate mean of childrens' heights for parents whose heights are in a particular range
nearby = heights.where('MidParent', are.between(67.5, 68.5))
nearby.column('Child').mean()

In [None]:
# plot the mean child height on the scatter plot
heights.scatter('MidParent', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, 66.24, color='gold', s=50);

In [None]:
# write a function to predict a child's height based on their (average) parents' heights that fall in a particular range
def predict_child(h):
    nearby = heights.where('MidParent', are.between(h - 0.5, h + 0.5))
    return nearby.column('Child').mean()

predict_child(68)

In [None]:
# try to make some predictions
predict_child(70)

In [None]:
predict_child(72)

In [None]:
# predict all values in our data set
predicted = heights.apply(predict_child, 'MidParent')
predicted

In [None]:
# add predictions to our heights table and plot the results
heights = heights.with_column('Predicted child', predicted)
heights

In [None]:
heights.scatter('MidParent')

## Apply with Multiple Columns ##

If we have a function that takes two arguments, we can apply it to values in two columns. 

Let's examine the range of temperatures that occur on different days.


In [None]:
temperatures = Table.read_table('temperatures.csv')
temperatures

In [None]:
# plot the high and the low temperature as a function of the day
temperatures.plot('Day')

In [None]:
# plot the high and the low temperature overlapping histograms
temperatures.select('Low', 'High').hist(bins=np.arange(30, 105, 5))

In [None]:
# create a scatter plot of the high and the low temperature
temperatures.scatter('Low', 'High')

In [None]:
# Difference between high temp and low temp
def difference(x, y):
    return x-y

difference(65, 54)

In [None]:
# calculate the spread of temperatures on each day and add it to our table
daily_spread = temperatures.apply(difference, 'High', 'Low')
temperatures = temperatures.with_column('Spread', daily_spread)
temperatures

In [None]:
# create a histogram of the spreads of temperatures
temperatures.hist('Spread', bins=np.arange(0, 40, 4))

In [None]:
# calculate the proportion of days that have more than a 20 degree spread in temperatures
temperatures.where('Spread', are.above(20)).num_rows / temperatures.num_rows

## Grouping by Category ##

Previously we've seen how we can use the `tb.group()` method to calculate the number of values in each group. We will now also use to to apply a function separately to each group. 


In [None]:
all_cones = Table.read_table('cones.csv')
all_cones

In [None]:
# remove the Color column for now
cones = all_cones.drop('Color').exclude(5)
cones

In [None]:
# count how many cones there are of each Flavor
cones.group('Flavor')

In [None]:
# Calculate the minimum price for each Flavor
cones.group('Flavor', min)

In [None]:
# Get the average price for each Flavor
cones.group('Flavor', np.average)

In [None]:
# Get a list of prices for each Flavor
cones.group('Flavor', list)

In [None]:
# User-defined function
def spread(arr):
    return max(arr) - min(arr)

spread(make_array(7, 10, 2))

In [None]:
# Use your own function in .group
cones.group('Flavor', spread)

## Grouping by multiple columns

We can also group by multiple columns. This will allow give us separate values for each combination of values in the columns we group by. 

In [None]:
cones

In [None]:
all_cones

In [None]:
# Count how many items have a particular Flavor and Color combination
all_cones.group(['Flavor', 'Color'])

In [None]:
# Take the average price for each particular Flavor and Color combination
all_cones.group(['Flavor', 'Color'], np.average)

## Example - NBA salaries ##

In [None]:
nba = Table.read_table('nba_salaries.csv').relabeled(3, 'SALARY')
nba

In [None]:
# Total salary paid by each team, highest first
nba.select('TEAM', 'SALARY').group('TEAM', sum).sort('SALARY sum', descending=True)

In [None]:
# what happens if we don't select only numerical columns first?
nba.group('TEAM', sum)

In [None]:
# Average salary paid for each position
nba.select('POSITION', 'SALARY').group('POSITION', np.average)

In [None]:
# For each team, average salary paid for each position
nba.drop('PLAYER').group(['TEAM', 'POSITION'], np.average)