In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action='ignore',category=np.VisibleDeprecationWarning)

# Lecture 11 #

### Picking up where we left off last time
How could we take sex of the child into account when making predictions? 
Does that lead to smaller errors?

In [None]:
# Read in data again and redefine variables and functions we used last time
families = Table.read_table('family_heights.csv')
parent_avgs = (families.column('father') + families.column('mother'))/2
def difference(x, y):
    return x - y
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
    'Sex', families.column('sex')
)
heights

In [None]:
def predict_smarter(h, s):
    nearby = heights...
    nearby_same_sex = nearby...
    return np.average(nearby_same_sex.column('Child'))

In [None]:
predict_smarter(68, 'female')

In [None]:
predict_smarter(68, 'male')

In [None]:
smarter_predicted_heights = heights.apply(predict_smarter, 'Parent Average', 'Sex')
heights = heights.with_column('Smarter Prediction', smarter_predicted_heights)
heights

In [None]:
smarter_pred_errs = heights.apply(difference, 'Child', 'Smarter Prediction')
heights = heights.with_column('Smarter Errors', smarter_pred_errs)
heights

In [None]:
heights.hist('Smarter Errors', group='Sex')

## Lists

In [None]:
[1, 5, 'hello', 5.0, make_array(1,2,3)]

### Rows from lists

In [None]:
drinks = Table(['Drink', 'Cafe', 'Price'])
drinks

In [None]:
drinks = drinks.with_rows([
    ['Milk Tea', 'Asha', 5.5],
    ['Espresso', 'Strada',  1.75],
    ['Latte',    'Strada',  3.25],
    ['Espresso', "FSM",   2]
])
drinks

## Grouping by One Column ##

In [None]:
cones = Table.read_table('cones.csv').drop('Color')
cones

In [None]:
cones.group('Flavor')

In [None]:
cones.group('Flavor', np.average)

In [None]:
cones.group('Flavor', min)

## Grouping By One Column: Welcome Survey ##

In [None]:
survey = Table.read_table('welcome_survey_fa23.csv')
survey.show(3)
def remove_plus(string):
    return string.strip(' or more')
survey = survey.with_columns('Number of textees',survey.apply(remove_plus,'Number of textees').astype(int))
survey = survey.where('Handedness',are.not_equal_to('nan'))

In [None]:
survey.hist('Extroversion',bins = np.arange(0,11,1))

In [None]:
survey.group('Extroversion')

In [None]:
by_extra = survey.group('Extroversion', np.average)
by_extra

In [None]:
by_extra.plot('Extroversion', 'Number of textees average')

In [None]:
survey.group("Sleep position",np.average)

In [None]:
survey.select("Sleep position", "Hours of sleep").group(0, np.average)

## Grouping by Two Columns ##

![Do right-handed people tend to sleep on their left side and left-handed people sleep on their right?](handed.png)

In [None]:
survey.group(['Handedness', 'Sleep position'])

In [None]:
survey.group(['Handedness', 'Sleep position'],
             np.average)

## Pivot Tables

In [None]:
survey.group('Handedness')

In [None]:
survey.pivot('Sleep position', 'Handedness')

In [None]:
survey.pivot('Sleep position', 
             'Handedness', 
             values='Hours of sleep', 
             collect=np.average)

In [None]:
# This cell will throw an error
# You cannot include just 1 of the optional arguments
survey.pivot('Sleep position', 
             'Handedness', 
             collect=np.average)

## Activity ##

In [None]:
# From the CORGIS Dataset Project
# By Austin Cory Bart acbart@vt.edu
# Version 2.0.0, created 3/22/2016
# https://corgis-edu.github.io/corgis/csv/skyscrapers/

sky = Table.read_table('skyscrapers.csv')
sky = (sky.with_column('age', 2022 - sky.column('completed'))
          .drop('completed'))
sky.show(3)

1. For each city, what’s the tallest building for each material?

In [None]:
# 1. For each city, what’s the tallest building for each material?






2. For each city, what’s the height difference between the tallest steel building and the tallest concrete building?

In [None]:
# 2. For each city, what’s the height difference between the tallest 
#    steel building and the tallest concrete building?









3. Generate a table of the names of the oldest buildings for each material for each city:

In [None]:
# 3. Generate a table of the names of the oldest buildings for each 
#    material for each city:

# Hint: You can use sort to find the name of the oldest building in the dataset
sky.sort('age', descending=True).column('name').item(0)


# Put your solution here








(No peeking! A solution appears below.)

In [None]:
def first(s):
    "Return the first element in an array."
    return s.item(0)

(sky
 .sort('age', descending=True)
 .pivot('material', 'city', 'name', first)
)

## Joins ##

In [None]:
drinks

In [None]:
discounts = Table().with_columns(
    'Coupon % off', make_array(10, 25, 5),
    'Location', make_array('Asha', 'Strada', 'Asha')
)
discounts

In [None]:
combined = drinks.join('Cafe', discounts, 'Location')
combined

In [None]:
discounted_frac = 1 - combined.column('Coupon % off') / 100
combined.with_column(
    'Discounted Price', 
    combined.column('Price') * discounted_frac
)

In [None]:
drinks.join('Cafe', drinks, 'Cafe')