In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [3]:
covid_table = Table.read_table("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
covid_table

date,county,state,fips,cases,deaths
2020-01-21,Snohomish,Washington,53061,1,0
2020-01-22,Snohomish,Washington,53061,1,0
2020-01-23,Snohomish,Washington,53061,1,0
2020-01-24,Cook,Illinois,17031,1,0
2020-01-24,Snohomish,Washington,53061,1,0
2020-01-25,Orange,California,6059,1,0
2020-01-25,Cook,Illinois,17031,1,0
2020-01-25,Snohomish,Washington,53061,1,0
2020-01-26,Maricopa,Arizona,4013,1,0
2020-01-26,Los Angeles,California,6037,1,0


In [4]:
county_geo = Table.read_table("https://raw.githubusercontent.com/jdlafferty/covid-19/master/data/geo-counties.csv") 
county_geo

county,state,fips,lat,lon
New York City,New York,,40.7146,-74.0071
Westchester,New York,36119.0,41.1191,-73.7887
Nassau,New York,36059.0,42.5164,-73.6113
Suffolk,New York,36103.0,40.9601,-72.8343
Cook,Illinois,17031.0,41.8139,-87.6155
King,Washington,53033.0,47.4325,-121.959
Unknown,New Jersey,,0.0,0.0
Wayne,Michigan,26163.0,42.2852,-83.3836
Los Angeles,California,6037.0,34.0536,-118.246
Bergen,New Jersey,34003.0,40.9476,-74.0276


### Process the recent data

First we will calculate the average new cases for a subset of states. 

In [5]:
first_date = '2021-02-20'

# Some subsets of states to visualize:
continental_states = ['Pennsylvania', 'Arizona', 'Connecticut', 'Florida', 'Wisconsin', 'South Dakota', 'Colorado',
 'New Jersey', 'California', 'Ohio', 'West Virginia', 'Oregon', 'Alabama', 'Maine', 'Vermont',
 'Montana', 'Nevada', 'Washington', 'Massachusetts', 'Maryland', 'Iowa', 'North Dakota', 'Kentucky',
 'Delaware', 'Mississippi', 'Tennessee', 'Virginia', 'Kansas', 'Missouri', 'Utah', 'North Carolina', 'Louisiana',
 'South Carolina', 'Minnesota', 'Arkansas', 'Indiana', 'New York', 'Wyoming', 'New Mexico', 'Rhode Island',
 'Michigan', 'Nebraska', 'New Hampshire', 'Georgia', 'Texas', 'Illinois', 'Oklahoma', 'Idaho']

states = continental_states

recent_data = covid_table.where('date', are.above(first_date))
recent_state_data = recent_data.where('state', are.contained_in(states))
recent_state_data

date,county,state,fips,cases,deaths
2021-02-21,Autauga,Alabama,1001,6117,85
2021-02-21,Baldwin,Alabama,1003,19433,262
2021-02-21,Barbour,Alabama,1005,2070,50
2021-02-21,Bibb,Alabama,1007,2416,58
2021-02-21,Blount,Alabama,1009,6042,125
2021-02-21,Bullock,Alabama,1011,1151,33
2021-02-21,Butler,Alabama,1013,1940,64
2021-02-21,Calhoun,Alabama,1015,12940,278
2021-02-21,Chambers,Alabama,1017,3367,99
2021-02-21,Cherokee,Alabama,1019,1753,37


In [6]:
# remove extra columns
data = recent_state_data.drop('date').drop('county').drop('state').drop('deaths')

# exclude cases where fips is not known
data = data.where('fips', are.above(0))

# now, group by fips and form a list of the cumlative cases
data = data.group('fips', list)

# apply the difference function np.diff to get the new cases
data = data.with_column('new cases', data.apply(np.diff, 'cases list'))
data = data.drop('cases list')

# Now average to get the average new cases in each county over the past week
# We add a small amount .001 to avoid zeros, which the graphics handles badly 
new_cases = Table().with_columns('fips', data['fips'], 
                                 'new cases', data.apply(np.mean, 'new cases') + .001)
new_cases

fips,new cases
1001,19.126
1003,40.626
1005,5.751
1007,4.251
1009,7.501
1011,2.251
1013,3.501
1015,45.001
1017,5.751
1019,2.126


In [7]:
state_geo = county_geo.where('state', are.contained_in(states)).sort('fips')
state_geo

county,state,fips,lat,lon
Autauga,Alabama,1001,32.5077,-86.651
Baldwin,Alabama,1003,30.7698,-87.7827
Blount,Alabama,1009,34.0128,-86.5337
Bullock,Alabama,1011,32.0927,-85.7129
Butler,Alabama,1013,32.0894,-88.2213
Calhoun,Alabama,1015,33.7623,-85.8421
Chambers,Alabama,1017,32.9188,-85.3938
Cherokee,Alabama,1019,34.7555,-87.9734
Chilton,Alabama,1021,32.866,-86.6652
Choctaw,Alabama,1023,32.004,-88.2858


Now we join the table of average new cases with the geo table, so that we have latitude and longitude 
for each county.


In [8]:
new_cases_geo = state_geo.join('fips', new_cases)
new_cases_geo = new_cases_geo.drop('fips')
new_cases_geo


county,state,lat,lon,new cases
Autauga,Alabama,32.5077,-86.651,19.126
Baldwin,Alabama,30.7698,-87.7827,40.626
Blount,Alabama,34.0128,-86.5337,7.501
Bullock,Alabama,32.0927,-85.7129,2.251
Butler,Alabama,32.0894,-88.2213,3.501
Calhoun,Alabama,33.7623,-85.8421,45.001
Chambers,Alabama,32.9188,-85.3938,5.751
Cherokee,Alabama,34.7555,-87.9734,2.126
Chilton,Alabama,32.866,-86.6652,11.376
Choctaw,Alabama,32.004,-88.2858,0.626


Finally, we can create a map where we show a circle at each county location, with area
proportional to the average number of new cases over the past 

In [None]:
n = np.random.choice(np.arange(new_cases_geo.n))

In [18]:
dat = Table().with_columns('lat', new_cases_geo['lat'], 
                           'long', new_cases_geo['lon'], 
                           'labels', new_cases_geo['county'],
                           'areas', 5*new_cases_geo['new cases'],
                           'colors', 'red')
dat.show(10)
Circle.map_table(dat, weight=1)

lat,long,labels,areas,colors
32.5077,-86.651,Autauga,105.005,red
30.7698,-87.7827,Baldwin,213.576,red
34.0128,-86.5337,Blount,39.2907,red
32.0927,-85.7129,Bullock,12.8621,red
32.0894,-88.2213,Butler,15.005,red
33.7623,-85.8421,Calhoun,239.291,red
32.9188,-85.3938,Chambers,30.7193,red
34.7555,-87.9734,Cherokee,12.1479,red
32.866,-86.6652,Chilton,60.7193,red
32.004,-88.2858,Choctaw,2.86214,red


## Table examples

In [19]:
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
    ['Milk Tea', 'Book Trader Cafe', 4],
    ['Espresso', "Willoughby's",  2],
    ['Coffee',   "Willoughby's",  3],
    ['Espresso', "Blue State Coffee",   2]
])
drinks

Drink,Cafe,Price
Milk Tea,Book Trader Cafe,4
Espresso,Willoughby's,2
Coffee,Willoughby's,3
Espresso,Blue State Coffee,2


In [20]:
discounts = Table().with_columns(
    'Coupon % off', make_array(5, 50, 25, 0),
    'Location', make_array("Willoughby's", "Blue State Coffee", "Willoughby's", "Book Trader Cafe")
)
discounts

Coupon % off,Location
5,Willoughby's
50,Blue State Coffee
25,Willoughby's
0,Book Trader Cafe


In [21]:
# Discussion question:  Generate a table with one row per cafe that 
# has the name and discounted price of its cheapest discounted drink

# Link (join) drinks with discounts
combined = drinks.join('Cafe', discounts, 'Location')
combined.show()

# Compute discounted prices
discounted_prices = combined.column('Price') * (1 - combined.column('Coupon % off')/100)

discounted_drinks = combined.with_column('Discounted price', discounted_prices)
discounted_drinks


Cafe,Drink,Price,Coupon % off
Blue State Coffee,Espresso,2,50
Book Trader Cafe,Milk Tea,4,0
Willoughby's,Espresso,2,5
Willoughby's,Espresso,2,25
Willoughby's,Coffee,3,5
Willoughby's,Coffee,3,25


Cafe,Drink,Price,Coupon % off,Discounted price
Blue State Coffee,Espresso,2,50,1.0
Book Trader Cafe,Milk Tea,4,0,4.0
Willoughby's,Espresso,2,5,1.9
Willoughby's,Espresso,2,25,1.5
Willoughby's,Coffee,3,5,2.85
Willoughby's,Coffee,3,25,2.25


In [22]:
#Correct, Espresso is cheaper
discounted_drinks.sort('Discounted price').sort('Cafe', distinct=True) 

Cafe,Drink,Price,Coupon % off,Discounted price
Blue State Coffee,Espresso,2,50,1.0
Book Trader Cafe,Milk Tea,4,0,4.0
Willoughby's,Espresso,2,25,1.5


In [23]:
#Incorrect - need to sort by "Discounted price" first
discounted_drinks.sort('Cafe', distinct=True) 

Cafe,Drink,Price,Coupon % off,Discounted price
Blue State Coffee,Espresso,2,50,1.0
Book Trader Cafe,Milk Tea,4,0,4.0
Willoughby's,Espresso,2,5,1.9


In [14]:
#Incorrect, Coffee is first alphabetically
discounted_drinks.group('Cafe', min) 

Cafe,Drink min,Price min,Coupon % off min,Discounted price min
Blue State Coffee,Espresso,2,50,1.0
Book Trader Cafe,Milk Tea,4,0,4.0
Willoughby's,Coffee,2,5,1.5


## Sample midterm question

Challenge yourself and try to solve these on your own before looking at the solutions!

In [24]:
trip0 = Table.read_table("trip.csv")
trip = Table().with_columns(
"Start", trip0.column("Start Station"),
"End", trip0.column("End Station"),
"Duration", trip0.column("Duration"))
trip.show(3)

Start,End,Duration
Harry Bridges Plaza (Ferry Building),San Francisco Caltrain (Townsend at 4th),765
San Antonio Shopping Center,Mountain View City Hall,1036
Post at Kearny,2nd at South Park,307


In [16]:
# The name of the station where the most rentals ended 
#(assume no ties).

In [17]:
# The number of stations for which the average duration ending 
# at that station was more than 10 minutes.

In [18]:
# The number of stations that have more than 500 starts 
# AND more than 500 ends

In [19]:
# The name of the station where the most rentals ended (assume no ties).
# First, find end counts
# Then, find the station with the highest end count
trip.group('End').sort('count', descending=True).column(0).item(0)





'San Francisco Caltrain (Townsend at 4th)'

In [20]:
# The number of stations for which the average duration ending 
# at that station was more than 10 minutes.

# First, find the average end time for each station
# Then, keep the ones above 10 minutes
# Then, count them
trip.group('End', np.average).where(2, are.above(10*60)).num_rows





68

In [21]:
# The number of stations that have more than 500 starts 
# AND more than 500 ends
# First, find the start counts
starting = trip.group('Start').relabeled('count', 'Start count').relabeled('Start', 'Station')
# Then, find the end counts
ending = trip.group('End').relabeled('count', 'End count').relabeled('End', 'Station')
# Combine them with join
starting.join('Station', ending).where('Start count', are.above(500)).where('End count', are.above(500)).num_rows




56

## Comparison ##

In [26]:
type(3 > 1)

bool

In [23]:
type(3 > 1)

bool

In [27]:
3 < 1

False

In [25]:
True

True

In [29]:
3 == 4

False

In [32]:
True = 3

SyntaxError: can't assign to keyword (<ipython-input-32-679355094e21>, line 1)

In [34]:
x = 14
y = 3

In [35]:
x > 10

True

In [36]:
12 < x < 18

True

In [31]:
12 < x

True

In [32]:
x < 18

True

In [37]:
12 < x-y < 18

False

In [38]:
x > 10 and y > 5

False

## Comparisons with arrays

In [39]:
pets = make_array('cat', 'dog', 'cat', 'cat', 'dog', 'rabbit')
pets

array(['cat', 'dog', 'cat', 'cat', 'dog', 'rabbit'], dtype='<U6')

In [41]:
pets == 'dog'

array([False,  True, False, False,  True, False])

In [43]:
0 + 1 + 0 + 0 + 1 + 0

2

In [44]:
sum(make_array(False, True, False, False, True, False))

2

In [45]:
sum(pets == 'dog')

2

In [40]:
np.count_nonzero(pets == 'dog')

2

In [41]:
pets > 'cat'

array([False,  True, False, False,  True,  True])

In [42]:
sum(pets > 'cat')

3

In [47]:
"cat" < "batastrophe"

False

## Predicates and advanced `where`

In [48]:
terms = Table().with_column('Semester', np.arange(1, 9))
terms

Semester
1
2
3
4
5
6
7
8


In [49]:
terms.where('Semester', are.above(6))

Semester
7
8


In [51]:
is_senior = are.above(6)

In [52]:
type(is_senior)

datascience.predicates._combinable

In [54]:
is_senior(100)

True

In [55]:
def also_is_senior(x):
    return x > 6

In [57]:
also_is_senior(100)

True

In [58]:
terms.apply(also_is_senior, 'Semester')

array([False, False, False, False, False, False,  True,  True])

In [59]:
terms.where('Semester', are.above(6))

Semester
7
8


In [60]:
terms.where('Semester', is_senior)

Semester
7
8


In [61]:
terms.where('Semester', also_is_senior)

Semester
7
8


In [62]:
terms.where(terms.apply(also_is_senior, 'Semester'))

Semester
7
8


In [64]:
b = terms.apply(also_is_senior, 'Semester')
b

array([False, False, False, False, False, False,  True,  True])

In [65]:
terms.where(b)

Semester
7
8


In [73]:
s = 'this      is a test'
s.split()

['this', 'is', 'a', 'test']