In [None]:
from datascience import *
%matplotlib inline
path_data = '../../../assets/data/'
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import numpy as np


# Lecture 12: Table Examples

## A Join Example ##

In [None]:
# Here are the census data we saw previously
full = Table.read_table('nc-est2019-agesex-res.csv')
full.show(3)

# Select the columns for age, sex, and population estimate 2019
census = full.select('SEX', 'AGE', 'POPESTIMATE2019')
census.show(3)

In [None]:
# The values in the SEX column are codes, and need some explanation
sex_codes = Table().with_columns(
    'SEX CODE', make_array(0, 1, 2),
    'CODE DEFINITION', make_array('All', 'Selected Male', 'Selected Female')
)
sex_codes

In [None]:
# Joining the tables lets us add annotations to the census table rows
sex_codes.join('SEX CODE', census, 'SEX').sort('AGE').show(9)

In [None]:
# How would the result be different if we used the tables in the opposite order?
census.join('SEX', sex_codes, 'SEX CODE').sort('AGE').show(9)

**Back to Slides...**

## Bike Sharing ##

In [None]:
# Load the data on hourly bike sharing in the Bay Area -- over 243,000 bike trips!
trip = Table.read_table('trip.csv')
trip.show(3)

The trip begins when the rider checks the bike out from the start station, and the trip ends when they check the bike in at the end station.

## Distribution of Durations ##

In [None]:
# To investigate the distribution of trip durations, we make a histogram
trip.hist('Duration')

This often happens -- we plot a histogram for unfamiliar data, and the result is strange. 

We need to investigate: Why is the x-axis being scaled out to 2 million? Are there some extremely large 'Duration' values? What is the unit label for Duration, anyway?

In [None]:
# To get a peek at the largest duration values, we sort into descending order by Duration
trip.sort('Duration', descending=True)

In [None]:
# How many seconds in a month?
seconds_per_month = 30 * 24 * 60 * 60
print(seconds_per_month, "seconds per month")

# How many seconds in a 30-minute trip?
print(60*30, "seconds per half hour")

In [None]:
# In fact, most of the trips were below 30 minutes because subscribers could
# get a free trip as long as it was not more than 30 minutes.

# Let's view the part of the distribution for short trips: less than 30 minutes.
commute = trip.where('Duration', are.below(1800))
commute.hist('Duration')

plt.title("Duration in Seconds for Trips of Less than 30 Minutes");

In [None]:
# What percentage of all the trips were "short"?
...

In [None]:
# Custom bins can make a clearer histogram; let's make the bins 250 seconds wide
# And using the `unit` keyword can help, too
commute.hist('Duration', bins=np.arange(0, 1800, 250), unit='Second')
plt.title("Duration in Seconds for Trips of Less than 30 Minutes");

The rectangle for bin \[250, 500) is about 0.15 in height. How do we find the area of that rectangle, and how do we interpret that area?

In [None]:
base = 500 - 250
height = 0.15
area = base * height
# area units: percent
print("About", area, "percent of short trips last between 250 and 500 seconds.")

In [None]:
# We can check this by computing the exact percentage.
# Remember, `commute` is the table of short trips only.
bin_count = (
    commute.where('Duration', are.between(250, 500))
    .num_rows
)
bin_count

In [None]:
bin_count / commute.num_rows * 100

In fact, 37.95% of short trips are at least 250 seconds long and less than 300 seconds long. Our estimate was pretty close.

Why is our histogram so ugly and "blocky" looking? Can we show more detail of the distribution?

In [None]:
# We can set bins to 60, and we'll get 60 bins/rectangles.
print("Width of each bin is", 1800/60, "seconds")
commute.hist('Duration', bins=60, unit='Second')

Notice that the y-scale is unchanged, because it's scaled by density.

## Start and End Stations ##

Discuss with your neighbor: How can we find the most common starting station?

In [None]:
# Use `group` to find the most common start station among all trips
starts = trip.group('Start Station').sort('count', descending=True)
starts

In [None]:
# Use `column` and `item` to get the name of the most frequent start station
starts.column('Start Station').item(0)

If we go to Google Maps and search on 'San Franciso Caltrain Townsend at 4th' we can see it on a map of San Francisco.

In [None]:
# Numbers of trips between pairs of stations -- a natural use of `pivot`
trip.pivot('Start Station', 'End Station')

In [None]:
# Average durations of trips between stations
trip.pivot('Start Station', 'End Station', values='Duration', collect=np.average)

Question: Does it make sense to use average trip duration as a "proxy" for the actual distance between the starting and ending stations? Why or why not?

A better proxy might be the time for the *fastest* trip between each pair of stations.

## Fastest Trips between Stations ##

How can we find the fastest trip ever between each pair of stations?

In [None]:
# We'll do this with group (other methods would also work)
# Choose the columns of interest
duration = trip.select('Start Station', 'End Station', 'Duration')
duration

In [None]:
# We're doing a cross-tabulation
# Get one row per pair of stations, start-and-end, and find the minimum duration for each pair
shortest = duration.group(['Start Station', 'End Station'], min)
shortest.show(5)

In [None]:
# Let's see a few more rows
shortest.show(12)

The pivot table we drew previously had lots of 0s. Now they have disappeared. Why?

## Discussion question

BART is the "Bay Area Rapid Transit" system. Find the 5 stations closest to Civic Center BART by minimum trip time.

In [None]:
cc = 'Civic Center BART (7th at Market)'
from_cc = (shortest.where('Start Station', cc)
    .sort('Duration min'))
from_cc.take(np.arange(5))

Here's another fun thing we can do: use maps to visualize data! 

## Maps ##

In [None]:
# Geographical data on the stations
stations = Table.read_table('station.csv').drop(4, 6)
stations

In [None]:
# Maybe we want to find all the bike share stations in San Francisco
sf_stations = stations.where('landmark', are.equal_to('San Francisco'))
sf_stations.show(5)

In [None]:
# To use this table as "map data", we select latitude, longitude, and name columns,
# and relabel the 'name' column as 'labels'
sf_stations_map_data = (sf_stations
 .select('lat', 'long', 'name')
 .relabeled('name', 'labels'))

sf_stations_map_data.show(5)

In [None]:
# The Marker object is defined in the datascience module
# We call `map_table` on it and provide our map data
Marker.map_table(sf_stations_map_data)

In [None]:
# We can use `Circle` instead of `Marker`
Circle.map_table(sf_stations_map_data)

### Discussion question

Map all stations within 4 minutes (minimum ride time) of Civic Center BART.

In [None]:
# from_cc is our table showing stations that are a quick bike trip 
# away from Civic Center
from_cc.show(4)

In [None]:
# The map_data table holds the information needed to put markers on the map
sf_stations_map_data.show(4)

In [None]:
# We'll need to join the tables somehow to get all the information in one table

# Start by limiting the rows of the first table where the end station is
# less than 4 minutes bike ride from Civic Center
close_cc = from_cc.where('Duration min', are.below(4 * 60))
close_cc.show(4)

In [None]:
# We can join tables by maching up "End Station" and "labels"
# Save the result as a new table, close_markers
close_markers = (sf_stations_map_data.join('labels', close_cc, 'End Station')
      .select('lat', 'long', 'labels'))

# Notice that after joining, we kept only the columns needed for putting markers on a map
Marker.map_table(close_markers)

In [None]:
# To help understand how the map was made, check out the join result
sf_stations_map_data.join('labels', close_cc, 'End Station')