In [None]:
# usual imports
from datascience import *
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plots
#plots.style.use('fivethirtyeight')

# Configure for presentation
#np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
#mpl.rc('font', size=16)

## Bikes

In [None]:
 # Read a dataset from a bike-rental firm containing 354k rentals
trips = Table.read_table('trip.csv')  
# see what columns are available in this data set:
trips

In [None]:
# identify a subsample of "commuters"
commute = trips.where('Duration', are.below(1800))   # Why is this here?  Are there significant ones above that?
commute.hist('Duration')

In [None]:
commute.hist('Duration', bins=60, unit='second')  # clean the plot up a bit

In [None]:
commute.hist('Duration', bins=np.arange(1801), unit='second')  # there are 354K rows

In [None]:
# group by starting location to get counts, then sort to get largest values
starts = commute.group('Start Station').sort('count', descending=True)
starts

In [None]:
# Compute a table counting start -> end trips
pivot = commute.pivot('Start Station', 'End Station')
pivot

In [None]:
# It's easier to interpret this graphically - the heat plot
plots.rcParams['figure.figsize'] = (11., 11.)  # make a square plot
plots.figure()
ct = pd.crosstab(commute['Start Station'], commute['End Station']) # pandas computation of pivot table
plots.grid(False)
plots.pcolor(ct)    # plot that dataframe as color spectrum
plots.xlabel('Start Station Index')
plots.ylabel('End Station Index')
plots.plot();

In [None]:
# Once you have the area of interest, you can pull values
# indices are inclusive/exclusive
pivot.row(50)[0:1], pivot.row(50)[60:71]  # so this is columns 60 through 70 in upper right

In [None]:
duration = trips.select('Start Station', 'End Station', 'Duration')  # narrow down the table to three columns
duration

In [None]:
# Group the trips from each to each, then select the shortest duration trip in each bin
shortest = duration.group(['Start Station', 'End Station'], min) 
shortest

In [None]:
# Select out the trips starting at a specific station
from_cc = shortest.where('Start Station', are.containing('Civic Center BART')).sort('Duration min')
from_cc

## Maps

In [None]:
# Get the locations of the stations
stations = Table.read_table('station.csv')   # Table of station locations
stations                                     # landmark is the town containg the station

In [None]:
# Map all the locations
Marker.map_table(stations.select('lat', 'long', 'name'))

In [None]:
# Show the San Francisco locations
sf = stations.where('landmark', 'San Francisco')
Circle.map_table(sf.select('lat', 'long', 'name'), color='green', area=100)

In [None]:
# Define colors for the various areas ("landmarks")
colors = stations.group('landmark').with_column(
    'color', make_array('blue', 'red', 'green', 'orange', 'purple'))
colors

In [None]:
# Map those groupings
colored = stations.join('landmark', colors).select('lat', 'long', 'name', 'color')
Marker.map_table(colored)

In [None]:
# Calculate the number of trips starting at each station by joining the two data sets
station_starts = stations.join('name', starts, 'Start Station')
station_starts

In [None]:
# Show how many trips start from each location?
Circle.map_table(station_starts.select('lat', 'long', 'name').with_columns(  # adding presentation options
    'color', 'blue',                                                         # show blue circles 
    'area', station_starts.column('count') * 0.1                             # set circle size from number starts
))

In [None]:
from_ft = shortest.where('Start Station', are.containing('Harry Bridges Plaza (Ferry Building)'))

In [None]:
from_ft_dest = from_ft.join("End Station", stations, "name")
display(from_ft_dest)

In [None]:
# Where do rentals from the Ferry Building end up?
Circle.map_table(from_ft_dest.select('lat', 'long', 'End Station'), area=100)

In [None]:
# Some exercises:
# Where do trips that start in the San Jose landmark end up?

In [None]:
# first create a table with start station and landmark
starts = shortest.join("Start Station", stations, "name")
starts

In [None]:
# keep the San Jose landmark
sjstarts = starts.where(starts['landmark'] == "San Jose")
sjstarts

In [None]:
# finally, group and display these
sjstarts.group("End Station").sort('End Station')  # alphabetize

In [None]:
# Where do the longest commuter trips start? End?
# How do you want to display that information?

In [None]:
# a straight-forward way to do this is to augment the "commuters" selection at the top to select
# greater than 1750 too, then rerun the notebook!
commute = trips.where(trips['Duration'] < 1800)
commute = commute.where(commute['Duration'] > 1750) 
commute.hist('Duration')
commute


In [None]:
# "group" to find starts:
commute.group("Start Station").sort("count", descending = True)

In [None]:
# "group" to find ends:
commute.group("End Station").sort("count", descending = True)

In [None]:
# doing the pivot table shows lots of empty cells
commute.pivot('Start Station', 'End Station')

In [None]:
# so remake the heat plot
plots.rcParams['figure.figsize'] = (11., 11.)  # make a square plot
plots.figure()
ct = pd.crosstab(commute['Start Station'], commute['End Station']) # pandas computation of pivot table
plots.grid(False)
plots.pcolor(ct)    # plot that dataframe as color spectrum
plots.xlabel('Start Station Index')
plots.ylabel('End Station Index')
plots.plot();

In [None]:
# make a table of locations of interest to you, for example 
# (google i.e. "krakow latitude longitude" for values)
local = Table(['name', 'lat', 'long', 'landmark']).with_rows([
    ['Downtown', 50.0647, 19.9450, 'Krakow'],
    ['Hotel', 50.0699, 19.8975, 'Krakow']    
])

In [None]:
# and make some plots. Maybe add another column for a 1-10 star rating and show that as circle size?

In [None]:
local = Table(['name', 'lat', 'long', 'landmark', 'rating']).with_rows([
    ['Downtown', 50.0647, 19.9450, 'Krakow', 9],
    ['Hotel', 50.0699, 19.8975, 'Krakow', 5]    
])

In [None]:
Circle.map_table(local.select('lat', 'long', 'name').with_columns(  # adding presentation options
    'color', 'blue',                                                         # show blue circles 
    'area', local.column('rating') * local.column('rating') * 20                             # set circle size from number starts
))