In [1]:
from collections import OrderedDict

import pandas as pd
import numpy as np

import blaze as bz

from bokeh.sampledata.us_states import data as states
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.io import output_notebook, show

output_notebook()

In [2]:
# https://catalog.data.gov/dataset/impaired-driving-death-rate-by-age-and-gender-2012-all-states-587fd
# impaired driving death rate by age and genter 2012
# Rate of deaths by age/gender (per 100,000 population) for people killed in
# crashes involving a driver with BAC =[HTML_REMOVED]0.08%, 2012 Source:
# Fatality Analysis Reporting System (FARS) Note: Blank cells indicate data are
# suppressed. Fatality rates based on fewer than 20 deaths are suppressed.
s1 = 'https://data.cdc.gov/api/views/ebbj-sh54/rows.csv?accessType=DOWNLOAD'
d = bz.data(s1)

The `data` interface automatically downloads and caches the `*.CSV` file to a temporary location on disk:
  `bz.data('https://data.cdc.gov/api/views/ebbj-sh54/rows.csv?accessType=DOWNLOAD')`
  
The interactive feature allows data exploration and blaze queries this file to print the data as the following functions `(head(), nrows, sort(), ...)` are executed.

Q: When does the temporary file get cleaned up? Does blaze do this automatically?

In [3]:
d.head()

Unnamed: 0,State,Ages 0-20,Ages 21-34,Ages 35+,All Ages,Male,Female,Location
0,Maine,,,2.8,3.8,6.6,,"Maine\n(45.25422910300006, -68.98502586699993)"
1,Kentucky,1.8,7.2,3.7,4.0,6.5,1.5,"Kentucky\n(37.64597313300004, -84.77496771599994)"
2,Louisiana,2.8,8.6,5.1,5.2,8.3,2.4,"Louisiana\n(31.312663324000027, -92.4456775049..."
3,Florida,1.2,8.0,3.3,3.7,5.7,1.7,"Florida\n(28.932042722000062, -81.92895768899996)"
4,New Jersey,,4.6,1.4,1.8,2.9,0.9,"New Jersey\n(40.130570807000026, -74.273687969..."
5,District of Columbia,,,,,,,"District of Columbia\n(38.89037258400003, -77...."
6,North Dakota,,15.3,12.0,11.3,17.4,,"North Dakota\n(47.475318609000055, -100.118427..."
7,New Hampshire,,,2.9,2.4,4.1,,"New Hampshire\n(43.65595283500005, -71.5003569..."
8,Minnesota,,3.7,2.2,2.1,3.2,1.2,"Minnesota\n(46.35564896600005, -94.79419983199..."
9,South Carolina,2.8,14.2,7.6,7.6,12.3,3.2,"South Carolina\n(33.99882216300006, -81.045365..."


In [4]:
d.sort('Male', ascending=False)[['State', 'Male', 'Female', 'All Ages']].head(5)

Unnamed: 0,State,Male,Female,All Ages
6,North Dakota,17.4,,11.3
49,Montana,14.9,4.0,9.4
9,South Carolina,12.3,3.2,7.6
42,Wyoming,11.3,,7.1
17,Mississippi,9.8,2.7,6.1


In [5]:
d.sort('Female', ascending=False)[['State', 'Male', 'Female', 'All Ages']].head(5)

Unnamed: 0,State,Male,Female,All Ages
49,Montana,14.9,4.0,9.4
9,South Carolina,12.3,3.2,7.6
36,New Mexico,6.9,2.9,4.8
30,Alabama,8.4,2.9,5.5
17,Mississippi,9.8,2.7,6.1


In [6]:
d.sort('Ages 0-20', ascending=False)[['State', 'Ages 0-20', 'Male', 'Female', 'All Ages']].head(5)

Unnamed: 0,State,Ages 0-20,Male,Female,All Ages
9,South Carolina,2.8,12.3,3.2,7.6
2,Louisiana,2.8,8.3,2.4,5.2
30,Alabama,2.7,8.4,2.9,5.5
17,Mississippi,2.6,9.8,2.7,6.1
34,Kansas,2.3,6.0,,3.4


In [7]:
d.Male.sum()

In [8]:
d.Female.sum()

In [9]:
d = bz.merge(d, ratio=d.Female/d.Male)

In [10]:
df = bz.compute(d)
df.head()

Unnamed: 0,State,Ages 0-20,Ages 21-34,Ages 35+,All Ages,Male,Female,Location,ratio
0,Maine,,,2.8,3.8,6.6,,"Maine\n(45.25422910300006, -68.98502586699993)",
1,Kentucky,1.8,7.2,3.7,4.0,6.5,1.5,"Kentucky\n(37.64597313300004, -84.77496771599994)",0.230769
2,Louisiana,2.8,8.6,5.1,5.2,8.3,2.4,"Louisiana\n(31.312663324000027, -92.4456775049...",0.289157
3,Florida,1.2,8.0,3.3,3.7,5.7,1.7,"Florida\n(28.932042722000062, -81.92895768899996)",0.298246
4,New Jersey,,4.6,1.4,1.8,2.9,0.9,"New Jersey\n(40.130570807000026, -74.273687969...",0.310345


In [11]:
name_abv = {val['name']: abv for abv, val in states.items()}

# state: (lat, long)
# (lat, long) = [(float(i[1].split(',')[0][1:]), float(i[1].split(',')[1][:-1])) for i in df.Location.str.split('\n')]
us = df[df['State'] == 'United States']
df = df[df['State'] != 'United States']
df['lons'] = [states[name_abv[name]]["lons"] for name in df.State]
df['lats'] = [states[name_abv[name]]["lats"] for name in df.State]

colors = ["#F1EEF6", "#D4B9DA", "#C994C7", "#DF65B0", "#DD1C77"]
c_vals = np.linspace(df['All Ages'].min(), df['All Ages'].max(), 5)

df['color'] = "#FFFFFF"

for ix, r in df.iterrows():
    if not np.isnan(r['All Ages']):
        df.loc[ix, 'color'] = colors[np.argmin(r['All Ages'] > c_vals)]

In [12]:
c_df = df[~df['State'].isin(['Alaska', 'Hawaii'])]
nc_df = df[df['State'].isin(['Alaska', 'Hawaii'])]

In [13]:
c_source = ColumnDataSource(data=c_df)
nc_source = ColumnDataSource(data=nc_df)

In [14]:
TOOLS = "pan,wheel_zoom,box_zoom,reset,resize,hover"
p = figure(title="US Impaired Driving Death Rate 2012", toolbar_location="left",
           plot_width=1000, plot_height=600, tools=TOOLS)

In [15]:
p.patches("lons", "lats", source=c_source, fill_color="color", fill_alpha=0.7,
          line_color="#884444", line_width=2, line_alpha=0.3)

hover = p.select(dict(type=HoverTool))
hover.tooltips = OrderedDict([
    ("State", "@State"),
    ("All Ages", "@{All Ages}"),
    ("F/M Ratio", "@ratio")
])

show(p)

In [16]:
p2 = figure(title="US Impaired Driving Death Rate 2012", toolbar_location="left",
            plot_width=1000, plot_height=500, tools=TOOLS)
p2.patches("lons", "lats", source=nc_source, fill_color="color", fill_alpha=0.7,
          line_color="#884444", line_width=2, line_alpha=0.3)
hover = p2.select(dict(type=HoverTool))
hover.tooltips = OrderedDict([
    ("State", "@State"),
    ("All Ages", "@{All Ages}"),
    ("F/M Ratio", "@ratio")
])

show(p2)