In [1]:
import pandas as pd
from tqdm import tqdm
from os.path import join as oj
data_dir = '../data/hrsa/data_AHRF_2018-2019/'
import re
import numpy as np
from bokeh.sampledata import us_states, us_counties
from bokeh.plotting import figure, show, output_notebook
from bokeh import palettes
from bokeh.models import ColorBar,HoverTool,LinearColorMapper,ColumnDataSource,FixedTicker
output_notebook()

# analyze data

In [2]:
df = pd.read_pickle(oj(data_dir, 'processed', 'df_renamed.pkl'))
df = df.rename(columns={'Blank': 'id'})
df.head()

Unnamed: 0,id,Header-FIPSStandCtyCode,EntityofFile,SecondaryEntityOfFile,DateofFile,DateofCreation,FileLength,StateName,StateNameAbbreviation,CountyName,...,Daysw/8-hrAvgOzoneovrNAAQSMaximum8-hourAverageOzone2014,Daysw/8-hrAvgOzoneovrNAAQSMaximum8-hourAverageOzone2013,Daysw/8-hrAvgOzoneovrNAAQSMaximum8-hourAverageOzone2012,Daysw/8-hrAvgOzoneovrNAAQSMaximum8-hourAverageOzone2011,Daysw/8-hrAvgOzoneovrNAAQSMaximum8-hourAverageOzone2010,Daysw/8-hrAvgOzoneovrNAAQSMaximum8-hourAverageOzone2009,Daysw/8-hrAvgOzoneovrNAAQSMaximum8-hourAverageOzone2008,Daysw/8-hrAvgOzoneovrNAAQSMaximum8-hourAverageOzone2007,Daysw/8-hrAvgOzoneovrNAAQSMaximum8-hourAverageOzone2006,ElevationFeet1976
0,,1001,AHRF,1001,2019,19212.0,31661.0,Alabama,AL,Autauga,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,3.0,290.0
1,,1003,AHRF,1003,2019,19212.0,31661.0,Alabama,AL,Baldwin,...,2.0,1.0,0.0,2.0,2.0,1.0,2.0,7.0,14.0,155.0
2,,1005,AHRF,1005,2019,19212.0,31661.0,Alabama,AL,Barbour,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,220.0
3,,1007,AHRF,1007,2019,19212.0,31661.0,Alabama,AL,Bibb,...,0.0,0.0,0.0,0.0,2.0,0.0,2.0,12.0,7.0,224.0
4,,1009,AHRF,1009,2019,19212.0,31661.0,Alabama,AL,Blount,...,0.0,0.0,2.0,1.0,1.0,0.0,4.0,10.0,7.0,870.0


# make interactive plots

In [3]:
def plot_counties(df, variable_to_distribute, variables_to_display, state=None):
    """Plots the distribution of a given variable across the given sttate
    
    Params
    ------
    df
        df is a data frame containing the county level data
    variable_to_distribute
        variable_to_distribute is the variable that you want to see across the state
    variables_to_display
        Variables to display on hovering over each county
    
    output: Bokeh plotting object
    """
    from bokeh.sampledata.us_counties import data as counties
    
    counties = {
        code: county for code, county in counties.items()
        if county["state"] == state.lower()
    }

    county_xs = [county["lons"] for county in counties.values()]
    county_ys = [county["lats"] for county in counties.values()]
    
    if variable_to_distribute in variables_to_display:
        variables_to_display.remove(variable_to_distribute)

    colors = palettes.viridis(4)
    min_value = df[variable_to_distribute].min()
    max_value = df[variable_to_distribute].max()
    gran = (max_value - min_value) / float(len(colors))
    #print variable_to_distribute,state,min_value,max_value
    index_range = [min_value + x*gran for x in range(len(colors))]
    county_colors = []
    variable_dictionary = {}
    variable_dictionary["county_names"] = [county['name'] for county in counties.values()]
    variable_dictionary["x"] = county_xs
    variable_dictionary["y"] = county_ys
    variable_dictionary[re.sub("[^\w]","",variable_to_distribute)] = []
    for vd in variables_to_display:
        variable_dictionary[re.sub("[^\w]","",vd)] = []
    for county_id in counties:
        StateCountyID = str(county_id[0]).zfill(2) + str(county_id[1]).zfill(3)
        if StateCountyID in list(df["Header-FIPSStandCtyCode"].values):
            temp_var = df[df["Header-FIPSStandCtyCode"] == StateCountyID][variable_to_distribute].values[0]
            if temp_var > 0.0:
                variable_dictionary[re.sub("[^\w]","",variable_to_distribute)].append(temp_var)
                for vd in variables_to_display:
                    variable_dictionary[re.sub("[^\w]","",vd)].append(round(float(df[df["Header-FIPSStandCtyCode"] == StateCountyID][vd].values),2))
                color_idx = list(temp_var - np.array(index_range)).index(min(x for x in list(temp_var - np.array(index_range)) if x >= 0))
                county_colors.append(colors[color_idx])
            else:
                variable_dictionary[re.sub("[^\w]","",variable_to_distribute)].append(0.0)
                county_colors.append("#A9A9A9")
                for vd in variables_to_display:
                    variable_dictionary[re.sub("[^\w]","",vd)].append(0.0)
        else:
            variable_dictionary[re.sub("[^\w]","",variable_to_distribute)].append(0.0)
            county_colors.append("#A9A9A9")
            for vd in variables_to_display:
                variable_dictionary[re.sub("[^\w]","",vd)].append(0.0)
        #print temp_var,counties[county_id]["name"]
    variable_dictionary["color"] = county_colors
    source = ColumnDataSource(data = variable_dictionary)
    TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,save"

    mapper = LinearColorMapper(palette=colors, low=min_value, high=max_value)

    color_bar = ColorBar(color_mapper=mapper, location=(0, 0), orientation='horizontal', 
                     title = variable_to_distribute,ticker=FixedTicker(ticks=index_range))

    p = figure(title=variable_to_distribute, toolbar_location="left",tools=TOOLS,
        plot_width=1100, plot_height=700,x_axis_location=None, y_axis_location=None)

    p.patches('x', 'y', source=source, fill_alpha=0.7,fill_color='color',
        line_color="#884444", line_width=2)

    hover = p.select_one(HoverTool)
    hover.point_policy = "follow_mouse"
    tool_tips = [("County ", "@county_names")]
    for key in variable_dictionary.keys():
        if key not in ["x","y","color","county_names"]:
            tool_tips.append((key,"@"+re.sub("[^\w]","",key) + "{1.11}"))
    hover.tooltips = tool_tips
    
    p.add_layout(color_bar, 'below')
    
    return p

In [None]:
# filter by state
from bokeh.plotting import figure, output_file, save
state = 'NY'
d = df[df["StateNameAbbreviation"] == state]

p = plot_counties(d, 
                  variable_to_distribute="CensusPopulation2010",
                  variables_to_display=["UnemploymentRate,16+2014", "PerCapitaPersonalIncome2014",
                         "%<65withoutHealthInsurance2014"],
                 state=state)
show(p)
output_file("../results/NY.html", mode='inline')
save(p)

# identify useful predictors

In [12]:
ks = sorted(list(df.keys()))
for k in ks:
    print(k)

#16+UnpaidFamilyWorkers2011-15
#16+UnpaidFamilyWorkers2013-17
#16+Workers,Carpool2011-15
#16+Workers,Carpool2013-17
#16+Workers,DriveAlone2011-15
#16+Workers,DriveAlone2013-17
#16+Workers,MeanTravelTime2011-15
#16+Workers,MeanTravelTime2013-17
#16+Workers,PublicTrans2011-15
#16+Workers,PublicTrans2013-17
#16+Workers,WalktoWork2011-15
#16+Workers,WalktoWork2013-17
#16+Workers,WorkatHome2011-15
#16+Workers,WorkatHome2013-17
#16+Workers10-14mintoWork2011-15
#16+Workers10-14mintoWork2013-17
#16+Workers15-19mintoWork2011-15
#16+Workers15-19mintoWork2013-17
#16+Workers20-29mintoWork2011-15
#16+Workers20-29mintoWork2013-17
#16+Workers30-44mintoWork2011-15
#16+Workers30-44mintoWork2013-17
#16+Workers45-59mintoWork2011-15
#16+Workers45-59mintoWork2013-17
#16+Workers5-9mintoWork2011-15
#16+Workers5-9mintoWork2013-17
#16+Workers60-89mintoWork2011-15
#16+Workers60-89mintoWork2013-17
#16+Workers90+mintoWork2011-15
#16+Workers90+mintoWork2013-17
#16+Workers<5mintoWork2011-15
#16+Workers<5mintoWork20