### Visualization of collected datasets
Before running this script
- Install `plotly` and `pyshp`
- Add Plotly API credentials to `config.py`

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
#import geopandas as gpd
#import seaborn as sns
import plotly.plotly as py
import plotly.figure_factory as ff
import numpy as np

In [2]:
# Plotly API
import plotly
from config import plotly_api_key, plotly_username
plotly.tools.set_credentials_file(username=plotly_username, api_key=plotly_api_key)
py.sign_in(plotly_username, plotly_api_key)

### Hospital Survey Data

In [3]:
# Import survey data
df = pd.read_csv("../Resources/Patient_survey__HCAHPS__-_Hospital.csv")
df.head()

Unnamed: 0,Provider ID,Hospital Name,Address,City,State,ZIP Code,County Name,Phone Number,HCAHPS Measure ID,HCAHPS Question,...,HCAHPS Answer Percent,HCAHPS Answer Percent Footnote,HCAHPS Linear Mean Value,Number of Completed Surveys,Number of Completed Surveys Footnote,Survey Response Rate Percent,Survey Response Rate Percent Footnote,Measure Start Date,Measure End Date,Location
0,240018,MAYO CLINIC HEALTH SYSTEM - RED WING,"701 HEWITT BOULEVARD, PO BOX 95",RED WING,MN,55066,GOODHUE,6512675000,H_COMP_3_LINEAR_SCORE,Staff responsiveness - linear mean score,...,Not Applicable,,93,347,,33,,07/01/2016,06/30/2017,"701 HEWITT BOULEVARD, PO BOX 95\nRED WING, MN\..."
1,231334,PROMEDICA HERRICK HOSPITAL,500 E POTTAWATAMIE STREET,TECUMSEH,MI,49286,LENAWEE,5174243000,H_QUIET_HSP_SN_P,Patients who reported that the area around the...,...,Not Available,1 - The number of cases/patients is too few to...,Not Applicable,Not Available,1 - The number of cases/patients is too few to...,Not Available,1 - The number of cases/patients is too few to...,07/01/2016,06/30/2017,"500 E POTTAWATAMIE STREET\nTECUMSEH, MI\n(42.0..."
2,231332,BRONSON LAKEVIEW HOSPITAL,408 HAZEN STREET,PAW PAW,MI,49079,VAN BUREN,2696571400,H_COMP_1_A_P,"Patients who reported that their nurses ""Alway...",...,85,11 - There were discrepancies in the data coll...,Not Applicable,130,11 - There were discrepancies in the data coll...,30,11 - There were discrepancies in the data coll...,07/01/2016,06/30/2017,"408 HAZEN STREET\nPAW PAW, MI\n(42.221009, -85..."
3,240010,MAYO CLINIC HOSPITAL ROCHESTER,1216 SECOND STREET SOUTHWEST,ROCHESTER,MN,55902,OLMSTED,5072555123,H_RECMND_PY,"Patients who reported YES, they would probably...",...,15,,Not Applicable,708,,38,,07/01/2016,06/30/2017,"1216 SECOND STREET SOUTHWEST\nROCHESTER, MN\n(..."
4,231330,MARLETTE REGIONAL HOSPITAL,2770 MAIN STREET,MARLETTE,MI,48453,SANILAC,9896354000,H_COMP_5_LINEAR_SCORE,Communication about medicines - linear mean score,...,Not Applicable,,76,119,,38,,07/01/2016,06/30/2017,"2770 MAIN STREET\nMARLETTE, MI\n(43.332579, -8..."


In [4]:
# Group hospital by county
grouper = df.groupby(["State", "County Name"])

# Number of hospitals by county
hospital = grouper["Hospital Name"].nunique().to_frame()
hospital.reset_index(inplace=True)
hospital = hospital.rename(columns={"Hospital Name": "Hospital", "County Name": "County"})
hospital.sort_values(by=["State", "County"], inplace=True)
hospital["County"] = hospital["County"].apply(lambda x: x.title())

hospital.head()

Unnamed: 0,State,County,Hospital
0,AK,Anchorage,3
1,AK,Bethel,1
2,AK,Dillingham,1
3,AK,Fairbanks North Star,1
4,AK,Juneau,1


In [5]:
# Clean up survey data
df_survey = df[["State", "County Name", "Hospital Name", 
    "Number of Completed Surveys"]].sort_values(by=["State", "County Name", "Hospital Name"])
df_survey = df_survey.drop_duplicates()
df_survey = df_survey.loc[df_survey["Number of Completed Surveys"]!="Not Available"]
df_survey["Number of Completed Surveys"] = df_survey["Number of Completed Surveys"
                                                    ].replace({"FEWER THAN 50":50}
                                                             ).astype(int)
df_survey = df_survey.reset_index(drop=True)
df_survey.head()

Unnamed: 0,State,County Name,Hospital Name,Number of Completed Surveys
0,AK,ANCHORAGE,ALASKA NATIVE MEDICAL CENTER,946
1,AK,ANCHORAGE,ALASKA REGIONAL HOSPITAL,995
2,AK,ANCHORAGE,PROVIDENCE ALASKA MEDICAL CENTER,1734
3,AK,BETHEL,YUKON KUSKOKWIM DELTA REG HOSPITAL,265
4,AK,FAIRBANKS NORTH STAR,FAIRBANKS MEMORIAL HOSPITAL,666


In [6]:
# Group survey by county
grouper_survey = df_survey.groupby(["State", "County Name"])

# Number of completed survey by county
survey = grouper_survey["Number of Completed Surveys"].sum().to_frame()
survey.reset_index(inplace=True)
survey = survey.rename(columns={"County Name": "County"})
survey.sort_values(by=["State", "County"], inplace=True)
survey["County"] = survey["County"].apply(lambda x: x.title())

survey.head(10)

Unnamed: 0,State,County,Number of Completed Surveys
0,AK,Anchorage,3675
1,AK,Bethel,265
2,AK,Fairbanks North Star,666
3,AK,Juneau,270
4,AK,Ketchikan Gateway,123
5,AK,Kodiak Island,89
6,AK,Matanuska Susitna,633
7,AK,Nome,61
8,AK,Sitka,50
9,AL,Autauga,579


In [7]:
# Merge number of hospital and number of completed survey
hospital_survey = pd.merge(hospital, survey, left_on = ['State', 'County'], 
                    right_on = ['State', 'County'], how = 'inner')
hospital_survey["County"] = hospital_survey["County"].apply(lambda x: x.replace("-", " "))
hospital_survey.head()

Unnamed: 0,State,County,Hospital,Number of Completed Surveys
0,AK,Anchorage,3,3675
1,AK,Bethel,1,265
2,AK,Fairbanks North Star,1,666
3,AK,Juneau,1,270
4,AK,Ketchikan Gateway,1,123


In [8]:
# Save processed data to csv
hospital_survey.to_csv("../Results/Map/hospital_survey.csv", index=False, header=True)

### FIPS Data
FIPS code 2016, collected from [United States Census Bureau](https://www.census.gov/geographies/reference-files/2016/demo/popest/2016-fips.html)

In [9]:
# FIPS by County
fips = pd.read_csv("../Resources/county_fips.csv", dtype=str, header=None, 
                   names = ["State", "fips1", "fips2", "County", "col5"])
fips["County"] = fips["County"].apply(lambda x: str(x).strip().title().replace(" County", "")
                                      .replace(" Borough", "")
                                      .replace(" Census Area", "")
                                      .replace(" Municipality", "")
                                      .replace(" City And", "")
                                      .replace("/Census Area", ""))
fips["FIPS"] = fips["fips1"] + fips["fips2"]
fips = fips[["State", "County", "FIPS"]].sort_values(by=["State", "County"])
fips.head()

Unnamed: 0,State,County,FIPS
67,AK,Aleutians East,2013
68,AK,Aleutians West,2016
69,AK,Anchorage,2020
70,AK,Bethel,2050
71,AK,Bristol Bay,2060


In [10]:
# Correct for mismatch of county expression between fips and pop
state_mis = ["IN", 'IN', 'LA', 'NM', 'PA']
county_mis = ['Dekalb', 'Laporte', 'La Salle Parish', 'De Baca', 'Mckean']
county_cor = ['De Kalb', 'La Porte', 'Lasalle Parish', 'Debaca', 'Mc Kean']

for i, state in enumerate(state_mis):    
    ind = list(fips.loc[(fips["State"]==state) & (fips["County"]==county_mis[i])].index)[0]
    fips.loc[ind, "County"] = county_cor[i]


In [11]:
# Save processed data to csv
fips.to_csv("../Results/Map/fips.csv", index=False, header=True)

### Population and Mortality Data

In [12]:
# Population and Mortality by County
pop = pd.read_csv("../Resources/mortality_2016_final.csv")
pop = pop[list(pop.columns)[1:]]
pop["State"] = pop["State"].apply(lambda x: str(x).strip())
pop["County"] = pop["County"].apply(lambda x: str(x).strip().title())
pop["County"] = pop["County"].apply(lambda x: x.replace(" County", "")
                                      .replace(" Borough", "")
                                      .replace(" Census Area", "")
                                      .replace(" Municipality", "")
                                      .replace(" City And", "")
                                    .replace("/Census Area", ""))
pop = pop.loc[(pop["County"]!="Nan") & (pop["Population"]!="Missing")].sort_values(by=["State", "County"])
pop["Population"] = pop["Population"].astype(int)
pop.head()

Unnamed: 0,State,County,Deaths,Population,Crude Rate,Age Adjusted Rate,% of Total Deaths
67,AK,Aleutians East,12,3296,Unreliable,Unreliable,0.00%
68,AK,Aleutians West,22,5647,389.59,594.32,0.00%
69,AK,Anchorage,1724,298192,578.15,717.44,0.06%
70,AK,Bethel,122,17968,678.98,1146.51,0.00%
71,AK,Bristol Bay,Suppressed,898,Suppressed,Suppressed,Suppressed


In [13]:
# Save processed data to csv
pop.to_csv("../Results/Map/population_mortality.csv", index=False, header=True)

### Land Area Data
Land area by county, collected from [United States Census Bureau](https://www.census.gov/support/USACdataDownloads.html#LND)

In [14]:
# Land Area by County
land_area = pd.read_csv("../Resources/county_land_area.csv", dtype=str)
land_area["LND110210D"] = land_area["LND110210D"].astype(float)
land_area = land_area[["Areaname", "STCOU", "LND110210D"]].loc[(land_area["STCOU"] != '00000')]
land_area = land_area.rename(columns={"LND110210D": "Land Area (Square Miles)", 
                                     "STCOU":"FIPS"}) #Land area in square miles 2010
land_area.head()

Unnamed: 0,Areaname,FIPS,Land Area (Square Miles)
1,ALABAMA,1000,50645.33
2,"Autauga, AL",1001,594.44
3,"Baldwin, AL",1003,1589.78
4,"Barbour, AL",1005,884.88
5,"Bibb, AL",1007,622.58


In [15]:
# Save processed data to csv
land_area.to_csv("../Results/Map/land_area.csv", index=False, header=True)

### Merging Population, FIPS, Land Area, and Hospital Survey
1. Population by FIPS (County)

In [16]:
# Merge Population with FIPS
pop_fips = pd.merge(pop, fips, left_on = ['State', 'County'], 
                    right_on = ['State', 'County'], how = 'left')
pop_fips.to_csv("../Results/Map/merge_pop_fips.csv")
pop_fips.head()

Unnamed: 0,State,County,Deaths,Population,Crude Rate,Age Adjusted Rate,% of Total Deaths,FIPS
0,AK,Aleutians East,12,3296,Unreliable,Unreliable,0.00%,2013
1,AK,Aleutians West,22,5647,389.59,594.32,0.00%,2016
2,AK,Anchorage,1724,298192,578.15,717.44,0.06%,2020
3,AK,Bethel,122,17968,678.98,1146.51,0.00%,2050
4,AK,Bristol Bay,Suppressed,898,Suppressed,Suppressed,Suppressed,2060


2. Population per Land Area by FIPS (County)

In [17]:
# Merge Population, FIPS, and Land Area
pop_fips_area = pd.merge(pop_fips, land_area, left_on = ['FIPS'], 
                    right_on = ['FIPS'], how = 'left')
pop_fips_area["Population per Square Mile"] = pop_fips_area["Population"] / pop_fips_area["Land Area (Square Miles)"]
pop_fips_area.to_csv("../Results/Map/merge_pop_fips_area.csv")
pop_fips_area.head()

Unnamed: 0,State,County,Deaths,Population,Crude Rate,Age Adjusted Rate,% of Total Deaths,FIPS,Areaname,Land Area (Square Miles),Population per Square Mile
0,AK,Aleutians East,12,3296,Unreliable,Unreliable,0.00%,2013,"Aleutians East, AK",6981.94,0.472075
1,AK,Aleutians West,22,5647,389.59,594.32,0.00%,2016,"Aleutians West, AK",4390.28,1.286251
2,AK,Anchorage,1724,298192,578.15,717.44,0.06%,2020,"Anchorage, AK",1704.68,174.925499
3,AK,Bethel,122,17968,678.98,1146.51,0.00%,2050,"Bethel, AK",40570.0,0.442889
4,AK,Bristol Bay,Suppressed,898,Suppressed,Suppressed,Suppressed,2060,"Bristol Bay, AK",503.84,1.782312


3. Number of Completed Surveys by FIPS (County)

In [18]:
# Modify fips to have consistent county names as hospital_survey
fips_for_hospital = fips
fips_for_hospital["County"] = fips_for_hospital["County"].apply(lambda x: x.replace("-", " ")
                                                                .replace("St.", "Saint"))

In [19]:
# Attempt Merge Population with FIPS
tst_merge1 = pd.merge(hospital_survey, fips_for_hospital, left_on = ['State', 'County'], 
                    right_on = ['State', 'County'], how = 'left')

# Add "Parish" to LA counties in fips when applicable
troublesome_county = tst_merge1.loc[tst_merge1["FIPS"].astype(str)=="nan"]
troublesome_la_county = troublesome_county.loc[troublesome_county["State"]=="LA"]["County"].tolist()
fips_la_county = [(i + " Parish") if (("Saint" in i) or (i in ["West Carroll", 
                                     "De Soto", "East Baton Rouge", "East Carroll", "Jefferson Davis"]))
                  else (i.replace(" ", "").title() + " Parish") for i in troublesome_la_county]

for i, county in enumerate(troublesome_la_county):
    ind = fips_for_hospital.loc[(fips_for_hospital["State"]=="LA") & 
                          (fips_for_hospital["County"]==fips_la_county[i])].index.tolist()[0]
    fips_for_hospital.loc[ind, "County"] = county

In [20]:
# Attempt Merge Population with FIPS x2
tst_merge2 = pd.merge(hospital_survey, fips_for_hospital, left_on = ['State', 'County'], 
                    right_on = ['State', 'County'], how = 'left')

# Edit fips_for_hospital to correct for mismatch between hospital_survey and fips_for_hospital
troublesome = tst_merge2.loc[tst_merge2["FIPS"].astype(str)=="nan"]
state_cor = troublesome["State"].tolist()
county_err = ["Dekalb", "Desoto", "O'Brien", "De Witt", "Lasalle", "Saint Joseph", "Prince George'S",
             "Saint Mary'S", "Ste. Genevieve", "Mc Kean", "Dewitt", "Salem City"]
county_cor = troublesome["County"].tolist()

for i, state in enumerate(state_cor):
    ind = fips_for_hospital.loc[(fips_for_hospital["State"]==state) & 
                          (fips_for_hospital["County"]==county_err[i])].index.tolist()[0]
    fips_for_hospital.loc[ind, "County"] = county_cor[i]

In [21]:
# Merge number of completed surveys and FIPS
hospital_survey_fips = pd.merge(hospital_survey, fips_for_hospital, left_on = ['State', 'County'], 
         right_on = ['State', 'County'], how = 'outer').fillna(0)
hospital_survey_fips.to_csv("../Results/Map/merge_survey_fips.csv")
hospital_survey_fips.head()

Unnamed: 0,State,County,Hospital,Number of Completed Surveys,FIPS
0,AK,Anchorage,3.0,3675.0,2020
1,AK,Bethel,1.0,265.0,2050
2,AK,Fairbanks North Star,1.0,666.0,2090
3,AK,Juneau,1.0,270.0,2110
4,AK,Ketchikan Gateway,1.0,123.0,2130


### Visualize County Population on Choropleth Maps
1. County Population

In [22]:
values = pop_fips_area['Population'].tolist()
fips_pop = pop_fips_area['FIPS'].tolist()
colorscale = [
    'rgb(239,239,239)',
    'rgb(195, 196, 222)',
    'rgb(144,148,194)',
    'rgb(101,104,168)',
    'rgb(65, 53, 132)'
]
scope = ["usa"]

fig = ff.create_choropleth(
    fips=fips_pop, values=values, scope=scope,
    binning_endpoints=[1000, 10000, 100000, 1000000], colorscale=colorscale,
    county_outline={'color': 'rgb(255,255,255)', 'width': 0.1}, 
    state_outline={'width': 0}, round_legend_values=True,
    legend_title='Population', title='Population by County'
)
py.iplot(fig, filename='population_by_county')


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.





The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long



2. County Population per Square Mile

In [23]:
values = pop_fips_area['Population per Square Mile'].tolist()
fips_pop = pop_fips_area['FIPS'].tolist()
colorscale = [
    'rgb(239,239,239)',
    'rgb(195, 196, 222)',
    'rgb(144,148,194)',
    'rgb(101,104,168)',
    'rgb(65, 53, 132)'
]
scope = ["usa"]

fig = ff.create_choropleth(
    fips=fips_pop, values=values, scope=scope, 
    binning_endpoints=[1, 10, 100, 1000], colorscale=colorscale,
    county_outline={'color': 'rgb(255,255,255)', 'width': 0.1}, 
    state_outline={'width': 0}, round_legend_values=True,
    legend_title='Population per Square Mile', title='Population per Square Mile by County',
    geo = dict(projection = dict(type='albers usa'))
)
py.iplot(fig, filename='population_per_square_mile_by_county')


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.





The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long



### Visualize Hospital Survey Data
1. Number of Completed Surveys by County

In [24]:
values = hospital_survey_fips['Number of Completed Surveys'].tolist()
fips_pop = hospital_survey_fips['FIPS'].tolist()
# colorscale = ["#f7fbff","#ebf3fb","#deebf7","#d2e3f3","#c6dbef","#b3d2e9","#9ecae1",
#               "#85bcdb","#6baed6","#57a0ce","#4292c6","#3082be","#2171b5","#1361a9",
#               "#08519c","#0b4083","#08306b"]
colorscale = ["#f7fbff","#d2e3f3","#9ecae1",
              "#57a0ce","#2171b5","#0b4083"]
scope = ["usa"]

fig = ff.create_choropleth(
    fips=fips_pop, values=values, scope=scope, 
    binning_endpoints=[1, 10, 100, 1000, 10000], colorscale=colorscale,
    county_outline={'color': 'rgb(255,255,255)', 'width': 0.1}, 
    state_outline={'width': 0}, round_legend_values=True,
    legend_title='Number of Completed Surveys', title='Number of Completed Surveys by County',
    geo = dict(projection = dict(type='albers usa'))
)
py.iplot(fig, filename='number_of_completed_surveys_by_county')


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unrecognized FIPS Values

Whoops! It looks like you are trying to pass at least one FIPS value that is not in our shapefile of FIPS and data for the counties. Your choropleth will still show up but these counties cannot be shown.
Unrecognized FIPS are: [74300]



The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long



2. Number of Surveyed Hospitals by County

In [25]:
values = hospital_survey_fips['Hospital'].tolist()
fips_pop = hospital_survey_fips['FIPS'].tolist()
# colorscale = ["#f7fbff","#ebf3fb","#deebf7","#d2e3f3","#c6dbef","#b3d2e9","#9ecae1",
#               "#85bcdb","#6baed6","#57a0ce","#4292c6","#3082be","#2171b5","#1361a9",
#               "#08519c","#0b4083","#08306b"]
colorscale = ["#f7fbff","#d2e3f3","#9ecae1",
              "#57a0ce","#2171b5","#0b4083"]
scope = ["usa"]

fig = ff.create_choropleth(
    fips=fips_pop, values=values, scope=scope, 
    binning_endpoints=[1, 5, 10, 20, 50], colorscale=colorscale,
    county_outline={'color': 'rgb(255,255,255)', 'width': 0.1}, 
    state_outline={'width': 0}, round_legend_values=True,
    legend_title='Number of Surveyed Hospitals', title='Number of Surveyed Hospitals by County',
    geo = dict(projection = dict(type='albers usa'))
)
py.iplot(fig, filename='number_of_surveyed_hospitals_by_county')


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unrecognized FIPS Values

Whoops! It looks like you are trying to pass at least one FIPS value that is not in our shapefile of FIPS and data for the counties. Your choropleth will still show up but these counties cannot be shown.
Unrecognized FIPS are: [74300]


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points



Woah there! Look at all those points! Due to browser limitations, the Plotly SVG d

The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long

