### Get dataset from source:

In [239]:
import pandas as pd
import altair as alt
import numpy as np

data_url_counties = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
data_url_states = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
us_counties = pd.read_csv(data_url_counties, error_bad_lines=False)
display(us_counties.head())
display(us_counties.info())

us_states = pd.read_csv(data_url_states, error_bad_lines=False)
display(us_states.head())
display(us_states.info())

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35698 entries, 0 to 35697
Data columns (total 6 columns):
date      35698 non-null object
county    35698 non-null object
state     35698 non-null object
fips      35180 non-null float64
cases     35698 non-null int64
deaths    35698 non-null int64
dtypes: float64(1), int64(2), object(3)
memory usage: 1.6+ MB


None

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1884 entries, 0 to 1883
Data columns (total 5 columns):
date      1884 non-null object
state     1884 non-null object
fips      1884 non-null int64
cases     1884 non-null int64
deaths    1884 non-null int64
dtypes: int64(3), object(2)
memory usage: 73.7+ KB


None

### Get basic information of cases in each state on current day

In [240]:
from datetime import datetime, timedelta

#get the date range of the data
total_dates = us_states["date"].unique()
date_range = [total_dates[1],total_dates[len(total_dates)-1]]
print("Date range of data: " + date_range[0] + " to " +date_range[1])

#extract the latest data
latest_date = date_range[1]
latest_cases = us_states[us_states["date"]==latest_date]
display(latest_cases)

#get the latest total cases
total_cases = latest_cases["cases"].sum()
print("Total cases on " + latest_date + ": "+  str(total_cases))


Date range of data: 2020-01-22 to 2020-04-05


Unnamed: 0,date,state,fips,cases,deaths
1829,2020-04-05,Alabama,1,1841,45
1830,2020-04-05,Alaska,2,185,4
1831,2020-04-05,Arizona,4,2269,64
1832,2020-04-05,Arkansas,5,853,16
1833,2020-04-05,California,6,15076,349
1834,2020-04-05,Colorado,8,4950,140
1835,2020-04-05,Connecticut,9,5675,189
1836,2020-04-05,Delaware,10,673,14
1837,2020-04-05,District of Columbia,11,998,22
1838,2020-04-05,Florida,12,12342,220


Total cases on 2020-04-05: 336776


### Case growth curve in cases for "hot spot" states
We defined 'hot spot' as cases that have more than 9000 cases on the latest date of the data


In [241]:
us_states['date'] = pd.to_datetime(us_states['date'], format="%Y/%m/%d")

recent_cases = us_states[us_states['date']>"2020-03-10"]

states_slow_growth = recent_cases[recent_cases['date']==latest_date]
states_slow_growth = states_slow_growth[states_slow_growth['cases']<9000]
states_slow_growth= states_slow_growth["state"]
recent_cases_hot_spot = recent_cases[~recent_cases.state.isin(states_slow_growth)]

scales = alt.selection_interval(bind='scales')
highlight = alt.selection(type='single', on='mouseover', fields=['state'], nearest=True)

base = alt.Chart(recent_cases_hot_spot).mark_line(point=True).encode(
    x='date:T',
    y='cases:Q',
    color='state:N',
    tooltip=['state', 'cases', 'date']
)


points = base.mark_circle().encode(
    opacity=alt.value(1)
).add_selection(
    highlight
).properties(
    width=600
)


lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1.25), alt.value(3))
)

points + lines



### Examining cases in New Jersey
We will take a look at how cases are distributed by county in New Jersey

In [262]:
counties = alt.topo_feature(data.us_10m.url, 'counties')
airports = data.airports.url


nj_county_coord = pd.read_csv("nj_county_coordinates.csv", error_bad_lines=False)
nj_county_coord=nj_county_coord.set_index('county')


jersey_cases = us_counties.loc[(us_counties['date']==latest_date) & (us_counties['state']=='New Jersey') ]
jersey_cases= jersey_cases.set_index('county')


jersey_cases['lat']= nj_county_coord['lat']
jersey_cases['long']= nj_county_coord['long']
jersey_cases=jersey_cases.drop(['Unknown'])
jersey_cases=jersey_cases.reset_index()
# display(jersey_cases)


map_jersey =(
    alt.Chart(counties).mark_geoshape(
        fill='#fffafc',
        stroke='darkgray'
    ).transform_calculate(state_id = "(datum.id / 1000)|0")
    .transform_filter((alt.datum.state_id)==34)
).properties(
    width=800,
    height=600
).project('albersUsa')



# airport positions on background
points = alt.Chart(jersey_cases).mark_circle().encode(
    longitude='long:Q',
    latitude='lat:Q',
    size=alt.Size('cases:Q', title='Number of Cases'),
#     color=alt.value('steelblue'),
#     color=alt.condition(brush, 'cases:Q', alt.value('grey')),
    color=alt.Color('cases:Q', scale=alt.Scale(scheme='yelloworangered')),
    tooltip=['cases:N','county:N']
).properties(
    title='Cases in NJ'
)

map_jersey + points


### Heatmap of America

In [3]:
!pip install --upgrade folium

Requirement already up-to-date: folium in /Users/annagodin/opt/anaconda3/lib/python3.7/site-packages (0.10.1)


In [7]:
import folium
import folium.plugins # The Folium Javascript Map Library

USA_COORDINATES = (39.8283, -98.5795)
usa_map = folium.Map(location=USA_COORDINATES,zoom_start=3)
state_geo = 'us-states.json'
bins = list(latest_cases['cases'].quantile([0, 0.25, 0.5, 0.75, 1]))

folium.Choropleth(
    geo_data=state_geo,
    data=latest_cases,
    columns=['state', 'cases'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.5,
    bins=bins,
    legend_name='Number of COVID-19 Cases' 
).add_to(usa_map)

folium.LayerControl().add_to(usa_map)

usa_map


New York has been aggressively testing and therefore has 67,180 more cases than the state with the second highest number of cases. Let's disregard New York to get a better look at how the other states compare. 

In [92]:
#Find NY row
indexNames = latest_cases[latest_cases['state'] == 'New York' ].index
 
# Delete this row from dataFrame
data_minus_NY = latest_cases.drop(indexNames)

bins_minus_NY = list(data_minus_NY['cases'].quantile([0, 0.25, 0.5, 0.75, 1]))

no_ny_map = folium.Map(location=USA_COORDINATES,zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    data=data_minus_NY,
    columns=['state', 'cases'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.5,
    #bins=bins_minus_NY,
    legend_name='Number of COVID-19 Cases' 
).add_to(no_ny_map)

folium.LayerControl().add_to(no_ny_map)

no_ny_map


Now we'll plot deaths. The numbers should be as affected by the different in state governement approaches to testing.

In [93]:
usa_death_map = folium.Map(location=USA_COORDINATES,zoom_start=3)
death_bins = list(latest_cases['deaths'].quantile([0, 0.25, 0.5, 0.75, 1]))

folium.Choropleth(
    geo_data=state_geo,
    data=latest_cases,
    columns=['state', 'deaths'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.5,
    bins=death_bins,
    legend_name='Number of COVID-19 Deaths' 
).add_to(usa_death_map)

folium.LayerControl().add_to(usa_death_map)

usa_death_map

Let's remove NY again, which currently has the worst outbreak of COVID-19.

In [95]:
no_ny_death_map = folium.Map(location=USA_COORDINATES,zoom_start=3)
no_ny_death_bins = list(data_minus_NY['deaths'].quantile([0, 0.25, 0.5, 0.75, 1]))

folium.Choropleth(
    geo_data=state_geo,
    data=data_minus_NY,
    columns=['state', 'deaths'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name='Number of COVID-19 Deaths' 
).add_to(no_ny_death_map)

folium.LayerControl().add_to(no_ny_death_map)

no_ny_death_map

Note the difference between putting the values into 6 evenly spaced bins versus putting them into bins based on quantiles. Because states like NY and NJ have such high relative case values (and have more deaths in general because the outbreak is so bad there), it's hard to pick bins that make both the visualization (the actual map) AND the legend valuable.