## 1 - Retrieving the dataset from source:

In [371]:
import pandas as pd
import altair as alt
import numpy as np

data_url_counties = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
data_url_states = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
us_counties = pd.read_csv(data_url_counties, error_bad_lines=False)
display(us_counties.head())
display(us_counties.info())

us_states = pd.read_csv(data_url_states, error_bad_lines=False)
display(us_states.head())
display(us_states.info())

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40720 entries, 0 to 40719
Data columns (total 6 columns):
date      40720 non-null object
county    40720 non-null object
state     40720 non-null object
fips      40148 non-null float64
cases     40720 non-null int64
deaths    40720 non-null int64
dtypes: float64(1), int64(2), object(3)
memory usage: 1.9+ MB


None

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-21,Washington,53,1,0
1,2020-01-22,Washington,53,1,0
2,2020-01-23,Washington,53,1,0
3,2020-01-24,Illinois,17,1,0
4,2020-01-24,Washington,53,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 5 columns):
date      1994 non-null object
state     1994 non-null object
fips      1994 non-null int64
cases     1994 non-null int64
deaths    1994 non-null int64
dtypes: int64(3), object(2)
memory usage: 78.0+ KB


None

## 2 - Get basic information of cases in each state on current day
We retrieve the latest date that the data includes, and display the distribution of cases for each state on the latest day, as well as the total number of cases on the latest day

In [372]:
from datetime import datetime, timedelta



#get the date range of the data
total_dates = us_states["date"].unique()
date_range = [total_dates[1],total_dates[len(total_dates)-1]]
print("Date range of data: " + str(date_range[0]) + " to " + str(date_range[1]))

#extract the latest data
latest_date = date_range[1]
latest_cases = us_states[us_states["date"]==latest_date]
latest_cases = latest_cases[latest_cases.state != 'Guam']
latest_cases = latest_cases[latest_cases.state != 'Northern Mariana Islands']

display(latest_cases)

#get the latest total cases
total_cases = latest_cases["cases"].sum()
print("Total cases on " + str(latest_date) + ": "+  str(total_cases))



Date range of data: 2020-01-22 to 2020-04-07


Unnamed: 0,date,state,fips,cases,deaths
1939,2020-04-07,Alabama,1,2197,64
1940,2020-04-07,Alaska,2,211,4
1941,2020-04-07,Arizona,4,2575,77
1942,2020-04-07,Arkansas,5,997,18
1943,2020-04-07,California,6,17540,447
1944,2020-04-07,Colorado,8,5429,179
1945,2020-04-07,Connecticut,9,7781,277
1946,2020-04-07,Delaware,10,928,16
1947,2020-04-07,District of Columbia,11,1211,22
1948,2020-04-07,Florida,12,14739,295


Total cases on 2020-04-07: 397472


## 3 - Examining case growth curves in "hot spot" states
We defined 'hot spot' states as those who make up more than 4% of the country's cases on the latest date recorded


In [373]:
us_states['date'] = pd.to_datetime(us_states['date'], format="%Y/%m/%d")

recent_cases = us_states[us_states['date']>"2020-03-1"]
hotspot_threshold=total_cases*.04

states_slow_growth = recent_cases[recent_cases['date']==latest_date]
states_slow_growth = states_slow_growth[states_slow_growth['cases']<hotspot_threshold]
states_slow_growth= states_slow_growth["state"]
recent_cases_hot_spot = recent_cases[~recent_cases.state.isin(states_slow_growth)]

scales = alt.selection_interval(bind='scales')
highlight = alt.selection(type='single', on='mouseover', fields=['state'], nearest=True)

base = alt.Chart(recent_cases_hot_spot).mark_line(point=True).encode(
    y=alt.Y('cases:Q', axis=alt.Axis(title='Total Cases')),
#     x='date:T',
    x=alt.X('date:T', axis=alt.Axis(format='%b %d', title='Date')),
#     y='cases:Q',
    color='state:N',
    tooltip=['state', 'cases', 'date']
)


points = base.mark_circle().encode(
    opacity=alt.value(1)
).add_selection(
    highlight
).properties(
    width=600,
    title='Total cases in hot spot states growth curve'
)


lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1.25), alt.value(3))
)

display(points + lines)


## 4 - Examining percentage of cases compared to all states
The below chart maps which states have the majority of cases at a given date. We can see how Washington and California originally had majority of cases in the U.S., and how quickly New York took over, and continues to make up a large percentage of cases in the U.S.

In [374]:
chart = alt.Chart(recent_cases).mark_area().encode(
    x=alt.X('date:T', axis=alt.Axis(format='%b %d', title='Date')),
    y=alt.Y('cases:Q', stack="normalize", axis=alt.Axis(format='%', title='Percent of Total Cases')), 
#     x="date:T",
#     y=alt.Y("cases:Q", stack="normalize"),
    tooltip=['state', 'cases', 'date'],
    color= alt.Color('state:N', scale=alt.Scale(scheme='category20b'))

).properties(
    width=700,
    height=550,
    title="Distribution of cases among all states"
)

display(chart)

## 5 - Examining cases in New Jersey
We will take a look at how cases are distributed by county in New Jersey.

The map below shows an interesting distrubution of cases in the sense that the more north-east the county is, the higher the number of cases. This could be attributed to the fact that north Jersey is very densely populated, as well as its proximity to New York City.

TODO eventually:
https://towardsdatascience.com/how-to-build-a-time-series-dashboard-in-python-with-panel-altair-and-a-jupyter-notebook-c0ed40f02289

https://nextjournal.com/sdanisch/data-types-graphical-marks-and-visual-encoding-channels

In [375]:
counties = alt.topo_feature(data.us_10m.url, 'counties')
airports = data.airports.url


nj_county_coord = pd.read_csv("nj_county_coordinates.csv", error_bad_lines=False)
nj_county_coord=nj_county_coord.set_index('county')


jersey_cases = us_counties.loc[(us_counties['date']==latest_date) & (us_counties['state']=='New Jersey') ]
jersey_cases= jersey_cases.set_index('county')


jersey_cases['lat']= nj_county_coord['lat']
jersey_cases['long']= nj_county_coord['long']
jersey_cases=jersey_cases.drop(['Unknown'])
jersey_cases=jersey_cases.reset_index()
# display(jersey_cases)


map_jersey =(
    alt.Chart(counties).mark_geoshape(
        fill='#fffafc',
        stroke='darkgray'
    ).transform_calculate(state_id = "(datum.id / 1000)|0")
    .transform_filter((alt.datum.state_id)==34)
).properties(
    width=800,
    height=600
).project('albersUsa')


# county positions on background
points = alt.Chart(jersey_cases).mark_point(filled=True, size=200).encode(
    longitude='long:Q',
    latitude='lat:Q',
    size=alt.Size('cases:Q', title='Number of Cases'),
    color=alt.Color('cases:Q', scale=alt.Scale(scheme='yelloworangered')),
    tooltip=['cases:N','county:N']
).properties(
    title='Cases in NJ'
)
# .transform_filter((datum.symbol == ticker) # this ties in the filter 
# .add_selection(select_date).transform_filter(select_date)

map_jersey + points


## 6 - Heatmap of cases in America

In [362]:
!pip install --upgrade folium

Requirement already up-to-date: folium in /Users/annagodin/opt/anaconda3/lib/python3.7/site-packages (0.10.1)


In [376]:
import folium
import folium.plugins # The Folium Javascript Map Library

USA_COORDINATES = (39.8283, -98.5795)
usa_map = folium.Map(location=USA_COORDINATES,zoom_start=3)
state_geo = 'us-states.json'
bins = list(latest_cases['cases'].quantile([0, 0.25, 0.5, 0.75, 1]))

folium.Choropleth(
    geo_data=state_geo,
    data=latest_cases,
    columns=['state', 'cases'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.5,
    bins=bins,
    legend_name='Number of COVID-19 Cases' 
).add_to(usa_map)

folium.LayerControl().add_to(usa_map)

usa_map


### 6.1 - Excluding New York
New York has been aggressively testing and therefore has 67,180 more cases than the state with the second highest number of cases. Let's disregard New York to get a better look at how the other states compare. 

In [377]:
#Find NY row
indexNames = latest_cases[latest_cases['state'] == 'New York' ].index
 
# Delete this row from dataFrame
data_minus_NY = latest_cases.drop(indexNames)

bins_minus_NY = list(data_minus_NY['cases'].quantile([0, 0.25, 0.5, 0.75, 1]))

no_ny_map = folium.Map(location=USA_COORDINATES,zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    data=data_minus_NY,
    columns=['state', 'cases'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.5,
    #bins=bins_minus_NY,
    legend_name='Number of COVID-19 Cases' 
).add_to(no_ny_map)

folium.LayerControl().add_to(no_ny_map)

no_ny_map


## 7 - Heatmap of deaths in America
Now we'll plot deaths. The numbers should be as affected by the difference in state governement approaches to testing.

In [378]:
usa_death_map = folium.Map(location=USA_COORDINATES,zoom_start=3)
death_bins = list(latest_cases['deaths'].quantile([0, 0.25, 0.5, 0.75, 1]))

folium.Choropleth(
    geo_data=state_geo,
    data=latest_cases,
    columns=['state', 'deaths'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.5,
    bins=death_bins,
    legend_name='Number of COVID-19 Deaths' 
).add_to(usa_death_map)

folium.LayerControl().add_to(usa_death_map)

usa_death_map

### 7.1 Excluding New York again
Let's remove NY again, which currently has the worst outbreak of COVID-19.

In [379]:
no_ny_death_map = folium.Map(location=USA_COORDINATES,zoom_start=3)
no_ny_death_bins = list(data_minus_NY['deaths'].quantile([0, 0.25, 0.5, 0.75, 1]))

folium.Choropleth(
    geo_data=state_geo,
    data=data_minus_NY,
    columns=['state', 'deaths'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name='Number of COVID-19 Deaths' 
).add_to(no_ny_death_map)

folium.LayerControl().add_to(no_ny_death_map)

no_ny_death_map

Note the difference between putting the values into 6 evenly spaced bins versus putting them into bins based on quantiles. Because states like NY and NJ have such high relative case values (and have more deaths in general because the outbreak is so bad there), it's hard to pick bins that make both the visualization (the actual map) AND the legend valuable.