In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, Legend, LogColorMapper, GeoJSONDataSource
from bokeh.models.widgets import Slider
from bokeh.layouts import widgetbox, column
from bokeh.plotting import figure, curdoc, save
from bokeh.transform import factor_cmap
from bokeh.palettes import RdYlBu, Category10, Viridis6 
from IPython.core.display import HTML, display
import matplotlib.pyplot as plt
import seaborn as sn
import geopandas as gpd

pd.set_option("display.max_rows", 8)
pd.set_option("display.max_columns", 20)

# Descriptive Analytics

## United States Summary

In [2]:
# import data
cases_US = pd.read_csv('../data/US_all_vars.csv').iloc[:,1:]
cases_global = pd.read_csv('../data/Global_all_vars.csv').iloc[:,1:]

cases_US['Date'] = pd.to_datetime(cases_US['Date'], cache=True)

In [3]:
corr_US = cases_US.drop('FIPS',axis=1).corr()
# sn.heatmap(corr_US, annot=True)
# sn.set(rc={'figure.figsize':(20,20)})

display(HTML("<p style='font-size: 15px'>U.S. Correlation Plot</p>"))

![img](assets/img/Seaborn/Corr_Plot.png)

***The figure above is a correlation plot of all numeric variables for U.S. cases. This was calculated using the correlation coefficient (pearson's). Some interesting correlations related to cases (and cases in 2 weeks) are protest count (.51/.54), population (.49/.51), and percent foreign born (.27,.28). It is interesting to point out that most variables had an increased correlation with cases 2 weeks later rather than cases at a given time.***

In [4]:
def assign_color(row):
    if row['Avg.Person.Per.Household'] < 2.3:
        return '< 2.3' 
    elif row['Avg.Person.Per.Household'] >= 2.3 and row['Avg.Person.Per.Household'] <= 2.6:
        return '>= 2.3 & <= 2.6'
    elif row['Avg.Person.Per.Household'] > 2.6:
        return '> 2.6'

cases_US['Avg.Person.Per.Household.FCT'] = cases_US.apply(assign_color, axis=1)

hover = HoverTool(tooltips=[
    ("County", "@County"),
    ("State", "@State"),
    ("Avg Person Per Household", "@{Avg.Person.Per.Household}"),
    ("Population", "@{POP.2019}{0.00}"),
    ("Total_Cases", "@Total_Cases")
    ])

p = figure(title = "Total Population vs Total Cases", plot_height=500, plot_width=750,
           tools=[hover, "pan,reset,wheel_zoom"])

colors = factor_cmap('Avg.Person.Per.Household.FCT', palette=['#EC7063','#A569BD','#007BCC'],
                     factors=cases_US['Avg.Person.Per.Household.FCT'].unique()) 

p.circle('POP.2019','Total_Cases', legend='Avg.Person.Per.Household.FCT',
         fill_alpha=0.1, size=10, fill_color=colors, line_color=colors,
         source=ColumnDataSource(cases_US[cases_US['Date'] == pd.Timestamp("2020-06-30")]))
p.xaxis.axis_label = 'Population 2019 (thousands)'
p.yaxis.axis_label = 'Total COVID-19 Cases (06/30/2020)'
p.legend.title = 'Avg Person Per Household'
p.legend.location = "top_left"
p.legend.orientation = "horizontal"

output_file("../assets/img/Bokeh/Total_Pop_Plot.html")

cases_US = cases_US.drop(['Avg.Person.Per.Household.FCT'], axis=1)
show(p)

<iframe src="assets/img/Bokeh/Total_Pop_Plot.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="500"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

***The figure above shows the correlation between total covid cases as of June 30th compared to the 2019 population estimate for each county in the US divided by the average person per household. There is a slight positive trend. Something to point out is that the counties with a higher average person per household than the national average (~2.5-2.6) seem to describe most of the outliers.***

***You may hover over points for more detailed information.***

In [5]:
hover = HoverTool(tooltips=[
    ("County", "@County"),
    ("State", "@State"),
    ("Protest_Count", "@Protest_Count"),
    ("Total Cases", "@Total_Cases"),
    ("Governer.Party", "@{Governer.Party}")
    ])

p = figure(title = "Total Cases of COVID-19 (as of 6/30/20) vs BLM Protest Count by County",
           plot_height=500, plot_width=750, tools=[hover, "pan,reset,wheel_zoom"])

colors = factor_cmap('Governer.Party', palette=['red', 'blue'], factors=cases_US['Governer.Party'].unique()) 

p.circle('Protest_Count','Total_Cases', legend='Governer.Party',
         fill_alpha=0.1, size=10, fill_color=colors, line_color=colors,
         source=ColumnDataSource(cases_US[cases_US['Date'] == pd.Timestamp("2020-06-30")]))

p.legend.title = 'Political Party of Governer'
p.legend.location = "top_left"
p.xaxis.axis_label = 'BLM Protest Count'
p.yaxis.axis_label = 'Total COVID-19 Cases (06/30/2020)'

output_file("../assets/img/Bokeh/BLM_Gov_Cases_Plot.html")
show(p)

<iframe src="assets/img/Bokeh/BLM_Gov_Cases_Plot.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="500"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

***This figure shows the correlation between total cases and the protests divided by the political party of the governer of the state. This seems to again have a slight positive correlation that is smaller than the population correlation. It should also be noted that There is a similar amount of Republican and Democratic outliers.***

***You may hover over points for more detailed information.***

In [6]:
def assign_density(row):
    if row['PopDensity'] < 500:
        return '< 500' 
    elif row['PopDensity'] >= 500 and row['PopDensity'] <= 1000:
        return '>= 500 & <= 1000'
    elif row['PopDensity'] > 1000:
        return '> 1000'

cases_US['PopDensity.FCT'] = cases_US.apply(assign_density, axis=1)

cases_US_bar = cases_US.groupby(['County', 'State']).agg({
    'Total_Cases': np.max, 
    'PopDensity.FCT': pd.Series.mode,
    'PopDensity': np.mean
}).reset_index()

cases_US_bar['Location'] = cases_US_bar[['County', 'State']].apply(lambda x: ', '.join(x), axis=1)

cases_US_bar = cases_US_bar.sort_values(by=['Total_Cases'], ascending=False)
cases_US_bar = cases_US_bar[:20]

Location = cases_US_bar['Location'].unique()
Density = cases_US_bar['PopDensity.FCT'].unique()
Density[0], Density[1] = Density[1], Density[0] # reorder factors to be descending

hover = HoverTool(tooltips=[
    ("County/State", "@Location"),
    ("Density", "@PopDensity{0.00}"),
    ("Total Cases", "@Total_Cases")
    ])

pal = RdYlBu[10]
source = ColumnDataSource(data=dict(Location=cases_US_bar['Location'],
                                    PopDensityFCT=cases_US_bar['PopDensity.FCT'],
                                    PopDensity=cases_US_bar['PopDensity'],
                                    Total_Cases=cases_US_bar['Total_Cases']/1000))

p = figure(x_range=Location, y_range=(0,max(cases_US_bar['Total_Cases']/1000)+25), 
           plot_height=500, title="Total COVID-19 Cases by Top 20 Counties as of 6/30/20", 
           toolbar_location=None, tools=[hover])

p.vbar(x='Location', top='Total_Cases', width=0.9, legend="PopDensityFCT",source=source,
       fill_color=factor_cmap('PopDensityFCT', palette=pal, factors=Density))

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = "vertical"

p.legend.title = 'People per square km'
p.xaxis.axis_label = 'U.S. County'
p.yaxis.axis_label = 'Total COVID-19 Cases (Thousands)'

output_file("../assets/img/Bokeh/County_Density_Plot.html")
show(p)

<iframe src="assets/img/Bokeh/County_Density_Plot.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="500"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

***The figure above shows the 20 counties with the most cases in the US as of June 30th. It should be noted that the top 2 counties are significatly higher in cases (by at least 40k) than the rest.***

***You may hover over the bars for more detailed information.***

In [7]:
plot_name = "Lockdown_Rating_Plot"

cases_US_bar = cases_US[cases_US['Date'] == pd.Timestamp('2020-06-30')]
cases_US_bar = cases_US_bar.groupby(['County','State']).agg({
    'Total_Cases': np.max,
    'POP.2019': np.mean,
    'Restriction Rating': pd.Series.mode
}).reset_index().groupby(['State']).agg({
    'Total_Cases': np.sum,
    'POP.2019': np.sum,
    'Restriction Rating': pd.Series.mode
}).reset_index()

cases_US_bar['Total_Cases'] = cases_US_bar['Total_Cases'] / 1000
cases_US_bar['POP.2019'] = cases_US_bar['POP.2019'] / 1000
cases_US_bar = cases_US_bar.sort_values(by=['Total_Cases'], ascending=False)
cases_US_bar = cases_US_bar[:20]

State = cases_US_bar['State'].unique()
Ratings = cases_US_bar['Restriction Rating'].unique()

hover = HoverTool(tooltips=[
    ("State", "@State"),
    ("Restriction Rating", "@{Restrictions}"),
    ("Population (mil)", "@Population{0.000}"),
    ("Total Cases (thous)", "@Total_Cases")
    ])

pal = RdYlBu[5]
source = ColumnDataSource(data=dict(State=cases_US_bar['State'],
                                    Restrictions=cases_US_bar['Restriction Rating'],
                                    Population=cases_US_bar['POP.2019'],
                                    Total_Cases=cases_US_bar['Total_Cases']))

p = figure(x_range=State, y_range=(0,max(cases_US_bar['Total_Cases'])+25), 
           plot_height=450, title="Total COVID-19 Cases by Top 20 States as of 6/30/20", 
           toolbar_location=None, tools=[hover])

p.vbar(x='State', top='Total_Cases', width=0.9, legend="Restrictions",
       fill_color=factor_cmap('Restrictions', palette=pal, factors=Ratings), source=source)

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = "vertical"

p.legend.title = 'State Lockdown Restrictions'
p.xaxis.axis_label = 'U.S. County'
p.yaxis.axis_label = 'Total COVID-19 Cases (Thousands)'

output_file("../assets/img/Bokeh/Lockdown_Rating_Plot.html")
show(p)

<iframe src="assets/img/Bokeh/Lockdown_Rating_Plot.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="500"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

***This figure shows the 20 states with the most cases divided by their restriction ratings. This graph shows there is actually some diversity in the extent to which the lockdown was restrictive. This is likely due to other factors influencing cases especially in larger states.***

***You may hover over the bars for more detailed information.***

In [8]:
time_series_chart = cases_US.groupby(['State', 'Date']).agg({
    'Cases_Delta': np.sum,
    'Phase.0': pd.Series.mode,
    'Phase.1': pd.Series.mode,
    'Phase.2': pd.Series.mode,
    'Phase.3': pd.Series.mode
}).reset_index()

States = cases_US.groupby('State').sum().reset_index().sort_values('Cases_Delta')
time_series_chart = time_series_chart[time_series_chart['State'].isin(States.iloc[-5:, 0])]

counties_pop = cases_US.groupby(['County', 'State']).agg({'POP.2019': np.mean}).reset_index()
states_pop = counties_pop.groupby(['State']).agg({'POP.2019': np.sum}).reset_index()

p = figure(plot_width=800, plot_height=500, x_axis_type="datetime",
          title="Daily COVID-19 Cases by 5 most infected States")

color_idx = 0
for State, df in cases_US.groupby(['State']):
    date = time_series_chart.loc[time_series_chart['State'] == State, 'Date']
    if date.empty:
        continue
    else:
        y = time_series_chart.loc[time_series_chart['State'] == State, 'Cases_Delta'].iloc[:-1]
        date = date.iloc[:-1]
        line = p.line(date, y, line_width=2, color=Category10[10][color_idx], alpha=0.8, legend=State)
        hover = HoverTool(tooltips=[
            ('Date', '@x{%F}'),
            ('State', State),
            ('New Cases', '@y'),
            ('Phase 0:', time_series_chart.loc[time_series_chart['State'] == State, 
                                               'Phase.0'].unique()[0]),
            ('Phase 1:', time_series_chart.loc[time_series_chart['State'] == State, 
                                               'Phase.1'].unique()[0]),
            ('Phase 2:', time_series_chart.loc[time_series_chart['State'] == State, 
                                               'Phase.2'].unique()[0]),
            ('Phase 3:', time_series_chart.loc[time_series_chart['State'] == State, 
                                               'Phase.3'].unique()[0])

        ], 
        renderers=[line], formatters={'x': 'datetime'})
        p.add_tools(hover)
        color_idx += 1
    
p.legend.location = "top_left"
p.legend.click_policy="hide"
p.xaxis.axis_label = 'Date (1/22/20 - 6/29/20)'
p.yaxis.axis_label = 'Daily COVID-19 cases by State'

output_file("../assets/img/Bokeh/Time_Series_US.html")

show(p)

<iframe src="assets/img/Bokeh/Time_Series_US.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="500"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

***This chart shows the progression of daily cases of the 5 states with the most total cases. Take notice that the y-axis does not represent the number of total cases but the number of new cases. It should be noted that around mid-May is the time California and Florida start a strong postitive trend. Many people may attribute this to moving to phase 2 too early although it is interesting that New York moved to phase 2 around the same timline with more active cases, yet sees a decline while Florida sees an increase. This means phase is likely not the only main factor in predicting cases.***
***You may hover over a line for more detail or click on the legend to filter states.***

In [9]:
temp = cases_US[(cases_US['Date'] == pd.Timestamp('2020-06-30')) &
                (cases_US['State'] == 'Texas')]

palette = tuple(reversed(Viridis6))
color_mapper = LogColorMapper(palette=palette)


source = ColumnDataSource(data=dict(
    x=temp['Long'],
    y=temp['Lat'],
    name=temp['County'],
    cases=temp['Total_Cases'],
))

TOOLS = "pan,wheel_zoom,reset,hover,save"

p = figure(
    title="California Cases, 6/30/20", tools=TOOLS,
    x_axis_location=None, y_axis_location=None,
    tooltips=[
        ("Name", "@name"), ("Cases", "@cases"), ("(Long, Lat)", "($x, $y)")
    ])
p.grid.grid_line_color = None
p.hover.point_policy = "follow_mouse"

p.patches('x', 'y', source=source,
          fill_color={'field': 'cases', 'transform': color_mapper},
          fill_alpha=0.7, line_color="black", line_width=0.5)

output_file("../assets/img/Bokeh/TX.html")

show(p)

In [38]:
# counties = gpd.read_file(r"../data/location/US/cb_2018_us_county_500k.shp")
# counties = counties[counties['NAME'].isin(cases_US['County'])]
# counties = counties[counties['STATEFP'].astype(int) <= 56]
# CRS = counties.crs

# # Convert GeoDataFrames into GeoJSONDataSource objects (similar to ColumnDataSource)
# roads_source = GeoJSONDataSource(geojson=counties.to_json())

# # Initialize our plot figure
# p = figure(title="A test map", width=500)

# # Add the lines to the map from our GeoJSONDataSource -object (it is important to specify the columns as 'xs' and 'ys')
# p.multi_line('xs', 'ys', source=roads_source, color='gray', line_width=1)

# output_file("../assets/img/Bokeh/TX.html")

# show(p)

In [39]:
# sorted(counties.STATEFP.astype(int).unique())
