In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, Legend
from bokeh.layouts import widgetbox, column
from bokeh.plotting import figure, curdoc
from bokeh.transform import factor_cmap
from bokeh.palettes import RdYlBu, Category10, Viridis6
from IPython.core.display import HTML, display
import matplotlib.pyplot as plt
import seaborn as sn

pd.set_option("display.max_rows", 8)
pd.set_option("display.max_columns", 20)

In [2]:
cases_global = pd.read_csv('../data/Global_all_vars.csv').iloc[:,1:]
cases_global['Date'] = pd.to_datetime(cases_global['Date'], cache=True)

In [3]:
# corr_global = cases_global.drop('FIPS',axis=1).corr()
# sn.heatmap(corr_global, annot=True, annot_kws={"size":15})
# sn.set_context("poster",font_scale=1.5)
# sn.set(rc={'figure.figsize':(15,15)})
display(HTML("<p style='font-size: 15px'>Global Correlation Plot</p>"))

![img](assets/img/Seaborn/Corr_Plot_Global.png)

***The figure above is a correlation plot of all numeric variables for U.S. cases. This was calculated using the correlation coefficient (pearson's). Some interesting correlations related to the total cases include area per square kilometer (.38) and the number of protests (0.62) although the protests are heavily influenced by the US.***

In [4]:
def assign_color(row):
    if row['Avg_Temp'] < 10:
        return '< 10' 
    elif row['Avg_Temp'] >= 10 and row['Avg_Temp'] <= 20:
        return '>= 10 & <= 20'
    elif row['Avg_Temp'] > 20:
        return '> 20'

cases_global['Avg_TempFCT'] = cases_global.apply(assign_color, axis=1)

In [5]:
hover = HoverTool(tooltips=[
    ("Country/Region", "@Country_Region"),
    ("Area per sq km", "@{Area.sq.km}{0.00}"),
    ("Avg Temperature", "@Avg_Temp"),
    ("Total_Cases", "@Total_Cases")
    ])

p = figure(title = "Area per square kilometer vs Total Cases", plot_height=500, plot_width=750,
           tools=[hover,"pan,reset,wheel_zoom"])

colors = factor_cmap('Avg_TempFCT', palette=['#EC7063','#A569BD','#007BCC'],
                     factors=cases_global['Avg_TempFCT'].unique()) 

p.circle('Area.sq.km','Total_Cases', legend='Avg_TempFCT',
         fill_alpha=0.5, size=10, fill_color=colors, line_color=colors,
         source=ColumnDataSource(cases_global[cases_global['Date'] == pd.Timestamp("2020-06-30")]))
p.xaxis.axis_label = 'Area per square kilometer (thousands)'
p.yaxis.axis_label = 'Total COVID-19 Cases (06/30/2020)'
p.legend.title = 'Avg Temperature'
p.legend.location = "top_left"
p.legend.orientation = "horizontal"

output_file("../assets/img/Bokeh/Global_Area_Temp.html")

show(p)

<iframe src="assets/img/Bokeh/Global_Area_Temp.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="525"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

***The figure above shows a weak positive correlation between cases and area per square kilometer. An interesting note here is that the countries with the most cases are above average in temperature.***

***You may hover over the bars for more detailed information.***

In [6]:
def assign_class_pop(row):
    if row['PopTotal'] < 80000:
        return "< 80,000"
    elif row['PopTotal'] >= 80000 and row['PopTotal'] <= 100000:
        return ">= 80,000 & <= 100,000"
    else:
        return "> 100,000"
    
cases_global['PopTotalFCT'] = cases_global.apply(assign_class_pop, axis=1)

hover = HoverTool(tooltips=[
    ("Country/Region", "@Country_Region"),
    ("Density", "@PopDensity{0.00}"),
    ("Population", "@PopTotal{0.00}"),
    ("Total_Cases", "@Total_Cases")
    ])

p = figure(title = "Population Density vs Total Cases", plot_height=500, plot_width=750,
           tools=[hover,"pan,reset,wheel_zoom"])

colors = factor_cmap('PopTotalFCT', palette=['#007BCC','#EC7063','#A569BD'],
                     factors=cases_global['PopTotalFCT'].unique()) 

p.circle('PopDensity','Total_Cases', legend='PopTotalFCT',
         fill_alpha=0.5, size=10, fill_color=colors, line_color=colors,
         source=ColumnDataSource(cases_global[cases_global['Date'] == pd.Timestamp("2020-06-30")]))
p.xaxis.axis_label = 'Population per square kilometer'
p.yaxis.axis_label = 'Total COVID-19 Cases (06/30/2020)'
p.legend.title = 'Total Population'
p.legend.location = "top_right"

cases_global = cases_global.drop('PopTotalFCT', axis=1)

output_file("../assets/img/Bokeh/Global_Density_Temp.html")

show(p)

<iframe src="assets/img/Bokeh/Global_Density_Temp.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="525"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

***The figure above shows a surprising little to no correlation between population density and cases. This could be that overall population has more impact than the density.***

***You may hover over the bars for more detailed information.***

In [7]:
hover = HoverTool(tooltips=[
    ("County/Region", "@Country_Region"),
    ("Total Cases", "@Total_Cases{0.00}")
    ])

cases_global_bar = cases_global[cases_global['Date'] == pd.Timestamp('2020-06-30')].sort_values('Total_Cases', ascending=False)
cases_global_bar = cases_global_bar[:10]
Countries = cases_global_bar['Country_Region']

pal = RdYlBu[10]
source = ColumnDataSource(data=dict(Country_Region=cases_global_bar['Country_Region'],
                                    Total_Cases=cases_global_bar['Total_Cases']/1000))

p = figure(x_range=Countries, y_range=(0,max(cases_global_bar['Total_Cases']/1000)+25), 
           plot_height=500, title="Total COVID-19 Cases by Top 10 Countries as of 6/30/20", 
           toolbar_location=None, tools=[hover])

p.vbar(x='Country_Region', top='Total_Cases', width=0.9,source=source)

p.xgrid.grid_line_color = None
p.xaxis.axis_label = 'Country'
p.yaxis.axis_label = 'Total COVID-19 Cases (Thousands)'

output_file("../assets/img/Bokeh/County_Density_Plot.html")
show(p)

<iframe src="assets/img/Bokeh/County_Density_Plot.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="525"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

***The figure above shows the top ten countries by cases. Units are in thousands. The top three seem to be significanly higher than the rest.***

In [8]:
time_series_chart = cases_global.groupby(['Country_Region', 'Date']).agg({
    'Cases_Delta': np.sum,
    'Total_Cases': np.sum,
    'PopMale': np.mean,
    'PopFemale': np.mean,
    'PopTotal': np.mean,
    'Avg_Temp': np.mean
}).reset_index()

Country_Region = cases_global.groupby('Country_Region').sum().reset_index().sort_values('Cases_Delta')
time_series_chart = time_series_chart[time_series_chart['Country_Region'].isin(Country_Region.iloc[-5:, 0])]

countries_pop = cases_global.groupby(['Country_Region']).agg({'PopTotal': np.mean}).reset_index()

p = figure(plot_width=800, plot_height=500, x_axis_type="datetime",
          title="Daily COVID-19 Cases by 5 most infected States")

color_idx = 0
for Country, df in cases_global.groupby(['Country_Region']):
    date = time_series_chart.loc[time_series_chart['Country_Region'] == Country, 'Date']
    if date.empty:
        continue
    else:
        y = time_series_chart.loc[time_series_chart['Country_Region'] == Country, 'Cases_Delta'].iloc[:-1]
        
        source = ColumnDataSource(data=dict(
            y =  time_series_chart.loc[time_series_chart['Country_Region'] == Country, 
                                       'Cases_Delta'].iloc[:-1],
            date = date.iloc[:-1],
            Total_Cases = time_series_chart.loc[time_series_chart['Country_Region'] == Country, 
                                       'Total_Cases'].iloc[:-1]))
        line = p.line(x='date', y='y', line_width=2, color=Category10[10][color_idx],
                      alpha=0.8, legend=Country, source=source)
        hover = HoverTool(tooltips=[
            ('Date', '@date{%F}'),
            ('Country', Country),
            ('New Cases', '@y'),
            ('Total Cases', '@Total_Cases')
        ], 
        renderers=[line], formatters={'date': 'datetime'})
        p.add_tools(hover)
        color_idx += 2
    
p.legend.location = "top_left"
p.legend.click_policy="hide"
p.xaxis.axis_label = 'Date (1/22/20 - 6/29/20)'
p.yaxis.axis_label = 'Daily COVID-19 cases by State'

output_file("../assets/img/Bokeh/Time_Series_Global.html")

show(p)

 <iframe src="assets/img/Bokeh/Time_Series_Global.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="525"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

***The figure above shows the progression of infection rate of the top 5 worst countries. It is interesting to see the US has two peaks whereas the other countries have a consistent progression. Another point is the huge dip from Spain on April 23rd. This can be explained by a correction in the John Hopkins dataset for cases on the 24th.***