# Enhancement

## 1. Correlation analysis

In the following we will create a correlation matrix to analyze if there exists a correaltion between the attributes of interest, including case fatality rate (CFR), infection rate over population (IR), population density (PD), and Tendency to wear mask (TWM). For this analysis, we will focus on the latest state-level data.

Upsert the MongoDB database

In [2]:
# !python3 data_acquire.py

In [21]:
import pandas as pd
import numpy as np

from utils import fip_to_county, fip_to_state, state_map_dict

In [3]:
from database import fetch_all_db_as_df

df_dict = fetch_all_db_as_df()

2020-12-06 15:47:54,777 [fetch_all_db]: 320 documents read from the db
2020-12-06 15:47:54,825 [fetch_all_db]: 15304 documents read from the db
2020-12-06 15:47:54,835 [fetch_all_db]: 3142 documents read from the db
2020-12-06 15:47:54,836 [fetch_all_db]: 55 documents read from the db
2020-12-06 15:47:54,869 [fetch_all_db]: 1877 documents read from the db
2020-12-06 15:47:54,874 [fetch_all_db]: 1933 documents read from the db
2020-12-06 15:47:54,876 [fetch_all_db]: 55 documents read from the db


### 1.1 Extract the latest state-level data 

In [4]:
## Extract the latest Data
df = df_dict['covid-us-state']
df.fips = df.fips.apply(lambda x: str(x).zfill(2))
df = df[df.date == max(df.date)]
df = df.drop(columns='date', axis=1).reset_index(drop=True)
df.head()

Unnamed: 0,state,fips,cases,deaths
0,Alabama,1,267589,3877
1,Alaska,2,36083,136
2,Arizona,4,361186,6940
3,Arkansas,5,169382,2620
4,California,6,1343145,19876


###  1.2 Read population and area data

In [5]:
state_pop = df_dict['state-population']
state_pop.head()

Unnamed: 0,state,total
0,Alabama,4903185
1,Alaska,731545
2,Arizona,7278717
3,Arkansas,3017804
4,California,39512223


In [6]:
## Area in sq. mile
state_area =  df_dict['state-area']
state_area.head()

Unnamed: 0,state,area
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


### 1.3 Aggregate the mask-use-by-county data to get the state-level mask-use data

*Note*: the mask use data only includes 51 states, so will the correlation ananlysis.

In [24]:
mask_use = df_dict['mask-use-by-county']
mask_use.countyfp = mask_use.countyfp.apply(lambda x: str(x).zfill(5))
mask_use['wear_mask_prob'] = 0.25 * mask_use['rarely'] + 0.5 * mask_use['sometimes'] + \
                0.75 * mask_use['frequently'] + 1.0 * mask_use['always']
mask_use['state_code'] = mask_use.apply(lambda x: fip_to_state(x.countyfp), axis=1)
mask_use['county'] = mask_use.apply(lambda x: fip_to_county(x.countyfp), axis=1)

In [63]:
df_agg = mask_use.groupby('state_code').agg(['mean'])
df_agg.columns = ["_".join(x) for x in df_agg.columns.ravel()]
df_agg.reset_index(inplace=True)
df_agg.rename(columns={'wear_mask_prob_mean' : 'wear_mask_prob'}, inplace=True)
df_agg = df_agg[['state_code', 'wear_mask_prob']]
df_agg.drop(df_agg[df_agg['state_code'] == 'N/A'].index, inplace = True)
df_agg.drop(df_agg[df_agg['state_code'] == 'DC'].index, inplace = True)
df_agg['state'] = df_agg['state_code'].apply(lambda x: state_map_dict[x])
df_agg = df_agg[['state', 'wear_mask_prob']]
df_agg.head()

Unnamed: 0,state,wear_mask_prob
0,Alaska,0.778924
1,Alabama,0.730399
2,Arkansas,0.71933
3,Arizona,0.826133
4,California,0.873116


### 1.4 Merge with Covid Data

In [64]:
from functools import reduce
data_frames = [df, state_pop, state_area, df_agg]

df_merged = reduce(lambda left, right: pd.merge(left,right,on=['state'],
                                            how='inner'), data_frames)
df_merged.head()

Unnamed: 0,state,fips,cases,deaths,total,area,wear_mask_prob
0,Alabama,1,267589,3877,4903185,52423,0.730399
1,Alaska,2,36083,136,731545,656425,0.778924
2,Arizona,4,361186,6940,7278717,114006,0.826133
3,Arkansas,5,169382,2620,3017804,53182,0.71933
4,California,6,1343145,19876,39512223,163707,0.873116


### 1.5 Compute Features of Interest

In [65]:
df_merged['CFR'] = df_merged['deaths'] / df_merged['cases'] 
df_merged['IR'] = df_merged['cases'] / df_merged['total']
df_merged['PD'] = df_merged['total'] / df_merged['area']
df_merged['WMP'] = df_merged['wear_mask_prob']
df_merged.head()

Unnamed: 0,state,fips,cases,deaths,total,area,wear_mask_prob,CFR,IR,PD,WMP
0,Alabama,1,267589,3877,4903185,52423,0.730399,0.014489,0.054575,93.531179,0.730399
1,Alaska,2,36083,136,731545,656425,0.778924,0.003769,0.049324,1.114438,0.778924
2,Arizona,4,361186,6940,7278717,114006,0.826133,0.019214,0.049622,63.845034,0.826133
3,Arkansas,5,169382,2620,3017804,53182,0.71933,0.015468,0.056128,56.744838,0.71933
4,California,6,1343145,19876,39512223,163707,0.873116,0.014798,0.033993,241.359398,0.873116


In [78]:
df_ana = df_merged[['state', 'CFR', 'IR', 'PD', 'WMP']]
df_ana[['CFR', 'IR', 'PD', 'WMP']] = np.round(df_ana[['CFR', 'IR', 'PD', 'WMP']], 3)

### 1.6 Feature Correlation 

Scatterplot matrix

In [152]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

# fig = px.scatter_matrix(df_ana, dimensions=['CFR', 'IR', 'PD', 'WMP'],
#                        hover_name='state')

# fig.update_layout(
#     title='Scatter Matrix of Covid Features',
#     width=600,
#     height=600,
# )

# fig.show()

fig = go.Figure(data=go.Splom(
                dimensions=[dict(label='Fatality rate',
                                 values=df_ana['CFR']),
                            dict(label='Infection rate',
                                 values=df_ana['IR']),
                            dict(label='Population density',
                                 values=df_ana['PD']),
                            dict(label='Wear mask prob.',
                                 values=df_ana['WMP'])],
                text=df_ana['state'],
#                 hovertemplate="%{x}, %{y}",
                marker=dict(showscale=False, # colors encode categorical variables
                            line_color='white', line_width=0.5),
                showupperhalf=False,
                ))

fig.update_layout(
    title='Scatter Matrix of Covid Features',
    dragmode='select',
    width=720,
    height=720,
    hovermode='closest',
)

fig.show()

**Correlation matrix**

In [139]:
df_corr = df_ana[['CFR', 'IR', 'PD', 'WMP']].corr()
df_corr

Unnamed: 0,CFR,IR,PD,WMP
CFR,1.0,-0.325502,0.77004,0.620835
IR,-0.325502,1.0,-0.238312,-0.693229
PD,0.77004,-0.238312,1.0,0.62557
WMP,0.620835,-0.693229,0.62557,1.0


In [155]:
fig = go.Figure(data=go.Heatmap(z=df_corr, 
                                x=['CFR', 'IR', 'PD', 'WMP'], 
                                y=['CFR', 'IR', 'PD', 'WMP'],
                                colorscale='Blues',
                               hovertemplate=" Corr(%{x}, %{y}) = %{z:.2f}"),
               )
fig.update_layout(
    title='Correlation Matrix',
    height=500,
    width=500,
    )
fig.show()

# fig = px.imshow(df_corr, color_continuous_scale=px.colors.sequential.Blues,
#                title='Feature Correlation Matrix', height=600, width=600,
#                )

# fig.show()

In [153]:
from plotly.subplots import make_subplots
figall = make_subplots(rows=1, cols=2)
fig1 = go.Splom(
                dimensions=[dict(label='Fatality rate',
                                 values=df_ana['CFR']),
                            dict(label='Infection rate',
                                 values=df_ana['IR']),
                            dict(label='Population density',
                                 values=df_ana['PD']),
                            dict(label='Wear mask prob.',
                                 values=df_ana['WMP'])],
                text=df_ana['state'],
#                 hovertemplate="%{x}, %{y}",
                marker=dict(showscale=False, # colors encode categorical variables
                            line_color='white', line_width=0.5),
                showupperhalf=False,
                )
fig2 = go.Heatmap(z=df_corr, 
                                x=['CFR', 'IR', 'PD', 'WMP'], 
                                y=['CFR', 'IR', 'PD', 'WMP'],
                                colorscale='Blues',
                               hovertemplate=" Corr(%{x}, %{y}) = %{z:.2f}")
figall.add_trace(fig1, row=1, col=1)
figall.add_trace(fig2, row=1, col=2)
figall.show()

ValueError: Trace type 'splom' is not compatible with subplot type 'xy'
at grid position (1, 1) 

See the docstring for the specs argument to plotly.subplots.make_subplots 
for more information on subplot types

As we can see from the scatter matrix and correlation matrix, the Infection Rate and Wear Mask Probability has a relatively strong negative correlation. In addition, the Case Fatality Rate correlates strongly with the population density.