In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

import gc

import warnings
warnings.simplefilter("ignore")
plt.style.use("classic")


In [2]:
data = pd.read_csv('./COVID-19_Case_Surveillance_Public_Use_Data.csv', low_memory=False)
data.head(10)

Unnamed: 0.1,Unnamed: 0,Report Date,Pos Test Date,Onset Date,Current Status,Sex,Age Group,Race,Hospital Y/N,ICU Admission Y/N,Death Status,Underlying Disease
0,0,2020/11/10,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Unknown,No,No
1,1,2020/11/14,2020/11/10,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,No
2,2,2020/11/19,2020/11/10,2020/11/09,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,No
3,3,2020/11/14,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,No,Missing
4,4,2020/11/13,2020/11/10,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,Yes
5,5,2020/11/17,2020/11/10,2020/11/08,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,Missing,Missing
6,6,2020/11/14,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Missing,Missing,Missing
7,7,2020/11/10,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,Missing,Missing
8,8,2020/11/10,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Unknown,Unknown,No,Unknown
9,9,2020/11/17,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,Missing,Missing


In [3]:
del data['Unnamed: 0']
data.columns = ['Report Date','Pos Test Date', 'Onset Date','Current Status','Sex','Age Group','Race','Hospital Y/N','ICU Admission Y/N','Death Status','Underlying Disease']
data

Unnamed: 0,Report Date,Pos Test Date,Onset Date,Current Status,Sex,Age Group,Race,Hospital Y/N,ICU Admission Y/N,Death Status,Underlying Disease
0,2020/11/10,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Unknown,No,No
1,2020/11/14,2020/11/10,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,No
2,2020/11/19,2020/11/10,2020/11/09,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,No
3,2020/11/14,2020/11/10,,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,No,Missing
4,2020/11/13,2020/11/10,2020/11/10,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...
8405074,2020/10/22,,,Probable Case,Missing,30 - 39 Years,Unknown,No,Unknown,No,Missing
8405075,2020/10/25,,,Laboratory-confirmed case,Missing,30 - 39 Years,Unknown,Missing,Missing,Missing,Missing
8405076,2020/07/18,,,Laboratory-confirmed case,Missing,30 - 39 Years,Unknown,Missing,Missing,Missing,Missing
8405077,2020/11/19,,,Laboratory-confirmed case,Missing,30 - 39 Years,Unknown,Missing,Missing,Missing,Missing


## What is the distribution of age groups in this study?

### Relative Frequency of Age Groups

In [4]:
age_freq = data.groupby(['Age Group']).count()['Report Date']
print(age_freq)
print(sum(age_freq))

Age Group
0 - 9 Years       299040
10 - 19 Years     841450
20 - 29 Years    1635264
30 - 39 Years    1372623
40 - 49 Years    1267350
50 - 59 Years    1227493
60 - 69 Years     863260
70 - 79 Years     483689
80+ Years         382869
Unknown            31952
Name: Report Date, dtype: int64
8404990


In [5]:
freq_ages = (age_freq / sum(age_freq)) * 100 + '%'


UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> dtype('<U32')

In [None]:
data.describe()

In [None]:
data.groupby(['Age Group']).count()['Report Date']

In [None]:
values = data['Age Group'].value_counts()
names = ['20 - 29 Years', '30 - 39 Years', '40 - 49 Years', '50 - 59 Years',
         '60 - 69 Years', '10 - 19 Years', '70 - 79 Years', '80+ Years',
         '0 - 9 Years', 'Unknown']

fig = px.pie(
    names=names,
    values=values,
    title='Age Distribution',
    color_discrete_sequence=px.colors.diverging.RdYlBu,
)
fig.show()

In [None]:
values = data['Age Group'].value_counts().tolist()
names = ['20 - 29 Years', '30 - 39 Years', '40 - 49 Years', '50 - 59 Years',
         '60 - 69 Years', '10 - 19 Years', '70 - 79 Years', '80+ Years',
         '0 - 9 Years', 'Unknown']

fig = px.bar(
    x=names,
    y=values,
    title='Age Distribution',
    labels={
        'x':'Age Ranges',
        'y':'Number of Patients'
    },
    color=values
)
fig.show()

## How many confirmed cases are there?

In [None]:
data.groupby(['Current Status']).count()['Report Date']

In [None]:
values = data['Current Status'].value_counts().tolist()
names = ['Confirmed','Probable']

fig = px.pie(
    names=names,
    values=values,
    title='Confirmed Cases vs Probable Cases',
    color_discrete_sequence=px.colors.sequential.Rainbow,
)
fig.show()

In [None]:
confirmed = 0.941
probable = 0.0589

confirmed_cases = round(confirmed * 8405079)
confirmed_cases

## How many individuals died because of Covid-19?

In [None]:
data.groupby(['Death Status']).count()['Report Date']

In [None]:
values = data['Death Status'].value_counts().tolist()
names = ['Missing','No','Unknown','Yes']

fig = px.pie(
    names=names,
    values=values,
    title='Death Distribution',
    color_discrete_sequence=px.colors.diverging.Spectral,
)
fig.show()

In [None]:
data.describe()