In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

In [12]:
url = 'https://www.cdc.gov/coronavirus/2019-ncov/cases-in-us.html'

### Extract date

In [13]:
content = requests.get(url).content
soup =BeautifulSoup(content, 'lxml')

In [14]:
data = soup.findAll(text=re.compile("Updated .+ 202\d"))
data

['Updated February 28, 2020']

In [15]:
m = re.search("Updated (\w+ \d+, 202\d)", data[0])
data_date_str = m.group(1)
data_date = datetime.strptime(data_date_str, '%B %d, %Y').strftime('%Y-%m-%d')
data_date

'2020-02-28'

### Extract data and add into Dataframe

In [16]:
dfs = pd.read_html(url)

In [17]:
if len(dfs) == 0 :
    raise Exception('There is no dataframes detected')

In [20]:
df = dfs[0]
df

Unnamed: 0.1,Unnamed: 0,Confirmed,Presumptive Positive**
0,Travel-related,12,0.0
1,Person-to-person spread,3,0.0
2,Total confirmed cases,15,0.0
3,Total tested,459,


In [28]:
col0 = df.columns[0]
df = df.set_index(col0)
df


Unnamed: 0_level_0,Confirmed,Presumptive Positive**
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
Travel-related,12,0.0
Person-to-person spread,3,0.0
Total confirmed cases,15,0.0
Total tested,459,


In [29]:
# remove non alphanumeric characters from the columns
# also lowercase them
df.columns = df.columns.map(lambda x: re.sub('\W+', '', str.lower(x)))

# add column date to the front
df.insert(0, 'date', data_date)


In [30]:
df

Unnamed: 0_level_0,date,confirmed,presumptivepositive
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Travel-related,2020-02-28,12,0.0
Person-to-person spread,2020-02-28,3,0.0
Total confirmed cases,2020-02-28,15,0.0
Total tested,2020-02-28,459,


### Save to file

In [31]:
from pathlib import Path
csv_file = 'data/cdc-us-cases.csv'

if Path(csv_file).exists():
    # read out the old dat
    old_df = pd.read_csv(csv_file)
    df = df.append(old_df)
df.drop_duplicates(subset=['date'], inplace=True)

In [32]:
df.sort_values(by=['date'], inplace=True)
df
df.to_csv(csv_file, index=False)

### Plot graph

In [34]:
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

png_file = 'images/cdc-us-cases.png'

# convert to pd.datetime
df['datetime_idx'] = pd.to_datetime(df['date'], format="%Y-%m-%d")

df.set_index('datetime_idx', inplace=True)
df

Unnamed: 0_level_0,date,confirmed,presumptivepositive,negative,pending,persontopersonspread,positive,total,totalconfirmedcases,totaltested,travelrelated
datetime_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-31,2020-01-31,,,114.0,121.0,,6.0,241.0,,,
2020-02-03,2020-02-03,,,167.0,82.0,,11.0,260.0,,,
2020-02-05,2020-02-05,,,206.0,76.0,,11.0,293.0,,,
2020-02-07,2020-02-07,,,225.0,100.0,,12.0,337.0,,,
2020-02-10,2020-02-10,,,318.0,68.0,,12.0,398.0,,,
2020-02-12,2020-02-12,,,347.0,66.0,,14.0,427.0,,,
2020-02-14,2020-02-14,,,347.0,81.0,,15.0,443.0,,,
2020-02-17,2020-02-17,,,392.0,60.0,,15.0,467.0,,,
2020-02-19,2020-02-19,,,412.0,52.0,,15.0,479.0,,,
2020-02-21,2020-02-21,,,,,2.0,,,14.0,414.0,12.0
