In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

In [2]:
url = 'https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/cases-in-us.html'

### Extract date

In [3]:
content = requests.get(url).content
soup =BeautifulSoup(content, 'lxml')

In [4]:
data = soup.findAll(text=re.compile("Updated .*202"))
data

['Updated April 14, 2020']

In [5]:
m = re.search("Updated (\w+ \d+, 202\d)", data[0])
data_date_str = m.group(1)
data_date = datetime.strptime(data_date_str, '%B %d, %Y').strftime('%Y-%m-%d')
data_date

'2020-04-14'

### Extract data and add into Dataframe

In [6]:
dfs = pd.read_html(url)

In [7]:
if len(dfs) == 0 :
    raise Exception('There is no dataframes detected')

In [8]:
df = dfs[0]
df

Unnamed: 0,0,1
0,Travel-related,6814
1,Close contact,14728
2,Under investigation,557463
3,Total cases,579005


In [9]:
col0 = df.columns[0]
df = df.set_index(col0).T
df



Unnamed: 0,Travel-related,Close contact,Under investigation,Total cases
1,6814,14728,557463,579005


In [10]:
# remove non alphanumeric characters from the columns
# also lowercase them
df.columns = df.columns.map(lambda x: re.sub('\W+', '', str.lower(x)))

# add column date to the front
df.insert(0, 'date', data_date)


In [11]:
df

Unnamed: 0,date,travelrelated,closecontact,underinvestigation,totalcases
1,2020-04-14,6814,14728,557463,579005


### Save to file

In [12]:
from pathlib import Path
csv_file = '../data/cdc-us-cases.csv'

if Path(csv_file).exists():
    # read out the old dat
    old_df = pd.read_csv(csv_file)
    df = df.append(old_df)
df.drop_duplicates(subset=['date'], inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [13]:
df.sort_values(by=['date'], inplace=True)
df
df.to_csv(csv_file, index=False)

### Plot graph

In [14]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# from pandas.plotting import register_matplotlib_converters
# register_matplotlib_converters()

# png_file = '../images/cdc-us-cases.png'

# # convert to pd.datetime
# df['datetime_idx'] = pd.to_datetime(df['date'], format="%Y-%m-%d")

# df.set_index('datetime_idx', inplace=True)
# df