In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import unicodedata

In [2]:
url = 'https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/cases-in-us.html'

### Extract date

In [3]:
content = requests.get(url).content
soup =BeautifulSoup(content, 'lxml')

In [4]:
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

data = soup.findAll(text=re.compile(".*pdated .*202"))
data


['Last updated on June 26, 2020',
 'Level of community transmission by jurisdiction — last updated May 18, 2020',
 'Total number of cases by day — last updated April 28, 2020',
 'Number of cases by source of exposure — last updated April 16, 2020',
 'Number of cases from Wuhan, China and the Diamond Princess cruise — last updated April 16, 2020',
 'Number of cases by illness start date — last updated April 15, 2020']

In [5]:
if len(data)>0:
    m = re.search(".*pdated .* (\w+ \d+, 202\d)", _removeNonAscii(data[0]))
    data_date_str = m.group(1)
    data_date = datetime.strptime(data_date_str, '%B %d, %Y').strftime('%Y-%m-%d')
else:
    data_date = datetime.utcnow().strftime('%Y-%m-%d')
data_date

'2020-06-26'

### Extract data and add into Dataframe

In [6]:
data = soup.findAll(text=re.compile("Total Cases = (\d+).*Total Deaths = (\d+)"))
data

['[{"@context":"https:\\/\\/schema.org","@type":"SpecialAnnouncement","name":"Cases in the U.S.","text":"<h3>U.S. At A Glance<\\/h3>\\r\\n<ul>\\r\\n \\t<li>Total Cases = 2,414,870\\r\\n<ul>\\r\\n \\t<li>New Cases = 40,588<\\/li>\\r\\n<\\/ul>\\r\\n<\\/li>\\r\\n \\t<li>Total Deaths = 124,325\\r\\n<ul>\\r\\n \\t<li>New Deaths = 2,516<\\/li>\\r\\n<\\/ul>\\r\\n<\\/li>\\r\\n<\\/ul>","category":"https:\\/\\/www.wikidata.org\\/wiki\\/Q81068910","datePosted":"2020-06-26T12:30","spatialCoverage":{"type":"Country","name":"USA"},"diseasePreventionInfo":"https:\\/\\/www.cdc.gov\\/coronavirus\\/2019-ncov\\/prevent-getting-sick\\/index.html","diseaseSpreadStatistics":"https:\\/\\/www.cdc.gov\\/coronavirus\\/2019-ncov\\/covid-data\\/covidview\\/index.html"}]']

In [7]:

m = re.search("Total Cases = ([,\d]+).*Total Deaths = ([,\d]+)", _removeNonAscii(data[0]))


In [8]:
total_cases = int(m.group(1).replace(',',''))
total_cases

2414870

In [9]:
total_deaths = int(m.group(2).replace(',',''))
total_deaths

124325

In [10]:
df = pd.DataFrame({'datetime':[data_date], 
                   'total_cases': [total_cases],
                   'total_deaths': [total_deaths]})

In [11]:
df

Unnamed: 0,datetime,total_cases,total_deaths
0,2020-06-26,2414870,124325


### Save to file (old data)


In [12]:
from pathlib import Path
csv_file = '../data/us-cdc-total-cases-deaths.csv'

if Path(csv_file).exists():
    # read out the old dat
    old_df = pd.read_csv(csv_file)
    df = df.append(old_df)
df.drop_duplicates(subset=['datetime'], inplace=True)
df

Unnamed: 0,datetime,total_cases,total_deaths
0,2020-06-26,2414870,124325
0,2020-05-07,1219066,73297
1,2020-05-08,1248040,75477
2,2020-05-09,1274036,77034
3,2020-05-10,1300696,78771
4,2020-05-11,1324488,79756
5,2020-05-12,1342594,80820
6,2020-05-13,1364061,82246
7,2020-05-14,1384930,83947
8,2020-05-15,1412121,85990


In [13]:
df.sort_values(by=['datetime'], inplace=True)
df.to_csv(csv_file, index=False)

In [14]:
df

Unnamed: 0,datetime,total_cases,total_deaths
0,2020-05-07,1219066,73297
1,2020-05-08,1248040,75477
2,2020-05-09,1274036,77034
3,2020-05-10,1300696,78771
4,2020-05-11,1324488,79756
5,2020-05-12,1342594,80820
6,2020-05-13,1364061,82246
7,2020-05-14,1384930,83947
8,2020-05-15,1412121,85990
9,2020-05-16,1435098,87315
