In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import unicodedata

In [2]:
datetime.now()

datetime.datetime(2020, 2, 14, 6, 9, 8, 730461)

In [3]:
url = 'https://www.health.gov.au/news/coronavirus-update-at-a-glance'
content = requests.get(url).content

In [4]:
soup = BeautifulSoup(content)
data = soup.findAll(text=re.compile('As .+')) + \
        soup.findAll(text=re.compile('.+ in (?:Queensland|Victoria|(New South Wales)|(South Australia))')) 
data = ' '.join(data)
data

'As at 06:00\xa0hrs on 14\xa0February 2020, we have confirmed 15\xa0cases of coronavirus (COVID-19) in Australia: 5 in Queensland 4 in New South Wales 4 in Victoria 2 in South Australia'

In [5]:
cases_str = unicodedata.normalize("NFKD", data)

In [6]:
cases_str

'As at 06:00 hrs on 14 February 2020, we have confirmed 15 cases of coronavirus (COVID-19) in Australia: 5 in Queensland 4 in New South Wales 4 in Victoria 2 in South Australia'

In [7]:
date_time_str = re.search('As .+ (\d+ \w+ 202\d)' ,cases_str).group(1)
date_time_str

'14 February 2020'

In [8]:
data_date_time = datetime.strptime(date_time_str, '%d %B %Y').strftime('%Y-%m-%d')
data_date_time

'2020-02-14'

In [9]:
m = re.search('(\d+) cases .+ in Australia', cases_str)
australia = int(m.group(1))


In [10]:
m = re.search('(\d+) in New South Wales', cases_str)
nsw = int(m.group(1))


In [11]:
m = re.search('(\d+) in Victoria', cases_str)
victoria = int(m.group(1))

In [12]:
m = re.search('(\d+) in South Australia', cases_str)
sa = int(m.group(1))

In [13]:
m = re.search('(\d+) in Queensland', cases_str)
qld = int(m.group(1))

### Dataframe

In [14]:
df = pd.DataFrame({'date': data_date_time, 'australia': australia, 'nsw': nsw, 'victoria': victoria, 'sa': sa, 'qld': qld}, index=[0])

In [15]:
df

Unnamed: 0,date,australia,nsw,victoria,sa,qld
0,2020-02-14,15,4,4,2,5


### Save to file

In [16]:
from pathlib import Path
csv_file = 'data/australia-cases.csv'

if Path(csv_file).exists():
    # read out the old data
    old_df = pd.read_csv(csv_file)
    df = df.append(old_df)
df.drop_duplicates(subset=['date'], inplace=True)

In [17]:
df.sort_values(by=['date'], inplace=True)
df.to_csv(csv_file, index=False)
df

Unnamed: 0,date,australia,nsw,victoria,sa,qld
0,2020-02-04,12,4,4,2,2
1,2020-02-05,13,4,4,2,3
2,2020-02-06,14,4,4,2,4
3,2020-02-07,15,4,4,2,5
4,2020-02-08,15,4,4,2,5
5,2020-02-09,15,4,4,2,5
6,2020-02-10,15,4,4,2,5
7,2020-02-11,15,4,4,2,5
8,2020-02-12,15,4,4,2,5
9,2020-02-13,15,4,4,2,5


### Plot graph

In [18]:
# TODO once there is more data